0.2.3 #99

Merged
sonny merged 112 commits from development into master 2020-05-23 16:58:42 +02:00
3 changed files with 177 additions and 33 deletions
Showing only changes of commit e3840342b3 - Show all commits

View file

@ -1,6 +1,7 @@
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import timedelta
from django.core.exceptions import MultipleObjectsReturned, ObjectDoesNotExist
from django.db.models.fields import CharField, TextField
@ -184,6 +185,9 @@ class FeedCollector(Collector):
class FeedDuplicateHandler:
duplicate_fields = ("url", "title", "body", "rule")
time_slot_minutes = 10
def __init__(self, rule):
self.queryset = rule.posts.all()
@ -199,39 +203,44 @@ class FeedDuplicateHandler:
def check(self, instances):
for instance in instances:
if instance.remote_identifier in self.existing_identifiers:
existing_post = self.handle_duplicate(instance)
existing_post = self.handle_duplicate_identifier(instance)
yield existing_post
continue
elif not instance.remote_identifier and self.in_database(instance):
continue
elif self.in_database(instance):
existing_post = self.get_duplicate_in_database(instance)
if self.in_time_slot(instance, existing_post):
yield self.update_existing_post(instance, existing_post)
continue
yield instance
def in_database(self, post):
values = {
"url": post.url,
"title": post.title,
"body": post.body,
"publication_date": post.publication_date,
}
values = {field: getattr(post, field, None) for field in self.duplicate_fields}
for existing_post in self.queryset.order_by("-publication_date")[:500]:
for existing_post in self.queryset.filter(**values):
if self.is_duplicate(existing_post, values):
return True
def in_time_slot(self, instance, existing_post):
time_delta_slot = timedelta(minutes=self.time_slot_minutes)
time_difference = instance.publication_date - existing_post.publication_date
if time_difference <= time_delta_slot:
return True
def is_duplicate(self, existing_post, values):
for key, value in values.items():
existing_value = getattr(existing_post, key, None)
if existing_value != value:
return False
return all(
getattr(existing_post, field, None) == value
for field, value in values.items()
)
return True
def handle_duplicate(self, instance):
def handle_duplicate_identifier(self, instance):
try:
existing_instance = self.queryset.get(
existing_post = self.queryset.get(
remote_identifier=instance.remote_identifier
)
except ObjectDoesNotExist:
@ -240,17 +249,43 @@ class FeedDuplicateHandler:
)
return instance
except MultipleObjectsReturned:
existing_instances = self.queryset.filter(
existing_posts = self.queryset.filter(
remote_identifier=instance.remote_identifier
).order_by("-publication_date")
existing_instance = existing_instances.last()
existing_instances.exclude(pk=existing_instance.pk).delete()
existing_post = existing_posts.last()
existing_posts.exclude(pk=existing_post.pk).delete()
updated_post = self.update_existing_post(instance, existing_post)
return updated_post
def get_duplicate_in_database(self, instance):
query_values = {
field: getattr(instance, field, None) for field in self.duplicate_fields
}
try:
existing_post = self.queryset.get(**query_values)
except ObjectDoesNotExist:
logger.error(
f"Duplicate handler tried retrieving post {instance.remote_identifier} but failed doing so."
)
return instance
except MultipleObjectsReturned:
existing_posts = self.queryset.filter(**query_values).order_by(
"-publication_date"
)
existing_post = existing_posts.last()
existing_posts.exclude(pk=existing_post.pk).delete()
return existing_post
def update_existing_post(self, instance, existing_post):
for field in instance._meta.get_fields():
getattr(existing_instance, field.name, object())
getattr(existing_post, field.name, object())
new_value = getattr(instance, field.name, object())
if new_value and field.name != "id":
setattr(existing_instance, field.name, new_value)
setattr(existing_post, field.name, new_value)
return existing_instance
return existing_post

View file

@ -1,8 +1,13 @@
from datetime import timedelta
from django.test import TestCase
from django.utils import timezone
from freezegun import freeze_time
from newsreader.news.collection.feed import FeedDuplicateHandler
from newsreader.news.collection.tests.factories import CollectionRuleFactory
from newsreader.news.core.models import Post
from newsreader.news.core.tests.factories import PostFactory
@ -25,16 +30,57 @@ class FeedDuplicateHandlerTestCase(TestCase):
posts_gen = duplicate_handler.check([new_post])
posts = list(posts_gen)
self.assertEquals(len(posts), 1)
post = posts[0]
existing_post.refresh_from_db()
self.assertEquals(existing_post.pk, post.pk)
self.assertEquals(post.publication_date, new_post.publication_date)
self.assertEquals(post.title, new_post.title)
self.assertEquals(post.body, new_post.body)
self.assertEquals(post.rule, new_post.rule)
self.assertEquals(post.read, False)
@freeze_time("2019-10-30 12:30:00")
def test_duplicate_entries_with_different_remote_identifiers(self):
rule = CollectionRuleFactory()
publication_date = timezone.now()
existing_post = PostFactory.create(
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7",
url="https://bbc.com",
title="New post",
body="Body",
publication_date=publication_date,
rule=rule,
)
new_post = PostFactory.build(
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7Q",
url="https://bbc.com",
title="New post",
body="Body",
publication_date=publication_date,
rule=rule,
)
with FeedDuplicateHandler(rule) as duplicate_handler:
posts_gen = duplicate_handler.check([new_post])
posts = list(posts_gen)
self.assertEquals(len(posts), 1)
existing_post.refresh_from_db()
post = posts[0]
self.assertEquals(existing_post.pk, post.pk)
self.assertEquals(post.title, new_post.title)
self.assertEquals(post.body, new_post.body)
self.assertEquals(post.rule, new_post.rule)
self.assertEquals(post.publication_date, new_post.publication_date)
self.assertTrue(post.publication_date != existing_post.publication_date)
self.assertTrue(post.title != existing_post.title)
self.assertEquals(post.read, False)
def test_duplicate_entries_in_recent_database(self):
PostFactory.create_batch(size=10)
publication_date = timezone.now()
rule = CollectionRuleFactory()
@ -43,7 +89,7 @@ class FeedDuplicateHandlerTestCase(TestCase):
title="Birmingham head teacher threatened over LGBT lessons",
body="Google's move to end business ties with Huawei will affect current devices",
publication_date=publication_date,
remote_identifier=None,
remote_identifier="jabbadabadoe",
rule=rule,
)
new_post = PostFactory.build(
@ -59,10 +105,19 @@ class FeedDuplicateHandlerTestCase(TestCase):
posts_gen = duplicate_handler.check([new_post])
posts = list(posts_gen)
self.assertEquals(len(posts), 0)
self.assertEquals(len(posts), 1)
existing_post.refresh_from_db()
post = posts[0]
self.assertEquals(existing_post.pk, post.pk)
self.assertEquals(post.title, new_post.title)
self.assertEquals(post.body, new_post.body)
self.assertEquals(post.rule, new_post.rule)
self.assertEquals(post.publication_date, new_post.publication_date)
self.assertEquals(post.read, False)
def test_multiple_existing_entries_with_identifier(self):
timezone.now()
rule = CollectionRuleFactory()
PostFactory.create_batch(
@ -80,4 +135,56 @@ class FeedDuplicateHandlerTestCase(TestCase):
posts = list(posts_gen)
self.assertEquals(len(posts), 1)
self.assertEquals(posts[0].title, new_post.title)
self.assertEquals(
Post.objects.filter(
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7"
).count(),
1,
)
post = posts[0]
self.assertEquals(post.title, new_post.title)
self.assertEquals(post.body, new_post.body)
self.assertEquals(post.publication_date, new_post.publication_date)
self.assertEquals(post.rule, new_post.rule)
self.assertEquals(post.read, False)
@freeze_time("2019-10-30 12:30:00")
def test_duplicate_entries_outside_time_slot(self):
publication_date = timezone.now()
rule = CollectionRuleFactory()
existing_post = PostFactory.create(
url="https://www.bbc.co.uk/news/uk-england-birmingham-48339080",
title="Birmingham head teacher threatened over LGBT lessons",
body="Google's move to end business ties with Huawei will affect current devices",
publication_date=publication_date,
remote_identifier="jabbadabadoe",
rule=rule,
)
new_post = PostFactory.build(
url="https://www.bbc.co.uk/news/uk-england-birmingham-48339080",
title="Birmingham head teacher threatened over LGBT lessons",
body="Google's move to end business ties with Huawei will affect current devices",
publication_date=publication_date + timedelta(minutes=12),
remote_identifier=None,
rule=rule,
)
with FeedDuplicateHandler(rule) as duplicate_handler:
posts_gen = duplicate_handler.check([new_post])
posts = list(posts_gen)
self.assertEquals(len(posts), 1)
existing_post.refresh_from_db()
post = posts[0]
self.assertEquals(post.pk, None)
self.assertEquals(post.title, new_post.title)
self.assertEquals(post.body, new_post.body)
self.assertEquals(post.rule, new_post.rule)
self.assertEquals(post.publication_date, new_post.publication_date)
self.assertEquals(post.read, False)

View file

@ -2,6 +2,7 @@ from datetime import datetime
from django.utils import timezone
import pytz
import requests
from requests.exceptions import RequestException
@ -15,7 +16,8 @@ def build_publication_date(dt, tz):
published_parsed = timezone.make_aware(naive_datetime, timezone=tz)
except (TypeError, ValueError):
return None, False
return published_parsed, True
return published_parsed.astimezone(pytz.utc), True
def fetch(url):