diff --git a/src/newsreader/news/collection/feed.py b/src/newsreader/news/collection/feed.py index 46a7a3b..b14f375 100644 --- a/src/newsreader/news/collection/feed.py +++ b/src/newsreader/news/collection/feed.py @@ -1,6 +1,7 @@ import logging from concurrent.futures import ThreadPoolExecutor, as_completed +from datetime import timedelta from django.core.exceptions import MultipleObjectsReturned, ObjectDoesNotExist from django.db.models.fields import CharField, TextField @@ -184,6 +185,9 @@ class FeedCollector(Collector): class FeedDuplicateHandler: + duplicate_fields = ("url", "title", "body", "rule") + time_slot_minutes = 10 + def __init__(self, rule): self.queryset = rule.posts.all() @@ -199,39 +203,44 @@ class FeedDuplicateHandler: def check(self, instances): for instance in instances: if instance.remote_identifier in self.existing_identifiers: - existing_post = self.handle_duplicate(instance) + existing_post = self.handle_duplicate_identifier(instance) yield existing_post continue - elif not instance.remote_identifier and self.in_database(instance): - continue + elif self.in_database(instance): + existing_post = self.get_duplicate_in_database(instance) + + if self.in_time_slot(instance, existing_post): + yield self.update_existing_post(instance, existing_post) + continue yield instance def in_database(self, post): - values = { - "url": post.url, - "title": post.title, - "body": post.body, - "publication_date": post.publication_date, - } + values = {field: getattr(post, field, None) for field in self.duplicate_fields} - for existing_post in self.queryset.order_by("-publication_date")[:500]: + for existing_post in self.queryset.filter(**values): if self.is_duplicate(existing_post, values): return True + def in_time_slot(self, instance, existing_post): + time_delta_slot = timedelta(minutes=self.time_slot_minutes) + + time_difference = instance.publication_date - existing_post.publication_date + + if time_difference <= time_delta_slot: + return True + def is_duplicate(self, existing_post, values): - for key, value in values.items(): - existing_value = getattr(existing_post, key, None) - if existing_value != value: - return False + return all( + getattr(existing_post, field, None) == value + for field, value in values.items() + ) - return True - - def handle_duplicate(self, instance): + def handle_duplicate_identifier(self, instance): try: - existing_instance = self.queryset.get( + existing_post = self.queryset.get( remote_identifier=instance.remote_identifier ) except ObjectDoesNotExist: @@ -240,17 +249,43 @@ class FeedDuplicateHandler: ) return instance except MultipleObjectsReturned: - existing_instances = self.queryset.filter( + existing_posts = self.queryset.filter( remote_identifier=instance.remote_identifier ).order_by("-publication_date") - existing_instance = existing_instances.last() - existing_instances.exclude(pk=existing_instance.pk).delete() + existing_post = existing_posts.last() + existing_posts.exclude(pk=existing_post.pk).delete() + updated_post = self.update_existing_post(instance, existing_post) + + return updated_post + + def get_duplicate_in_database(self, instance): + query_values = { + field: getattr(instance, field, None) for field in self.duplicate_fields + } + + try: + existing_post = self.queryset.get(**query_values) + except ObjectDoesNotExist: + logger.error( + f"Duplicate handler tried retrieving post {instance.remote_identifier} but failed doing so." + ) + return instance + except MultipleObjectsReturned: + existing_posts = self.queryset.filter(**query_values).order_by( + "-publication_date" + ) + existing_post = existing_posts.last() + existing_posts.exclude(pk=existing_post.pk).delete() + + return existing_post + + def update_existing_post(self, instance, existing_post): for field in instance._meta.get_fields(): - getattr(existing_instance, field.name, object()) + getattr(existing_post, field.name, object()) new_value = getattr(instance, field.name, object()) if new_value and field.name != "id": - setattr(existing_instance, field.name, new_value) + setattr(existing_post, field.name, new_value) - return existing_instance + return existing_post diff --git a/src/newsreader/news/collection/tests/feed/duplicate_handler/tests.py b/src/newsreader/news/collection/tests/feed/duplicate_handler/tests.py index b794f3e..005771a 100644 --- a/src/newsreader/news/collection/tests/feed/duplicate_handler/tests.py +++ b/src/newsreader/news/collection/tests/feed/duplicate_handler/tests.py @@ -1,8 +1,13 @@ +from datetime import timedelta + from django.test import TestCase from django.utils import timezone +from freezegun import freeze_time + from newsreader.news.collection.feed import FeedDuplicateHandler from newsreader.news.collection.tests.factories import CollectionRuleFactory +from newsreader.news.core.models import Post from newsreader.news.core.tests.factories import PostFactory @@ -25,16 +30,57 @@ class FeedDuplicateHandlerTestCase(TestCase): posts_gen = duplicate_handler.check([new_post]) posts = list(posts_gen) + self.assertEquals(len(posts), 1) + post = posts[0] + existing_post.refresh_from_db() + + self.assertEquals(existing_post.pk, post.pk) + self.assertEquals(post.publication_date, new_post.publication_date) + self.assertEquals(post.title, new_post.title) + self.assertEquals(post.body, new_post.body) + self.assertEquals(post.rule, new_post.rule) + self.assertEquals(post.read, False) + + @freeze_time("2019-10-30 12:30:00") + def test_duplicate_entries_with_different_remote_identifiers(self): + rule = CollectionRuleFactory() + publication_date = timezone.now() + + existing_post = PostFactory.create( + remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7", + url="https://bbc.com", + title="New post", + body="Body", + publication_date=publication_date, + rule=rule, + ) + new_post = PostFactory.build( + remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7Q", + url="https://bbc.com", + title="New post", + body="Body", + publication_date=publication_date, + rule=rule, + ) + + with FeedDuplicateHandler(rule) as duplicate_handler: + posts_gen = duplicate_handler.check([new_post]) + posts = list(posts_gen) self.assertEquals(len(posts), 1) + + existing_post.refresh_from_db() + post = posts[0] + + self.assertEquals(existing_post.pk, post.pk) + self.assertEquals(post.title, new_post.title) + self.assertEquals(post.body, new_post.body) + self.assertEquals(post.rule, new_post.rule) self.assertEquals(post.publication_date, new_post.publication_date) - self.assertTrue(post.publication_date != existing_post.publication_date) - self.assertTrue(post.title != existing_post.title) + self.assertEquals(post.read, False) def test_duplicate_entries_in_recent_database(self): - PostFactory.create_batch(size=10) - publication_date = timezone.now() rule = CollectionRuleFactory() @@ -43,7 +89,7 @@ class FeedDuplicateHandlerTestCase(TestCase): title="Birmingham head teacher threatened over LGBT lessons", body="Google's move to end business ties with Huawei will affect current devices", publication_date=publication_date, - remote_identifier=None, + remote_identifier="jabbadabadoe", rule=rule, ) new_post = PostFactory.build( @@ -59,10 +105,19 @@ class FeedDuplicateHandlerTestCase(TestCase): posts_gen = duplicate_handler.check([new_post]) posts = list(posts_gen) - self.assertEquals(len(posts), 0) + self.assertEquals(len(posts), 1) + + existing_post.refresh_from_db() + post = posts[0] + + self.assertEquals(existing_post.pk, post.pk) + self.assertEquals(post.title, new_post.title) + self.assertEquals(post.body, new_post.body) + self.assertEquals(post.rule, new_post.rule) + self.assertEquals(post.publication_date, new_post.publication_date) + self.assertEquals(post.read, False) def test_multiple_existing_entries_with_identifier(self): - timezone.now() rule = CollectionRuleFactory() PostFactory.create_batch( @@ -80,4 +135,56 @@ class FeedDuplicateHandlerTestCase(TestCase): posts = list(posts_gen) self.assertEquals(len(posts), 1) - self.assertEquals(posts[0].title, new_post.title) + + self.assertEquals( + Post.objects.filter( + remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7" + ).count(), + 1, + ) + + post = posts[0] + + self.assertEquals(post.title, new_post.title) + self.assertEquals(post.body, new_post.body) + self.assertEquals(post.publication_date, new_post.publication_date) + self.assertEquals(post.rule, new_post.rule) + self.assertEquals(post.read, False) + + @freeze_time("2019-10-30 12:30:00") + def test_duplicate_entries_outside_time_slot(self): + publication_date = timezone.now() + + rule = CollectionRuleFactory() + existing_post = PostFactory.create( + url="https://www.bbc.co.uk/news/uk-england-birmingham-48339080", + title="Birmingham head teacher threatened over LGBT lessons", + body="Google's move to end business ties with Huawei will affect current devices", + publication_date=publication_date, + remote_identifier="jabbadabadoe", + rule=rule, + ) + new_post = PostFactory.build( + url="https://www.bbc.co.uk/news/uk-england-birmingham-48339080", + title="Birmingham head teacher threatened over LGBT lessons", + body="Google's move to end business ties with Huawei will affect current devices", + publication_date=publication_date + timedelta(minutes=12), + remote_identifier=None, + rule=rule, + ) + + with FeedDuplicateHandler(rule) as duplicate_handler: + posts_gen = duplicate_handler.check([new_post]) + posts = list(posts_gen) + + self.assertEquals(len(posts), 1) + + existing_post.refresh_from_db() + post = posts[0] + + self.assertEquals(post.pk, None) + self.assertEquals(post.title, new_post.title) + self.assertEquals(post.body, new_post.body) + self.assertEquals(post.rule, new_post.rule) + self.assertEquals(post.publication_date, new_post.publication_date) + self.assertEquals(post.read, False) diff --git a/src/newsreader/news/collection/utils.py b/src/newsreader/news/collection/utils.py index 0aa096f..fd6ab0a 100644 --- a/src/newsreader/news/collection/utils.py +++ b/src/newsreader/news/collection/utils.py @@ -2,6 +2,7 @@ from datetime import datetime from django.utils import timezone +import pytz import requests from requests.exceptions import RequestException @@ -15,7 +16,8 @@ def build_publication_date(dt, tz): published_parsed = timezone.make_aware(naive_datetime, timezone=tz) except (TypeError, ValueError): return None, False - return published_parsed, True + + return published_parsed.astimezone(pytz.utc), True def fetch(url):