diff --git a/src/newsreader/news/collection/feed.py b/src/newsreader/news/collection/feed.py index 888eb15..07090ce 100644 --- a/src/newsreader/news/collection/feed.py +++ b/src/newsreader/news/collection/feed.py @@ -195,7 +195,9 @@ class FeedDuplicateHandler: pass def check(self, instances): - for instance in instances: + deduplicated_instances = self.deduplicate_instances(instances) + + for instance in deduplicated_instances: if instance.remote_identifier in self.existing_identifiers: existing_post = self.handle_duplicate_identifier(instance) @@ -226,6 +228,34 @@ class FeedDuplicateHandler: if time_difference <= time_delta_slot: return True + def deduplicate_instances(self, instances): + deduplicated_instances = [] + + for instance in instances: + values = { + field: getattr(instance, field, None) for field in self.duplicate_fields + } + duplicate = False + + for deduplicated_instance in deduplicated_instances: + deduplicated_identifier = deduplicated_instance.remote_identifier + instance_identifier = instance.remote_identifier + has_identifiers = deduplicated_identifier and instance_identifier + + if self.is_duplicate(deduplicated_instance, values): + duplicate = True + break + elif has_identifiers and deduplicated_identifier == instance_identifier: + duplicate = True + break + + if duplicate: + continue + + deduplicated_instances.append(instance) + + return deduplicated_instances + def is_duplicate(self, existing_post, values): return all( getattr(existing_post, field, None) == value diff --git a/src/newsreader/news/collection/tests/feed/duplicate_handler/tests.py b/src/newsreader/news/collection/tests/feed/duplicate_handler/tests.py index 005771a..6ed8a59 100644 --- a/src/newsreader/news/collection/tests/feed/duplicate_handler/tests.py +++ b/src/newsreader/news/collection/tests/feed/duplicate_handler/tests.py @@ -89,7 +89,7 @@ class FeedDuplicateHandlerTestCase(TestCase): title="Birmingham head teacher threatened over LGBT lessons", body="Google's move to end business ties with Huawei will affect current devices", publication_date=publication_date, - remote_identifier="jabbadabadoe", + remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7", rule=rule, ) new_post = PostFactory.build( @@ -161,7 +161,7 @@ class FeedDuplicateHandlerTestCase(TestCase): title="Birmingham head teacher threatened over LGBT lessons", body="Google's move to end business ties with Huawei will affect current devices", publication_date=publication_date, - remote_identifier="jabbadabadoe", + remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7", rule=rule, ) new_post = PostFactory.build( @@ -188,3 +188,43 @@ class FeedDuplicateHandlerTestCase(TestCase): self.assertEquals(post.rule, new_post.rule) self.assertEquals(post.publication_date, new_post.publication_date) self.assertEquals(post.read, False) + + def test_duplicate_entries_in_collected_entries(self): + rule = CollectionRuleFactory() + post_1 = PostFactory.build( + title="title got updated", body="body", url="https://bbc.com", rule=rule + ) + duplicate_post_1 = PostFactory.build( + title="title got updated", body="body", url="https://bbc.com", rule=rule + ) + + post_2 = PostFactory.build( + remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7" + ) + duplicate_post_2 = PostFactory.build( + remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7" + ) + + collected_posts = (post_1, post_2, duplicate_post_1, duplicate_post_2) + + with FeedDuplicateHandler(rule) as duplicate_handler: + posts_gen = duplicate_handler.check(collected_posts) + posts = list(posts_gen) + + self.assertEquals(len(posts), 2) + + post = posts[0] + + self.assertEquals(post_1.publication_date, post.publication_date) + self.assertEquals(post_1.title, post.title) + self.assertEquals(post_1.body, post.body) + self.assertEquals(post_1.rule, post.rule) + self.assertEquals(post.read, False) + + post = posts[1] + + self.assertEquals(post_2.publication_date, post.publication_date) + self.assertEquals(post_2.title, post.title) + self.assertEquals(post_2.body, post.body) + self.assertEquals(post_2.rule, post.rule) + self.assertEquals(post.read, False)