Check the collected entries for duplicates
This commit is contained in:
parent
c3f831e14b
commit
24b145a342
2 changed files with 73 additions and 3 deletions
|
|
@ -195,7 +195,9 @@ class FeedDuplicateHandler:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def check(self, instances):
|
def check(self, instances):
|
||||||
for instance in instances:
|
deduplicated_instances = self.deduplicate_instances(instances)
|
||||||
|
|
||||||
|
for instance in deduplicated_instances:
|
||||||
if instance.remote_identifier in self.existing_identifiers:
|
if instance.remote_identifier in self.existing_identifiers:
|
||||||
existing_post = self.handle_duplicate_identifier(instance)
|
existing_post = self.handle_duplicate_identifier(instance)
|
||||||
|
|
||||||
|
|
@ -226,6 +228,34 @@ class FeedDuplicateHandler:
|
||||||
if time_difference <= time_delta_slot:
|
if time_difference <= time_delta_slot:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
def deduplicate_instances(self, instances):
|
||||||
|
deduplicated_instances = []
|
||||||
|
|
||||||
|
for instance in instances:
|
||||||
|
values = {
|
||||||
|
field: getattr(instance, field, None) for field in self.duplicate_fields
|
||||||
|
}
|
||||||
|
duplicate = False
|
||||||
|
|
||||||
|
for deduplicated_instance in deduplicated_instances:
|
||||||
|
deduplicated_identifier = deduplicated_instance.remote_identifier
|
||||||
|
instance_identifier = instance.remote_identifier
|
||||||
|
has_identifiers = deduplicated_identifier and instance_identifier
|
||||||
|
|
||||||
|
if self.is_duplicate(deduplicated_instance, values):
|
||||||
|
duplicate = True
|
||||||
|
break
|
||||||
|
elif has_identifiers and deduplicated_identifier == instance_identifier:
|
||||||
|
duplicate = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if duplicate:
|
||||||
|
continue
|
||||||
|
|
||||||
|
deduplicated_instances.append(instance)
|
||||||
|
|
||||||
|
return deduplicated_instances
|
||||||
|
|
||||||
def is_duplicate(self, existing_post, values):
|
def is_duplicate(self, existing_post, values):
|
||||||
return all(
|
return all(
|
||||||
getattr(existing_post, field, None) == value
|
getattr(existing_post, field, None) == value
|
||||||
|
|
|
||||||
|
|
@ -89,7 +89,7 @@ class FeedDuplicateHandlerTestCase(TestCase):
|
||||||
title="Birmingham head teacher threatened over LGBT lessons",
|
title="Birmingham head teacher threatened over LGBT lessons",
|
||||||
body="Google's move to end business ties with Huawei will affect current devices",
|
body="Google's move to end business ties with Huawei will affect current devices",
|
||||||
publication_date=publication_date,
|
publication_date=publication_date,
|
||||||
remote_identifier="jabbadabadoe",
|
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7",
|
||||||
rule=rule,
|
rule=rule,
|
||||||
)
|
)
|
||||||
new_post = PostFactory.build(
|
new_post = PostFactory.build(
|
||||||
|
|
@ -161,7 +161,7 @@ class FeedDuplicateHandlerTestCase(TestCase):
|
||||||
title="Birmingham head teacher threatened over LGBT lessons",
|
title="Birmingham head teacher threatened over LGBT lessons",
|
||||||
body="Google's move to end business ties with Huawei will affect current devices",
|
body="Google's move to end business ties with Huawei will affect current devices",
|
||||||
publication_date=publication_date,
|
publication_date=publication_date,
|
||||||
remote_identifier="jabbadabadoe",
|
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7",
|
||||||
rule=rule,
|
rule=rule,
|
||||||
)
|
)
|
||||||
new_post = PostFactory.build(
|
new_post = PostFactory.build(
|
||||||
|
|
@ -188,3 +188,43 @@ class FeedDuplicateHandlerTestCase(TestCase):
|
||||||
self.assertEquals(post.rule, new_post.rule)
|
self.assertEquals(post.rule, new_post.rule)
|
||||||
self.assertEquals(post.publication_date, new_post.publication_date)
|
self.assertEquals(post.publication_date, new_post.publication_date)
|
||||||
self.assertEquals(post.read, False)
|
self.assertEquals(post.read, False)
|
||||||
|
|
||||||
|
def test_duplicate_entries_in_collected_entries(self):
|
||||||
|
rule = CollectionRuleFactory()
|
||||||
|
post_1 = PostFactory.build(
|
||||||
|
title="title got updated", body="body", url="https://bbc.com", rule=rule
|
||||||
|
)
|
||||||
|
duplicate_post_1 = PostFactory.build(
|
||||||
|
title="title got updated", body="body", url="https://bbc.com", rule=rule
|
||||||
|
)
|
||||||
|
|
||||||
|
post_2 = PostFactory.build(
|
||||||
|
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7"
|
||||||
|
)
|
||||||
|
duplicate_post_2 = PostFactory.build(
|
||||||
|
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7"
|
||||||
|
)
|
||||||
|
|
||||||
|
collected_posts = (post_1, post_2, duplicate_post_1, duplicate_post_2)
|
||||||
|
|
||||||
|
with FeedDuplicateHandler(rule) as duplicate_handler:
|
||||||
|
posts_gen = duplicate_handler.check(collected_posts)
|
||||||
|
posts = list(posts_gen)
|
||||||
|
|
||||||
|
self.assertEquals(len(posts), 2)
|
||||||
|
|
||||||
|
post = posts[0]
|
||||||
|
|
||||||
|
self.assertEquals(post_1.publication_date, post.publication_date)
|
||||||
|
self.assertEquals(post_1.title, post.title)
|
||||||
|
self.assertEquals(post_1.body, post.body)
|
||||||
|
self.assertEquals(post_1.rule, post.rule)
|
||||||
|
self.assertEquals(post.read, False)
|
||||||
|
|
||||||
|
post = posts[1]
|
||||||
|
|
||||||
|
self.assertEquals(post_2.publication_date, post.publication_date)
|
||||||
|
self.assertEquals(post_2.title, post.title)
|
||||||
|
self.assertEquals(post_2.body, post.body)
|
||||||
|
self.assertEquals(post_2.rule, post.rule)
|
||||||
|
self.assertEquals(post.read, False)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue