0.2.3 #99
3 changed files with 177 additions and 33 deletions
|
|
@ -1,6 +1,7 @@
|
|||
import logging
|
||||
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from datetime import timedelta
|
||||
|
||||
from django.core.exceptions import MultipleObjectsReturned, ObjectDoesNotExist
|
||||
from django.db.models.fields import CharField, TextField
|
||||
|
|
@ -184,6 +185,9 @@ class FeedCollector(Collector):
|
|||
|
||||
|
||||
class FeedDuplicateHandler:
|
||||
duplicate_fields = ("url", "title", "body", "rule")
|
||||
time_slot_minutes = 10
|
||||
|
||||
def __init__(self, rule):
|
||||
self.queryset = rule.posts.all()
|
||||
|
||||
|
|
@ -199,39 +203,44 @@ class FeedDuplicateHandler:
|
|||
def check(self, instances):
|
||||
for instance in instances:
|
||||
if instance.remote_identifier in self.existing_identifiers:
|
||||
existing_post = self.handle_duplicate(instance)
|
||||
existing_post = self.handle_duplicate_identifier(instance)
|
||||
|
||||
yield existing_post
|
||||
|
||||
continue
|
||||
elif not instance.remote_identifier and self.in_database(instance):
|
||||
elif self.in_database(instance):
|
||||
existing_post = self.get_duplicate_in_database(instance)
|
||||
|
||||
if self.in_time_slot(instance, existing_post):
|
||||
yield self.update_existing_post(instance, existing_post)
|
||||
continue
|
||||
|
||||
yield instance
|
||||
|
||||
def in_database(self, post):
|
||||
values = {
|
||||
"url": post.url,
|
||||
"title": post.title,
|
||||
"body": post.body,
|
||||
"publication_date": post.publication_date,
|
||||
}
|
||||
values = {field: getattr(post, field, None) for field in self.duplicate_fields}
|
||||
|
||||
for existing_post in self.queryset.order_by("-publication_date")[:500]:
|
||||
for existing_post in self.queryset.filter(**values):
|
||||
if self.is_duplicate(existing_post, values):
|
||||
return True
|
||||
|
||||
def is_duplicate(self, existing_post, values):
|
||||
for key, value in values.items():
|
||||
existing_value = getattr(existing_post, key, None)
|
||||
if existing_value != value:
|
||||
return False
|
||||
def in_time_slot(self, instance, existing_post):
|
||||
time_delta_slot = timedelta(minutes=self.time_slot_minutes)
|
||||
|
||||
time_difference = instance.publication_date - existing_post.publication_date
|
||||
|
||||
if time_difference <= time_delta_slot:
|
||||
return True
|
||||
|
||||
def handle_duplicate(self, instance):
|
||||
def is_duplicate(self, existing_post, values):
|
||||
return all(
|
||||
getattr(existing_post, field, None) == value
|
||||
for field, value in values.items()
|
||||
)
|
||||
|
||||
def handle_duplicate_identifier(self, instance):
|
||||
try:
|
||||
existing_instance = self.queryset.get(
|
||||
existing_post = self.queryset.get(
|
||||
remote_identifier=instance.remote_identifier
|
||||
)
|
||||
except ObjectDoesNotExist:
|
||||
|
|
@ -240,17 +249,43 @@ class FeedDuplicateHandler:
|
|||
)
|
||||
return instance
|
||||
except MultipleObjectsReturned:
|
||||
existing_instances = self.queryset.filter(
|
||||
existing_posts = self.queryset.filter(
|
||||
remote_identifier=instance.remote_identifier
|
||||
).order_by("-publication_date")
|
||||
existing_instance = existing_instances.last()
|
||||
existing_instances.exclude(pk=existing_instance.pk).delete()
|
||||
existing_post = existing_posts.last()
|
||||
existing_posts.exclude(pk=existing_post.pk).delete()
|
||||
|
||||
updated_post = self.update_existing_post(instance, existing_post)
|
||||
|
||||
return updated_post
|
||||
|
||||
def get_duplicate_in_database(self, instance):
|
||||
query_values = {
|
||||
field: getattr(instance, field, None) for field in self.duplicate_fields
|
||||
}
|
||||
|
||||
try:
|
||||
existing_post = self.queryset.get(**query_values)
|
||||
except ObjectDoesNotExist:
|
||||
logger.error(
|
||||
f"Duplicate handler tried retrieving post {instance.remote_identifier} but failed doing so."
|
||||
)
|
||||
return instance
|
||||
except MultipleObjectsReturned:
|
||||
existing_posts = self.queryset.filter(**query_values).order_by(
|
||||
"-publication_date"
|
||||
)
|
||||
existing_post = existing_posts.last()
|
||||
existing_posts.exclude(pk=existing_post.pk).delete()
|
||||
|
||||
return existing_post
|
||||
|
||||
def update_existing_post(self, instance, existing_post):
|
||||
for field in instance._meta.get_fields():
|
||||
getattr(existing_instance, field.name, object())
|
||||
getattr(existing_post, field.name, object())
|
||||
new_value = getattr(instance, field.name, object())
|
||||
|
||||
if new_value and field.name != "id":
|
||||
setattr(existing_instance, field.name, new_value)
|
||||
setattr(existing_post, field.name, new_value)
|
||||
|
||||
return existing_instance
|
||||
return existing_post
|
||||
|
|
|
|||
|
|
@ -1,8 +1,13 @@
|
|||
from datetime import timedelta
|
||||
|
||||
from django.test import TestCase
|
||||
from django.utils import timezone
|
||||
|
||||
from freezegun import freeze_time
|
||||
|
||||
from newsreader.news.collection.feed import FeedDuplicateHandler
|
||||
from newsreader.news.collection.tests.factories import CollectionRuleFactory
|
||||
from newsreader.news.core.models import Post
|
||||
from newsreader.news.core.tests.factories import PostFactory
|
||||
|
||||
|
||||
|
|
@ -25,16 +30,57 @@ class FeedDuplicateHandlerTestCase(TestCase):
|
|||
posts_gen = duplicate_handler.check([new_post])
|
||||
posts = list(posts_gen)
|
||||
|
||||
self.assertEquals(len(posts), 1)
|
||||
|
||||
post = posts[0]
|
||||
existing_post.refresh_from_db()
|
||||
|
||||
self.assertEquals(existing_post.pk, post.pk)
|
||||
self.assertEquals(post.publication_date, new_post.publication_date)
|
||||
self.assertEquals(post.title, new_post.title)
|
||||
self.assertEquals(post.body, new_post.body)
|
||||
self.assertEquals(post.rule, new_post.rule)
|
||||
self.assertEquals(post.read, False)
|
||||
|
||||
@freeze_time("2019-10-30 12:30:00")
|
||||
def test_duplicate_entries_with_different_remote_identifiers(self):
|
||||
rule = CollectionRuleFactory()
|
||||
publication_date = timezone.now()
|
||||
|
||||
existing_post = PostFactory.create(
|
||||
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7",
|
||||
url="https://bbc.com",
|
||||
title="New post",
|
||||
body="Body",
|
||||
publication_date=publication_date,
|
||||
rule=rule,
|
||||
)
|
||||
new_post = PostFactory.build(
|
||||
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7Q",
|
||||
url="https://bbc.com",
|
||||
title="New post",
|
||||
body="Body",
|
||||
publication_date=publication_date,
|
||||
rule=rule,
|
||||
)
|
||||
|
||||
with FeedDuplicateHandler(rule) as duplicate_handler:
|
||||
posts_gen = duplicate_handler.check([new_post])
|
||||
posts = list(posts_gen)
|
||||
|
||||
self.assertEquals(len(posts), 1)
|
||||
|
||||
existing_post.refresh_from_db()
|
||||
post = posts[0]
|
||||
|
||||
self.assertEquals(existing_post.pk, post.pk)
|
||||
self.assertEquals(post.title, new_post.title)
|
||||
self.assertEquals(post.body, new_post.body)
|
||||
self.assertEquals(post.rule, new_post.rule)
|
||||
self.assertEquals(post.publication_date, new_post.publication_date)
|
||||
self.assertTrue(post.publication_date != existing_post.publication_date)
|
||||
self.assertTrue(post.title != existing_post.title)
|
||||
self.assertEquals(post.read, False)
|
||||
|
||||
def test_duplicate_entries_in_recent_database(self):
|
||||
PostFactory.create_batch(size=10)
|
||||
|
||||
publication_date = timezone.now()
|
||||
|
||||
rule = CollectionRuleFactory()
|
||||
|
|
@ -43,7 +89,7 @@ class FeedDuplicateHandlerTestCase(TestCase):
|
|||
title="Birmingham head teacher threatened over LGBT lessons",
|
||||
body="Google's move to end business ties with Huawei will affect current devices",
|
||||
publication_date=publication_date,
|
||||
remote_identifier=None,
|
||||
remote_identifier="jabbadabadoe",
|
||||
rule=rule,
|
||||
)
|
||||
new_post = PostFactory.build(
|
||||
|
|
@ -59,10 +105,19 @@ class FeedDuplicateHandlerTestCase(TestCase):
|
|||
posts_gen = duplicate_handler.check([new_post])
|
||||
posts = list(posts_gen)
|
||||
|
||||
self.assertEquals(len(posts), 0)
|
||||
self.assertEquals(len(posts), 1)
|
||||
|
||||
existing_post.refresh_from_db()
|
||||
post = posts[0]
|
||||
|
||||
self.assertEquals(existing_post.pk, post.pk)
|
||||
self.assertEquals(post.title, new_post.title)
|
||||
self.assertEquals(post.body, new_post.body)
|
||||
self.assertEquals(post.rule, new_post.rule)
|
||||
self.assertEquals(post.publication_date, new_post.publication_date)
|
||||
self.assertEquals(post.read, False)
|
||||
|
||||
def test_multiple_existing_entries_with_identifier(self):
|
||||
timezone.now()
|
||||
rule = CollectionRuleFactory()
|
||||
|
||||
PostFactory.create_batch(
|
||||
|
|
@ -80,4 +135,56 @@ class FeedDuplicateHandlerTestCase(TestCase):
|
|||
posts = list(posts_gen)
|
||||
|
||||
self.assertEquals(len(posts), 1)
|
||||
self.assertEquals(posts[0].title, new_post.title)
|
||||
|
||||
self.assertEquals(
|
||||
Post.objects.filter(
|
||||
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7"
|
||||
).count(),
|
||||
1,
|
||||
)
|
||||
|
||||
post = posts[0]
|
||||
|
||||
self.assertEquals(post.title, new_post.title)
|
||||
self.assertEquals(post.body, new_post.body)
|
||||
self.assertEquals(post.publication_date, new_post.publication_date)
|
||||
self.assertEquals(post.rule, new_post.rule)
|
||||
self.assertEquals(post.read, False)
|
||||
|
||||
@freeze_time("2019-10-30 12:30:00")
|
||||
def test_duplicate_entries_outside_time_slot(self):
|
||||
publication_date = timezone.now()
|
||||
|
||||
rule = CollectionRuleFactory()
|
||||
existing_post = PostFactory.create(
|
||||
url="https://www.bbc.co.uk/news/uk-england-birmingham-48339080",
|
||||
title="Birmingham head teacher threatened over LGBT lessons",
|
||||
body="Google's move to end business ties with Huawei will affect current devices",
|
||||
publication_date=publication_date,
|
||||
remote_identifier="jabbadabadoe",
|
||||
rule=rule,
|
||||
)
|
||||
new_post = PostFactory.build(
|
||||
url="https://www.bbc.co.uk/news/uk-england-birmingham-48339080",
|
||||
title="Birmingham head teacher threatened over LGBT lessons",
|
||||
body="Google's move to end business ties with Huawei will affect current devices",
|
||||
publication_date=publication_date + timedelta(minutes=12),
|
||||
remote_identifier=None,
|
||||
rule=rule,
|
||||
)
|
||||
|
||||
with FeedDuplicateHandler(rule) as duplicate_handler:
|
||||
posts_gen = duplicate_handler.check([new_post])
|
||||
posts = list(posts_gen)
|
||||
|
||||
self.assertEquals(len(posts), 1)
|
||||
|
||||
existing_post.refresh_from_db()
|
||||
post = posts[0]
|
||||
|
||||
self.assertEquals(post.pk, None)
|
||||
self.assertEquals(post.title, new_post.title)
|
||||
self.assertEquals(post.body, new_post.body)
|
||||
self.assertEquals(post.rule, new_post.rule)
|
||||
self.assertEquals(post.publication_date, new_post.publication_date)
|
||||
self.assertEquals(post.read, False)
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ from datetime import datetime
|
|||
|
||||
from django.utils import timezone
|
||||
|
||||
import pytz
|
||||
import requests
|
||||
|
||||
from requests.exceptions import RequestException
|
||||
|
|
@ -15,7 +16,8 @@ def build_publication_date(dt, tz):
|
|||
published_parsed = timezone.make_aware(naive_datetime, timezone=tz)
|
||||
except (TypeError, ValueError):
|
||||
return None, False
|
||||
return published_parsed, True
|
||||
|
||||
return published_parsed.astimezone(pytz.utc), True
|
||||
|
||||
|
||||
def fetch(url):
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue