0.2.3 #99
3 changed files with 177 additions and 33 deletions
|
|
@ -1,6 +1,7 @@
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
from datetime import timedelta
|
||||||
|
|
||||||
from django.core.exceptions import MultipleObjectsReturned, ObjectDoesNotExist
|
from django.core.exceptions import MultipleObjectsReturned, ObjectDoesNotExist
|
||||||
from django.db.models.fields import CharField, TextField
|
from django.db.models.fields import CharField, TextField
|
||||||
|
|
@ -184,6 +185,9 @@ class FeedCollector(Collector):
|
||||||
|
|
||||||
|
|
||||||
class FeedDuplicateHandler:
|
class FeedDuplicateHandler:
|
||||||
|
duplicate_fields = ("url", "title", "body", "rule")
|
||||||
|
time_slot_minutes = 10
|
||||||
|
|
||||||
def __init__(self, rule):
|
def __init__(self, rule):
|
||||||
self.queryset = rule.posts.all()
|
self.queryset = rule.posts.all()
|
||||||
|
|
||||||
|
|
@ -199,39 +203,44 @@ class FeedDuplicateHandler:
|
||||||
def check(self, instances):
|
def check(self, instances):
|
||||||
for instance in instances:
|
for instance in instances:
|
||||||
if instance.remote_identifier in self.existing_identifiers:
|
if instance.remote_identifier in self.existing_identifiers:
|
||||||
existing_post = self.handle_duplicate(instance)
|
existing_post = self.handle_duplicate_identifier(instance)
|
||||||
|
|
||||||
yield existing_post
|
yield existing_post
|
||||||
|
|
||||||
continue
|
continue
|
||||||
elif not instance.remote_identifier and self.in_database(instance):
|
elif self.in_database(instance):
|
||||||
continue
|
existing_post = self.get_duplicate_in_database(instance)
|
||||||
|
|
||||||
|
if self.in_time_slot(instance, existing_post):
|
||||||
|
yield self.update_existing_post(instance, existing_post)
|
||||||
|
continue
|
||||||
|
|
||||||
yield instance
|
yield instance
|
||||||
|
|
||||||
def in_database(self, post):
|
def in_database(self, post):
|
||||||
values = {
|
values = {field: getattr(post, field, None) for field in self.duplicate_fields}
|
||||||
"url": post.url,
|
|
||||||
"title": post.title,
|
|
||||||
"body": post.body,
|
|
||||||
"publication_date": post.publication_date,
|
|
||||||
}
|
|
||||||
|
|
||||||
for existing_post in self.queryset.order_by("-publication_date")[:500]:
|
for existing_post in self.queryset.filter(**values):
|
||||||
if self.is_duplicate(existing_post, values):
|
if self.is_duplicate(existing_post, values):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
def in_time_slot(self, instance, existing_post):
|
||||||
|
time_delta_slot = timedelta(minutes=self.time_slot_minutes)
|
||||||
|
|
||||||
|
time_difference = instance.publication_date - existing_post.publication_date
|
||||||
|
|
||||||
|
if time_difference <= time_delta_slot:
|
||||||
|
return True
|
||||||
|
|
||||||
def is_duplicate(self, existing_post, values):
|
def is_duplicate(self, existing_post, values):
|
||||||
for key, value in values.items():
|
return all(
|
||||||
existing_value = getattr(existing_post, key, None)
|
getattr(existing_post, field, None) == value
|
||||||
if existing_value != value:
|
for field, value in values.items()
|
||||||
return False
|
)
|
||||||
|
|
||||||
return True
|
def handle_duplicate_identifier(self, instance):
|
||||||
|
|
||||||
def handle_duplicate(self, instance):
|
|
||||||
try:
|
try:
|
||||||
existing_instance = self.queryset.get(
|
existing_post = self.queryset.get(
|
||||||
remote_identifier=instance.remote_identifier
|
remote_identifier=instance.remote_identifier
|
||||||
)
|
)
|
||||||
except ObjectDoesNotExist:
|
except ObjectDoesNotExist:
|
||||||
|
|
@ -240,17 +249,43 @@ class FeedDuplicateHandler:
|
||||||
)
|
)
|
||||||
return instance
|
return instance
|
||||||
except MultipleObjectsReturned:
|
except MultipleObjectsReturned:
|
||||||
existing_instances = self.queryset.filter(
|
existing_posts = self.queryset.filter(
|
||||||
remote_identifier=instance.remote_identifier
|
remote_identifier=instance.remote_identifier
|
||||||
).order_by("-publication_date")
|
).order_by("-publication_date")
|
||||||
existing_instance = existing_instances.last()
|
existing_post = existing_posts.last()
|
||||||
existing_instances.exclude(pk=existing_instance.pk).delete()
|
existing_posts.exclude(pk=existing_post.pk).delete()
|
||||||
|
|
||||||
|
updated_post = self.update_existing_post(instance, existing_post)
|
||||||
|
|
||||||
|
return updated_post
|
||||||
|
|
||||||
|
def get_duplicate_in_database(self, instance):
|
||||||
|
query_values = {
|
||||||
|
field: getattr(instance, field, None) for field in self.duplicate_fields
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
existing_post = self.queryset.get(**query_values)
|
||||||
|
except ObjectDoesNotExist:
|
||||||
|
logger.error(
|
||||||
|
f"Duplicate handler tried retrieving post {instance.remote_identifier} but failed doing so."
|
||||||
|
)
|
||||||
|
return instance
|
||||||
|
except MultipleObjectsReturned:
|
||||||
|
existing_posts = self.queryset.filter(**query_values).order_by(
|
||||||
|
"-publication_date"
|
||||||
|
)
|
||||||
|
existing_post = existing_posts.last()
|
||||||
|
existing_posts.exclude(pk=existing_post.pk).delete()
|
||||||
|
|
||||||
|
return existing_post
|
||||||
|
|
||||||
|
def update_existing_post(self, instance, existing_post):
|
||||||
for field in instance._meta.get_fields():
|
for field in instance._meta.get_fields():
|
||||||
getattr(existing_instance, field.name, object())
|
getattr(existing_post, field.name, object())
|
||||||
new_value = getattr(instance, field.name, object())
|
new_value = getattr(instance, field.name, object())
|
||||||
|
|
||||||
if new_value and field.name != "id":
|
if new_value and field.name != "id":
|
||||||
setattr(existing_instance, field.name, new_value)
|
setattr(existing_post, field.name, new_value)
|
||||||
|
|
||||||
return existing_instance
|
return existing_post
|
||||||
|
|
|
||||||
|
|
@ -1,8 +1,13 @@
|
||||||
|
from datetime import timedelta
|
||||||
|
|
||||||
from django.test import TestCase
|
from django.test import TestCase
|
||||||
from django.utils import timezone
|
from django.utils import timezone
|
||||||
|
|
||||||
|
from freezegun import freeze_time
|
||||||
|
|
||||||
from newsreader.news.collection.feed import FeedDuplicateHandler
|
from newsreader.news.collection.feed import FeedDuplicateHandler
|
||||||
from newsreader.news.collection.tests.factories import CollectionRuleFactory
|
from newsreader.news.collection.tests.factories import CollectionRuleFactory
|
||||||
|
from newsreader.news.core.models import Post
|
||||||
from newsreader.news.core.tests.factories import PostFactory
|
from newsreader.news.core.tests.factories import PostFactory
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -25,16 +30,57 @@ class FeedDuplicateHandlerTestCase(TestCase):
|
||||||
posts_gen = duplicate_handler.check([new_post])
|
posts_gen = duplicate_handler.check([new_post])
|
||||||
posts = list(posts_gen)
|
posts = list(posts_gen)
|
||||||
|
|
||||||
|
self.assertEquals(len(posts), 1)
|
||||||
|
|
||||||
post = posts[0]
|
post = posts[0]
|
||||||
|
existing_post.refresh_from_db()
|
||||||
|
|
||||||
|
self.assertEquals(existing_post.pk, post.pk)
|
||||||
|
self.assertEquals(post.publication_date, new_post.publication_date)
|
||||||
|
self.assertEquals(post.title, new_post.title)
|
||||||
|
self.assertEquals(post.body, new_post.body)
|
||||||
|
self.assertEquals(post.rule, new_post.rule)
|
||||||
|
self.assertEquals(post.read, False)
|
||||||
|
|
||||||
|
@freeze_time("2019-10-30 12:30:00")
|
||||||
|
def test_duplicate_entries_with_different_remote_identifiers(self):
|
||||||
|
rule = CollectionRuleFactory()
|
||||||
|
publication_date = timezone.now()
|
||||||
|
|
||||||
|
existing_post = PostFactory.create(
|
||||||
|
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7",
|
||||||
|
url="https://bbc.com",
|
||||||
|
title="New post",
|
||||||
|
body="Body",
|
||||||
|
publication_date=publication_date,
|
||||||
|
rule=rule,
|
||||||
|
)
|
||||||
|
new_post = PostFactory.build(
|
||||||
|
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7Q",
|
||||||
|
url="https://bbc.com",
|
||||||
|
title="New post",
|
||||||
|
body="Body",
|
||||||
|
publication_date=publication_date,
|
||||||
|
rule=rule,
|
||||||
|
)
|
||||||
|
|
||||||
|
with FeedDuplicateHandler(rule) as duplicate_handler:
|
||||||
|
posts_gen = duplicate_handler.check([new_post])
|
||||||
|
posts = list(posts_gen)
|
||||||
|
|
||||||
self.assertEquals(len(posts), 1)
|
self.assertEquals(len(posts), 1)
|
||||||
|
|
||||||
|
existing_post.refresh_from_db()
|
||||||
|
post = posts[0]
|
||||||
|
|
||||||
|
self.assertEquals(existing_post.pk, post.pk)
|
||||||
|
self.assertEquals(post.title, new_post.title)
|
||||||
|
self.assertEquals(post.body, new_post.body)
|
||||||
|
self.assertEquals(post.rule, new_post.rule)
|
||||||
self.assertEquals(post.publication_date, new_post.publication_date)
|
self.assertEquals(post.publication_date, new_post.publication_date)
|
||||||
self.assertTrue(post.publication_date != existing_post.publication_date)
|
self.assertEquals(post.read, False)
|
||||||
self.assertTrue(post.title != existing_post.title)
|
|
||||||
|
|
||||||
def test_duplicate_entries_in_recent_database(self):
|
def test_duplicate_entries_in_recent_database(self):
|
||||||
PostFactory.create_batch(size=10)
|
|
||||||
|
|
||||||
publication_date = timezone.now()
|
publication_date = timezone.now()
|
||||||
|
|
||||||
rule = CollectionRuleFactory()
|
rule = CollectionRuleFactory()
|
||||||
|
|
@ -43,7 +89,7 @@ class FeedDuplicateHandlerTestCase(TestCase):
|
||||||
title="Birmingham head teacher threatened over LGBT lessons",
|
title="Birmingham head teacher threatened over LGBT lessons",
|
||||||
body="Google's move to end business ties with Huawei will affect current devices",
|
body="Google's move to end business ties with Huawei will affect current devices",
|
||||||
publication_date=publication_date,
|
publication_date=publication_date,
|
||||||
remote_identifier=None,
|
remote_identifier="jabbadabadoe",
|
||||||
rule=rule,
|
rule=rule,
|
||||||
)
|
)
|
||||||
new_post = PostFactory.build(
|
new_post = PostFactory.build(
|
||||||
|
|
@ -59,10 +105,19 @@ class FeedDuplicateHandlerTestCase(TestCase):
|
||||||
posts_gen = duplicate_handler.check([new_post])
|
posts_gen = duplicate_handler.check([new_post])
|
||||||
posts = list(posts_gen)
|
posts = list(posts_gen)
|
||||||
|
|
||||||
self.assertEquals(len(posts), 0)
|
self.assertEquals(len(posts), 1)
|
||||||
|
|
||||||
|
existing_post.refresh_from_db()
|
||||||
|
post = posts[0]
|
||||||
|
|
||||||
|
self.assertEquals(existing_post.pk, post.pk)
|
||||||
|
self.assertEquals(post.title, new_post.title)
|
||||||
|
self.assertEquals(post.body, new_post.body)
|
||||||
|
self.assertEquals(post.rule, new_post.rule)
|
||||||
|
self.assertEquals(post.publication_date, new_post.publication_date)
|
||||||
|
self.assertEquals(post.read, False)
|
||||||
|
|
||||||
def test_multiple_existing_entries_with_identifier(self):
|
def test_multiple_existing_entries_with_identifier(self):
|
||||||
timezone.now()
|
|
||||||
rule = CollectionRuleFactory()
|
rule = CollectionRuleFactory()
|
||||||
|
|
||||||
PostFactory.create_batch(
|
PostFactory.create_batch(
|
||||||
|
|
@ -80,4 +135,56 @@ class FeedDuplicateHandlerTestCase(TestCase):
|
||||||
posts = list(posts_gen)
|
posts = list(posts_gen)
|
||||||
|
|
||||||
self.assertEquals(len(posts), 1)
|
self.assertEquals(len(posts), 1)
|
||||||
self.assertEquals(posts[0].title, new_post.title)
|
|
||||||
|
self.assertEquals(
|
||||||
|
Post.objects.filter(
|
||||||
|
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7"
|
||||||
|
).count(),
|
||||||
|
1,
|
||||||
|
)
|
||||||
|
|
||||||
|
post = posts[0]
|
||||||
|
|
||||||
|
self.assertEquals(post.title, new_post.title)
|
||||||
|
self.assertEquals(post.body, new_post.body)
|
||||||
|
self.assertEquals(post.publication_date, new_post.publication_date)
|
||||||
|
self.assertEquals(post.rule, new_post.rule)
|
||||||
|
self.assertEquals(post.read, False)
|
||||||
|
|
||||||
|
@freeze_time("2019-10-30 12:30:00")
|
||||||
|
def test_duplicate_entries_outside_time_slot(self):
|
||||||
|
publication_date = timezone.now()
|
||||||
|
|
||||||
|
rule = CollectionRuleFactory()
|
||||||
|
existing_post = PostFactory.create(
|
||||||
|
url="https://www.bbc.co.uk/news/uk-england-birmingham-48339080",
|
||||||
|
title="Birmingham head teacher threatened over LGBT lessons",
|
||||||
|
body="Google's move to end business ties with Huawei will affect current devices",
|
||||||
|
publication_date=publication_date,
|
||||||
|
remote_identifier="jabbadabadoe",
|
||||||
|
rule=rule,
|
||||||
|
)
|
||||||
|
new_post = PostFactory.build(
|
||||||
|
url="https://www.bbc.co.uk/news/uk-england-birmingham-48339080",
|
||||||
|
title="Birmingham head teacher threatened over LGBT lessons",
|
||||||
|
body="Google's move to end business ties with Huawei will affect current devices",
|
||||||
|
publication_date=publication_date + timedelta(minutes=12),
|
||||||
|
remote_identifier=None,
|
||||||
|
rule=rule,
|
||||||
|
)
|
||||||
|
|
||||||
|
with FeedDuplicateHandler(rule) as duplicate_handler:
|
||||||
|
posts_gen = duplicate_handler.check([new_post])
|
||||||
|
posts = list(posts_gen)
|
||||||
|
|
||||||
|
self.assertEquals(len(posts), 1)
|
||||||
|
|
||||||
|
existing_post.refresh_from_db()
|
||||||
|
post = posts[0]
|
||||||
|
|
||||||
|
self.assertEquals(post.pk, None)
|
||||||
|
self.assertEquals(post.title, new_post.title)
|
||||||
|
self.assertEquals(post.body, new_post.body)
|
||||||
|
self.assertEquals(post.rule, new_post.rule)
|
||||||
|
self.assertEquals(post.publication_date, new_post.publication_date)
|
||||||
|
self.assertEquals(post.read, False)
|
||||||
|
|
|
||||||
|
|
@ -2,6 +2,7 @@ from datetime import datetime
|
||||||
|
|
||||||
from django.utils import timezone
|
from django.utils import timezone
|
||||||
|
|
||||||
|
import pytz
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
from requests.exceptions import RequestException
|
from requests.exceptions import RequestException
|
||||||
|
|
@ -15,7 +16,8 @@ def build_publication_date(dt, tz):
|
||||||
published_parsed = timezone.make_aware(naive_datetime, timezone=tz)
|
published_parsed = timezone.make_aware(naive_datetime, timezone=tz)
|
||||||
except (TypeError, ValueError):
|
except (TypeError, ValueError):
|
||||||
return None, False
|
return None, False
|
||||||
return published_parsed, True
|
|
||||||
|
return published_parsed.astimezone(pytz.utc), True
|
||||||
|
|
||||||
|
|
||||||
def fetch(url):
|
def fetch(url):
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue