diff --git a/src/newsreader/accounts/migrations/0009_auto_20200524_1218.py b/src/newsreader/accounts/migrations/0009_auto_20200524_1218.py new file mode 100644 index 0000000..3b01b0f --- /dev/null +++ b/src/newsreader/accounts/migrations/0009_auto_20200524_1218.py @@ -0,0 +1,28 @@ +# Generated by Django 3.0.5 on 2020-05-24 10:18 + +import django.db.models.deletion + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("django_celery_beat", "0012_periodictask_expire_seconds"), + ("accounts", "0008_auto_20200422_2243"), + ] + + operations = [ + migrations.AlterField( + model_name="user", + name="task", + field=models.OneToOneField( + blank=True, + editable=False, + null=True, + on_delete=django.db.models.deletion.CASCADE, + to="django_celery_beat.PeriodicTask", + verbose_name="collection task", + ), + ) + ] diff --git a/src/newsreader/accounts/models.py b/src/newsreader/accounts/models.py index 0b2799f..18eba07 100644 --- a/src/newsreader/accounts/models.py +++ b/src/newsreader/accounts/models.py @@ -43,7 +43,7 @@ class User(AbstractUser): task = models.OneToOneField( PeriodicTask, - on_delete=models.SET_NULL, + on_delete=models.CASCADE, null=True, blank=True, editable=False, diff --git a/src/newsreader/news/collection/feed.py b/src/newsreader/news/collection/feed.py index b14f375..07090ce 100644 --- a/src/newsreader/news/collection/feed.py +++ b/src/newsreader/news/collection/feed.py @@ -47,13 +47,9 @@ class FeedBuilder(Builder): def create_posts(self, stream): data, stream = stream - entries = [] with FeedDuplicateHandler(stream.rule) as duplicate_handler: - try: - entries = data["entries"] - except KeyError: - pass + entries = data.get("entries", []) instances = self.build(entries, stream.rule) posts = duplicate_handler.check(instances) @@ -82,11 +78,9 @@ class FeedBuilder(Builder): value = self.truncate_text(model_field, entry[field]) if field == "published_parsed": - aware_datetime, created = build_publication_date(value, tz) - data[model_field] = aware_datetime if created else None + data[model_field] = build_publication_date(value, tz) elif field == "summary": - summary = self.sanitize_fragment(value) - data[model_field] = summary + data[model_field] = self.sanitize_fragment(value) else: data[model_field] = value @@ -201,7 +195,9 @@ class FeedDuplicateHandler: pass def check(self, instances): - for instance in instances: + deduplicated_instances = self.deduplicate_instances(instances) + + for instance in deduplicated_instances: if instance.remote_identifier in self.existing_identifiers: existing_post = self.handle_duplicate_identifier(instance) @@ -232,6 +228,34 @@ class FeedDuplicateHandler: if time_difference <= time_delta_slot: return True + def deduplicate_instances(self, instances): + deduplicated_instances = [] + + for instance in instances: + values = { + field: getattr(instance, field, None) for field in self.duplicate_fields + } + duplicate = False + + for deduplicated_instance in deduplicated_instances: + deduplicated_identifier = deduplicated_instance.remote_identifier + instance_identifier = instance.remote_identifier + has_identifiers = deduplicated_identifier and instance_identifier + + if self.is_duplicate(deduplicated_instance, values): + duplicate = True + break + elif has_identifiers and deduplicated_identifier == instance_identifier: + duplicate = True + break + + if duplicate: + continue + + deduplicated_instances.append(instance) + + return deduplicated_instances + def is_duplicate(self, existing_post, values): return all( getattr(existing_post, field, None) == value diff --git a/src/newsreader/news/collection/tests/feed/collector/tests.py b/src/newsreader/news/collection/tests/feed/collector/tests.py index 88f2875..0506783 100644 --- a/src/newsreader/news/collection/tests/feed/collector/tests.py +++ b/src/newsreader/news/collection/tests/feed/collector/tests.py @@ -139,7 +139,7 @@ class FeedCollectorTestCase(TestCase): self.mocked_parse.return_value = duplicate_mock rule = CollectionRuleFactory() - aware_datetime, _ = build_publication_date( + aware_datetime = build_publication_date( struct_time((2019, 5, 20, 16, 7, 37, 0, 140, 0)), pytz.utc ) @@ -152,7 +152,7 @@ class FeedCollectorTestCase(TestCase): rule=rule, ) - aware_datetime, _ = build_publication_date( + aware_datetime = build_publication_date( struct_time((2019, 5, 20, 12, 19, 19, 0, 140, 0)), pytz.utc ) @@ -165,7 +165,7 @@ class FeedCollectorTestCase(TestCase): rule=rule, ) - aware_datetime, _ = build_publication_date( + aware_datetime = build_publication_date( struct_time((2019, 5, 20, 16, 32, 38, 0, 140, 0)), pytz.utc ) diff --git a/src/newsreader/news/collection/tests/feed/duplicate_handler/tests.py b/src/newsreader/news/collection/tests/feed/duplicate_handler/tests.py index 005771a..6ed8a59 100644 --- a/src/newsreader/news/collection/tests/feed/duplicate_handler/tests.py +++ b/src/newsreader/news/collection/tests/feed/duplicate_handler/tests.py @@ -89,7 +89,7 @@ class FeedDuplicateHandlerTestCase(TestCase): title="Birmingham head teacher threatened over LGBT lessons", body="Google's move to end business ties with Huawei will affect current devices", publication_date=publication_date, - remote_identifier="jabbadabadoe", + remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7", rule=rule, ) new_post = PostFactory.build( @@ -161,7 +161,7 @@ class FeedDuplicateHandlerTestCase(TestCase): title="Birmingham head teacher threatened over LGBT lessons", body="Google's move to end business ties with Huawei will affect current devices", publication_date=publication_date, - remote_identifier="jabbadabadoe", + remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7", rule=rule, ) new_post = PostFactory.build( @@ -188,3 +188,43 @@ class FeedDuplicateHandlerTestCase(TestCase): self.assertEquals(post.rule, new_post.rule) self.assertEquals(post.publication_date, new_post.publication_date) self.assertEquals(post.read, False) + + def test_duplicate_entries_in_collected_entries(self): + rule = CollectionRuleFactory() + post_1 = PostFactory.build( + title="title got updated", body="body", url="https://bbc.com", rule=rule + ) + duplicate_post_1 = PostFactory.build( + title="title got updated", body="body", url="https://bbc.com", rule=rule + ) + + post_2 = PostFactory.build( + remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7" + ) + duplicate_post_2 = PostFactory.build( + remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7" + ) + + collected_posts = (post_1, post_2, duplicate_post_1, duplicate_post_2) + + with FeedDuplicateHandler(rule) as duplicate_handler: + posts_gen = duplicate_handler.check(collected_posts) + posts = list(posts_gen) + + self.assertEquals(len(posts), 2) + + post = posts[0] + + self.assertEquals(post_1.publication_date, post.publication_date) + self.assertEquals(post_1.title, post.title) + self.assertEquals(post_1.body, post.body) + self.assertEquals(post_1.rule, post.rule) + self.assertEquals(post.read, False) + + post = posts[1] + + self.assertEquals(post_2.publication_date, post.publication_date) + self.assertEquals(post_2.title, post.title) + self.assertEquals(post_2.body, post.body) + self.assertEquals(post_2.rule, post.rule) + self.assertEquals(post.read, False) diff --git a/src/newsreader/news/collection/utils.py b/src/newsreader/news/collection/utils.py index fd6ab0a..9a2e456 100644 --- a/src/newsreader/news/collection/utils.py +++ b/src/newsreader/news/collection/utils.py @@ -15,9 +15,9 @@ def build_publication_date(dt, tz): naive_datetime = datetime(*dt[:6]) published_parsed = timezone.make_aware(naive_datetime, timezone=tz) except (TypeError, ValueError): - return None, False + return timezone.now() - return published_parsed.astimezone(pytz.utc), True + return published_parsed.astimezone(pytz.utc) def fetch(url): diff --git a/src/newsreader/news/core/migrations/0006_auto_20200524_1218.py b/src/newsreader/news/core/migrations/0006_auto_20200524_1218.py new file mode 100644 index 0000000..f90b205 --- /dev/null +++ b/src/newsreader/news/core/migrations/0006_auto_20200524_1218.py @@ -0,0 +1,18 @@ +# Generated by Django 3.0.5 on 2020-05-24 10:18 + +import django.utils.timezone + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [("core", "0005_auto_20200412_1955")] + + operations = [ + migrations.AlterField( + model_name="post", + name="publication_date", + field=models.DateTimeField(default=django.utils.timezone.now), + ) + ] diff --git a/src/newsreader/news/core/models.py b/src/newsreader/news/core/models.py index 64028d2..28bf3fd 100644 --- a/src/newsreader/news/core/models.py +++ b/src/newsreader/news/core/models.py @@ -1,4 +1,5 @@ from django.db import models +from django.utils import timezone from django.utils.translation import gettext as _ from newsreader.core.models import TimeStampedModel @@ -9,7 +10,7 @@ class Post(TimeStampedModel): title = models.CharField(max_length=200, blank=True, null=True) body = models.TextField(blank=True, null=True) author = models.CharField(max_length=40, blank=True, null=True) - publication_date = models.DateTimeField(blank=True, null=True) + publication_date = models.DateTimeField(default=timezone.now) url = models.URLField(max_length=1024, blank=True, null=True) read = models.BooleanField(default=False) @@ -18,7 +19,7 @@ class Post(TimeStampedModel): CollectionRule, on_delete=models.CASCADE, editable=False, related_name="posts" ) remote_identifier = models.CharField( - max_length=500, blank=True, null=True, editable=False + max_length=500, editable=False, blank=True, null=True ) def __str__(self):