From 2be35bce53219fb58177ce11b18149b86db525bb Mon Sep 17 00:00:00 2001 From: sonny Date: Thu, 18 Jun 2020 20:29:48 +0200 Subject: [PATCH] 0.2.3.6 - Update logging - Update FeedDuplicateHandler --- gitlab-ci/lint.yml | 4 +- gitlab-ci/test.yml | 4 +- src/newsreader/conf/base.py | 10 +- src/newsreader/news/collection/feed.py | 35 +-- .../collection/tests/feed/builder/tests.py | 197 ++++++++++------- .../tests/feed/duplicate_handler/tests.py | 199 ++++++++++++------ src/newsreader/news/core/models.py | 2 +- 7 files changed, 280 insertions(+), 171 deletions(-) diff --git a/gitlab-ci/lint.yml b/gitlab-ci/lint.yml index 3f1e259..134716f 100644 --- a/gitlab-ci/lint.yml +++ b/gitlab-ci/lint.yml @@ -3,10 +3,10 @@ python-linting: allow_failure: true image: python:3.7.4-slim-stretch before_script: - - pip install poetry + - pip install poetry --quiet - poetry config cache-dir ~/.cache/poetry - poetry config virtualenvs.in-project true - - poetry install --no-interaction + - poetry install --no-interaction --quiet script: - poetry run isort src/ --check-only --recursive - poetry run black src/ --line-length 88 --check diff --git a/gitlab-ci/test.yml b/gitlab-ci/test.yml index 3e8eccb..723a0e8 100644 --- a/gitlab-ci/test.yml +++ b/gitlab-ci/test.yml @@ -6,10 +6,10 @@ python-tests: - memcached:1.5.22 image: python:3.7.4-slim-stretch before_script: - - pip install poetry + - pip install poetry --quiet - poetry config cache-dir .cache/poetry - poetry config virtualenvs.in-project true - - poetry install --no-interaction + - poetry install --no-interaction --quiet script: - poetry run coverage run src/manage.py test newsreader - poetry run coverage report diff --git a/src/newsreader/conf/base.py b/src/newsreader/conf/base.py index c911462..85a62ba 100644 --- a/src/newsreader/conf/base.py +++ b/src/newsreader/conf/base.py @@ -103,7 +103,7 @@ CACHES = { # https://docs.djangoproject.com/en/2.2/topics/logging/#configuring-logging LOGGING = { "version": 1, - "disable_existing_loggers": False, + "disable_existing_loggers": True, "filters": { "require_debug_false": {"()": "django.utils.log.RequireDebugFalse"}, "require_debug_true": {"()": "django.utils.log.RequireDebugTrue"}, @@ -114,7 +114,11 @@ LOGGING = { "format": "[{server_time}] {message}", "style": "{", }, - "syslog": {"class": "logging.Formatter", "format": "{message}", "style": "{"}, + "syslog": { + "class": "logging.Formatter", + "format": "[newsreader] {message}", + "style": "{", + }, }, "handlers": { "console": { @@ -124,6 +128,7 @@ LOGGING = { }, "django.server": { "level": "INFO", + "filters": ["require_debug_true"], "class": "logging.StreamHandler", "formatter": "django.server", }, @@ -157,7 +162,6 @@ LOGGING = { "level": "INFO", "propagate": False, }, - "celery": {"handlers": ["syslog", "console"], "level": "INFO"}, "celery.task": {"handlers": ["syslog", "console"], "level": "INFO"}, }, } diff --git a/src/newsreader/news/collection/feed.py b/src/newsreader/news/collection/feed.py index 07090ce..35b0b1e 100644 --- a/src/newsreader/news/collection/feed.py +++ b/src/newsreader/news/collection/feed.py @@ -52,9 +52,7 @@ class FeedBuilder(Builder): entries = data.get("entries", []) instances = self.build(entries, stream.rule) - posts = duplicate_handler.check(instances) - - self.instances = [post for post in posts] + self.instances = duplicate_handler.check(instances) def build(self, entries, rule): field_mapping = { @@ -196,22 +194,27 @@ class FeedDuplicateHandler: def check(self, instances): deduplicated_instances = self.deduplicate_instances(instances) + checked_instances = [] for instance in deduplicated_instances: if instance.remote_identifier in self.existing_identifiers: existing_post = self.handle_duplicate_identifier(instance) - yield existing_post + checked_instances.append(existing_post) continue elif self.in_database(instance): existing_post = self.get_duplicate_in_database(instance) if self.in_time_slot(instance, existing_post): - yield self.update_existing_post(instance, existing_post) + checked_instances.append( + self.update_existing_post(instance, existing_post) + ) continue - yield instance + checked_instances.append(instance) + + return checked_instances def in_database(self, post): values = {field: getattr(post, field, None) for field in self.duplicate_fields} @@ -229,23 +232,29 @@ class FeedDuplicateHandler: return True def deduplicate_instances(self, instances): + sorted_instances = sorted( + instances, key=lambda instance: instance.publication_date, reverse=True + ) deduplicated_instances = [] - for instance in instances: + for instance in sorted_instances: + instance_identifier = instance.remote_identifier + duplicate = False + values = { field: getattr(instance, field, None) for field in self.duplicate_fields } - duplicate = False for deduplicated_instance in deduplicated_instances: deduplicated_identifier = deduplicated_instance.remote_identifier - instance_identifier = instance.remote_identifier has_identifiers = deduplicated_identifier and instance_identifier - if self.is_duplicate(deduplicated_instance, values): - duplicate = True - break - elif has_identifiers and deduplicated_identifier == instance_identifier: + is_same_identifier = ( + has_identifiers and deduplicated_identifier == instance_identifier + ) + is_duplicate = self.is_duplicate(deduplicated_instance, values) + + if is_duplicate or is_same_identifier: duplicate = True break diff --git a/src/newsreader/news/collection/tests/feed/builder/tests.py b/src/newsreader/news/collection/tests/feed/builder/tests.py index be13908..cfafa4f 100644 --- a/src/newsreader/news/collection/tests/feed/builder/tests.py +++ b/src/newsreader/news/collection/tests/feed/builder/tests.py @@ -16,6 +16,7 @@ from newsreader.news.core.tests.factories import PostFactory from .mocks import * +@freeze_time("2019-10-30 12:30:00") class FeedBuilderTestCase(TestCase): def setUp(self): self.maxDiff = None @@ -30,8 +31,10 @@ class FeedBuilderTestCase(TestCase): post = Post.objects.get() - d = datetime.combine(date(2019, 5, 20), time(hour=16, minute=7, second=37)) - aware_date = pytz.utc.localize(d) + publication_date = datetime.combine( + date(2019, 5, 20), time(hour=16, minute=7, second=37) + ) + aware_date = pytz.utc.localize(publication_date) self.assertEquals(post.publication_date, aware_date) self.assertEquals(Post.objects.count(), 1) @@ -57,49 +60,60 @@ class FeedBuilderTestCase(TestCase): with builder((multiple_mock, mock_stream)) as builder: builder.save() - posts = Post.objects.order_by("id") + posts = Post.objects.order_by("-publication_date") self.assertEquals(Post.objects.count(), 3) - first_post = posts[0] - second_post = posts[1] + post = posts[0] - d = datetime.combine(date(2019, 5, 20), time(hour=16, minute=7, second=37)) - aware_date = pytz.utc.localize(d) - - self.assertEquals(first_post.publication_date, aware_date) + publication_date = datetime.combine( + date(2019, 5, 20), time(hour=16, minute=32, second=38) + ) + aware_date = pytz.utc.localize(publication_date) self.assertEquals( - first_post.remote_identifier, + post.publication_date.strftime("%Y-%m-%d %H:%M:%S"), + aware_date.strftime("%Y-%m-%d %H:%M:%S"), + ) + + self.assertEquals( + post.remote_identifier, + "https://www.bbc.co.uk/news/uk-england-birmingham-48339080", + ) + + self.assertEquals( + post.url, "https://www.bbc.co.uk/news/uk-england-birmingham-48339080" + ) + + self.assertEquals( + post.title, "Birmingham head teacher threatened over LGBT lessons" + ) + + post = posts[1] + + publication_date = datetime.combine( + date(2019, 5, 20), time(hour=16, minute=7, second=37) + ) + aware_date = pytz.utc.localize(publication_date) + + self.assertEquals( + post.publication_date.strftime("%Y-%m-%d %H:%M:%S"), + aware_date.strftime("%Y-%m-%d %H:%M:%S"), + ) + + self.assertEquals( + post.remote_identifier, "https://www.bbc.co.uk/news/world-us-canada-48338168", ) self.assertEquals( - first_post.url, "https://www.bbc.co.uk/news/world-us-canada-48338168" + post.url, "https://www.bbc.co.uk/news/world-us-canada-48338168" ) self.assertEquals( - first_post.title, "Trump's 'genocidal taunts' will not end Iran - Zarif" + post.title, "Trump's 'genocidal taunts' will not end Iran - Zarif" ) - d = datetime.combine(date(2019, 5, 20), time(hour=12, minute=19, second=19)) - aware_date = pytz.utc.localize(d) - - self.assertEquals(second_post.publication_date, aware_date) - - self.assertEquals( - second_post.remote_identifier, - "https://www.bbc.co.uk/news/technology-48334739", - ) - - self.assertEquals( - second_post.url, "https://www.bbc.co.uk/news/technology-48334739" - ) - - self.assertEquals( - second_post.title, "Huawei's Android loss: How it affects you" - ) - - def test_entry_without_remote_identifier(self): + def test_entries_without_remote_identifier(self): builder = FeedBuilder rule = CollectionRuleFactory() mock_stream = MagicMock(rule=rule) @@ -107,27 +121,37 @@ class FeedBuilderTestCase(TestCase): with builder((mock_without_identifier, mock_stream)) as builder: builder.save() - posts = Post.objects.order_by("id") + posts = Post.objects.order_by("-publication_date") self.assertEquals(Post.objects.count(), 2) - first_post = posts[0] + post = posts[0] - d = datetime.combine(date(2019, 5, 20), time(hour=16, minute=7, second=37)) - aware_date = pytz.utc.localize(d) - - self.assertEquals(first_post.publication_date, aware_date) - - self.assertEquals(first_post.remote_identifier, None) + publication_date = datetime.combine( + date(2019, 5, 20), time(hour=16, minute=7, second=37) + ) + aware_date = pytz.utc.localize(publication_date) + self.assertEquals(post.publication_date, aware_date) + self.assertEquals(post.remote_identifier, None) self.assertEquals( - first_post.url, "https://www.bbc.co.uk/news/world-us-canada-48338168" + post.url, "https://www.bbc.co.uk/news/world-us-canada-48338168" + ) + self.assertEquals( + post.title, "Trump's 'genocidal taunts' will not end Iran - Zarif" ) - self.assertEquals( - first_post.title, "Trump's 'genocidal taunts' will not end Iran - Zarif" - ) + post = posts[1] + + publication_date = datetime.combine( + date(2019, 5, 20), time(hour=12, minute=19, second=19) + ) + aware_date = pytz.utc.localize(publication_date) + + self.assertEquals(post.publication_date, aware_date) + self.assertEquals(post.remote_identifier, None) + self.assertEquals(post.url, "https://www.bbc.co.uk/news/technology-48334739") + self.assertEquals(post.title, "Huawei's Android loss: How it affects you") - @freeze_time("2019-10-30 12:30:00") def test_entry_without_publication_date(self): builder = FeedBuilder rule = CollectionRuleFactory() @@ -136,25 +160,30 @@ class FeedBuilderTestCase(TestCase): with builder((mock_without_publish_date, mock_stream)) as builder: builder.save() - posts = Post.objects.order_by("id") + posts = Post.objects.order_by("-publication_date") self.assertEquals(Post.objects.count(), 2) - first_post = posts[0] - second_post = posts[1] + post = posts[0] - self.assertEquals(first_post.created, timezone.now()) self.assertEquals( - first_post.remote_identifier, + post.publication_date.strftime("%Y-%m-%d %H:%M"), "2019-10-30 12:30" + ) + self.assertEquals(post.created, timezone.now()) + self.assertEquals( + post.remote_identifier, "https://www.bbc.co.uk/news/world-us-canada-48338168", ) - self.assertEquals(second_post.created, timezone.now()) + post = posts[1] + self.assertEquals( - second_post.remote_identifier, - "https://www.bbc.co.uk/news/technology-48334739", + post.publication_date.strftime("%Y-%m-%d %H:%M"), "2019-10-30 12:30" + ) + self.assertEquals(post.created, timezone.now()) + self.assertEquals( + post.remote_identifier, "https://www.bbc.co.uk/news/technology-48334739" ) - @freeze_time("2019-10-30 12:30:00") def test_entry_without_url(self): builder = FeedBuilder rule = CollectionRuleFactory() @@ -163,25 +192,24 @@ class FeedBuilderTestCase(TestCase): with builder((mock_without_url, mock_stream)) as builder: builder.save() - posts = Post.objects.order_by("id") + posts = Post.objects.order_by("-publication_date") self.assertEquals(Post.objects.count(), 2) - first_post = posts[0] - second_post = posts[1] + post = posts[0] - self.assertEquals(first_post.created, timezone.now()) + self.assertEquals(post.created, timezone.now()) self.assertEquals( - first_post.remote_identifier, + post.remote_identifier, "https://www.bbc.co.uk/news/world-us-canada-48338168", ) - self.assertEquals(second_post.created, timezone.now()) + post = posts[1] + + self.assertEquals(post.created, timezone.now()) self.assertEquals( - second_post.remote_identifier, - "https://www.bbc.co.uk/news/technology-48334739", + post.remote_identifier, "https://www.bbc.co.uk/news/technology-48334739" ) - @freeze_time("2019-10-30 12:30:00") def test_entry_without_body(self): builder = FeedBuilder rule = CollectionRuleFactory() @@ -190,25 +218,32 @@ class FeedBuilderTestCase(TestCase): with builder((mock_without_body, mock_stream)) as builder: builder.save() - posts = Post.objects.order_by("id") + posts = Post.objects.order_by("-publication_date") + self.assertEquals(Post.objects.count(), 2) - first_post = posts[0] - second_post = posts[1] + post = posts[0] - self.assertEquals(first_post.created, timezone.now()) self.assertEquals( - first_post.remote_identifier, - "https://www.bbc.co.uk/news/world-us-canada-48338168", + post.created.strftime("%Y-%m-%d %H:%M:%S"), "2019-10-30 12:30:00" ) - - self.assertEquals(second_post.created, timezone.now()) self.assertEquals( - second_post.remote_identifier, + post.remote_identifier, "https://www.bbc.co.uk/news/uk-england-birmingham-48339080", ) + self.assertEquals(post.body, "") + + post = posts[1] + + self.assertEquals( + post.created.strftime("%Y-%m-%d %H:%M:%S"), "2019-10-30 12:30:00" + ) + self.assertEquals( + post.remote_identifier, + "https://www.bbc.co.uk/news/world-us-canada-48338168", + ) + self.assertEquals(post.body, "") - @freeze_time("2019-10-30 12:30:00") def test_entry_without_author(self): builder = FeedBuilder rule = CollectionRuleFactory() @@ -217,23 +252,25 @@ class FeedBuilderTestCase(TestCase): with builder((mock_without_author, mock_stream)) as builder: builder.save() - posts = Post.objects.order_by("id") + posts = Post.objects.order_by("-publication_date") self.assertEquals(Post.objects.count(), 2) - first_post = posts[0] - second_post = posts[1] + post = posts[0] - self.assertEquals(first_post.created, timezone.now()) + self.assertEquals(post.created, timezone.now()) self.assertEquals( - first_post.remote_identifier, + post.remote_identifier, "https://www.bbc.co.uk/news/world-us-canada-48338168", ) + self.assertEquals(post.author, None) - self.assertEquals(second_post.created, timezone.now()) + post = posts[1] + + self.assertEquals(post.created, timezone.now()) self.assertEquals( - second_post.remote_identifier, - "https://www.bbc.co.uk/news/technology-48334739", + post.remote_identifier, "https://www.bbc.co.uk/news/technology-48334739" ) + self.assertEquals(post.author, None) def test_empty_entries(self): builder = FeedBuilder diff --git a/src/newsreader/news/collection/tests/feed/duplicate_handler/tests.py b/src/newsreader/news/collection/tests/feed/duplicate_handler/tests.py index 6ed8a59..109491b 100644 --- a/src/newsreader/news/collection/tests/feed/duplicate_handler/tests.py +++ b/src/newsreader/news/collection/tests/feed/duplicate_handler/tests.py @@ -11,110 +11,137 @@ from newsreader.news.core.models import Post from newsreader.news.core.tests.factories import PostFactory +@freeze_time("2019-10-30 12:30:00") class FeedDuplicateHandlerTestCase(TestCase): def setUp(self): self.maxDiff = None def test_duplicate_entries_with_remote_identifiers(self): rule = CollectionRuleFactory() + existing_post = PostFactory.create( remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7", rule=rule ) - new_post = PostFactory.build( + + new_posts = PostFactory.build_batch( remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7", - title="title got updated", + publication_date=timezone.now() - timedelta(days=7), + rule=rule, + size=5, + ) + last_post = PostFactory.build( + remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7", + publication_date=timezone.now(), rule=rule, ) with FeedDuplicateHandler(rule) as duplicate_handler: - posts_gen = duplicate_handler.check([new_post]) - posts = list(posts_gen) + posts = duplicate_handler.check((*new_posts, last_post)) self.assertEquals(len(posts), 1) post = posts[0] - existing_post.refresh_from_db() - self.assertEquals(existing_post.pk, post.pk) - self.assertEquals(post.publication_date, new_post.publication_date) - self.assertEquals(post.title, new_post.title) - self.assertEquals(post.body, new_post.body) - self.assertEquals(post.rule, new_post.rule) + self.assertEquals( + post.publication_date.strftime("%Y-%m-%d %H-%M-%S"), + last_post.publication_date.strftime("%Y-%m-%d %H-%M-%S"), + ) + self.assertEquals(post.title, last_post.title) + self.assertEquals(post.body, last_post.body) + self.assertEquals(post.rule, last_post.rule) self.assertEquals(post.read, False) - @freeze_time("2019-10-30 12:30:00") def test_duplicate_entries_with_different_remote_identifiers(self): rule = CollectionRuleFactory() - publication_date = timezone.now() - existing_post = PostFactory.create( + existing_post = PostFactory( remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7", url="https://bbc.com", title="New post", body="Body", - publication_date=publication_date, + publication_date=timezone.now() - timedelta(minutes=10), rule=rule, ) - new_post = PostFactory.build( + + new_posts = PostFactory.build_batch( remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7Q", url="https://bbc.com", title="New post", body="Body", - publication_date=publication_date, + publication_date=timezone.now() - timedelta(minutes=5), + rule=rule, + size=5, + ) + last_post = PostFactory.build( + remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7Q", + url="https://bbc.com", + title="New post", + body="Body", + publication_date=timezone.now(), rule=rule, ) with FeedDuplicateHandler(rule) as duplicate_handler: - posts_gen = duplicate_handler.check([new_post]) - posts = list(posts_gen) + posts = duplicate_handler.check((*new_posts, last_post)) self.assertEquals(len(posts), 1) - existing_post.refresh_from_db() post = posts[0] - self.assertEquals(existing_post.pk, post.pk) - self.assertEquals(post.title, new_post.title) - self.assertEquals(post.body, new_post.body) - self.assertEquals(post.rule, new_post.rule) - self.assertEquals(post.publication_date, new_post.publication_date) + self.assertEquals( + post.publication_date.strftime("%Y-%m-%d %H-%M-%S"), + last_post.publication_date.strftime("%Y-%m-%d %H-%M-%S"), + ) + self.assertEquals(post.title, last_post.title) + self.assertEquals(post.body, last_post.body) + self.assertEquals(post.rule, last_post.rule) self.assertEquals(post.read, False) def test_duplicate_entries_in_recent_database(self): - publication_date = timezone.now() - rule = CollectionRuleFactory() - existing_post = PostFactory.create( + + existing_post = PostFactory( url="https://www.bbc.co.uk/news/uk-england-birmingham-48339080", title="Birmingham head teacher threatened over LGBT lessons", body="Google's move to end business ties with Huawei will affect current devices", - publication_date=publication_date, - remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7", + publication_date=timezone.now() - timedelta(minutes=10), + remote_identifier=None, rule=rule, ) - new_post = PostFactory.build( + + new_posts = PostFactory.build_batch( url="https://www.bbc.co.uk/news/uk-england-birmingham-48339080", title="Birmingham head teacher threatened over LGBT lessons", body="Google's move to end business ties with Huawei will affect current devices", - publication_date=publication_date, + publication_date=timezone.now() - timedelta(minutes=5), + remote_identifier=None, + rule=rule, + size=5, + ) + + last_post = PostFactory.build( + url="https://www.bbc.co.uk/news/uk-england-birmingham-48339080", + title="Birmingham head teacher threatened over LGBT lessons", + body="Google's move to end business ties with Huawei will affect current devices", + publication_date=timezone.now(), remote_identifier=None, rule=rule, ) with FeedDuplicateHandler(rule) as duplicate_handler: - posts_gen = duplicate_handler.check([new_post]) - posts = list(posts_gen) + posts = duplicate_handler.check((*new_posts, last_post)) self.assertEquals(len(posts), 1) - existing_post.refresh_from_db() post = posts[0] - self.assertEquals(existing_post.pk, post.pk) - self.assertEquals(post.title, new_post.title) - self.assertEquals(post.body, new_post.body) - self.assertEquals(post.rule, new_post.rule) - self.assertEquals(post.publication_date, new_post.publication_date) + self.assertEquals( + post.publication_date.strftime("%Y-%m-%d %H-%M-%S"), + last_post.publication_date.strftime("%Y-%m-%d %H-%M-%S"), + ) + self.assertEquals(post.title, last_post.title) + self.assertEquals(post.body, last_post.body) + self.assertEquals(post.rule, last_post.rule) self.assertEquals(post.read, False) def test_multiple_existing_entries_with_identifier(self): @@ -124,15 +151,20 @@ class FeedDuplicateHandlerTestCase(TestCase): remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7", rule=rule, size=5 ) - new_post = PostFactory.build( + new_posts = PostFactory.build_batch( remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7", - title="This is a new one", + publication_date=timezone.now() - timedelta(hours=5), + rule=rule, + size=5, + ) + last_post = PostFactory.build( + remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7", + publication_date=timezone.now() - timedelta(minutes=5), rule=rule, ) with FeedDuplicateHandler(rule) as duplicate_handler: - posts_gen = duplicate_handler.check([new_post]) - posts = list(posts_gen) + posts = duplicate_handler.check((*new_posts, last_post)) self.assertEquals(len(posts), 1) @@ -145,77 +177,101 @@ class FeedDuplicateHandlerTestCase(TestCase): post = posts[0] - self.assertEquals(post.title, new_post.title) - self.assertEquals(post.body, new_post.body) - self.assertEquals(post.publication_date, new_post.publication_date) - self.assertEquals(post.rule, new_post.rule) + self.assertEquals( + post.publication_date.strftime("%Y-%m-%d %H-%M-%S"), + last_post.publication_date.strftime("%Y-%m-%d %H-%M-%S"), + ) + self.assertEquals(post.title, last_post.title) + self.assertEquals(post.body, last_post.body) + self.assertEquals(post.rule, last_post.rule) self.assertEquals(post.read, False) - @freeze_time("2019-10-30 12:30:00") def test_duplicate_entries_outside_time_slot(self): - publication_date = timezone.now() - rule = CollectionRuleFactory() - existing_post = PostFactory.create( + + existing_post = PostFactory( url="https://www.bbc.co.uk/news/uk-england-birmingham-48339080", title="Birmingham head teacher threatened over LGBT lessons", body="Google's move to end business ties with Huawei will affect current devices", - publication_date=publication_date, - remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7", + publication_date=timezone.now(), + remote_identifier=None, rule=rule, ) - new_post = PostFactory.build( + + new_posts = PostFactory.build_batch( url="https://www.bbc.co.uk/news/uk-england-birmingham-48339080", title="Birmingham head teacher threatened over LGBT lessons", body="Google's move to end business ties with Huawei will affect current devices", - publication_date=publication_date + timedelta(minutes=12), + publication_date=timezone.now() + timedelta(minutes=12), + remote_identifier=None, + rule=rule, + size=5, + ) + last_post = PostFactory.build( + url="https://www.bbc.co.uk/news/uk-england-birmingham-48339080", + title="Birmingham head teacher threatened over LGBT lessons", + body="Google's move to end business ties with Huawei will affect current devices", + publication_date=timezone.now() + timedelta(minutes=13), remote_identifier=None, rule=rule, ) with FeedDuplicateHandler(rule) as duplicate_handler: - posts_gen = duplicate_handler.check([new_post]) - posts = list(posts_gen) + posts = duplicate_handler.check((*new_posts, last_post)) self.assertEquals(len(posts), 1) - existing_post.refresh_from_db() post = posts[0] self.assertEquals(post.pk, None) - self.assertEquals(post.title, new_post.title) - self.assertEquals(post.body, new_post.body) - self.assertEquals(post.rule, new_post.rule) - self.assertEquals(post.publication_date, new_post.publication_date) + self.assertEquals( + post.publication_date.strftime("%Y-%m-%d %H-%M-%S"), + last_post.publication_date.strftime("%Y-%m-%d %H-%M-%S"), + ) + self.assertEquals(post.title, last_post.title) + self.assertEquals(post.body, last_post.body) + self.assertEquals(post.rule, last_post.rule) self.assertEquals(post.read, False) def test_duplicate_entries_in_collected_entries(self): rule = CollectionRuleFactory() post_1 = PostFactory.build( - title="title got updated", body="body", url="https://bbc.com", rule=rule + title="title got updated", + body="body", + url="https://bbc.com", + publication_date=timezone.now(), + rule=rule, ) duplicate_post_1 = PostFactory.build( - title="title got updated", body="body", url="https://bbc.com", rule=rule + title="title got updated", + body="body", + url="https://bbc.com", + publication_date=timezone.now() - timedelta(minutes=5), + rule=rule, ) post_2 = PostFactory.build( - remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7" + remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7", + publication_date=timezone.now(), ) duplicate_post_2 = PostFactory.build( - remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7" + remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7", + publication_date=timezone.now() - timedelta(minutes=5), ) collected_posts = (post_1, post_2, duplicate_post_1, duplicate_post_2) with FeedDuplicateHandler(rule) as duplicate_handler: - posts_gen = duplicate_handler.check(collected_posts) - posts = list(posts_gen) + posts = duplicate_handler.check(collected_posts) self.assertEquals(len(posts), 2) post = posts[0] - self.assertEquals(post_1.publication_date, post.publication_date) + self.assertEquals( + post_1.publication_date.strftime("%Y-%m-%d %H-%M-%S"), + post.publication_date.strftime("%Y-%m-%d %H-%M-%S"), + ) self.assertEquals(post_1.title, post.title) self.assertEquals(post_1.body, post.body) self.assertEquals(post_1.rule, post.rule) @@ -223,7 +279,10 @@ class FeedDuplicateHandlerTestCase(TestCase): post = posts[1] - self.assertEquals(post_2.publication_date, post.publication_date) + self.assertEquals( + post_2.publication_date.strftime("%Y-%m-%d %H-%M-%S"), + post.publication_date.strftime("%Y-%m-%d %H-%M-%S"), + ) self.assertEquals(post_2.title, post.title) self.assertEquals(post_2.body, post.body) self.assertEquals(post_2.rule, post.rule) diff --git a/src/newsreader/news/core/models.py b/src/newsreader/news/core/models.py index 28bf3fd..ff44c81 100644 --- a/src/newsreader/news/core/models.py +++ b/src/newsreader/news/core/models.py @@ -8,7 +8,7 @@ from newsreader.news.collection.models import CollectionRule class Post(TimeStampedModel): title = models.CharField(max_length=200, blank=True, null=True) - body = models.TextField(blank=True, null=True) + body = models.TextField(blank=True) author = models.CharField(max_length=40, blank=True, null=True) publication_date = models.DateTimeField(default=timezone.now) url = models.URLField(max_length=1024, blank=True, null=True)