From 90553168df0514da83eb44498da7f10216f6024f Mon Sep 17 00:00:00 2001 From: Sonny Bakker Date: Thu, 8 Oct 2020 22:51:36 +0200 Subject: [PATCH] Refactor FeedBuilder --- src/newsreader/news/collection/feed.py | 65 +++-- .../collection/tests/feed/builder/tests.py | 242 ++++++++---------- 2 files changed, 144 insertions(+), 163 deletions(-) diff --git a/src/newsreader/news/collection/feed.py b/src/newsreader/news/collection/feed.py index ae6cd42..f9bf294 100644 --- a/src/newsreader/news/collection/feed.py +++ b/src/newsreader/news/collection/feed.py @@ -39,6 +39,23 @@ class FeedBuilder(PostBuilder): rule__type = RuleTypeChoices.feed def build(self): + instances = [] + + with FeedDuplicateHandler(self.stream.rule) as duplicate_handler: + entries = self.payload.get("entries", []) + + for entry in entries: + try: + post = self.build_post(entry) + except KeyError: + logger.exception(f"Failed building post") + continue + + instances.append(post) + + self.instances = duplicate_handler.check(instances) + + def build_post(self, entry): field_mapping = { "id": "remote_identifier", "title": "title", @@ -48,41 +65,37 @@ class FeedBuilder(PostBuilder): "author": "author", } tz = pytz.timezone(self.stream.rule.timezone) - instances = [] + data = {"rule_id": self.stream.rule.pk} - with FeedDuplicateHandler(self.stream.rule) as duplicate_handler: - entries = self.payload.get("entries", []) + for field, model_field in field_mapping.items(): + if not field in entry: + continue - for entry in entries: - data = {"rule_id": self.stream.rule.pk} + value = truncate_text(Post, model_field, entry[field]) - for field, model_field in field_mapping.items(): - if not field in entry: - continue + if field == "published_parsed": + data[model_field] = build_publication_date(value, tz) + elif field == "summary": + data[model_field] = self.sanitize_fragment(value) + else: + data[model_field] = value - value = truncate_text(Post, model_field, entry[field]) + content_details = self.get_content_details(entry) - if field == "published_parsed": - data[model_field] = build_publication_date(value, tz) - elif field == "summary": - data[model_field] = self.sanitize_fragment(value) - else: - data[model_field] = value + # use content details key if it contains more information + if not "body" in data or len(data["body"]) < len(content_details): + data["body"] = content_details - if "content" in entry: - content = self.get_content(entry["content"]) - body = data.get("body", "") + return Post(**data) - if not body or len(body) < len(content): - data["body"] = content + def get_content_details(self, entry): + content_items = entry.get("content") - instances.append(Post(**data)) + if not content_items: + return "" - self.instances = duplicate_handler.check(instances) - - def get_content(self, items): - content = "\n ".join([item.get("value") for item in items]) - return self.sanitize_fragment(content) + content_details = "\n ".join([item.get("value") for item in content_items]) + return self.sanitize_fragment(content_details) class FeedStream(PostStream): diff --git a/src/newsreader/news/collection/tests/feed/builder/tests.py b/src/newsreader/news/collection/tests/feed/builder/tests.py index 571a7cd..9672186 100644 --- a/src/newsreader/news/collection/tests/feed/builder/tests.py +++ b/src/newsreader/news/collection/tests/feed/builder/tests.py @@ -1,5 +1,6 @@ -from datetime import date, datetime, time -from unittest.mock import Mock +from datetime import datetime +from unittest.mock import Mock, patch +from uuid import uuid4 from django.test import TestCase from django.utils import timezone @@ -21,277 +22,233 @@ class FeedBuilderTestCase(TestCase): def setUp(self): self.maxDiff = None - def test_basic_entry(self): - builder = FeedBuilder - rule = FeedFactory() - mock_stream = Mock(rule=rule) - - with builder(simple_mock, mock_stream) as builder: - builder.build() - builder.save() - - post = Post.objects.get() - - publication_date = datetime.combine( - date(2019, 5, 20), time(hour=16, minute=7, second=37) - ) - aware_date = pytz.utc.localize(publication_date) - - self.assertEquals(post.publication_date, aware_date) - self.assertEquals(Post.objects.count(), 1) - - self.assertEquals( - post.remote_identifier, - "https://www.bbc.co.uk/news/world-us-canada-48338168", - ) - - self.assertEquals( - post.url, "https://www.bbc.co.uk/news/world-us-canada-48338168" - ) - - self.assertEquals( - post.title, "Trump's 'genocidal taunts' will not end Iran - Zarif" - ) - def test_multiple_entries(self): - builder = FeedBuilder rule = FeedFactory() mock_stream = Mock(rule=rule) - with builder(multiple_mock, mock_stream) as builder: + with FeedBuilder(multiple_mock, mock_stream) as builder: builder.build() builder.save() posts = Post.objects.order_by("-publication_date") - self.assertEquals(Post.objects.count(), 3) + self.assertEqual(Post.objects.count(), 3) post = posts[0] - publication_date = datetime.combine( - date(2019, 5, 20), time(hour=16, minute=32, second=38) + publication_date = datetime( + 2019, 5, 20, hour=16, minute=32, second=38, tzinfo=pytz.utc ) - aware_date = pytz.utc.localize(publication_date) - self.assertEquals( + self.assertEqual( post.publication_date.strftime("%Y-%m-%d %H:%M:%S"), - aware_date.strftime("%Y-%m-%d %H:%M:%S"), + publication_date.strftime("%Y-%m-%d %H:%M:%S"), ) - self.assertEquals( + self.assertEqual( post.remote_identifier, "https://www.bbc.co.uk/news/uk-england-birmingham-48339080", ) - self.assertEquals( + self.assertEqual( post.url, "https://www.bbc.co.uk/news/uk-england-birmingham-48339080" ) - self.assertEquals( + self.assertEqual( post.title, "Birmingham head teacher threatened over LGBT lessons" ) post = posts[1] - publication_date = datetime.combine( - date(2019, 5, 20), time(hour=16, minute=7, second=37) + publication_date = datetime( + 2019, 5, 20, hour=16, minute=7, second=37, tzinfo=pytz.utc ) - aware_date = pytz.utc.localize(publication_date) - self.assertEquals( + self.assertEqual( post.publication_date.strftime("%Y-%m-%d %H:%M:%S"), - aware_date.strftime("%Y-%m-%d %H:%M:%S"), + publication_date.strftime("%Y-%m-%d %H:%M:%S"), ) - self.assertEquals( + self.assertEqual( post.remote_identifier, "https://www.bbc.co.uk/news/world-us-canada-48338168", ) - self.assertEquals( + self.assertEqual( post.url, "https://www.bbc.co.uk/news/world-us-canada-48338168" ) - self.assertEquals( + self.assertEqual( post.title, "Trump's 'genocidal taunts' will not end Iran - Zarif" ) def test_entries_without_remote_identifier(self): - builder = FeedBuilder rule = FeedFactory() mock_stream = Mock(rule=rule) - with builder(mock_without_identifier, mock_stream) as builder: + with FeedBuilder(mock_without_identifier, mock_stream) as builder: builder.build() builder.save() posts = Post.objects.order_by("-publication_date") - self.assertEquals(Post.objects.count(), 2) + self.assertEqual(Post.objects.count(), 2) post = posts[0] - publication_date = datetime.combine( - date(2019, 5, 20), time(hour=16, minute=7, second=37) + publication_date = datetime( + 2019, 5, 20, hour=16, minute=7, second=37, tzinfo=pytz.utc ) - aware_date = pytz.utc.localize(publication_date) - self.assertEquals(post.publication_date, aware_date) - self.assertEquals(post.remote_identifier, None) - self.assertEquals( + self.assertEqual(post.publication_date, publication_date) + self.assertEqual(post.remote_identifier, None) + self.assertEqual( post.url, "https://www.bbc.co.uk/news/world-us-canada-48338168" ) - self.assertEquals( + self.assertEqual( post.title, "Trump's 'genocidal taunts' will not end Iran - Zarif" ) post = posts[1] - publication_date = datetime.combine( - date(2019, 5, 20), time(hour=12, minute=19, second=19) + publication_date = datetime( + 2019, 5, 20, hour=12, minute=19, second=19, tzinfo=pytz.utc ) - aware_date = pytz.utc.localize(publication_date) - self.assertEquals(post.publication_date, aware_date) - self.assertEquals(post.remote_identifier, None) - self.assertEquals(post.url, "https://www.bbc.co.uk/news/technology-48334739") - self.assertEquals(post.title, "Huawei's Android loss: How it affects you") + self.assertEqual(post.publication_date, publication_date) + self.assertEqual(post.remote_identifier, None) + self.assertEqual(post.url, "https://www.bbc.co.uk/news/technology-48334739") + self.assertEqual(post.title, "Huawei's Android loss: How it affects you") def test_entry_without_publication_date(self): - builder = FeedBuilder rule = FeedFactory() mock_stream = Mock(rule=rule) - with builder(mock_without_publish_date, mock_stream) as builder: + with FeedBuilder(mock_without_publish_date, mock_stream) as builder: builder.build() builder.save() posts = Post.objects.order_by("-publication_date") - self.assertEquals(Post.objects.count(), 2) + self.assertEqual(Post.objects.count(), 2) post = posts[0] - self.assertEquals( + self.assertEqual( post.publication_date.strftime("%Y-%m-%d %H:%M"), "2019-10-30 12:30" ) - self.assertEquals(post.created, timezone.now()) - self.assertEquals( + self.assertEqual(post.created, timezone.now()) + self.assertEqual( post.remote_identifier, "https://www.bbc.co.uk/news/world-us-canada-48338168", ) post = posts[1] - self.assertEquals( + self.assertEqual( post.publication_date.strftime("%Y-%m-%d %H:%M"), "2019-10-30 12:30" ) - self.assertEquals(post.created, timezone.now()) - self.assertEquals( + self.assertEqual(post.created, timezone.now()) + self.assertEqual( post.remote_identifier, "https://www.bbc.co.uk/news/technology-48334739" ) def test_entry_without_url(self): - builder = FeedBuilder rule = FeedFactory() mock_stream = Mock(rule=rule) - with builder(mock_without_url, mock_stream) as builder: + with FeedBuilder(mock_without_url, mock_stream) as builder: builder.build() builder.save() posts = Post.objects.order_by("-publication_date") - self.assertEquals(Post.objects.count(), 2) + self.assertEqual(Post.objects.count(), 2) post = posts[0] - self.assertEquals(post.created, timezone.now()) - self.assertEquals( + self.assertEqual(post.created, timezone.now()) + self.assertEqual( post.remote_identifier, "https://www.bbc.co.uk/news/world-us-canada-48338168", ) post = posts[1] - self.assertEquals(post.created, timezone.now()) - self.assertEquals( + self.assertEqual(post.created, timezone.now()) + self.assertEqual( post.remote_identifier, "https://www.bbc.co.uk/news/technology-48334739" ) def test_entry_without_body(self): - builder = FeedBuilder rule = FeedFactory() mock_stream = Mock(rule=rule) - with builder(mock_without_body, mock_stream) as builder: + with FeedBuilder(mock_without_body, mock_stream) as builder: builder.build() builder.save() posts = Post.objects.order_by("-publication_date") - self.assertEquals(Post.objects.count(), 2) + self.assertEqual(Post.objects.count(), 2) post = posts[0] - self.assertEquals( + self.assertEqual( post.created.strftime("%Y-%m-%d %H:%M:%S"), "2019-10-30 12:30:00" ) - self.assertEquals( + self.assertEqual( post.remote_identifier, "https://www.bbc.co.uk/news/uk-england-birmingham-48339080", ) - self.assertEquals(post.body, "") + self.assertEqual(post.body, "") post = posts[1] - self.assertEquals( + self.assertEqual( post.created.strftime("%Y-%m-%d %H:%M:%S"), "2019-10-30 12:30:00" ) - self.assertEquals( + self.assertEqual( post.remote_identifier, "https://www.bbc.co.uk/news/world-us-canada-48338168", ) - self.assertEquals(post.body, "") + self.assertEqual(post.body, "") def test_entry_without_author(self): - builder = FeedBuilder rule = FeedFactory() mock_stream = Mock(rule=rule) - with builder(mock_without_author, mock_stream) as builder: + with FeedBuilder(mock_without_author, mock_stream) as builder: builder.build() builder.save() posts = Post.objects.order_by("-publication_date") - self.assertEquals(Post.objects.count(), 2) + self.assertEqual(Post.objects.count(), 2) post = posts[0] - self.assertEquals(post.created, timezone.now()) - self.assertEquals( + self.assertEqual(post.created, timezone.now()) + self.assertEqual( post.remote_identifier, "https://www.bbc.co.uk/news/world-us-canada-48338168", ) - self.assertEquals(post.author, None) + self.assertEqual(post.author, None) post = posts[1] - self.assertEquals(post.created, timezone.now()) - self.assertEquals( + self.assertEqual(post.created, timezone.now()) + self.assertEqual( post.remote_identifier, "https://www.bbc.co.uk/news/technology-48334739" ) - self.assertEquals(post.author, None) + self.assertEqual(post.author, None) def test_empty_entries(self): - builder = FeedBuilder rule = FeedFactory() mock_stream = Mock(rule=rule) - with builder(mock_without_entries, mock_stream) as builder: + with FeedBuilder(mock_without_entries, mock_stream) as builder: builder.build() builder.save() - self.assertEquals(Post.objects.count(), 0) + self.assertEqual(Post.objects.count(), 0) def test_update_entries(self): - builder = FeedBuilder rule = FeedFactory() mock_stream = Mock(rule=rule) @@ -303,36 +260,35 @@ class FeedBuilderTestCase(TestCase): remote_identifier="a5479c66-8fae-11e9-8422-00163ef6bee7", rule=rule ) - with builder(mock_with_update_entries, mock_stream) as builder: + with FeedBuilder(mock_with_update_entries, mock_stream) as builder: builder.build() builder.save() - self.assertEquals(Post.objects.count(), 3) + self.assertEqual(Post.objects.count(), 3) existing_first_post.refresh_from_db() existing_second_post.refresh_from_db() - self.assertEquals( + self.assertEqual( existing_first_post.title, "Trump's 'genocidal taunts' will not end Iran - Zarif", ) - self.assertEquals( + self.assertEqual( existing_second_post.title, "Huawei's Android loss: How it affects you" ) def test_html_sanitizing(self): - builder = FeedBuilder rule = FeedFactory() mock_stream = Mock(rule=rule) - with builder(mock_with_html, mock_stream) as builder: + with FeedBuilder(mock_with_html, mock_stream) as builder: builder.build() builder.save() post = Post.objects.get() - self.assertEquals(Post.objects.count(), 1) + self.assertEqual(Post.objects.count(), 1) self.assertTrue("
" in post.body) self.assertTrue("

" in post.body) @@ -345,64 +301,60 @@ class FeedBuilderTestCase(TestCase): self.assertTrue("