From 9e5e05c056dff8ac39bea39d2e38ea795a44ce32 Mon Sep 17 00:00:00 2001 From: Sonny Bakker Date: Sat, 17 Oct 2020 13:19:49 +0200 Subject: [PATCH] 0.3.3 - Update static configuration - Builder refactor - Fix for images stretching to far --- src/newsreader/fixtures/default-fixture.json | 1 - .../news/collection/exceptions/__init__.py | 16 ++ .../news/collection/exceptions/builder.py | 21 ++ .../{exceptions.py => exceptions/stream.py} | 0 src/newsreader/news/collection/feed.py | 60 ++--- src/newsreader/news/collection/reddit.py | 213 ++++++++++------- .../news/collection/views/rules.html | 16 +- .../collection/tests/feed/builder/tests.py | 221 +++++++----------- .../collection/tests/reddit/builder/tests.py | 57 ----- .../collection/tests/twitter/builder/mocks.py | 199 ++++++++++++++++ .../collection/tests/twitter/builder/tests.py | 19 ++ src/newsreader/news/collection/twitter.py | 79 ++++--- .../scss/components/post/_post.scss | 2 + src/newsreader/templates/base.html | 2 +- webpack.common.babel.js | 3 +- 15 files changed, 568 insertions(+), 341 deletions(-) create mode 100644 src/newsreader/news/collection/exceptions/__init__.py create mode 100644 src/newsreader/news/collection/exceptions/builder.py rename src/newsreader/news/collection/{exceptions.py => exceptions/stream.py} (100%) diff --git a/src/newsreader/fixtures/default-fixture.json b/src/newsreader/fixtures/default-fixture.json index 1794742..880db4c 100644 --- a/src/newsreader/fixtures/default-fixture.json +++ b/src/newsreader/fixtures/default-fixture.json @@ -3427,7 +3427,6 @@ "is_active": true, "date_joined": "2019-07-18T18:52:36.080Z", "email": "sonny@bakker.nl", - "task": 10, "reddit_refresh_token": null, "reddit_access_token": null, "groups": [], diff --git a/src/newsreader/news/collection/exceptions/__init__.py b/src/newsreader/news/collection/exceptions/__init__.py new file mode 100644 index 0000000..35ce72d --- /dev/null +++ b/src/newsreader/news/collection/exceptions/__init__.py @@ -0,0 +1,16 @@ +from newsreader.news.collection.exceptions.builder import ( + BuilderDuplicateException, + BuilderException, + BuilderMissingDataException, + BuilderParseException, +) +from newsreader.news.collection.exceptions.stream import ( + StreamConnectionException, + StreamDeniedException, + StreamException, + StreamForbiddenException, + StreamNotFoundException, + StreamParseException, + StreamTimeOutException, + StreamTooManyException, +) diff --git a/src/newsreader/news/collection/exceptions/builder.py b/src/newsreader/news/collection/exceptions/builder.py new file mode 100644 index 0000000..6fb2d60 --- /dev/null +++ b/src/newsreader/news/collection/exceptions/builder.py @@ -0,0 +1,21 @@ +class BuilderException(Exception): + message = "Builder exception" + + def __init__(self, payload=None, message=None): + self.payload = payload + self.message = message if message else self.message + + def __str__(self): + return self.message + + +class BuilderMissingDataException(BuilderException): + message = "Payload contains missing data" + + +class BuilderDuplicateException(BuilderException): + message = "Payload contains duplicate entry" + + +class BuilderParseException(BuilderException): + message = "Failed to parse payload" diff --git a/src/newsreader/news/collection/exceptions.py b/src/newsreader/news/collection/exceptions/stream.py similarity index 100% rename from src/newsreader/news/collection/exceptions.py rename to src/newsreader/news/collection/exceptions/stream.py diff --git a/src/newsreader/news/collection/feed.py b/src/newsreader/news/collection/feed.py index ae6cd42..379f18e 100644 --- a/src/newsreader/news/collection/feed.py +++ b/src/newsreader/news/collection/feed.py @@ -39,6 +39,18 @@ class FeedBuilder(PostBuilder): rule__type = RuleTypeChoices.feed def build(self): + instances = [] + + with FeedDuplicateHandler(self.stream.rule) as duplicate_handler: + entries = self.payload.get("entries", []) + + for entry in entries: + post = self.build_post(entry) + instances.append(post) + + self.instances = duplicate_handler.check(instances) + + def build_post(self, entry): field_mapping = { "id": "remote_identifier", "title": "title", @@ -48,41 +60,37 @@ class FeedBuilder(PostBuilder): "author": "author", } tz = pytz.timezone(self.stream.rule.timezone) - instances = [] + data = {"rule_id": self.stream.rule.pk} - with FeedDuplicateHandler(self.stream.rule) as duplicate_handler: - entries = self.payload.get("entries", []) + for field, model_field in field_mapping.items(): + if not field in entry: + continue - for entry in entries: - data = {"rule_id": self.stream.rule.pk} + value = truncate_text(Post, model_field, entry[field]) - for field, model_field in field_mapping.items(): - if not field in entry: - continue + if field == "published_parsed": + data[model_field] = build_publication_date(value, tz) + elif field == "summary": + data[model_field] = self.sanitize_fragment(value) + else: + data[model_field] = value - value = truncate_text(Post, model_field, entry[field]) + content_details = self.get_content_details(entry) - if field == "published_parsed": - data[model_field] = build_publication_date(value, tz) - elif field == "summary": - data[model_field] = self.sanitize_fragment(value) - else: - data[model_field] = value + # use content details key if it contains more information + if not "body" in data or len(data["body"]) < len(content_details): + data["body"] = content_details - if "content" in entry: - content = self.get_content(entry["content"]) - body = data.get("body", "") + return Post(**data) - if not body or len(body) < len(content): - data["body"] = content + def get_content_details(self, entry): + content_items = entry.get("content") - instances.append(Post(**data)) + if not content_items: + return "" - self.instances = duplicate_handler.check(instances) - - def get_content(self, items): - content = "\n ".join([item.get("value") for item in items]) - return self.sanitize_fragment(content) + content_details = "\n ".join([item.get("value") for item in content_items]) + return self.sanitize_fragment(content_details) class FeedStream(PostStream): diff --git a/src/newsreader/news/collection/reddit.py b/src/newsreader/news/collection/reddit.py index daeb85f..1fbffe2 100644 --- a/src/newsreader/news/collection/reddit.py +++ b/src/newsreader/news/collection/reddit.py @@ -28,6 +28,10 @@ from newsreader.news.collection.constants import ( WHITELISTED_TAGS, ) from newsreader.news.collection.exceptions import ( + BuilderDuplicateException, + BuilderException, + BuilderMissingDataException, + BuilderParseException, StreamDeniedException, StreamException, StreamParseException, @@ -122,99 +126,136 @@ class RedditBuilder(PostBuilder): if not "data" in self.payload or not "children" in self.payload["data"]: return - posts = self.payload["data"]["children"] - rule = self.stream.rule - - for post in posts: - if not "data" in post or post["kind"] != REDDIT_POST: - continue - - data = post["data"] - - remote_identifier = data["id"] - title = truncate_text(Post, "title", data["title"]) - author = truncate_text(Post, "author", data["author"]) - post_url_fragment = data["permalink"] - direct_url = data["url"] - is_text_post = data["is_self"] - - if remote_identifier in results: - continue - - if is_text_post: - uncleaned_body = data["selftext_html"] - unescaped_body = unescape(uncleaned_body) if uncleaned_body else "" - body = self.sanitize_fragment(unescaped_body) if unescaped_body else "" - elif direct_url.endswith(REDDIT_IMAGE_EXTENSIONS): - body = format_html( - "
{title}
", - url=direct_url, - title=title, - ) - elif data["is_video"]: - video_info = data["secure_media"]["reddit_video"] - - body = format_html( - "
", - url=video_info["fallback_url"], - ) - elif direct_url.endswith(REDDIT_VIDEO_EXTENSIONS): - extension = next( - extension.replace(".", "") - for extension in REDDIT_VIDEO_EXTENSIONS - if direct_url.endswith(extension) - ) - - if extension == "gifv": - body = format_html( - "
", - url=direct_url.replace(extension, "mp4"), - ) - else: - body = format_html( - "
", - url=direct_url, - extension=extension, - ) - else: - body = format_html( - "
Direct url
", - url=direct_url, - title=title, - ) + entries = self.payload["data"]["children"] + for entry in entries: try: - parsed_date = datetime.fromtimestamp(post["data"]["created_utc"]) - created_date = pytz.utc.localize(parsed_date) - except (OverflowError, OSError): - logging.warning( - f"Failed parsing timestamp from {REDDIT_URL}{post_url_fragment}" - ) - created_date = timezone.now() - - post_data = { - "remote_identifier": remote_identifier, - "title": title, - "body": body, - "author": author, - "url": f"{REDDIT_URL}{post_url_fragment}", - "publication_date": created_date, - "rule": rule, - } - - if remote_identifier in self.existing_posts: - existing_post = self.existing_posts[remote_identifier] - - for key, value in post_data.items(): - setattr(existing_post, key, value) - - results[existing_post.remote_identifier] = existing_post + post = self.build_post(entry) + except BuilderException: + logger.exception("Failed building post") continue - results[remote_identifier] = Post(**post_data) + identifier = post.remote_identifier + results[identifier] = post self.instances = results.values() + def build_post(self, entry): + rule = self.stream.rule + entry_data = entry.get("data", {}) + remote_identifier = entry_data.get("id", "") + kind = entry.get("kind") + + if remote_identifier in self.existing_posts: + raise BuilderDuplicateException(payload=entry) + elif kind != REDDIT_POST: + raise BuilderParseException( + message=f"Payload is not an reddit post, its of kind {kind}", + payload=entry, + ) + elif not entry_data: + raise BuilderMissingDataException( + message=f"Post {remote_identifier} did not contain any data", + payload=entry, + ) + + try: + title = entry_data["title"] + author = entry_data["author"] + post_url_fragment = entry_data["permalink"] + direct_url = entry_data["url"] + is_text = entry_data["is_self"] + is_video = entry_data["is_video"] + except KeyError as e: + raise BuilderMissingDataException(payload=entry) from e + + title = truncate_text(Post, "title", title) + author = truncate_text(Post, "author", author) + + if is_text: + body = self.get_text_post(entry_data) + elif direct_url.endswith(REDDIT_IMAGE_EXTENSIONS): + body = self.get_image_post(title, direct_url) + elif is_video: + body = self.get_native_video_post(entry_data) + elif direct_url.endswith(REDDIT_VIDEO_EXTENSIONS): + body = self.get_video_post(direct_url) + else: + body = self.get_url_post(title, direct_url) + + try: + parsed_date = datetime.fromtimestamp(entry_data["created_utc"]) + created_date = pytz.utc.localize(parsed_date) + except (OverflowError, OSError) as e: + raise BuilderParseException(payload=entry) from e + except KeyError as e: + raise BuilderMissingDataException(payload=entry) from e + + post_entry = { + "remote_identifier": remote_identifier, + "title": title, + "body": body, + "author": author, + "url": f"{REDDIT_URL}{post_url_fragment}", + "publication_date": created_date, + "rule": rule, + } + + return Post(**post_entry) + + def get_text_post(self, entry): + try: + uncleaned_body = entry["selftext_html"] + except KeyError as e: + raise BuilderMissingDataException(payload=entry) from e + + unescaped_body = unescape(uncleaned_body) if uncleaned_body else "" + return self.sanitize_fragment(unescaped_body) if unescaped_body else "" + + def get_image_post(self, title, url): + return format_html( + "
{title}
", + url=url, + title=title, + ) + + def get_native_video_post(self, entry): + try: + video_info = entry["secure_media"]["reddit_video"] + except KeyError as e: + raise BuilderMissingDataException(payload=entry) from e + + return format_html( + "
", + url=video_info["fallback_url"], + ) + + def get_video_post(self, url): + extension = next( + extension.replace(".", "") + for extension in REDDIT_VIDEO_EXTENSIONS + if url.endswith(extension) + ) + + if extension == "gifv": + return format_html( + "
", + url=url.replace(extension, "mp4"), + ) + + return format_html( + "
", + url=url, + extension=extension, + ) + + def get_url_post(self, title, url): + return format_html( + "
Direct url
", + url=url, + title=title, + ) + class RedditStream(PostStream): rule_type = RuleTypeChoices.subreddit diff --git a/src/newsreader/news/collection/templates/news/collection/views/rules.html b/src/newsreader/news/collection/templates/news/collection/views/rules.html index 678716e..ef05352 100644 --- a/src/newsreader/news/collection/templates/news/collection/views/rules.html +++ b/src/newsreader/news/collection/templates/news/collection/views/rules.html @@ -6,19 +6,21 @@
{% csrf_token %} +
+ +
+
- -
diff --git a/src/newsreader/news/collection/tests/feed/builder/tests.py b/src/newsreader/news/collection/tests/feed/builder/tests.py index 571a7cd..7f4edf0 100644 --- a/src/newsreader/news/collection/tests/feed/builder/tests.py +++ b/src/newsreader/news/collection/tests/feed/builder/tests.py @@ -1,4 +1,4 @@ -from datetime import date, datetime, time +from datetime import datetime from unittest.mock import Mock from django.test import TestCase @@ -21,277 +21,233 @@ class FeedBuilderTestCase(TestCase): def setUp(self): self.maxDiff = None - def test_basic_entry(self): - builder = FeedBuilder - rule = FeedFactory() - mock_stream = Mock(rule=rule) - - with builder(simple_mock, mock_stream) as builder: - builder.build() - builder.save() - - post = Post.objects.get() - - publication_date = datetime.combine( - date(2019, 5, 20), time(hour=16, minute=7, second=37) - ) - aware_date = pytz.utc.localize(publication_date) - - self.assertEquals(post.publication_date, aware_date) - self.assertEquals(Post.objects.count(), 1) - - self.assertEquals( - post.remote_identifier, - "https://www.bbc.co.uk/news/world-us-canada-48338168", - ) - - self.assertEquals( - post.url, "https://www.bbc.co.uk/news/world-us-canada-48338168" - ) - - self.assertEquals( - post.title, "Trump's 'genocidal taunts' will not end Iran - Zarif" - ) - def test_multiple_entries(self): - builder = FeedBuilder rule = FeedFactory() mock_stream = Mock(rule=rule) - with builder(multiple_mock, mock_stream) as builder: + with FeedBuilder(multiple_mock, mock_stream) as builder: builder.build() builder.save() posts = Post.objects.order_by("-publication_date") - self.assertEquals(Post.objects.count(), 3) + self.assertEqual(Post.objects.count(), 3) post = posts[0] - publication_date = datetime.combine( - date(2019, 5, 20), time(hour=16, minute=32, second=38) + publication_date = datetime( + 2019, 5, 20, hour=16, minute=32, second=38, tzinfo=pytz.utc ) - aware_date = pytz.utc.localize(publication_date) - self.assertEquals( + self.assertEqual( post.publication_date.strftime("%Y-%m-%d %H:%M:%S"), - aware_date.strftime("%Y-%m-%d %H:%M:%S"), + publication_date.strftime("%Y-%m-%d %H:%M:%S"), ) - self.assertEquals( + self.assertEqual( post.remote_identifier, "https://www.bbc.co.uk/news/uk-england-birmingham-48339080", ) - self.assertEquals( + self.assertEqual( post.url, "https://www.bbc.co.uk/news/uk-england-birmingham-48339080" ) - self.assertEquals( + self.assertEqual( post.title, "Birmingham head teacher threatened over LGBT lessons" ) post = posts[1] - publication_date = datetime.combine( - date(2019, 5, 20), time(hour=16, minute=7, second=37) + publication_date = datetime( + 2019, 5, 20, hour=16, minute=7, second=37, tzinfo=pytz.utc ) - aware_date = pytz.utc.localize(publication_date) - self.assertEquals( + self.assertEqual( post.publication_date.strftime("%Y-%m-%d %H:%M:%S"), - aware_date.strftime("%Y-%m-%d %H:%M:%S"), + publication_date.strftime("%Y-%m-%d %H:%M:%S"), ) - self.assertEquals( + self.assertEqual( post.remote_identifier, "https://www.bbc.co.uk/news/world-us-canada-48338168", ) - self.assertEquals( + self.assertEqual( post.url, "https://www.bbc.co.uk/news/world-us-canada-48338168" ) - self.assertEquals( + self.assertEqual( post.title, "Trump's 'genocidal taunts' will not end Iran - Zarif" ) def test_entries_without_remote_identifier(self): - builder = FeedBuilder rule = FeedFactory() mock_stream = Mock(rule=rule) - with builder(mock_without_identifier, mock_stream) as builder: + with FeedBuilder(mock_without_identifier, mock_stream) as builder: builder.build() builder.save() posts = Post.objects.order_by("-publication_date") - self.assertEquals(Post.objects.count(), 2) + self.assertEqual(Post.objects.count(), 2) post = posts[0] - publication_date = datetime.combine( - date(2019, 5, 20), time(hour=16, minute=7, second=37) + publication_date = datetime( + 2019, 5, 20, hour=16, minute=7, second=37, tzinfo=pytz.utc ) - aware_date = pytz.utc.localize(publication_date) - self.assertEquals(post.publication_date, aware_date) - self.assertEquals(post.remote_identifier, None) - self.assertEquals( + self.assertEqual(post.publication_date, publication_date) + self.assertEqual(post.remote_identifier, None) + self.assertEqual( post.url, "https://www.bbc.co.uk/news/world-us-canada-48338168" ) - self.assertEquals( + self.assertEqual( post.title, "Trump's 'genocidal taunts' will not end Iran - Zarif" ) post = posts[1] - publication_date = datetime.combine( - date(2019, 5, 20), time(hour=12, minute=19, second=19) + publication_date = datetime( + 2019, 5, 20, hour=12, minute=19, second=19, tzinfo=pytz.utc ) - aware_date = pytz.utc.localize(publication_date) - self.assertEquals(post.publication_date, aware_date) - self.assertEquals(post.remote_identifier, None) - self.assertEquals(post.url, "https://www.bbc.co.uk/news/technology-48334739") - self.assertEquals(post.title, "Huawei's Android loss: How it affects you") + self.assertEqual(post.publication_date, publication_date) + self.assertEqual(post.remote_identifier, None) + self.assertEqual(post.url, "https://www.bbc.co.uk/news/technology-48334739") + self.assertEqual(post.title, "Huawei's Android loss: How it affects you") def test_entry_without_publication_date(self): - builder = FeedBuilder rule = FeedFactory() mock_stream = Mock(rule=rule) - with builder(mock_without_publish_date, mock_stream) as builder: + with FeedBuilder(mock_without_publish_date, mock_stream) as builder: builder.build() builder.save() posts = Post.objects.order_by("-publication_date") - self.assertEquals(Post.objects.count(), 2) + self.assertEqual(Post.objects.count(), 2) post = posts[0] - self.assertEquals( + self.assertEqual( post.publication_date.strftime("%Y-%m-%d %H:%M"), "2019-10-30 12:30" ) - self.assertEquals(post.created, timezone.now()) - self.assertEquals( + self.assertEqual(post.created, timezone.now()) + self.assertEqual( post.remote_identifier, "https://www.bbc.co.uk/news/world-us-canada-48338168", ) post = posts[1] - self.assertEquals( + self.assertEqual( post.publication_date.strftime("%Y-%m-%d %H:%M"), "2019-10-30 12:30" ) - self.assertEquals(post.created, timezone.now()) - self.assertEquals( + self.assertEqual(post.created, timezone.now()) + self.assertEqual( post.remote_identifier, "https://www.bbc.co.uk/news/technology-48334739" ) def test_entry_without_url(self): - builder = FeedBuilder rule = FeedFactory() mock_stream = Mock(rule=rule) - with builder(mock_without_url, mock_stream) as builder: + with FeedBuilder(mock_without_url, mock_stream) as builder: builder.build() builder.save() posts = Post.objects.order_by("-publication_date") - self.assertEquals(Post.objects.count(), 2) + self.assertEqual(Post.objects.count(), 2) post = posts[0] - self.assertEquals(post.created, timezone.now()) - self.assertEquals( + self.assertEqual(post.created, timezone.now()) + self.assertEqual( post.remote_identifier, "https://www.bbc.co.uk/news/world-us-canada-48338168", ) post = posts[1] - self.assertEquals(post.created, timezone.now()) - self.assertEquals( + self.assertEqual(post.created, timezone.now()) + self.assertEqual( post.remote_identifier, "https://www.bbc.co.uk/news/technology-48334739" ) def test_entry_without_body(self): - builder = FeedBuilder rule = FeedFactory() mock_stream = Mock(rule=rule) - with builder(mock_without_body, mock_stream) as builder: + with FeedBuilder(mock_without_body, mock_stream) as builder: builder.build() builder.save() posts = Post.objects.order_by("-publication_date") - self.assertEquals(Post.objects.count(), 2) + self.assertEqual(Post.objects.count(), 2) post = posts[0] - self.assertEquals( + self.assertEqual( post.created.strftime("%Y-%m-%d %H:%M:%S"), "2019-10-30 12:30:00" ) - self.assertEquals( + self.assertEqual( post.remote_identifier, "https://www.bbc.co.uk/news/uk-england-birmingham-48339080", ) - self.assertEquals(post.body, "") + self.assertEqual(post.body, "") post = posts[1] - self.assertEquals( + self.assertEqual( post.created.strftime("%Y-%m-%d %H:%M:%S"), "2019-10-30 12:30:00" ) - self.assertEquals( + self.assertEqual( post.remote_identifier, "https://www.bbc.co.uk/news/world-us-canada-48338168", ) - self.assertEquals(post.body, "") + self.assertEqual(post.body, "") def test_entry_without_author(self): - builder = FeedBuilder rule = FeedFactory() mock_stream = Mock(rule=rule) - with builder(mock_without_author, mock_stream) as builder: + with FeedBuilder(mock_without_author, mock_stream) as builder: builder.build() builder.save() posts = Post.objects.order_by("-publication_date") - self.assertEquals(Post.objects.count(), 2) + self.assertEqual(Post.objects.count(), 2) post = posts[0] - self.assertEquals(post.created, timezone.now()) - self.assertEquals( + self.assertEqual(post.created, timezone.now()) + self.assertEqual( post.remote_identifier, "https://www.bbc.co.uk/news/world-us-canada-48338168", ) - self.assertEquals(post.author, None) + self.assertEqual(post.author, None) post = posts[1] - self.assertEquals(post.created, timezone.now()) - self.assertEquals( + self.assertEqual(post.created, timezone.now()) + self.assertEqual( post.remote_identifier, "https://www.bbc.co.uk/news/technology-48334739" ) - self.assertEquals(post.author, None) + self.assertEqual(post.author, None) def test_empty_entries(self): - builder = FeedBuilder rule = FeedFactory() mock_stream = Mock(rule=rule) - with builder(mock_without_entries, mock_stream) as builder: + with FeedBuilder(mock_without_entries, mock_stream) as builder: builder.build() builder.save() - self.assertEquals(Post.objects.count(), 0) + self.assertEqual(Post.objects.count(), 0) def test_update_entries(self): - builder = FeedBuilder rule = FeedFactory() mock_stream = Mock(rule=rule) @@ -303,36 +259,35 @@ class FeedBuilderTestCase(TestCase): remote_identifier="a5479c66-8fae-11e9-8422-00163ef6bee7", rule=rule ) - with builder(mock_with_update_entries, mock_stream) as builder: + with FeedBuilder(mock_with_update_entries, mock_stream) as builder: builder.build() builder.save() - self.assertEquals(Post.objects.count(), 3) + self.assertEqual(Post.objects.count(), 3) existing_first_post.refresh_from_db() existing_second_post.refresh_from_db() - self.assertEquals( + self.assertEqual( existing_first_post.title, "Trump's 'genocidal taunts' will not end Iran - Zarif", ) - self.assertEquals( + self.assertEqual( existing_second_post.title, "Huawei's Android loss: How it affects you" ) def test_html_sanitizing(self): - builder = FeedBuilder rule = FeedFactory() mock_stream = Mock(rule=rule) - with builder(mock_with_html, mock_stream) as builder: + with FeedBuilder(mock_with_html, mock_stream) as builder: builder.build() builder.save() post = Post.objects.get() - self.assertEquals(Post.objects.count(), 1) + self.assertEqual(Post.objects.count(), 1) self.assertTrue("
" in post.body) self.assertTrue("

" in post.body) @@ -345,64 +300,60 @@ class FeedBuilderTestCase(TestCase): self.assertTrue("