From c68a58136cec339b8fbca621332c154e7aa9919c Mon Sep 17 00:00:00 2001 From: Sonny Bakker Date: Sun, 11 Oct 2020 21:50:23 +0200 Subject: [PATCH] Refactor RedditBuilder --- src/newsreader/news/collection/reddit.py | 182 ++++++++++-------- .../collection/tests/reddit/builder/tests.py | 57 ------ 2 files changed, 101 insertions(+), 138 deletions(-) diff --git a/src/newsreader/news/collection/reddit.py b/src/newsreader/news/collection/reddit.py index daeb85f..c0c2dc9 100644 --- a/src/newsreader/news/collection/reddit.py +++ b/src/newsreader/news/collection/reddit.py @@ -122,99 +122,119 @@ class RedditBuilder(PostBuilder): if not "data" in self.payload or not "children" in self.payload["data"]: return - posts = self.payload["data"]["children"] - rule = self.stream.rule + entries = self.payload["data"]["children"] - for post in posts: - if not "data" in post or post["kind"] != REDDIT_POST: + for entry in entries: + if not "data" in entry: + continue + elif entry.get("kind") != REDDIT_POST: + continue + elif not "id" in entry["data"]: continue - data = post["data"] + remote_identifier = entry["data"]["id"] - remote_identifier = data["id"] - title = truncate_text(Post, "title", data["title"]) - author = truncate_text(Post, "author", data["author"]) - post_url_fragment = data["permalink"] - direct_url = data["url"] - is_text_post = data["is_self"] - - if remote_identifier in results: + if remote_identifier in results or remote_identifier in self.existing_posts: continue - if is_text_post: - uncleaned_body = data["selftext_html"] - unescaped_body = unescape(uncleaned_body) if uncleaned_body else "" - body = self.sanitize_fragment(unescaped_body) if unescaped_body else "" - elif direct_url.endswith(REDDIT_IMAGE_EXTENSIONS): - body = format_html( - "
{title}
", - url=direct_url, - title=title, - ) - elif data["is_video"]: - video_info = data["secure_media"]["reddit_video"] - - body = format_html( - "
", - url=video_info["fallback_url"], - ) - elif direct_url.endswith(REDDIT_VIDEO_EXTENSIONS): - extension = next( - extension.replace(".", "") - for extension in REDDIT_VIDEO_EXTENSIONS - if direct_url.endswith(extension) - ) - - if extension == "gifv": - body = format_html( - "
", - url=direct_url.replace(extension, "mp4"), - ) - else: - body = format_html( - "
", - url=direct_url, - extension=extension, - ) - else: - body = format_html( - "
Direct url
", - url=direct_url, - title=title, - ) - try: - parsed_date = datetime.fromtimestamp(post["data"]["created_utc"]) - created_date = pytz.utc.localize(parsed_date) - except (OverflowError, OSError): - logging.warning( - f"Failed parsing timestamp from {REDDIT_URL}{post_url_fragment}" - ) - created_date = timezone.now() - - post_data = { - "remote_identifier": remote_identifier, - "title": title, - "body": body, - "author": author, - "url": f"{REDDIT_URL}{post_url_fragment}", - "publication_date": created_date, - "rule": rule, - } - - if remote_identifier in self.existing_posts: - existing_post = self.existing_posts[remote_identifier] - - for key, value in post_data.items(): - setattr(existing_post, key, value) - - results[existing_post.remote_identifier] = existing_post + post = self.build_post(entry["data"]) + except KeyError: + logger.exception(f"Failed building post {remote_identifier}") continue - results[remote_identifier] = Post(**post_data) + results[remote_identifier] = post self.instances = results.values() + def build_post(self, entry): + rule = self.stream.rule + + remote_identifier = entry["id"] + title = truncate_text(Post, "title", entry["title"]) + author = truncate_text(Post, "author", entry["author"]) + post_url_fragment = entry["permalink"] + direct_url = entry["url"] + + if entry["is_self"]: + body = self.get_text_post(entry) + elif direct_url.endswith(REDDIT_IMAGE_EXTENSIONS): + body = self.get_image_post(entry) + elif entry["is_video"]: + body = self.get_native_video_post(entry) + elif direct_url.endswith(REDDIT_VIDEO_EXTENSIONS): + body = self.get_video_post(entry) + else: + body = self.get_url_post(entry) + + try: + parsed_date = datetime.fromtimestamp(entry["created_utc"]) + created_date = pytz.utc.localize(parsed_date) + except (OverflowError, OSError): + logging.warning( + f"Failed parsing timestamp from {REDDIT_URL}{post_url_fragment}" + ) + created_date = timezone.now() + + post_entry = { + "remote_identifier": remote_identifier, + "title": title, + "body": body, + "author": author, + "url": f"{REDDIT_URL}{post_url_fragment}", + "publication_date": created_date, + "rule": rule, + } + + return Post(**post_entry) + + def get_text_post(self, entry): + uncleaned_body = entry["selftext_html"] + unescaped_body = unescape(uncleaned_body) if uncleaned_body else "" + return self.sanitize_fragment(unescaped_body) if unescaped_body else "" + + def get_image_post(self, entry): + return format_html( + "
{title}
", + url=entry["url"], + title=entry["title"], + ) + + def get_native_video_post(self, entry): + video_info = entry["secure_media"]["reddit_video"] + + return format_html( + "
", + url=video_info["fallback_url"], + ) + + def get_video_post(self, entry): + url = entry["url"] + extension = next( + extension.replace(".", "") + for extension in REDDIT_VIDEO_EXTENSIONS + if url.endswith(extension) + ) + + if extension == "gifv": + return format_html( + "
", + url=url.replace(extension, "mp4"), + ) + + return format_html( + "
", + url=url, + extension=extension, + ) + + def get_url_post(self, entry): + return format_html( + "
Direct url
", + url=entry["url"], + title=entry["title"], + ) + class RedditStream(PostStream): rule_type = RuleTypeChoices.subreddit diff --git a/src/newsreader/news/collection/tests/reddit/builder/tests.py b/src/newsreader/news/collection/tests/reddit/builder/tests.py index 11cf549..e326ed6 100644 --- a/src/newsreader/news/collection/tests/reddit/builder/tests.py +++ b/src/newsreader/news/collection/tests/reddit/builder/tests.py @@ -86,52 +86,6 @@ class RedditBuilderTestCase(TestCase): self.assertEquals(Post.objects.count(), 0) - def test_update_posts(self): - subreddit = SubredditFactory() - existing_post = RedditPostFactory( - remote_identifier="hm0qct", - author="Old author", - title="Old title", - body="Old body", - url="https://bbc.com/", - rule=subreddit, - ) - - builder = RedditBuilder - mock_stream = Mock(rule=subreddit) - - with builder(simple_mock, mock_stream) as builder: - builder.build() - builder.save() - - posts = {post.remote_identifier: post for post in Post.objects.all()} - - self.assertCountEqual( - ("hm0qct", "hna75r", "hngs71", "hngsj8", "hnd7cy"), posts.keys() - ) - - existing_post.refresh_from_db() - - self.assertEquals(existing_post.remote_identifier, "hm0qct") - self.assertEquals(existing_post.author, "AutoModerator") - self.assertEquals( - existing_post.title, - "Linux Experiences/Rants or Education/Certifications thread - July 06, 2020", - ) - self.assertIn( - "This megathread is also to hear opinions from anyone just starting out " - "with Linux or those that have used Linux (GNU or otherwise) for a long time.", - existing_post.body, - ) - self.assertEquals( - existing_post.publication_date, - pytz.utc.localize(datetime(2020, 7, 6, 6, 11, 22)), - ) - self.assertEquals( - existing_post.url, - "https://www.reddit.com/r/linux/comments/hm0qct/linux_experiencesrants_or_educationcertifications/", - ) - def test_html_sanitizing(self): builder = RedditBuilder @@ -225,17 +179,6 @@ class RedditBuilderTestCase(TestCase): ("hm0qct", "hna75r", "hngs71", "hngsj8", "hnd7cy"), posts.keys() ) - duplicate_post.refresh_from_db() - - self.assertEquals( - duplicate_post.publication_date, - pytz.utc.localize(datetime(2020, 7, 6, 6, 11, 22)), - ) - self.assertEquals( - duplicate_post.title, - "Linux Experiences/Rants or Education/Certifications thread - July 06, 2020", - ) - def test_image_post(self): builder = RedditBuilder