From e89b4c04a10420f7c891f889077001d0947ffa71 Mon Sep 17 00:00:00 2001 From: Sonny Bakker Date: Wed, 14 Oct 2020 21:39:36 +0200 Subject: [PATCH] Use new builder exceptions in RedditBuilder --- src/newsreader/news/collection/reddit.py | 117 +++++++++++++---------- 1 file changed, 69 insertions(+), 48 deletions(-) diff --git a/src/newsreader/news/collection/reddit.py b/src/newsreader/news/collection/reddit.py index c0c2dc9..1fbffe2 100644 --- a/src/newsreader/news/collection/reddit.py +++ b/src/newsreader/news/collection/reddit.py @@ -28,6 +28,10 @@ from newsreader.news.collection.constants import ( WHITELISTED_TAGS, ) from newsreader.news.collection.exceptions import ( + BuilderDuplicateException, + BuilderException, + BuilderMissingDataException, + BuilderParseException, StreamDeniedException, StreamException, StreamParseException, @@ -125,56 +129,67 @@ class RedditBuilder(PostBuilder): entries = self.payload["data"]["children"] for entry in entries: - if not "data" in entry: - continue - elif entry.get("kind") != REDDIT_POST: - continue - elif not "id" in entry["data"]: - continue - - remote_identifier = entry["data"]["id"] - - if remote_identifier in results or remote_identifier in self.existing_posts: - continue - try: - post = self.build_post(entry["data"]) - except KeyError: - logger.exception(f"Failed building post {remote_identifier}") + post = self.build_post(entry) + except BuilderException: + logger.exception("Failed building post") continue - results[remote_identifier] = post + identifier = post.remote_identifier + results[identifier] = post self.instances = results.values() def build_post(self, entry): rule = self.stream.rule + entry_data = entry.get("data", {}) + remote_identifier = entry_data.get("id", "") + kind = entry.get("kind") - remote_identifier = entry["id"] - title = truncate_text(Post, "title", entry["title"]) - author = truncate_text(Post, "author", entry["author"]) - post_url_fragment = entry["permalink"] - direct_url = entry["url"] - - if entry["is_self"]: - body = self.get_text_post(entry) - elif direct_url.endswith(REDDIT_IMAGE_EXTENSIONS): - body = self.get_image_post(entry) - elif entry["is_video"]: - body = self.get_native_video_post(entry) - elif direct_url.endswith(REDDIT_VIDEO_EXTENSIONS): - body = self.get_video_post(entry) - else: - body = self.get_url_post(entry) + if remote_identifier in self.existing_posts: + raise BuilderDuplicateException(payload=entry) + elif kind != REDDIT_POST: + raise BuilderParseException( + message=f"Payload is not an reddit post, its of kind {kind}", + payload=entry, + ) + elif not entry_data: + raise BuilderMissingDataException( + message=f"Post {remote_identifier} did not contain any data", + payload=entry, + ) try: - parsed_date = datetime.fromtimestamp(entry["created_utc"]) + title = entry_data["title"] + author = entry_data["author"] + post_url_fragment = entry_data["permalink"] + direct_url = entry_data["url"] + is_text = entry_data["is_self"] + is_video = entry_data["is_video"] + except KeyError as e: + raise BuilderMissingDataException(payload=entry) from e + + title = truncate_text(Post, "title", title) + author = truncate_text(Post, "author", author) + + if is_text: + body = self.get_text_post(entry_data) + elif direct_url.endswith(REDDIT_IMAGE_EXTENSIONS): + body = self.get_image_post(title, direct_url) + elif is_video: + body = self.get_native_video_post(entry_data) + elif direct_url.endswith(REDDIT_VIDEO_EXTENSIONS): + body = self.get_video_post(direct_url) + else: + body = self.get_url_post(title, direct_url) + + try: + parsed_date = datetime.fromtimestamp(entry_data["created_utc"]) created_date = pytz.utc.localize(parsed_date) - except (OverflowError, OSError): - logging.warning( - f"Failed parsing timestamp from {REDDIT_URL}{post_url_fragment}" - ) - created_date = timezone.now() + except (OverflowError, OSError) as e: + raise BuilderParseException(payload=entry) from e + except KeyError as e: + raise BuilderMissingDataException(payload=entry) from e post_entry = { "remote_identifier": remote_identifier, @@ -189,27 +204,33 @@ class RedditBuilder(PostBuilder): return Post(**post_entry) def get_text_post(self, entry): - uncleaned_body = entry["selftext_html"] + try: + uncleaned_body = entry["selftext_html"] + except KeyError as e: + raise BuilderMissingDataException(payload=entry) from e + unescaped_body = unescape(uncleaned_body) if uncleaned_body else "" return self.sanitize_fragment(unescaped_body) if unescaped_body else "" - def get_image_post(self, entry): + def get_image_post(self, title, url): return format_html( "
{title}
", - url=entry["url"], - title=entry["title"], + url=url, + title=title, ) def get_native_video_post(self, entry): - video_info = entry["secure_media"]["reddit_video"] + try: + video_info = entry["secure_media"]["reddit_video"] + except KeyError as e: + raise BuilderMissingDataException(payload=entry) from e return format_html( "
", url=video_info["fallback_url"], ) - def get_video_post(self, entry): - url = entry["url"] + def get_video_post(self, url): extension = next( extension.replace(".", "") for extension in REDDIT_VIDEO_EXTENSIONS @@ -228,11 +249,11 @@ class RedditBuilder(PostBuilder): extension=extension, ) - def get_url_post(self, entry): + def get_url_post(self, title, url): return format_html( "
Direct url
", - url=entry["url"], - title=entry["title"], + url=url, + title=title, )