From 379f8516e3a9b744be6a8e446a03de9bb900d84c Mon Sep 17 00:00:00 2001 From: Sonny Bakker Date: Wed, 14 Oct 2020 22:18:58 +0200 Subject: [PATCH] Use new builder exceptions in TwitterBuilder --- src/newsreader/news/collection/twitter.py | 78 +++++++++++++++-------- 1 file changed, 50 insertions(+), 28 deletions(-) diff --git a/src/newsreader/news/collection/twitter.py b/src/newsreader/news/collection/twitter.py index 99d8c1c..36047a5 100644 --- a/src/newsreader/news/collection/twitter.py +++ b/src/newsreader/news/collection/twitter.py @@ -22,6 +22,10 @@ from newsreader.news.collection.base import ( ) from newsreader.news.collection.choices import RuleTypeChoices, TwitterPostTypeChoices from newsreader.news.collection.exceptions import ( + BuilderDuplicateException, + BuilderException, + BuilderMissingDataException, + BuilderParseException, StreamDeniedException, StreamException, StreamNotFoundException, @@ -50,56 +54,74 @@ class TwitterBuilder(PostBuilder): results = {} for post in self.payload: - remote_identifier = post.get("id_str") - - if not remote_identifier or remote_identifier in self.existing_posts: - continue - try: - results[remote_identifier] = self.build_post(post) - except KeyError: - logger.exception(f"Failed building post {remote_identifier}") + post = self.build_post(post) + except BuilderException: + logger.exception("Failed building post") continue + identifier = post.remote_identifier + results[identifier] = post + self.instances = results.values() def build_post(self, data): - remote_identifier = data["id_str"] - body = urlize(data["full_text"], nofollow=True) - title = truncate_text(Post, "title", self.sanitize_fragment(data["full_text"])) - url = f"{TWITTER_URL}/{self.stream.rule.screen_name}/status/{remote_identifier}" + remote_identifier = data.get("id_str", "") + rule = self.stream.rule - publication_date = pytz.utc.localize( - datetime.strptime(data["created_at"], "%a %b %d %H:%M:%S +0000 %Y") - ) + if remote_identifier in self.existing_posts: + raise BuilderDuplicateException(payload=data) + + try: + body = urlize(data["full_text"], nofollow=True) + title = truncate_text( + Post, "title", self.sanitize_fragment(data["full_text"]) + ) + + publication_date = pytz.utc.localize( + datetime.strptime(data["created_at"], "%a %b %d %H:%M:%S +0000 %Y") + ) + except KeyError as e: + raise BuilderMissingDataException(payload=data) from e + except (OverflowError, OSError) as e: + raise BuilderParseException(payload=data) from e + + url = f"{TWITTER_URL}/{rule.screen_name}/status/{remote_identifier}" if "extended_entities" in data: try: media_entities = self.get_media_entities(data) body += media_entities - except KeyError: - logger.exception(f"Failed parsing media_entities for {url}") + except KeyError as e: + raise BuilderMissingDataException( + message="Failed parsing data for media entities", payload=data + ) from e - if "retweeted_status" in data: - original_post = data["retweeted_status"] - original_tweet = urlize(original_post["full_text"], nofollow=True) - body = f"{body}
Original tweet: {original_tweet}
" - if "quoted_status" in data: - original_post = data["quoted_status"] - original_tweet = urlize(original_post["full_text"], nofollow=True) - body = f"{body}
Quoted tweet: {original_tweet}
" + try: + if "retweeted_status" in data: + original_post = data["retweeted_status"] + original_tweet = urlize(original_post["full_text"], nofollow=True) + body = f"{body}
Original tweet: {original_tweet}
" + if "quoted_status" in data: + original_post = data["quoted_status"] + original_tweet = urlize(original_post["full_text"], nofollow=True) + body = f"{body}
Quoted tweet: {original_tweet}
" + except KeyError as e: + raise BuilderMissingDataException( + message="Failed parsing data for original tweet", payload=data + ) from e body = self.sanitize_fragment(body) return Post( **{ - "remote_identifier": data["id_str"], + "remote_identifier": remote_identifier, "title": fix_text(title), "body": fix_text(body), - "author": self.stream.rule.screen_name, + "author": rule.screen_name, "publication_date": publication_date, "url": url, - "rule": self.stream.rule, + "rule": rule, } )