diff --git a/src/newsreader/news/collection/twitter.py b/src/newsreader/news/collection/twitter.py
index 99d8c1c..36047a5 100644
--- a/src/newsreader/news/collection/twitter.py
+++ b/src/newsreader/news/collection/twitter.py
@@ -22,6 +22,10 @@ from newsreader.news.collection.base import (
)
from newsreader.news.collection.choices import RuleTypeChoices, TwitterPostTypeChoices
from newsreader.news.collection.exceptions import (
+ BuilderDuplicateException,
+ BuilderException,
+ BuilderMissingDataException,
+ BuilderParseException,
StreamDeniedException,
StreamException,
StreamNotFoundException,
@@ -50,56 +54,74 @@ class TwitterBuilder(PostBuilder):
results = {}
for post in self.payload:
- remote_identifier = post.get("id_str")
-
- if not remote_identifier or remote_identifier in self.existing_posts:
- continue
-
try:
- results[remote_identifier] = self.build_post(post)
- except KeyError:
- logger.exception(f"Failed building post {remote_identifier}")
+ post = self.build_post(post)
+ except BuilderException:
+ logger.exception("Failed building post")
continue
+ identifier = post.remote_identifier
+ results[identifier] = post
+
self.instances = results.values()
def build_post(self, data):
- remote_identifier = data["id_str"]
- body = urlize(data["full_text"], nofollow=True)
- title = truncate_text(Post, "title", self.sanitize_fragment(data["full_text"]))
- url = f"{TWITTER_URL}/{self.stream.rule.screen_name}/status/{remote_identifier}"
+ remote_identifier = data.get("id_str", "")
+ rule = self.stream.rule
- publication_date = pytz.utc.localize(
- datetime.strptime(data["created_at"], "%a %b %d %H:%M:%S +0000 %Y")
- )
+ if remote_identifier in self.existing_posts:
+ raise BuilderDuplicateException(payload=data)
+
+ try:
+ body = urlize(data["full_text"], nofollow=True)
+ title = truncate_text(
+ Post, "title", self.sanitize_fragment(data["full_text"])
+ )
+
+ publication_date = pytz.utc.localize(
+ datetime.strptime(data["created_at"], "%a %b %d %H:%M:%S +0000 %Y")
+ )
+ except KeyError as e:
+ raise BuilderMissingDataException(payload=data) from e
+ except (OverflowError, OSError) as e:
+ raise BuilderParseException(payload=data) from e
+
+ url = f"{TWITTER_URL}/{rule.screen_name}/status/{remote_identifier}"
if "extended_entities" in data:
try:
media_entities = self.get_media_entities(data)
body += media_entities
- except KeyError:
- logger.exception(f"Failed parsing media_entities for {url}")
+ except KeyError as e:
+ raise BuilderMissingDataException(
+ message="Failed parsing data for media entities", payload=data
+ ) from e
- if "retweeted_status" in data:
- original_post = data["retweeted_status"]
- original_tweet = urlize(original_post["full_text"], nofollow=True)
- body = f"{body}