diff --git a/src/newsreader/fixtures/default-fixture.json b/src/newsreader/fixtures/default-fixture.json index 1794742..880db4c 100644 --- a/src/newsreader/fixtures/default-fixture.json +++ b/src/newsreader/fixtures/default-fixture.json @@ -3427,7 +3427,6 @@ "is_active": true, "date_joined": "2019-07-18T18:52:36.080Z", "email": "sonny@bakker.nl", - "task": 10, "reddit_refresh_token": null, "reddit_access_token": null, "groups": [], diff --git a/src/newsreader/news/collection/exceptions/__init__.py b/src/newsreader/news/collection/exceptions/__init__.py new file mode 100644 index 0000000..35ce72d --- /dev/null +++ b/src/newsreader/news/collection/exceptions/__init__.py @@ -0,0 +1,16 @@ +from newsreader.news.collection.exceptions.builder import ( + BuilderDuplicateException, + BuilderException, + BuilderMissingDataException, + BuilderParseException, +) +from newsreader.news.collection.exceptions.stream import ( + StreamConnectionException, + StreamDeniedException, + StreamException, + StreamForbiddenException, + StreamNotFoundException, + StreamParseException, + StreamTimeOutException, + StreamTooManyException, +) diff --git a/src/newsreader/news/collection/exceptions/builder.py b/src/newsreader/news/collection/exceptions/builder.py new file mode 100644 index 0000000..6fb2d60 --- /dev/null +++ b/src/newsreader/news/collection/exceptions/builder.py @@ -0,0 +1,21 @@ +class BuilderException(Exception): + message = "Builder exception" + + def __init__(self, payload=None, message=None): + self.payload = payload + self.message = message if message else self.message + + def __str__(self): + return self.message + + +class BuilderMissingDataException(BuilderException): + message = "Payload contains missing data" + + +class BuilderDuplicateException(BuilderException): + message = "Payload contains duplicate entry" + + +class BuilderParseException(BuilderException): + message = "Failed to parse payload" diff --git a/src/newsreader/news/collection/exceptions.py b/src/newsreader/news/collection/exceptions/stream.py similarity index 100% rename from src/newsreader/news/collection/exceptions.py rename to src/newsreader/news/collection/exceptions/stream.py diff --git a/src/newsreader/news/collection/feed.py b/src/newsreader/news/collection/feed.py index ae6cd42..379f18e 100644 --- a/src/newsreader/news/collection/feed.py +++ b/src/newsreader/news/collection/feed.py @@ -39,6 +39,18 @@ class FeedBuilder(PostBuilder): rule__type = RuleTypeChoices.feed def build(self): + instances = [] + + with FeedDuplicateHandler(self.stream.rule) as duplicate_handler: + entries = self.payload.get("entries", []) + + for entry in entries: + post = self.build_post(entry) + instances.append(post) + + self.instances = duplicate_handler.check(instances) + + def build_post(self, entry): field_mapping = { "id": "remote_identifier", "title": "title", @@ -48,41 +60,37 @@ class FeedBuilder(PostBuilder): "author": "author", } tz = pytz.timezone(self.stream.rule.timezone) - instances = [] + data = {"rule_id": self.stream.rule.pk} - with FeedDuplicateHandler(self.stream.rule) as duplicate_handler: - entries = self.payload.get("entries", []) + for field, model_field in field_mapping.items(): + if not field in entry: + continue - for entry in entries: - data = {"rule_id": self.stream.rule.pk} + value = truncate_text(Post, model_field, entry[field]) - for field, model_field in field_mapping.items(): - if not field in entry: - continue + if field == "published_parsed": + data[model_field] = build_publication_date(value, tz) + elif field == "summary": + data[model_field] = self.sanitize_fragment(value) + else: + data[model_field] = value - value = truncate_text(Post, model_field, entry[field]) + content_details = self.get_content_details(entry) - if field == "published_parsed": - data[model_field] = build_publication_date(value, tz) - elif field == "summary": - data[model_field] = self.sanitize_fragment(value) - else: - data[model_field] = value + # use content details key if it contains more information + if not "body" in data or len(data["body"]) < len(content_details): + data["body"] = content_details - if "content" in entry: - content = self.get_content(entry["content"]) - body = data.get("body", "") + return Post(**data) - if not body or len(body) < len(content): - data["body"] = content + def get_content_details(self, entry): + content_items = entry.get("content") - instances.append(Post(**data)) + if not content_items: + return "" - self.instances = duplicate_handler.check(instances) - - def get_content(self, items): - content = "\n ".join([item.get("value") for item in items]) - return self.sanitize_fragment(content) + content_details = "\n ".join([item.get("value") for item in content_items]) + return self.sanitize_fragment(content_details) class FeedStream(PostStream): diff --git a/src/newsreader/news/collection/reddit.py b/src/newsreader/news/collection/reddit.py index daeb85f..1fbffe2 100644 --- a/src/newsreader/news/collection/reddit.py +++ b/src/newsreader/news/collection/reddit.py @@ -28,6 +28,10 @@ from newsreader.news.collection.constants import ( WHITELISTED_TAGS, ) from newsreader.news.collection.exceptions import ( + BuilderDuplicateException, + BuilderException, + BuilderMissingDataException, + BuilderParseException, StreamDeniedException, StreamException, StreamParseException, @@ -122,99 +126,136 @@ class RedditBuilder(PostBuilder): if not "data" in self.payload or not "children" in self.payload["data"]: return - posts = self.payload["data"]["children"] - rule = self.stream.rule - - for post in posts: - if not "data" in post or post["kind"] != REDDIT_POST: - continue - - data = post["data"] - - remote_identifier = data["id"] - title = truncate_text(Post, "title", data["title"]) - author = truncate_text(Post, "author", data["author"]) - post_url_fragment = data["permalink"] - direct_url = data["url"] - is_text_post = data["is_self"] - - if remote_identifier in results: - continue - - if is_text_post: - uncleaned_body = data["selftext_html"] - unescaped_body = unescape(uncleaned_body) if uncleaned_body else "" - body = self.sanitize_fragment(unescaped_body) if unescaped_body else "" - elif direct_url.endswith(REDDIT_IMAGE_EXTENSIONS): - body = format_html( - "