diff --git a/src/newsreader/news/collection/reddit.py b/src/newsreader/news/collection/reddit.py index 7af7eb1..c713e17 100644 --- a/src/newsreader/news/collection/reddit.py +++ b/src/newsreader/news/collection/reddit.py @@ -45,6 +45,9 @@ RATE_LIMIT_DURATION = timedelta(seconds=60) REDDIT_IMAGE_EXTENSIONS = (".jpg", ".png", ".gif") REDDIT_VIDEO_EXTENSIONS = (".mp4", ".gifv", ".webm") +# see type prefixes on https://www.reddit.com/dev/api/ +REDDIT_POST = "t3" + def get_reddit_authorization_url(user): state = str(uuid4()) @@ -117,40 +120,42 @@ class RedditBuilder(Builder): results = {} for post in posts: - if not "data" in post: + if not "data" in post or post["kind"] != REDDIT_POST: continue - remote_identifier = post["data"]["id"] - title = truncate_text(Post, "title", post["data"]["title"]) - author = truncate_text(Post, "author", post["data"]["author"]) - post_url_fragment = post["data"]["permalink"] - direct_url = post["data"]["url"] - is_text_post = post["data"]["is_self"] + data = post["data"] + + remote_identifier = data["id"] + title = truncate_text(Post, "title", data["title"]) + author = truncate_text(Post, "author", data["author"]) + post_url_fragment = data["permalink"] + direct_url = data["url"] + is_text_post = data["is_self"] if remote_identifier in results: continue - uncleaned_body = post["data"]["selftext_html"] - unescaped_body = unescape(uncleaned_body) if uncleaned_body else "" - body = ( - bleach.clean( - unescaped_body, - tags=WHITELISTED_TAGS, - attributes=WHITELISTED_ATTRIBUTES, - strip=True, - strip_comments=True, + if is_text_post: + uncleaned_body = data["selftext_html"] + unescaped_body = unescape(uncleaned_body) if uncleaned_body else "" + body = ( + bleach.clean( + unescaped_body, + tags=WHITELISTED_TAGS, + attributes=WHITELISTED_ATTRIBUTES, + strip=True, + strip_comments=True, + ) + if unescaped_body + else "" ) - if unescaped_body - else "" - ) - - if not is_text_post and direct_url.endswith(REDDIT_IMAGE_EXTENSIONS): + elif direct_url.endswith(REDDIT_IMAGE_EXTENSIONS): body = f"