diff --git a/src/newsreader/news/collection/base.py b/src/newsreader/news/collection/base.py index 5de3454..df122a6 100644 --- a/src/newsreader/news/collection/base.py +++ b/src/newsreader/news/collection/base.py @@ -1,5 +1,11 @@ +import bleach + from bs4 import BeautifulSoup +from newsreader.news.collection.constants import ( + WHITELISTED_ATTRIBUTES, + WHITELISTED_TAGS, +) from newsreader.news.collection.exceptions import StreamParseException from newsreader.news.collection.utils import fetch from newsreader.news.core.models import Post @@ -79,6 +85,18 @@ class Builder: def create_posts(self, stream): raise NotImplementedError + def sanitize_fragment(self, fragment): + if not fragment: + return "" + + return bleach.clean( + fragment, + tags=WHITELISTED_TAGS, + attributes=WHITELISTED_ATTRIBUTES, + strip=True, + strip_comments=True, + ) + def save(self): for post in self.instances: post.save() diff --git a/src/newsreader/news/collection/constants.py b/src/newsreader/news/collection/constants.py index eade898..0c73642 100644 --- a/src/newsreader/news/collection/constants.py +++ b/src/newsreader/news/collection/constants.py @@ -23,6 +23,7 @@ WHITELISTED_TAGS = ( WHITELISTED_ATTRIBUTES = { **BLEACH_ATTRIBUTES, "a": ["href", "rel"], - "img": ["alt", "src"], - "source": ["srcset", "media", "src", "type"], + "img": ["alt", "src", "loading"], + "video": ["controls", "muted"], + "source": ["srcset", "src", "media", "type"], } diff --git a/src/newsreader/news/collection/feed.py b/src/newsreader/news/collection/feed.py index cb5618f..ff28666 100644 --- a/src/newsreader/news/collection/feed.py +++ b/src/newsreader/news/collection/feed.py @@ -6,17 +6,12 @@ from datetime import timedelta from django.core.exceptions import MultipleObjectsReturned, ObjectDoesNotExist from django.utils import timezone -import bleach import pytz from feedparser import parse from newsreader.news.collection.base import Builder, Client, Collector, Stream from newsreader.news.collection.choices import RuleTypeChoices -from newsreader.news.collection.constants import ( - WHITELISTED_ATTRIBUTES, - WHITELISTED_TAGS, -) from newsreader.news.collection.exceptions import ( StreamDeniedException, StreamException, @@ -85,18 +80,6 @@ class FeedBuilder(Builder): yield Post(**data) - def sanitize_fragment(self, fragment): - if not fragment: - return "" - - return bleach.clean( - fragment, - tags=WHITELISTED_TAGS, - attributes=WHITELISTED_ATTRIBUTES, - strip=True, - strip_comments=True, - ) - def get_content(self, items): content = "\n ".join([item.get("value") for item in items]) return self.sanitize_fragment(content) diff --git a/src/newsreader/news/collection/reddit.py b/src/newsreader/news/collection/reddit.py index 7ef4784..65ce384 100644 --- a/src/newsreader/news/collection/reddit.py +++ b/src/newsreader/news/collection/reddit.py @@ -12,7 +12,6 @@ from django.core.cache import cache from django.utils import timezone from django.utils.html import format_html -import bleach import pytz import requests @@ -128,17 +127,7 @@ class RedditBuilder(Builder): if is_text_post: uncleaned_body = data["selftext_html"] unescaped_body = unescape(uncleaned_body) if uncleaned_body else "" - body = ( - bleach.clean( - unescaped_body, - tags=WHITELISTED_TAGS, - attributes=WHITELISTED_ATTRIBUTES, - strip=True, - strip_comments=True, - ) - if unescaped_body - else "" - ) + body = self.sanitize_fragment(unescaped_body) if unescaped_body else "" elif direct_url.endswith(REDDIT_IMAGE_EXTENSIONS): body = format_html( "
{title}
", diff --git a/src/newsreader/news/collection/tests/twitter/builder/mocks.py b/src/newsreader/news/collection/tests/twitter/builder/mocks.py index 11047b1..b330f2f 100644 --- a/src/newsreader/news/collection/tests/twitter/builder/mocks.py +++ b/src/newsreader/news/collection/tests/twitter/builder/mocks.py @@ -2080,3 +2080,108 @@ gif_mock = [ }, }, ] + +unsanitized_mock = [ + { + "contributors": None, + "coordinates": None, + "created_at": "Fri Aug 07 00:17:05 +0000 2020", + "display_text_range": [11, 59], + "entities": { + "hashtags": [], + "symbols": [], + "urls": [ + { + "display_url": "youtu.be/rDy7tPf6CT8", + "expanded_url": "https://youtu.be/rDy7tPf6CT8", + "indices": [36, 59], + "url": "https://t.co/trAcIxBMlX", + } + ], + "user_mentions": [ + { + "id": 975844884606275587, + "id_str": "975844884606275587", + "indices": [0, 10], + "name": "ArieNeo", + "screen_name": "ArieNeoSC", + } + ], + }, + "favorite_count": 19, + "favorited": False, + "full_text": "@ArieNeoSC Here you go, goodnight!\n\nhttps://t.co/trAcIxBMlX
", + "geo": None, + "id": 1291528756373286914, + "id_str": "1291528756373286914", + "in_reply_to_screen_name": "ArieNeoSC", + "in_reply_to_status_id": 1291507356313038850, + "in_reply_to_status_id_str": "1291507356313038850", + "in_reply_to_user_id": 975844884606275587, + "in_reply_to_user_id_str": "975844884606275587", + "is_quote_status": False, + "lang": "en", + "place": None, + "possibly_sensitive": False, + "retweet_count": 5, + "retweeted": False, + "source": 'Twitter Web App', + "truncated": False, + "user": { + "contributors_enabled": False, + "created_at": "Wed Sep 05 00:58:11 +0000 2012", + "default_profile": False, + "default_profile_image": False, + "description": "The official Twitter profile for #StarCitizen and Roberts Space Industries.", + "entities": { + "description": {"urls": []}, + "url": { + "urls": [ + { + "display_url": "robertsspaceindustries.com", + "expanded_url": "http://www.robertsspaceindustries.com", + "indices": [0, 23], + "url": "https://t.co/iqO6apof3y", + } + ] + }, + }, + "favourites_count": 4588, + "follow_request_sent": None, + "followers_count": 106169, + "following": None, + "friends_count": 201, + "geo_enabled": False, + "has_extended_profile": False, + "id": 803542770, + "id_str": "803542770", + "is_translation_enabled": False, + "is_translator": False, + "lang": None, + "listed_count": 890, + "location": "Roberts Space Industries", + "name": "Star Citizen", + "notifications": None, + "profile_background_color": "131516", + "profile_background_image_url": "http://abs.twimg.com/images/themes/theme14/bg.gif", + "profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme14/bg.gif", + "profile_background_tile": False, + "profile_banner_url": "https://pbs.twimg.com/profile_banners/803542770/1596651186", + "profile_image_url": "http://pbs.twimg.com/profile_images/963109950103814144/ysnj_Asy_normal.jpg", + "profile_image_url_https": "https://pbs.twimg.com/profile_images/963109950103814144/ysnj_Asy_normal.jpg", + "profile_link_color": "0A5485", + "profile_sidebar_border_color": "FFFFFF", + "profile_sidebar_fill_color": "EFEFEF", + "profile_text_color": "333333", + "profile_use_background_image": True, + "protected": False, + "screen_name": "RobertsSpaceInd", + "statuses_count": 6210, + "time_zone": None, + "translator_type": "none", + "url": "https://t.co/iqO6apof3y", + "utc_offset": None, + "verified": True, + }, + } +] diff --git a/src/newsreader/news/collection/tests/twitter/builder/tests.py b/src/newsreader/news/collection/tests/twitter/builder/tests.py index 9ba81d4..19fdce3 100644 --- a/src/newsreader/news/collection/tests/twitter/builder/tests.py +++ b/src/newsreader/news/collection/tests/twitter/builder/tests.py @@ -1,8 +1,6 @@ from datetime import datetime -from unittest import skip from unittest.mock import MagicMock -from django.template.defaultfilters import truncatechars from django.test import TestCase from django.utils.html import format_html @@ -17,10 +15,12 @@ from newsreader.news.collection.tests.twitter.builder.mocks import ( quoted_mock, retweet_mock, simple_mock, + unsanitized_mock, video_mock, video_without_bitrate_mock, ) from newsreader.news.collection.twitter import TWITTER_URL, TwitterBuilder +from newsreader.news.collection.utils import truncate_text from newsreader.news.core.models import Post @@ -48,7 +48,7 @@ class TwitterBuilderTestCase(TestCase): full_text = "@ArieNeoSC Here you go, goodnight!\n\nhttps://t.co/trAcIxBMlX" self.assertEquals(post.rule, profile) - self.assertEquals(post.title, truncatechars(full_text, 40)) + self.assertEquals(post.title, truncate_text(Post, "title", full_text)) self.assertEquals(post.body, format_html(full_text)) self.assertEquals(post.author, "RobertsSpaceInd") @@ -64,7 +64,7 @@ class TwitterBuilderTestCase(TestCase): full_text = "@RelicCcb Hi Christoper, we have checked the status of your investigation and it is still ongoing." self.assertEquals(post.rule, profile) - self.assertEquals(post.title, truncatechars(full_text, 40)) + self.assertEquals(post.title, truncate_text(Post, "title", full_text)) self.assertEquals(post.body, format_html(full_text)) self.assertEquals(post.author, "RobertsSpaceInd") @@ -107,12 +107,12 @@ class TwitterBuilderTestCase(TestCase): self.assertIn(full_text, post.body) self.assertInHTML( - """
1269039233072689152
""", + """
1269039233072689152
""", post.body, count=1, ) self.assertInHTML( - """
1269039233068527618
""", + """
1269039233068527618
""", post.body, count=1, ) @@ -142,7 +142,7 @@ class TwitterBuilderTestCase(TestCase): ) self.assertEquals(post.rule, profile) - self.assertEquals(post.title, truncatechars(full_text, 40)) + self.assertEquals(post.title, truncate_text(Post, "title", full_text)) self.assertEquals(post.author, "RobertsSpaceInd") self.assertEquals( @@ -154,7 +154,7 @@ class TwitterBuilderTestCase(TestCase): self.assertIn(full_text, post.body) self.assertInHTML( - """
""", + """
""", post.body, count=1, ) @@ -175,7 +175,7 @@ class TwitterBuilderTestCase(TestCase): post = posts["1291080532361527296"] self.assertInHTML( - """
""", + """
""", post.body, count=1, ) @@ -198,7 +198,7 @@ class TwitterBuilderTestCase(TestCase): post = posts["1289337776140296193"] self.assertInHTML( - """
""", + """
""", post.body, count=1, ) @@ -270,18 +270,43 @@ class TwitterBuilderTestCase(TestCase): post.body, ) - @skip("Not implemented") def test_empty_data(self): - pass + builder = TwitterBuilder - @skip("Not implemented") - def test_update_posts(self): - pass + profile = TwitterProfileFactory(screen_name="RobertsSpaceInd") + mock_stream = MagicMock(rule=profile) + + with builder(([], mock_stream)) as builder: + builder.save() + + self.assertEquals(Post.objects.count(), 0) - @skip("Not implemented") def test_html_sanitizing(self): - pass + builder = TwitterBuilder - @skip("Not implemented") - def test_duplicate_in_data(self): - pass + profile = TwitterProfileFactory(screen_name="RobertsSpaceInd") + mock_stream = MagicMock(rule=profile) + + with builder((unsanitized_mock, mock_stream)) as builder: + builder.save() + + posts = {post.remote_identifier: post for post in Post.objects.all()} + + self.assertCountEqual(("1291528756373286914",), posts.keys()) + + post = posts["1291528756373286914"] + + full_text = ( + "@ArieNeoSC Here you go, goodnight!\n\nhttps://t.co/trAcIxBMlX" + "
" + ) + + self.assertEquals(post.rule, profile) + self.assertEquals(post.title, truncate_text(Post, "title", full_text)) + self.assertEquals(post.body, format_html(full_text)) + + self.assertInHTML("", post.body, count=0) + self.assertInHTML("
", post.body, count=1) + + self.assertInHTML("", post.title, count=0) + self.assertInHTML("
", post.title, count=1) diff --git a/src/newsreader/news/collection/twitter.py b/src/newsreader/news/collection/twitter.py index d8d273b..b0f08cc 100644 --- a/src/newsreader/news/collection/twitter.py +++ b/src/newsreader/news/collection/twitter.py @@ -2,7 +2,6 @@ import logging from datetime import datetime -from django.template.defaultfilters import truncatechars from django.utils.html import format_html import pytz @@ -11,6 +10,7 @@ from ftfy import fix_text from newsreader.news.collection.base import Builder, Client, Collector, Stream from newsreader.news.collection.choices import RuleTypeChoices, TwitterPostTypeChoices +from newsreader.news.collection.utils import truncate_text from newsreader.news.core.models import Post @@ -41,10 +41,13 @@ class TwitterBuilder(Builder): for post in posts: remote_identifier = post["id_str"] url = f"{TWITTER_URL}/{rule.screen_name}/{remote_identifier}" + + body = post["full_text"] + title = truncate_text(Post, "title", self.sanitize_fragment(body)) + publication_date = pytz.utc.localize( datetime.strptime(post["created_at"], "%a %b %d %H:%M:%S +0000 %Y") ) - body = post["full_text"] if "extended_entities" in post: try: @@ -66,9 +69,11 @@ class TwitterBuilder(Builder): original_post=original_post["full_text"], ) + body = self.sanitize_fragment(body) + data = { "remote_identifier": remote_identifier, - "title": fix_text(truncatechars(post["full_text"], 40)), + "title": fix_text(title), "body": fix_text(body), "author": rule.screen_name, "publication_date": publication_date,