diff --git a/src/newsreader/news/collection/base.py b/src/newsreader/news/collection/base.py
index 5de3454..df122a6 100644
--- a/src/newsreader/news/collection/base.py
+++ b/src/newsreader/news/collection/base.py
@@ -1,5 +1,11 @@
+import bleach
+
from bs4 import BeautifulSoup
+from newsreader.news.collection.constants import (
+ WHITELISTED_ATTRIBUTES,
+ WHITELISTED_TAGS,
+)
from newsreader.news.collection.exceptions import StreamParseException
from newsreader.news.collection.utils import fetch
from newsreader.news.core.models import Post
@@ -79,6 +85,18 @@ class Builder:
def create_posts(self, stream):
raise NotImplementedError
+ def sanitize_fragment(self, fragment):
+ if not fragment:
+ return ""
+
+ return bleach.clean(
+ fragment,
+ tags=WHITELISTED_TAGS,
+ attributes=WHITELISTED_ATTRIBUTES,
+ strip=True,
+ strip_comments=True,
+ )
+
def save(self):
for post in self.instances:
post.save()
diff --git a/src/newsreader/news/collection/constants.py b/src/newsreader/news/collection/constants.py
index eade898..0c73642 100644
--- a/src/newsreader/news/collection/constants.py
+++ b/src/newsreader/news/collection/constants.py
@@ -23,6 +23,7 @@ WHITELISTED_TAGS = (
WHITELISTED_ATTRIBUTES = {
**BLEACH_ATTRIBUTES,
"a": ["href", "rel"],
- "img": ["alt", "src"],
- "source": ["srcset", "media", "src", "type"],
+ "img": ["alt", "src", "loading"],
+ "video": ["controls", "muted"],
+ "source": ["srcset", "src", "media", "type"],
}
diff --git a/src/newsreader/news/collection/feed.py b/src/newsreader/news/collection/feed.py
index cb5618f..ff28666 100644
--- a/src/newsreader/news/collection/feed.py
+++ b/src/newsreader/news/collection/feed.py
@@ -6,17 +6,12 @@ from datetime import timedelta
from django.core.exceptions import MultipleObjectsReturned, ObjectDoesNotExist
from django.utils import timezone
-import bleach
import pytz
from feedparser import parse
from newsreader.news.collection.base import Builder, Client, Collector, Stream
from newsreader.news.collection.choices import RuleTypeChoices
-from newsreader.news.collection.constants import (
- WHITELISTED_ATTRIBUTES,
- WHITELISTED_TAGS,
-)
from newsreader.news.collection.exceptions import (
StreamDeniedException,
StreamException,
@@ -85,18 +80,6 @@ class FeedBuilder(Builder):
yield Post(**data)
- def sanitize_fragment(self, fragment):
- if not fragment:
- return ""
-
- return bleach.clean(
- fragment,
- tags=WHITELISTED_TAGS,
- attributes=WHITELISTED_ATTRIBUTES,
- strip=True,
- strip_comments=True,
- )
-
def get_content(self, items):
content = "\n ".join([item.get("value") for item in items])
return self.sanitize_fragment(content)
diff --git a/src/newsreader/news/collection/reddit.py b/src/newsreader/news/collection/reddit.py
index 7ef4784..65ce384 100644
--- a/src/newsreader/news/collection/reddit.py
+++ b/src/newsreader/news/collection/reddit.py
@@ -12,7 +12,6 @@ from django.core.cache import cache
from django.utils import timezone
from django.utils.html import format_html
-import bleach
import pytz
import requests
@@ -128,17 +127,7 @@ class RedditBuilder(Builder):
if is_text_post:
uncleaned_body = data["selftext_html"]
unescaped_body = unescape(uncleaned_body) if uncleaned_body else ""
- body = (
- bleach.clean(
- unescaped_body,
- tags=WHITELISTED_TAGS,
- attributes=WHITELISTED_ATTRIBUTES,
- strip=True,
- strip_comments=True,
- )
- if unescaped_body
- else ""
- )
+ body = self.sanitize_fragment(unescaped_body) if unescaped_body else ""
elif direct_url.endswith(REDDIT_IMAGE_EXTENSIONS):
body = format_html(
"

",
diff --git a/src/newsreader/news/collection/tests/twitter/builder/mocks.py b/src/newsreader/news/collection/tests/twitter/builder/mocks.py
index 11047b1..b330f2f 100644
--- a/src/newsreader/news/collection/tests/twitter/builder/mocks.py
+++ b/src/newsreader/news/collection/tests/twitter/builder/mocks.py
@@ -2080,3 +2080,108 @@ gif_mock = [
},
},
]
+
+unsanitized_mock = [
+ {
+ "contributors": None,
+ "coordinates": None,
+ "created_at": "Fri Aug 07 00:17:05 +0000 2020",
+ "display_text_range": [11, 59],
+ "entities": {
+ "hashtags": [],
+ "symbols": [],
+ "urls": [
+ {
+ "display_url": "youtu.be/rDy7tPf6CT8",
+ "expanded_url": "https://youtu.be/rDy7tPf6CT8",
+ "indices": [36, 59],
+ "url": "https://t.co/trAcIxBMlX",
+ }
+ ],
+ "user_mentions": [
+ {
+ "id": 975844884606275587,
+ "id_str": "975844884606275587",
+ "indices": [0, 10],
+ "name": "ArieNeo",
+ "screen_name": "ArieNeoSC",
+ }
+ ],
+ },
+ "favorite_count": 19,
+ "favorited": False,
+ "full_text": "@ArieNeoSC Here you go, goodnight!\n\nhttps://t.co/trAcIxBMlX ",
+ "geo": None,
+ "id": 1291528756373286914,
+ "id_str": "1291528756373286914",
+ "in_reply_to_screen_name": "ArieNeoSC",
+ "in_reply_to_status_id": 1291507356313038850,
+ "in_reply_to_status_id_str": "1291507356313038850",
+ "in_reply_to_user_id": 975844884606275587,
+ "in_reply_to_user_id_str": "975844884606275587",
+ "is_quote_status": False,
+ "lang": "en",
+ "place": None,
+ "possibly_sensitive": False,
+ "retweet_count": 5,
+ "retweeted": False,
+ "source": 'Twitter Web App',
+ "truncated": False,
+ "user": {
+ "contributors_enabled": False,
+ "created_at": "Wed Sep 05 00:58:11 +0000 2012",
+ "default_profile": False,
+ "default_profile_image": False,
+ "description": "The official Twitter profile for #StarCitizen and Roberts Space Industries.",
+ "entities": {
+ "description": {"urls": []},
+ "url": {
+ "urls": [
+ {
+ "display_url": "robertsspaceindustries.com",
+ "expanded_url": "http://www.robertsspaceindustries.com",
+ "indices": [0, 23],
+ "url": "https://t.co/iqO6apof3y",
+ }
+ ]
+ },
+ },
+ "favourites_count": 4588,
+ "follow_request_sent": None,
+ "followers_count": 106169,
+ "following": None,
+ "friends_count": 201,
+ "geo_enabled": False,
+ "has_extended_profile": False,
+ "id": 803542770,
+ "id_str": "803542770",
+ "is_translation_enabled": False,
+ "is_translator": False,
+ "lang": None,
+ "listed_count": 890,
+ "location": "Roberts Space Industries",
+ "name": "Star Citizen",
+ "notifications": None,
+ "profile_background_color": "131516",
+ "profile_background_image_url": "http://abs.twimg.com/images/themes/theme14/bg.gif",
+ "profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme14/bg.gif",
+ "profile_background_tile": False,
+ "profile_banner_url": "https://pbs.twimg.com/profile_banners/803542770/1596651186",
+ "profile_image_url": "http://pbs.twimg.com/profile_images/963109950103814144/ysnj_Asy_normal.jpg",
+ "profile_image_url_https": "https://pbs.twimg.com/profile_images/963109950103814144/ysnj_Asy_normal.jpg",
+ "profile_link_color": "0A5485",
+ "profile_sidebar_border_color": "FFFFFF",
+ "profile_sidebar_fill_color": "EFEFEF",
+ "profile_text_color": "333333",
+ "profile_use_background_image": True,
+ "protected": False,
+ "screen_name": "RobertsSpaceInd",
+ "statuses_count": 6210,
+ "time_zone": None,
+ "translator_type": "none",
+ "url": "https://t.co/iqO6apof3y",
+ "utc_offset": None,
+ "verified": True,
+ },
+ }
+]
diff --git a/src/newsreader/news/collection/tests/twitter/builder/tests.py b/src/newsreader/news/collection/tests/twitter/builder/tests.py
index 9ba81d4..19fdce3 100644
--- a/src/newsreader/news/collection/tests/twitter/builder/tests.py
+++ b/src/newsreader/news/collection/tests/twitter/builder/tests.py
@@ -1,8 +1,6 @@
from datetime import datetime
-from unittest import skip
from unittest.mock import MagicMock
-from django.template.defaultfilters import truncatechars
from django.test import TestCase
from django.utils.html import format_html
@@ -17,10 +15,12 @@ from newsreader.news.collection.tests.twitter.builder.mocks import (
quoted_mock,
retweet_mock,
simple_mock,
+ unsanitized_mock,
video_mock,
video_without_bitrate_mock,
)
from newsreader.news.collection.twitter import TWITTER_URL, TwitterBuilder
+from newsreader.news.collection.utils import truncate_text
from newsreader.news.core.models import Post
@@ -48,7 +48,7 @@ class TwitterBuilderTestCase(TestCase):
full_text = "@ArieNeoSC Here you go, goodnight!\n\nhttps://t.co/trAcIxBMlX"
self.assertEquals(post.rule, profile)
- self.assertEquals(post.title, truncatechars(full_text, 40))
+ self.assertEquals(post.title, truncate_text(Post, "title", full_text))
self.assertEquals(post.body, format_html(full_text))
self.assertEquals(post.author, "RobertsSpaceInd")
@@ -64,7 +64,7 @@ class TwitterBuilderTestCase(TestCase):
full_text = "@RelicCcb Hi Christoper, we have checked the status of your investigation and it is still ongoing."
self.assertEquals(post.rule, profile)
- self.assertEquals(post.title, truncatechars(full_text, 40))
+ self.assertEquals(post.title, truncate_text(Post, "title", full_text))
self.assertEquals(post.body, format_html(full_text))
self.assertEquals(post.author, "RobertsSpaceInd")
@@ -107,12 +107,12 @@ class TwitterBuilderTestCase(TestCase):
self.assertIn(full_text, post.body)
self.assertInHTML(
- """""",
+ """""",
post.body,
count=1,
)
self.assertInHTML(
- """""",
+ """""",
post.body,
count=1,
)
@@ -142,7 +142,7 @@ class TwitterBuilderTestCase(TestCase):
)
self.assertEquals(post.rule, profile)
- self.assertEquals(post.title, truncatechars(full_text, 40))
+ self.assertEquals(post.title, truncate_text(Post, "title", full_text))
self.assertEquals(post.author, "RobertsSpaceInd")
self.assertEquals(
@@ -154,7 +154,7 @@ class TwitterBuilderTestCase(TestCase):
self.assertIn(full_text, post.body)
self.assertInHTML(
- """""",
+ """""",
post.body,
count=1,
)
@@ -175,7 +175,7 @@ class TwitterBuilderTestCase(TestCase):
post = posts["1291080532361527296"]
self.assertInHTML(
- """""",
+ """""",
post.body,
count=1,
)
@@ -198,7 +198,7 @@ class TwitterBuilderTestCase(TestCase):
post = posts["1289337776140296193"]
self.assertInHTML(
- """""",
+ """""",
post.body,
count=1,
)
@@ -270,18 +270,43 @@ class TwitterBuilderTestCase(TestCase):
post.body,
)
- @skip("Not implemented")
def test_empty_data(self):
- pass
+ builder = TwitterBuilder
- @skip("Not implemented")
- def test_update_posts(self):
- pass
+ profile = TwitterProfileFactory(screen_name="RobertsSpaceInd")
+ mock_stream = MagicMock(rule=profile)
+
+ with builder(([], mock_stream)) as builder:
+ builder.save()
+
+ self.assertEquals(Post.objects.count(), 0)
- @skip("Not implemented")
def test_html_sanitizing(self):
- pass
+ builder = TwitterBuilder
- @skip("Not implemented")
- def test_duplicate_in_data(self):
- pass
+ profile = TwitterProfileFactory(screen_name="RobertsSpaceInd")
+ mock_stream = MagicMock(rule=profile)
+
+ with builder((unsanitized_mock, mock_stream)) as builder:
+ builder.save()
+
+ posts = {post.remote_identifier: post for post in Post.objects.all()}
+
+ self.assertCountEqual(("1291528756373286914",), posts.keys())
+
+ post = posts["1291528756373286914"]
+
+ full_text = (
+ "@ArieNeoSC Here you go, goodnight!\n\nhttps://t.co/trAcIxBMlX"
+ " "
+ )
+
+ self.assertEquals(post.rule, profile)
+ self.assertEquals(post.title, truncate_text(Post, "title", full_text))
+ self.assertEquals(post.body, format_html(full_text))
+
+ self.assertInHTML("", post.body, count=0)
+ self.assertInHTML("", post.body, count=1)
+
+ self.assertInHTML("", post.title, count=0)
+ self.assertInHTML("", post.title, count=1)
diff --git a/src/newsreader/news/collection/twitter.py b/src/newsreader/news/collection/twitter.py
index d8d273b..b0f08cc 100644
--- a/src/newsreader/news/collection/twitter.py
+++ b/src/newsreader/news/collection/twitter.py
@@ -2,7 +2,6 @@ import logging
from datetime import datetime
-from django.template.defaultfilters import truncatechars
from django.utils.html import format_html
import pytz
@@ -11,6 +10,7 @@ from ftfy import fix_text
from newsreader.news.collection.base import Builder, Client, Collector, Stream
from newsreader.news.collection.choices import RuleTypeChoices, TwitterPostTypeChoices
+from newsreader.news.collection.utils import truncate_text
from newsreader.news.core.models import Post
@@ -41,10 +41,13 @@ class TwitterBuilder(Builder):
for post in posts:
remote_identifier = post["id_str"]
url = f"{TWITTER_URL}/{rule.screen_name}/{remote_identifier}"
+
+ body = post["full_text"]
+ title = truncate_text(Post, "title", self.sanitize_fragment(body))
+
publication_date = pytz.utc.localize(
datetime.strptime(post["created_at"], "%a %b %d %H:%M:%S +0000 %Y")
)
- body = post["full_text"]
if "extended_entities" in post:
try:
@@ -66,9 +69,11 @@ class TwitterBuilder(Builder):
original_post=original_post["full_text"],
)
+ body = self.sanitize_fragment(body)
+
data = {
"remote_identifier": remote_identifier,
- "title": fix_text(truncatechars(post["full_text"], 40)),
+ "title": fix_text(title),
"body": fix_text(body),
"author": rule.screen_name,
"publication_date": publication_date,