Add last builder test

This commit is contained in:
Sonny Bakker 2020-09-12 19:23:59 +02:00
parent b48efbbffb
commit 6be8862a7d
7 changed files with 180 additions and 54 deletions

View file

@ -1,5 +1,11 @@
import bleach
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from newsreader.news.collection.constants import (
WHITELISTED_ATTRIBUTES,
WHITELISTED_TAGS,
)
from newsreader.news.collection.exceptions import StreamParseException from newsreader.news.collection.exceptions import StreamParseException
from newsreader.news.collection.utils import fetch from newsreader.news.collection.utils import fetch
from newsreader.news.core.models import Post from newsreader.news.core.models import Post
@ -79,6 +85,18 @@ class Builder:
def create_posts(self, stream): def create_posts(self, stream):
raise NotImplementedError raise NotImplementedError
def sanitize_fragment(self, fragment):
if not fragment:
return ""
return bleach.clean(
fragment,
tags=WHITELISTED_TAGS,
attributes=WHITELISTED_ATTRIBUTES,
strip=True,
strip_comments=True,
)
def save(self): def save(self):
for post in self.instances: for post in self.instances:
post.save() post.save()

View file

@ -23,6 +23,7 @@ WHITELISTED_TAGS = (
WHITELISTED_ATTRIBUTES = { WHITELISTED_ATTRIBUTES = {
**BLEACH_ATTRIBUTES, **BLEACH_ATTRIBUTES,
"a": ["href", "rel"], "a": ["href", "rel"],
"img": ["alt", "src"], "img": ["alt", "src", "loading"],
"source": ["srcset", "media", "src", "type"], "video": ["controls", "muted"],
"source": ["srcset", "src", "media", "type"],
} }

View file

@ -6,17 +6,12 @@ from datetime import timedelta
from django.core.exceptions import MultipleObjectsReturned, ObjectDoesNotExist from django.core.exceptions import MultipleObjectsReturned, ObjectDoesNotExist
from django.utils import timezone from django.utils import timezone
import bleach
import pytz import pytz
from feedparser import parse from feedparser import parse
from newsreader.news.collection.base import Builder, Client, Collector, Stream from newsreader.news.collection.base import Builder, Client, Collector, Stream
from newsreader.news.collection.choices import RuleTypeChoices from newsreader.news.collection.choices import RuleTypeChoices
from newsreader.news.collection.constants import (
WHITELISTED_ATTRIBUTES,
WHITELISTED_TAGS,
)
from newsreader.news.collection.exceptions import ( from newsreader.news.collection.exceptions import (
StreamDeniedException, StreamDeniedException,
StreamException, StreamException,
@ -85,18 +80,6 @@ class FeedBuilder(Builder):
yield Post(**data) yield Post(**data)
def sanitize_fragment(self, fragment):
if not fragment:
return ""
return bleach.clean(
fragment,
tags=WHITELISTED_TAGS,
attributes=WHITELISTED_ATTRIBUTES,
strip=True,
strip_comments=True,
)
def get_content(self, items): def get_content(self, items):
content = "\n ".join([item.get("value") for item in items]) content = "\n ".join([item.get("value") for item in items])
return self.sanitize_fragment(content) return self.sanitize_fragment(content)

View file

@ -12,7 +12,6 @@ from django.core.cache import cache
from django.utils import timezone from django.utils import timezone
from django.utils.html import format_html from django.utils.html import format_html
import bleach
import pytz import pytz
import requests import requests
@ -128,17 +127,7 @@ class RedditBuilder(Builder):
if is_text_post: if is_text_post:
uncleaned_body = data["selftext_html"] uncleaned_body = data["selftext_html"]
unescaped_body = unescape(uncleaned_body) if uncleaned_body else "" unescaped_body = unescape(uncleaned_body) if uncleaned_body else ""
body = ( body = self.sanitize_fragment(unescaped_body) if unescaped_body else ""
bleach.clean(
unescaped_body,
tags=WHITELISTED_TAGS,
attributes=WHITELISTED_ATTRIBUTES,
strip=True,
strip_comments=True,
)
if unescaped_body
else ""
)
elif direct_url.endswith(REDDIT_IMAGE_EXTENSIONS): elif direct_url.endswith(REDDIT_IMAGE_EXTENSIONS):
body = format_html( body = format_html(
"<div><img alt='{title}' src='{url}' loading='lazy' /></div>", "<div><img alt='{title}' src='{url}' loading='lazy' /></div>",

View file

@ -2080,3 +2080,108 @@ gif_mock = [
}, },
}, },
] ]
unsanitized_mock = [
{
"contributors": None,
"coordinates": None,
"created_at": "Fri Aug 07 00:17:05 +0000 2020",
"display_text_range": [11, 59],
"entities": {
"hashtags": [],
"symbols": [],
"urls": [
{
"display_url": "youtu.be/rDy7tPf6CT8",
"expanded_url": "https://youtu.be/rDy7tPf6CT8",
"indices": [36, 59],
"url": "https://t.co/trAcIxBMlX",
}
],
"user_mentions": [
{
"id": 975844884606275587,
"id_str": "975844884606275587",
"indices": [0, 10],
"name": "ArieNeo",
"screen_name": "ArieNeoSC",
}
],
},
"favorite_count": 19,
"favorited": False,
"full_text": "@ArieNeoSC Here you go, goodnight!\n\nhttps://t.co/trAcIxBMlX <script></script><article></article>",
"geo": None,
"id": 1291528756373286914,
"id_str": "1291528756373286914",
"in_reply_to_screen_name": "ArieNeoSC",
"in_reply_to_status_id": 1291507356313038850,
"in_reply_to_status_id_str": "1291507356313038850",
"in_reply_to_user_id": 975844884606275587,
"in_reply_to_user_id_str": "975844884606275587",
"is_quote_status": False,
"lang": "en",
"place": None,
"possibly_sensitive": False,
"retweet_count": 5,
"retweeted": False,
"source": '<a href="https://mobile.twitter.com" rel="nofollow">Twitter Web App</a>',
"truncated": False,
"user": {
"contributors_enabled": False,
"created_at": "Wed Sep 05 00:58:11 +0000 2012",
"default_profile": False,
"default_profile_image": False,
"description": "The official Twitter profile for #StarCitizen and Roberts Space Industries.",
"entities": {
"description": {"urls": []},
"url": {
"urls": [
{
"display_url": "robertsspaceindustries.com",
"expanded_url": "http://www.robertsspaceindustries.com",
"indices": [0, 23],
"url": "https://t.co/iqO6apof3y",
}
]
},
},
"favourites_count": 4588,
"follow_request_sent": None,
"followers_count": 106169,
"following": None,
"friends_count": 201,
"geo_enabled": False,
"has_extended_profile": False,
"id": 803542770,
"id_str": "803542770",
"is_translation_enabled": False,
"is_translator": False,
"lang": None,
"listed_count": 890,
"location": "Roberts Space Industries",
"name": "Star Citizen",
"notifications": None,
"profile_background_color": "131516",
"profile_background_image_url": "http://abs.twimg.com/images/themes/theme14/bg.gif",
"profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme14/bg.gif",
"profile_background_tile": False,
"profile_banner_url": "https://pbs.twimg.com/profile_banners/803542770/1596651186",
"profile_image_url": "http://pbs.twimg.com/profile_images/963109950103814144/ysnj_Asy_normal.jpg",
"profile_image_url_https": "https://pbs.twimg.com/profile_images/963109950103814144/ysnj_Asy_normal.jpg",
"profile_link_color": "0A5485",
"profile_sidebar_border_color": "FFFFFF",
"profile_sidebar_fill_color": "EFEFEF",
"profile_text_color": "333333",
"profile_use_background_image": True,
"protected": False,
"screen_name": "RobertsSpaceInd",
"statuses_count": 6210,
"time_zone": None,
"translator_type": "none",
"url": "https://t.co/iqO6apof3y",
"utc_offset": None,
"verified": True,
},
}
]

View file

@ -1,8 +1,6 @@
from datetime import datetime from datetime import datetime
from unittest import skip
from unittest.mock import MagicMock from unittest.mock import MagicMock
from django.template.defaultfilters import truncatechars
from django.test import TestCase from django.test import TestCase
from django.utils.html import format_html from django.utils.html import format_html
@ -17,10 +15,12 @@ from newsreader.news.collection.tests.twitter.builder.mocks import (
quoted_mock, quoted_mock,
retweet_mock, retweet_mock,
simple_mock, simple_mock,
unsanitized_mock,
video_mock, video_mock,
video_without_bitrate_mock, video_without_bitrate_mock,
) )
from newsreader.news.collection.twitter import TWITTER_URL, TwitterBuilder from newsreader.news.collection.twitter import TWITTER_URL, TwitterBuilder
from newsreader.news.collection.utils import truncate_text
from newsreader.news.core.models import Post from newsreader.news.core.models import Post
@ -48,7 +48,7 @@ class TwitterBuilderTestCase(TestCase):
full_text = "@ArieNeoSC Here you go, goodnight!\n\nhttps://t.co/trAcIxBMlX" full_text = "@ArieNeoSC Here you go, goodnight!\n\nhttps://t.co/trAcIxBMlX"
self.assertEquals(post.rule, profile) self.assertEquals(post.rule, profile)
self.assertEquals(post.title, truncatechars(full_text, 40)) self.assertEquals(post.title, truncate_text(Post, "title", full_text))
self.assertEquals(post.body, format_html(full_text)) self.assertEquals(post.body, format_html(full_text))
self.assertEquals(post.author, "RobertsSpaceInd") self.assertEquals(post.author, "RobertsSpaceInd")
@ -64,7 +64,7 @@ class TwitterBuilderTestCase(TestCase):
full_text = "@RelicCcb Hi Christoper, we have checked the status of your investigation and it is still ongoing." full_text = "@RelicCcb Hi Christoper, we have checked the status of your investigation and it is still ongoing."
self.assertEquals(post.rule, profile) self.assertEquals(post.rule, profile)
self.assertEquals(post.title, truncatechars(full_text, 40)) self.assertEquals(post.title, truncate_text(Post, "title", full_text))
self.assertEquals(post.body, format_html(full_text)) self.assertEquals(post.body, format_html(full_text))
self.assertEquals(post.author, "RobertsSpaceInd") self.assertEquals(post.author, "RobertsSpaceInd")
@ -107,12 +107,12 @@ class TwitterBuilderTestCase(TestCase):
self.assertIn(full_text, post.body) self.assertIn(full_text, post.body)
self.assertInHTML( self.assertInHTML(
"""<div><img alt="1269039233072689152" src="https://pbs.twimg.com/media/EZyIdXVU8AACPCz.jpg" loading="lazy" /></div>""", """<div><img alt="1269039233072689152" src="https://pbs.twimg.com/media/EZyIdXVU8AACPCz.jpg" loading="lazy"></div>""",
post.body, post.body,
count=1, count=1,
) )
self.assertInHTML( self.assertInHTML(
"""<div><img alt="1269039233068527618" src="https://pbs.twimg.com/media/EZyIdXUVcAI3Cju.jpg" loading="lazy" /></div>""", """<div><img alt="1269039233068527618" src="https://pbs.twimg.com/media/EZyIdXUVcAI3Cju.jpg" loading="lazy"></div>""",
post.body, post.body,
count=1, count=1,
) )
@ -142,7 +142,7 @@ class TwitterBuilderTestCase(TestCase):
) )
self.assertEquals(post.rule, profile) self.assertEquals(post.rule, profile)
self.assertEquals(post.title, truncatechars(full_text, 40)) self.assertEquals(post.title, truncate_text(Post, "title", full_text))
self.assertEquals(post.author, "RobertsSpaceInd") self.assertEquals(post.author, "RobertsSpaceInd")
self.assertEquals( self.assertEquals(
@ -154,7 +154,7 @@ class TwitterBuilderTestCase(TestCase):
self.assertIn(full_text, post.body) self.assertIn(full_text, post.body)
self.assertInHTML( self.assertInHTML(
"""<div><video controls muted><source src="https://video.twimg.com/amplify_video/1291074294747770880/vid/1280x720/J05_p6q74ZUN4csg.mp4?tag=13" type="video/mp4" /></video></div>""", """<div><video controls muted=""><source src="https://video.twimg.com/amplify_video/1291074294747770880/vid/1280x720/J05_p6q74ZUN4csg.mp4?tag=13" type="video/mp4" /></video></div>""",
post.body, post.body,
count=1, count=1,
) )
@ -175,7 +175,7 @@ class TwitterBuilderTestCase(TestCase):
post = posts["1291080532361527296"] post = posts["1291080532361527296"]
self.assertInHTML( self.assertInHTML(
"""<div><video controls muted><source src="https://video.twimg.com/amplify_video/1291074294747770880/pl/kMYgFEoRyoW99o-i.m3u8?tag=13" type="application/x-mpegURL" /></video></div>""", """<div><video controls muted=""><source src="https://video.twimg.com/amplify_video/1291074294747770880/pl/kMYgFEoRyoW99o-i.m3u8?tag=13" type="application/x-mpegURL"></video></div>""",
post.body, post.body,
count=1, count=1,
) )
@ -198,7 +198,7 @@ class TwitterBuilderTestCase(TestCase):
post = posts["1289337776140296193"] post = posts["1289337776140296193"]
self.assertInHTML( self.assertInHTML(
"""<div><video controls muted><source src="https://video.twimg.com/tweet_video/EeSl3sPUcAAyE4J.mp4" type="video/mp4" /></video></div>""", """<div><video controls muted=""><source src="https://video.twimg.com/tweet_video/EeSl3sPUcAAyE4J.mp4" type="video/mp4"></video></div>""",
post.body, post.body,
count=1, count=1,
) )
@ -270,18 +270,43 @@ class TwitterBuilderTestCase(TestCase):
post.body, post.body,
) )
@skip("Not implemented")
def test_empty_data(self): def test_empty_data(self):
pass builder = TwitterBuilder
@skip("Not implemented") profile = TwitterProfileFactory(screen_name="RobertsSpaceInd")
def test_update_posts(self): mock_stream = MagicMock(rule=profile)
pass
with builder(([], mock_stream)) as builder:
builder.save()
self.assertEquals(Post.objects.count(), 0)
@skip("Not implemented")
def test_html_sanitizing(self): def test_html_sanitizing(self):
pass builder = TwitterBuilder
@skip("Not implemented") profile = TwitterProfileFactory(screen_name="RobertsSpaceInd")
def test_duplicate_in_data(self): mock_stream = MagicMock(rule=profile)
pass
with builder((unsanitized_mock, mock_stream)) as builder:
builder.save()
posts = {post.remote_identifier: post for post in Post.objects.all()}
self.assertCountEqual(("1291528756373286914",), posts.keys())
post = posts["1291528756373286914"]
full_text = (
"@ArieNeoSC Here you go, goodnight!\n\nhttps://t.co/trAcIxBMlX"
" <article></article>"
)
self.assertEquals(post.rule, profile)
self.assertEquals(post.title, truncate_text(Post, "title", full_text))
self.assertEquals(post.body, format_html(full_text))
self.assertInHTML("<script></script>", post.body, count=0)
self.assertInHTML("<article></article>", post.body, count=1)
self.assertInHTML("<script></script>", post.title, count=0)
self.assertInHTML("<article></article>", post.title, count=1)

View file

@ -2,7 +2,6 @@ import logging
from datetime import datetime from datetime import datetime
from django.template.defaultfilters import truncatechars
from django.utils.html import format_html from django.utils.html import format_html
import pytz import pytz
@ -11,6 +10,7 @@ from ftfy import fix_text
from newsreader.news.collection.base import Builder, Client, Collector, Stream from newsreader.news.collection.base import Builder, Client, Collector, Stream
from newsreader.news.collection.choices import RuleTypeChoices, TwitterPostTypeChoices from newsreader.news.collection.choices import RuleTypeChoices, TwitterPostTypeChoices
from newsreader.news.collection.utils import truncate_text
from newsreader.news.core.models import Post from newsreader.news.core.models import Post
@ -41,10 +41,13 @@ class TwitterBuilder(Builder):
for post in posts: for post in posts:
remote_identifier = post["id_str"] remote_identifier = post["id_str"]
url = f"{TWITTER_URL}/{rule.screen_name}/{remote_identifier}" url = f"{TWITTER_URL}/{rule.screen_name}/{remote_identifier}"
body = post["full_text"]
title = truncate_text(Post, "title", self.sanitize_fragment(body))
publication_date = pytz.utc.localize( publication_date = pytz.utc.localize(
datetime.strptime(post["created_at"], "%a %b %d %H:%M:%S +0000 %Y") datetime.strptime(post["created_at"], "%a %b %d %H:%M:%S +0000 %Y")
) )
body = post["full_text"]
if "extended_entities" in post: if "extended_entities" in post:
try: try:
@ -66,9 +69,11 @@ class TwitterBuilder(Builder):
original_post=original_post["full_text"], original_post=original_post["full_text"],
) )
body = self.sanitize_fragment(body)
data = { data = {
"remote_identifier": remote_identifier, "remote_identifier": remote_identifier,
"title": fix_text(truncatechars(post["full_text"], 40)), "title": fix_text(title),
"body": fix_text(body), "body": fix_text(body),
"author": rule.screen_name, "author": rule.screen_name,
"publication_date": publication_date, "publication_date": publication_date,