Add last builder test
This commit is contained in:
parent
b48efbbffb
commit
6be8862a7d
7 changed files with 180 additions and 54 deletions
|
|
@ -1,5 +1,11 @@
|
|||
import bleach
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from newsreader.news.collection.constants import (
|
||||
WHITELISTED_ATTRIBUTES,
|
||||
WHITELISTED_TAGS,
|
||||
)
|
||||
from newsreader.news.collection.exceptions import StreamParseException
|
||||
from newsreader.news.collection.utils import fetch
|
||||
from newsreader.news.core.models import Post
|
||||
|
|
@ -79,6 +85,18 @@ class Builder:
|
|||
def create_posts(self, stream):
|
||||
raise NotImplementedError
|
||||
|
||||
def sanitize_fragment(self, fragment):
|
||||
if not fragment:
|
||||
return ""
|
||||
|
||||
return bleach.clean(
|
||||
fragment,
|
||||
tags=WHITELISTED_TAGS,
|
||||
attributes=WHITELISTED_ATTRIBUTES,
|
||||
strip=True,
|
||||
strip_comments=True,
|
||||
)
|
||||
|
||||
def save(self):
|
||||
for post in self.instances:
|
||||
post.save()
|
||||
|
|
|
|||
|
|
@ -23,6 +23,7 @@ WHITELISTED_TAGS = (
|
|||
WHITELISTED_ATTRIBUTES = {
|
||||
**BLEACH_ATTRIBUTES,
|
||||
"a": ["href", "rel"],
|
||||
"img": ["alt", "src"],
|
||||
"source": ["srcset", "media", "src", "type"],
|
||||
"img": ["alt", "src", "loading"],
|
||||
"video": ["controls", "muted"],
|
||||
"source": ["srcset", "src", "media", "type"],
|
||||
}
|
||||
|
|
|
|||
|
|
@ -6,17 +6,12 @@ from datetime import timedelta
|
|||
from django.core.exceptions import MultipleObjectsReturned, ObjectDoesNotExist
|
||||
from django.utils import timezone
|
||||
|
||||
import bleach
|
||||
import pytz
|
||||
|
||||
from feedparser import parse
|
||||
|
||||
from newsreader.news.collection.base import Builder, Client, Collector, Stream
|
||||
from newsreader.news.collection.choices import RuleTypeChoices
|
||||
from newsreader.news.collection.constants import (
|
||||
WHITELISTED_ATTRIBUTES,
|
||||
WHITELISTED_TAGS,
|
||||
)
|
||||
from newsreader.news.collection.exceptions import (
|
||||
StreamDeniedException,
|
||||
StreamException,
|
||||
|
|
@ -85,18 +80,6 @@ class FeedBuilder(Builder):
|
|||
|
||||
yield Post(**data)
|
||||
|
||||
def sanitize_fragment(self, fragment):
|
||||
if not fragment:
|
||||
return ""
|
||||
|
||||
return bleach.clean(
|
||||
fragment,
|
||||
tags=WHITELISTED_TAGS,
|
||||
attributes=WHITELISTED_ATTRIBUTES,
|
||||
strip=True,
|
||||
strip_comments=True,
|
||||
)
|
||||
|
||||
def get_content(self, items):
|
||||
content = "\n ".join([item.get("value") for item in items])
|
||||
return self.sanitize_fragment(content)
|
||||
|
|
|
|||
|
|
@ -12,7 +12,6 @@ from django.core.cache import cache
|
|||
from django.utils import timezone
|
||||
from django.utils.html import format_html
|
||||
|
||||
import bleach
|
||||
import pytz
|
||||
import requests
|
||||
|
||||
|
|
@ -128,17 +127,7 @@ class RedditBuilder(Builder):
|
|||
if is_text_post:
|
||||
uncleaned_body = data["selftext_html"]
|
||||
unescaped_body = unescape(uncleaned_body) if uncleaned_body else ""
|
||||
body = (
|
||||
bleach.clean(
|
||||
unescaped_body,
|
||||
tags=WHITELISTED_TAGS,
|
||||
attributes=WHITELISTED_ATTRIBUTES,
|
||||
strip=True,
|
||||
strip_comments=True,
|
||||
)
|
||||
if unescaped_body
|
||||
else ""
|
||||
)
|
||||
body = self.sanitize_fragment(unescaped_body) if unescaped_body else ""
|
||||
elif direct_url.endswith(REDDIT_IMAGE_EXTENSIONS):
|
||||
body = format_html(
|
||||
"<div><img alt='{title}' src='{url}' loading='lazy' /></div>",
|
||||
|
|
|
|||
|
|
@ -2080,3 +2080,108 @@ gif_mock = [
|
|||
},
|
||||
},
|
||||
]
|
||||
|
||||
unsanitized_mock = [
|
||||
{
|
||||
"contributors": None,
|
||||
"coordinates": None,
|
||||
"created_at": "Fri Aug 07 00:17:05 +0000 2020",
|
||||
"display_text_range": [11, 59],
|
||||
"entities": {
|
||||
"hashtags": [],
|
||||
"symbols": [],
|
||||
"urls": [
|
||||
{
|
||||
"display_url": "youtu.be/rDy7tPf6CT8",
|
||||
"expanded_url": "https://youtu.be/rDy7tPf6CT8",
|
||||
"indices": [36, 59],
|
||||
"url": "https://t.co/trAcIxBMlX",
|
||||
}
|
||||
],
|
||||
"user_mentions": [
|
||||
{
|
||||
"id": 975844884606275587,
|
||||
"id_str": "975844884606275587",
|
||||
"indices": [0, 10],
|
||||
"name": "ArieNeo",
|
||||
"screen_name": "ArieNeoSC",
|
||||
}
|
||||
],
|
||||
},
|
||||
"favorite_count": 19,
|
||||
"favorited": False,
|
||||
"full_text": "@ArieNeoSC Here you go, goodnight!\n\nhttps://t.co/trAcIxBMlX <script></script><article></article>",
|
||||
"geo": None,
|
||||
"id": 1291528756373286914,
|
||||
"id_str": "1291528756373286914",
|
||||
"in_reply_to_screen_name": "ArieNeoSC",
|
||||
"in_reply_to_status_id": 1291507356313038850,
|
||||
"in_reply_to_status_id_str": "1291507356313038850",
|
||||
"in_reply_to_user_id": 975844884606275587,
|
||||
"in_reply_to_user_id_str": "975844884606275587",
|
||||
"is_quote_status": False,
|
||||
"lang": "en",
|
||||
"place": None,
|
||||
"possibly_sensitive": False,
|
||||
"retweet_count": 5,
|
||||
"retweeted": False,
|
||||
"source": '<a href="https://mobile.twitter.com" rel="nofollow">Twitter Web App</a>',
|
||||
"truncated": False,
|
||||
"user": {
|
||||
"contributors_enabled": False,
|
||||
"created_at": "Wed Sep 05 00:58:11 +0000 2012",
|
||||
"default_profile": False,
|
||||
"default_profile_image": False,
|
||||
"description": "The official Twitter profile for #StarCitizen and Roberts Space Industries.",
|
||||
"entities": {
|
||||
"description": {"urls": []},
|
||||
"url": {
|
||||
"urls": [
|
||||
{
|
||||
"display_url": "robertsspaceindustries.com",
|
||||
"expanded_url": "http://www.robertsspaceindustries.com",
|
||||
"indices": [0, 23],
|
||||
"url": "https://t.co/iqO6apof3y",
|
||||
}
|
||||
]
|
||||
},
|
||||
},
|
||||
"favourites_count": 4588,
|
||||
"follow_request_sent": None,
|
||||
"followers_count": 106169,
|
||||
"following": None,
|
||||
"friends_count": 201,
|
||||
"geo_enabled": False,
|
||||
"has_extended_profile": False,
|
||||
"id": 803542770,
|
||||
"id_str": "803542770",
|
||||
"is_translation_enabled": False,
|
||||
"is_translator": False,
|
||||
"lang": None,
|
||||
"listed_count": 890,
|
||||
"location": "Roberts Space Industries",
|
||||
"name": "Star Citizen",
|
||||
"notifications": None,
|
||||
"profile_background_color": "131516",
|
||||
"profile_background_image_url": "http://abs.twimg.com/images/themes/theme14/bg.gif",
|
||||
"profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme14/bg.gif",
|
||||
"profile_background_tile": False,
|
||||
"profile_banner_url": "https://pbs.twimg.com/profile_banners/803542770/1596651186",
|
||||
"profile_image_url": "http://pbs.twimg.com/profile_images/963109950103814144/ysnj_Asy_normal.jpg",
|
||||
"profile_image_url_https": "https://pbs.twimg.com/profile_images/963109950103814144/ysnj_Asy_normal.jpg",
|
||||
"profile_link_color": "0A5485",
|
||||
"profile_sidebar_border_color": "FFFFFF",
|
||||
"profile_sidebar_fill_color": "EFEFEF",
|
||||
"profile_text_color": "333333",
|
||||
"profile_use_background_image": True,
|
||||
"protected": False,
|
||||
"screen_name": "RobertsSpaceInd",
|
||||
"statuses_count": 6210,
|
||||
"time_zone": None,
|
||||
"translator_type": "none",
|
||||
"url": "https://t.co/iqO6apof3y",
|
||||
"utc_offset": None,
|
||||
"verified": True,
|
||||
},
|
||||
}
|
||||
]
|
||||
|
|
|
|||
|
|
@ -1,8 +1,6 @@
|
|||
from datetime import datetime
|
||||
from unittest import skip
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
from django.template.defaultfilters import truncatechars
|
||||
from django.test import TestCase
|
||||
from django.utils.html import format_html
|
||||
|
||||
|
|
@ -17,10 +15,12 @@ from newsreader.news.collection.tests.twitter.builder.mocks import (
|
|||
quoted_mock,
|
||||
retweet_mock,
|
||||
simple_mock,
|
||||
unsanitized_mock,
|
||||
video_mock,
|
||||
video_without_bitrate_mock,
|
||||
)
|
||||
from newsreader.news.collection.twitter import TWITTER_URL, TwitterBuilder
|
||||
from newsreader.news.collection.utils import truncate_text
|
||||
from newsreader.news.core.models import Post
|
||||
|
||||
|
||||
|
|
@ -48,7 +48,7 @@ class TwitterBuilderTestCase(TestCase):
|
|||
full_text = "@ArieNeoSC Here you go, goodnight!\n\nhttps://t.co/trAcIxBMlX"
|
||||
|
||||
self.assertEquals(post.rule, profile)
|
||||
self.assertEquals(post.title, truncatechars(full_text, 40))
|
||||
self.assertEquals(post.title, truncate_text(Post, "title", full_text))
|
||||
self.assertEquals(post.body, format_html(full_text))
|
||||
|
||||
self.assertEquals(post.author, "RobertsSpaceInd")
|
||||
|
|
@ -64,7 +64,7 @@ class TwitterBuilderTestCase(TestCase):
|
|||
full_text = "@RelicCcb Hi Christoper, we have checked the status of your investigation and it is still ongoing."
|
||||
|
||||
self.assertEquals(post.rule, profile)
|
||||
self.assertEquals(post.title, truncatechars(full_text, 40))
|
||||
self.assertEquals(post.title, truncate_text(Post, "title", full_text))
|
||||
self.assertEquals(post.body, format_html(full_text))
|
||||
|
||||
self.assertEquals(post.author, "RobertsSpaceInd")
|
||||
|
|
@ -107,12 +107,12 @@ class TwitterBuilderTestCase(TestCase):
|
|||
|
||||
self.assertIn(full_text, post.body)
|
||||
self.assertInHTML(
|
||||
"""<div><img alt="1269039233072689152" src="https://pbs.twimg.com/media/EZyIdXVU8AACPCz.jpg" loading="lazy" /></div>""",
|
||||
"""<div><img alt="1269039233072689152" src="https://pbs.twimg.com/media/EZyIdXVU8AACPCz.jpg" loading="lazy"></div>""",
|
||||
post.body,
|
||||
count=1,
|
||||
)
|
||||
self.assertInHTML(
|
||||
"""<div><img alt="1269039233068527618" src="https://pbs.twimg.com/media/EZyIdXUVcAI3Cju.jpg" loading="lazy" /></div>""",
|
||||
"""<div><img alt="1269039233068527618" src="https://pbs.twimg.com/media/EZyIdXUVcAI3Cju.jpg" loading="lazy"></div>""",
|
||||
post.body,
|
||||
count=1,
|
||||
)
|
||||
|
|
@ -142,7 +142,7 @@ class TwitterBuilderTestCase(TestCase):
|
|||
)
|
||||
|
||||
self.assertEquals(post.rule, profile)
|
||||
self.assertEquals(post.title, truncatechars(full_text, 40))
|
||||
self.assertEquals(post.title, truncate_text(Post, "title", full_text))
|
||||
|
||||
self.assertEquals(post.author, "RobertsSpaceInd")
|
||||
self.assertEquals(
|
||||
|
|
@ -154,7 +154,7 @@ class TwitterBuilderTestCase(TestCase):
|
|||
|
||||
self.assertIn(full_text, post.body)
|
||||
self.assertInHTML(
|
||||
"""<div><video controls muted><source src="https://video.twimg.com/amplify_video/1291074294747770880/vid/1280x720/J05_p6q74ZUN4csg.mp4?tag=13" type="video/mp4" /></video></div>""",
|
||||
"""<div><video controls muted=""><source src="https://video.twimg.com/amplify_video/1291074294747770880/vid/1280x720/J05_p6q74ZUN4csg.mp4?tag=13" type="video/mp4" /></video></div>""",
|
||||
post.body,
|
||||
count=1,
|
||||
)
|
||||
|
|
@ -175,7 +175,7 @@ class TwitterBuilderTestCase(TestCase):
|
|||
post = posts["1291080532361527296"]
|
||||
|
||||
self.assertInHTML(
|
||||
"""<div><video controls muted><source src="https://video.twimg.com/amplify_video/1291074294747770880/pl/kMYgFEoRyoW99o-i.m3u8?tag=13" type="application/x-mpegURL" /></video></div>""",
|
||||
"""<div><video controls muted=""><source src="https://video.twimg.com/amplify_video/1291074294747770880/pl/kMYgFEoRyoW99o-i.m3u8?tag=13" type="application/x-mpegURL"></video></div>""",
|
||||
post.body,
|
||||
count=1,
|
||||
)
|
||||
|
|
@ -198,7 +198,7 @@ class TwitterBuilderTestCase(TestCase):
|
|||
post = posts["1289337776140296193"]
|
||||
|
||||
self.assertInHTML(
|
||||
"""<div><video controls muted><source src="https://video.twimg.com/tweet_video/EeSl3sPUcAAyE4J.mp4" type="video/mp4" /></video></div>""",
|
||||
"""<div><video controls muted=""><source src="https://video.twimg.com/tweet_video/EeSl3sPUcAAyE4J.mp4" type="video/mp4"></video></div>""",
|
||||
post.body,
|
||||
count=1,
|
||||
)
|
||||
|
|
@ -270,18 +270,43 @@ class TwitterBuilderTestCase(TestCase):
|
|||
post.body,
|
||||
)
|
||||
|
||||
@skip("Not implemented")
|
||||
def test_empty_data(self):
|
||||
pass
|
||||
builder = TwitterBuilder
|
||||
|
||||
@skip("Not implemented")
|
||||
def test_update_posts(self):
|
||||
pass
|
||||
profile = TwitterProfileFactory(screen_name="RobertsSpaceInd")
|
||||
mock_stream = MagicMock(rule=profile)
|
||||
|
||||
with builder(([], mock_stream)) as builder:
|
||||
builder.save()
|
||||
|
||||
self.assertEquals(Post.objects.count(), 0)
|
||||
|
||||
@skip("Not implemented")
|
||||
def test_html_sanitizing(self):
|
||||
pass
|
||||
builder = TwitterBuilder
|
||||
|
||||
@skip("Not implemented")
|
||||
def test_duplicate_in_data(self):
|
||||
pass
|
||||
profile = TwitterProfileFactory(screen_name="RobertsSpaceInd")
|
||||
mock_stream = MagicMock(rule=profile)
|
||||
|
||||
with builder((unsanitized_mock, mock_stream)) as builder:
|
||||
builder.save()
|
||||
|
||||
posts = {post.remote_identifier: post for post in Post.objects.all()}
|
||||
|
||||
self.assertCountEqual(("1291528756373286914",), posts.keys())
|
||||
|
||||
post = posts["1291528756373286914"]
|
||||
|
||||
full_text = (
|
||||
"@ArieNeoSC Here you go, goodnight!\n\nhttps://t.co/trAcIxBMlX"
|
||||
" <article></article>"
|
||||
)
|
||||
|
||||
self.assertEquals(post.rule, profile)
|
||||
self.assertEquals(post.title, truncate_text(Post, "title", full_text))
|
||||
self.assertEquals(post.body, format_html(full_text))
|
||||
|
||||
self.assertInHTML("<script></script>", post.body, count=0)
|
||||
self.assertInHTML("<article></article>", post.body, count=1)
|
||||
|
||||
self.assertInHTML("<script></script>", post.title, count=0)
|
||||
self.assertInHTML("<article></article>", post.title, count=1)
|
||||
|
|
|
|||
|
|
@ -2,7 +2,6 @@ import logging
|
|||
|
||||
from datetime import datetime
|
||||
|
||||
from django.template.defaultfilters import truncatechars
|
||||
from django.utils.html import format_html
|
||||
|
||||
import pytz
|
||||
|
|
@ -11,6 +10,7 @@ from ftfy import fix_text
|
|||
|
||||
from newsreader.news.collection.base import Builder, Client, Collector, Stream
|
||||
from newsreader.news.collection.choices import RuleTypeChoices, TwitterPostTypeChoices
|
||||
from newsreader.news.collection.utils import truncate_text
|
||||
from newsreader.news.core.models import Post
|
||||
|
||||
|
||||
|
|
@ -41,10 +41,13 @@ class TwitterBuilder(Builder):
|
|||
for post in posts:
|
||||
remote_identifier = post["id_str"]
|
||||
url = f"{TWITTER_URL}/{rule.screen_name}/{remote_identifier}"
|
||||
|
||||
body = post["full_text"]
|
||||
title = truncate_text(Post, "title", self.sanitize_fragment(body))
|
||||
|
||||
publication_date = pytz.utc.localize(
|
||||
datetime.strptime(post["created_at"], "%a %b %d %H:%M:%S +0000 %Y")
|
||||
)
|
||||
body = post["full_text"]
|
||||
|
||||
if "extended_entities" in post:
|
||||
try:
|
||||
|
|
@ -66,9 +69,11 @@ class TwitterBuilder(Builder):
|
|||
original_post=original_post["full_text"],
|
||||
)
|
||||
|
||||
body = self.sanitize_fragment(body)
|
||||
|
||||
data = {
|
||||
"remote_identifier": remote_identifier,
|
||||
"title": fix_text(truncatechars(post["full_text"], 40)),
|
||||
"title": fix_text(title),
|
||||
"body": fix_text(body),
|
||||
"author": rule.screen_name,
|
||||
"publication_date": publication_date,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue