0.2.3 #99

Merged
sonny merged 112 commits from development into master 2020-05-23 16:58:42 +02:00
5 changed files with 211 additions and 858 deletions
Showing only changes of commit c13f968234 - Show all commits

View file

@ -0,0 +1,28 @@
from bleach.sanitizer import ALLOWED_ATTRIBUTES as BLEACH_ATTRIBUTES
from bleach.sanitizer import ALLOWED_TAGS as BLEACH_TAGS
WHITELISTED_TAGS = (
*BLEACH_TAGS,
"h1",
"h2",
"h3",
"article",
"p",
"img",
"figure",
"small",
"picture",
"b",
"video",
"source",
"div",
"body",
)
WHITELISTED_ATTRIBUTES = {
**BLEACH_ATTRIBUTES,
"a": ["href", "rel"],
"img": ["alt", "src"],
"source": ["srcset", "media", "src", "type"],
}

View file

@ -11,6 +11,7 @@ import pytz
from feedparser import parse from feedparser import parse
from newsreader.news.collection.base import Builder, Client, Collector, Stream from newsreader.news.collection.base import Builder, Client, Collector, Stream
from newsreader.news.collection.constants import WHITELISTED_ATTRIBUTES, WHITELISTED_TAGS
from newsreader.news.collection.exceptions import ( from newsreader.news.collection.exceptions import (
StreamDeniedException, StreamDeniedException,
StreamException, StreamException,
@ -66,25 +67,40 @@ class FeedBuilder(Builder):
data = {"rule_id": rule.pk} data = {"rule_id": rule.pk}
for field, model_field in field_mapping.items(): for field, model_field in field_mapping.items():
if field in entry: if not field in entry:
continue
value = self.truncate_text(model_field, entry[field]) value = self.truncate_text(model_field, entry[field])
if field == "published_parsed": if field == "published_parsed":
aware_datetime, created = build_publication_date(value, tz) aware_datetime, created = build_publication_date(value, tz)
data[model_field] = aware_datetime if created else None data[model_field] = aware_datetime if created else None
elif field == "summary": elif field == "summary":
summary = self.sanitize_summary(value) summary = self.sanitize_fragment(value)
data[model_field] = summary data[model_field] = summary
else: else:
data[model_field] = value data[model_field] = value
if "content" in entry:
content = self.get_content(entry["content"])
body = data.get("body", "")
if not body or len(body) < len(content):
data["body"] = content
yield Post(**data) yield Post(**data)
def sanitize_summary(self, summary: str) -> Optional[str]: def sanitize_fragment(self, fragment: str) -> Optional[str]:
attrs = {"a": ["href", "rel"], "img": ["alt", "src"]} if not fragment:
tags = ["a", "img", "p"] return ""
return bleach.clean(summary, tags=tags, attributes=attrs) if summary else None return bleach.clean(
fragment,
tags=WHITELISTED_TAGS,
attributes=WHITELISTED_ATTRIBUTES,
strip=True,
strip_comments=True,
)
def truncate_text(self, field_name, value): def truncate_text(self, field_name, value):
field = Post._meta.get_field(field_name) field = Post._meta.get_field(field_name)
@ -101,6 +117,10 @@ class FeedBuilder(Builder):
return value return value
def get_content(self, items: List) -> str:
content = "\n ".join([item.get("value") for item in items])
return self.sanitize_fragment(content)
def save(self) -> None: def save(self) -> None:
for post in self.instances: for post in self.instances:
post.save() post.save()

View file

@ -4,7 +4,11 @@ html_summary = """
<article> <article>
<h1>This is clickbait</h1> <h1>This is clickbait</h1>
<p>This <strong>is</strong> <a href="https://www.bbc.com" media="durp">clickbait</a></p> <p>This <strong>is</strong> <a href="https://www.bbc.com" media="durp">clickbait</a></p>
<iframe src="https://somesketchysite.com/hacks.js"></iframe>
</article> </article>
</body> </body>
<script href="https://somesketchysite.com/hacks.js"></script>
<script>console.log("durp");</script>
</html> </html>
""" """

File diff suppressed because it is too large Load diff

View file

@ -282,14 +282,16 @@ class FeedBuilderTestCase(TestCase):
self.assertEquals(Post.objects.count(), 1) self.assertEquals(Post.objects.count(), 1)
self.assertTrue("<html>" not in post.body) self.assertTrue("<article>" in post.body)
self.assertTrue("<body>" not in post.body) self.assertTrue("<h1>" in post.body)
self.assertTrue("<article>" not in post.body) self.assertTrue("<strong>" in post.body)
self.assertTrue("<h1>" not in post.body)
self.assertTrue("<strong>" not in post.body)
self.assertTrue('<a href="https://www.bbc.com">' in post.body) self.assertTrue('<a href="https://www.bbc.com">' in post.body)
self.assertTrue("<p>" in post.body) self.assertTrue("<p>" in post.body)
self.assertTrue("<html>" not in post.body)
self.assertTrue("<script>" not in post.body)
self.assertTrue("<iframe>" not in post.body)
def test_long_author_text_is_truncated(self): def test_long_author_text_is_truncated(self):
builder = FeedBuilder builder = FeedBuilder
rule = CollectionRuleFactory() rule = CollectionRuleFactory()
@ -317,3 +319,46 @@ class FeedBuilderTestCase(TestCase):
self.assertEquals(Post.objects.count(), 1) self.assertEquals(Post.objects.count(), 1)
self.assertEquals(len(post.title), 200) self.assertEquals(len(post.title), 200)
def test_content_detail_is_prioritized_if_longer(self):
builder = FeedBuilder
rule = CollectionRuleFactory()
mock_stream = MagicMock(rule=rule)
with builder((mock_with_longer_content_detail, mock_stream)) as builder:
builder.save()
post = Post.objects.get()
self.assertEquals(Post.objects.count(), 1)
self.assertFalse("Foreign Minister Mohammad Javad Zarif says the US" in post.body)
self.assertTrue("Federal Communications Commission" in post.body)
def test_content_detail_is_not_prioritized_if_shorter(self):
builder = FeedBuilder
rule = CollectionRuleFactory()
mock_stream = MagicMock(rule=rule)
with builder((mock_with_shorter_content_detail, mock_stream)) as builder:
builder.save()
post = Post.objects.get()
self.assertEquals(Post.objects.count(), 1)
self.assertTrue("Foreign Minister Mohammad Javad Zarif says the US" in post.body)
def test_content_detail_is_concatinated(self):
builder = FeedBuilder
rule = CollectionRuleFactory()
mock_stream = MagicMock(rule=rule)
with builder((mock_with_multiple_content_detail, mock_stream)) as builder:
builder.save()
post = Post.objects.get()
self.assertEquals(Post.objects.count(), 1)
self.assertEquals(post.body, "Yippie\n Ya\n Yee")