0.2.3 #99
5 changed files with 211 additions and 858 deletions
28
src/newsreader/news/collection/constants.py
Normal file
28
src/newsreader/news/collection/constants.py
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
from bleach.sanitizer import ALLOWED_ATTRIBUTES as BLEACH_ATTRIBUTES
|
||||
from bleach.sanitizer import ALLOWED_TAGS as BLEACH_TAGS
|
||||
|
||||
|
||||
WHITELISTED_TAGS = (
|
||||
*BLEACH_TAGS,
|
||||
"h1",
|
||||
"h2",
|
||||
"h3",
|
||||
"article",
|
||||
"p",
|
||||
"img",
|
||||
"figure",
|
||||
"small",
|
||||
"picture",
|
||||
"b",
|
||||
"video",
|
||||
"source",
|
||||
"div",
|
||||
"body",
|
||||
)
|
||||
|
||||
WHITELISTED_ATTRIBUTES = {
|
||||
**BLEACH_ATTRIBUTES,
|
||||
"a": ["href", "rel"],
|
||||
"img": ["alt", "src"],
|
||||
"source": ["srcset", "media", "src", "type"],
|
||||
}
|
||||
|
|
@ -11,6 +11,7 @@ import pytz
|
|||
from feedparser import parse
|
||||
|
||||
from newsreader.news.collection.base import Builder, Client, Collector, Stream
|
||||
from newsreader.news.collection.constants import WHITELISTED_ATTRIBUTES, WHITELISTED_TAGS
|
||||
from newsreader.news.collection.exceptions import (
|
||||
StreamDeniedException,
|
||||
StreamException,
|
||||
|
|
@ -66,25 +67,40 @@ class FeedBuilder(Builder):
|
|||
data = {"rule_id": rule.pk}
|
||||
|
||||
for field, model_field in field_mapping.items():
|
||||
if field in entry:
|
||||
value = self.truncate_text(model_field, entry[field])
|
||||
if not field in entry:
|
||||
continue
|
||||
|
||||
if field == "published_parsed":
|
||||
aware_datetime, created = build_publication_date(value, tz)
|
||||
data[model_field] = aware_datetime if created else None
|
||||
elif field == "summary":
|
||||
summary = self.sanitize_summary(value)
|
||||
data[model_field] = summary
|
||||
else:
|
||||
data[model_field] = value
|
||||
value = self.truncate_text(model_field, entry[field])
|
||||
|
||||
if field == "published_parsed":
|
||||
aware_datetime, created = build_publication_date(value, tz)
|
||||
data[model_field] = aware_datetime if created else None
|
||||
elif field == "summary":
|
||||
summary = self.sanitize_fragment(value)
|
||||
data[model_field] = summary
|
||||
else:
|
||||
data[model_field] = value
|
||||
|
||||
if "content" in entry:
|
||||
content = self.get_content(entry["content"])
|
||||
body = data.get("body", "")
|
||||
|
||||
if not body or len(body) < len(content):
|
||||
data["body"] = content
|
||||
|
||||
yield Post(**data)
|
||||
|
||||
def sanitize_summary(self, summary: str) -> Optional[str]:
|
||||
attrs = {"a": ["href", "rel"], "img": ["alt", "src"]}
|
||||
tags = ["a", "img", "p"]
|
||||
def sanitize_fragment(self, fragment: str) -> Optional[str]:
|
||||
if not fragment:
|
||||
return ""
|
||||
|
||||
return bleach.clean(summary, tags=tags, attributes=attrs) if summary else None
|
||||
return bleach.clean(
|
||||
fragment,
|
||||
tags=WHITELISTED_TAGS,
|
||||
attributes=WHITELISTED_ATTRIBUTES,
|
||||
strip=True,
|
||||
strip_comments=True,
|
||||
)
|
||||
|
||||
def truncate_text(self, field_name, value):
|
||||
field = Post._meta.get_field(field_name)
|
||||
|
|
@ -101,6 +117,10 @@ class FeedBuilder(Builder):
|
|||
|
||||
return value
|
||||
|
||||
def get_content(self, items: List) -> str:
|
||||
content = "\n ".join([item.get("value") for item in items])
|
||||
return self.sanitize_fragment(content)
|
||||
|
||||
def save(self) -> None:
|
||||
for post in self.instances:
|
||||
post.save()
|
||||
|
|
|
|||
|
|
@ -2,9 +2,13 @@ html_summary = """
|
|||
<html>
|
||||
<body>
|
||||
<article>
|
||||
<h1>This is clickbait</h1>
|
||||
<p>This <strong>is</strong> <a href="https://www.bbc.com" media="durp">clickbait</a></p>
|
||||
<h1>This is clickbait</h1>
|
||||
<p>This <strong>is</strong> <a href="https://www.bbc.com" media="durp">clickbait</a></p>
|
||||
<iframe src="https://somesketchysite.com/hacks.js"></iframe>
|
||||
</article>
|
||||
</body>
|
||||
|
||||
<script href="https://somesketchysite.com/hacks.js"></script>
|
||||
<script>console.log("durp");</script>
|
||||
</html>
|
||||
"""
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -282,14 +282,16 @@ class FeedBuilderTestCase(TestCase):
|
|||
|
||||
self.assertEquals(Post.objects.count(), 1)
|
||||
|
||||
self.assertTrue("<html>" not in post.body)
|
||||
self.assertTrue("<body>" not in post.body)
|
||||
self.assertTrue("<article>" not in post.body)
|
||||
self.assertTrue("<h1>" not in post.body)
|
||||
self.assertTrue("<strong>" not in post.body)
|
||||
self.assertTrue("<article>" in post.body)
|
||||
self.assertTrue("<h1>" in post.body)
|
||||
self.assertTrue("<strong>" in post.body)
|
||||
self.assertTrue('<a href="https://www.bbc.com">' in post.body)
|
||||
self.assertTrue("<p>" in post.body)
|
||||
|
||||
self.assertTrue("<html>" not in post.body)
|
||||
self.assertTrue("<script>" not in post.body)
|
||||
self.assertTrue("<iframe>" not in post.body)
|
||||
|
||||
def test_long_author_text_is_truncated(self):
|
||||
builder = FeedBuilder
|
||||
rule = CollectionRuleFactory()
|
||||
|
|
@ -317,3 +319,46 @@ class FeedBuilderTestCase(TestCase):
|
|||
self.assertEquals(Post.objects.count(), 1)
|
||||
|
||||
self.assertEquals(len(post.title), 200)
|
||||
|
||||
def test_content_detail_is_prioritized_if_longer(self):
|
||||
builder = FeedBuilder
|
||||
rule = CollectionRuleFactory()
|
||||
mock_stream = MagicMock(rule=rule)
|
||||
|
||||
with builder((mock_with_longer_content_detail, mock_stream)) as builder:
|
||||
builder.save()
|
||||
|
||||
post = Post.objects.get()
|
||||
|
||||
self.assertEquals(Post.objects.count(), 1)
|
||||
|
||||
self.assertFalse("Foreign Minister Mohammad Javad Zarif says the US" in post.body)
|
||||
self.assertTrue("Federal Communications Commission" in post.body)
|
||||
|
||||
def test_content_detail_is_not_prioritized_if_shorter(self):
|
||||
builder = FeedBuilder
|
||||
rule = CollectionRuleFactory()
|
||||
mock_stream = MagicMock(rule=rule)
|
||||
|
||||
with builder((mock_with_shorter_content_detail, mock_stream)) as builder:
|
||||
builder.save()
|
||||
|
||||
post = Post.objects.get()
|
||||
|
||||
self.assertEquals(Post.objects.count(), 1)
|
||||
|
||||
self.assertTrue("Foreign Minister Mohammad Javad Zarif says the US" in post.body)
|
||||
|
||||
def test_content_detail_is_concatinated(self):
|
||||
builder = FeedBuilder
|
||||
rule = CollectionRuleFactory()
|
||||
mock_stream = MagicMock(rule=rule)
|
||||
|
||||
with builder((mock_with_multiple_content_detail, mock_stream)) as builder:
|
||||
builder.save()
|
||||
|
||||
post = Post.objects.get()
|
||||
|
||||
self.assertEquals(Post.objects.count(), 1)
|
||||
|
||||
self.assertEquals(post.body, "Yippie\n Ya\n Yee")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue