0.2.3 #99
5 changed files with 211 additions and 858 deletions
28
src/newsreader/news/collection/constants.py
Normal file
28
src/newsreader/news/collection/constants.py
Normal file
|
|
@ -0,0 +1,28 @@
|
||||||
|
from bleach.sanitizer import ALLOWED_ATTRIBUTES as BLEACH_ATTRIBUTES
|
||||||
|
from bleach.sanitizer import ALLOWED_TAGS as BLEACH_TAGS
|
||||||
|
|
||||||
|
|
||||||
|
WHITELISTED_TAGS = (
|
||||||
|
*BLEACH_TAGS,
|
||||||
|
"h1",
|
||||||
|
"h2",
|
||||||
|
"h3",
|
||||||
|
"article",
|
||||||
|
"p",
|
||||||
|
"img",
|
||||||
|
"figure",
|
||||||
|
"small",
|
||||||
|
"picture",
|
||||||
|
"b",
|
||||||
|
"video",
|
||||||
|
"source",
|
||||||
|
"div",
|
||||||
|
"body",
|
||||||
|
)
|
||||||
|
|
||||||
|
WHITELISTED_ATTRIBUTES = {
|
||||||
|
**BLEACH_ATTRIBUTES,
|
||||||
|
"a": ["href", "rel"],
|
||||||
|
"img": ["alt", "src"],
|
||||||
|
"source": ["srcset", "media", "src", "type"],
|
||||||
|
}
|
||||||
|
|
@ -11,6 +11,7 @@ import pytz
|
||||||
from feedparser import parse
|
from feedparser import parse
|
||||||
|
|
||||||
from newsreader.news.collection.base import Builder, Client, Collector, Stream
|
from newsreader.news.collection.base import Builder, Client, Collector, Stream
|
||||||
|
from newsreader.news.collection.constants import WHITELISTED_ATTRIBUTES, WHITELISTED_TAGS
|
||||||
from newsreader.news.collection.exceptions import (
|
from newsreader.news.collection.exceptions import (
|
||||||
StreamDeniedException,
|
StreamDeniedException,
|
||||||
StreamException,
|
StreamException,
|
||||||
|
|
@ -66,25 +67,40 @@ class FeedBuilder(Builder):
|
||||||
data = {"rule_id": rule.pk}
|
data = {"rule_id": rule.pk}
|
||||||
|
|
||||||
for field, model_field in field_mapping.items():
|
for field, model_field in field_mapping.items():
|
||||||
if field in entry:
|
if not field in entry:
|
||||||
|
continue
|
||||||
|
|
||||||
value = self.truncate_text(model_field, entry[field])
|
value = self.truncate_text(model_field, entry[field])
|
||||||
|
|
||||||
if field == "published_parsed":
|
if field == "published_parsed":
|
||||||
aware_datetime, created = build_publication_date(value, tz)
|
aware_datetime, created = build_publication_date(value, tz)
|
||||||
data[model_field] = aware_datetime if created else None
|
data[model_field] = aware_datetime if created else None
|
||||||
elif field == "summary":
|
elif field == "summary":
|
||||||
summary = self.sanitize_summary(value)
|
summary = self.sanitize_fragment(value)
|
||||||
data[model_field] = summary
|
data[model_field] = summary
|
||||||
else:
|
else:
|
||||||
data[model_field] = value
|
data[model_field] = value
|
||||||
|
|
||||||
|
if "content" in entry:
|
||||||
|
content = self.get_content(entry["content"])
|
||||||
|
body = data.get("body", "")
|
||||||
|
|
||||||
|
if not body or len(body) < len(content):
|
||||||
|
data["body"] = content
|
||||||
|
|
||||||
yield Post(**data)
|
yield Post(**data)
|
||||||
|
|
||||||
def sanitize_summary(self, summary: str) -> Optional[str]:
|
def sanitize_fragment(self, fragment: str) -> Optional[str]:
|
||||||
attrs = {"a": ["href", "rel"], "img": ["alt", "src"]}
|
if not fragment:
|
||||||
tags = ["a", "img", "p"]
|
return ""
|
||||||
|
|
||||||
return bleach.clean(summary, tags=tags, attributes=attrs) if summary else None
|
return bleach.clean(
|
||||||
|
fragment,
|
||||||
|
tags=WHITELISTED_TAGS,
|
||||||
|
attributes=WHITELISTED_ATTRIBUTES,
|
||||||
|
strip=True,
|
||||||
|
strip_comments=True,
|
||||||
|
)
|
||||||
|
|
||||||
def truncate_text(self, field_name, value):
|
def truncate_text(self, field_name, value):
|
||||||
field = Post._meta.get_field(field_name)
|
field = Post._meta.get_field(field_name)
|
||||||
|
|
@ -101,6 +117,10 @@ class FeedBuilder(Builder):
|
||||||
|
|
||||||
return value
|
return value
|
||||||
|
|
||||||
|
def get_content(self, items: List) -> str:
|
||||||
|
content = "\n ".join([item.get("value") for item in items])
|
||||||
|
return self.sanitize_fragment(content)
|
||||||
|
|
||||||
def save(self) -> None:
|
def save(self) -> None:
|
||||||
for post in self.instances:
|
for post in self.instances:
|
||||||
post.save()
|
post.save()
|
||||||
|
|
|
||||||
|
|
@ -4,7 +4,11 @@ html_summary = """
|
||||||
<article>
|
<article>
|
||||||
<h1>This is clickbait</h1>
|
<h1>This is clickbait</h1>
|
||||||
<p>This <strong>is</strong> <a href="https://www.bbc.com" media="durp">clickbait</a></p>
|
<p>This <strong>is</strong> <a href="https://www.bbc.com" media="durp">clickbait</a></p>
|
||||||
|
<iframe src="https://somesketchysite.com/hacks.js"></iframe>
|
||||||
</article>
|
</article>
|
||||||
</body>
|
</body>
|
||||||
|
|
||||||
|
<script href="https://somesketchysite.com/hacks.js"></script>
|
||||||
|
<script>console.log("durp");</script>
|
||||||
</html>
|
</html>
|
||||||
"""
|
"""
|
||||||
|
|
|
||||||
File diff suppressed because it is too large
Load diff
|
|
@ -282,14 +282,16 @@ class FeedBuilderTestCase(TestCase):
|
||||||
|
|
||||||
self.assertEquals(Post.objects.count(), 1)
|
self.assertEquals(Post.objects.count(), 1)
|
||||||
|
|
||||||
self.assertTrue("<html>" not in post.body)
|
self.assertTrue("<article>" in post.body)
|
||||||
self.assertTrue("<body>" not in post.body)
|
self.assertTrue("<h1>" in post.body)
|
||||||
self.assertTrue("<article>" not in post.body)
|
self.assertTrue("<strong>" in post.body)
|
||||||
self.assertTrue("<h1>" not in post.body)
|
|
||||||
self.assertTrue("<strong>" not in post.body)
|
|
||||||
self.assertTrue('<a href="https://www.bbc.com">' in post.body)
|
self.assertTrue('<a href="https://www.bbc.com">' in post.body)
|
||||||
self.assertTrue("<p>" in post.body)
|
self.assertTrue("<p>" in post.body)
|
||||||
|
|
||||||
|
self.assertTrue("<html>" not in post.body)
|
||||||
|
self.assertTrue("<script>" not in post.body)
|
||||||
|
self.assertTrue("<iframe>" not in post.body)
|
||||||
|
|
||||||
def test_long_author_text_is_truncated(self):
|
def test_long_author_text_is_truncated(self):
|
||||||
builder = FeedBuilder
|
builder = FeedBuilder
|
||||||
rule = CollectionRuleFactory()
|
rule = CollectionRuleFactory()
|
||||||
|
|
@ -317,3 +319,46 @@ class FeedBuilderTestCase(TestCase):
|
||||||
self.assertEquals(Post.objects.count(), 1)
|
self.assertEquals(Post.objects.count(), 1)
|
||||||
|
|
||||||
self.assertEquals(len(post.title), 200)
|
self.assertEquals(len(post.title), 200)
|
||||||
|
|
||||||
|
def test_content_detail_is_prioritized_if_longer(self):
|
||||||
|
builder = FeedBuilder
|
||||||
|
rule = CollectionRuleFactory()
|
||||||
|
mock_stream = MagicMock(rule=rule)
|
||||||
|
|
||||||
|
with builder((mock_with_longer_content_detail, mock_stream)) as builder:
|
||||||
|
builder.save()
|
||||||
|
|
||||||
|
post = Post.objects.get()
|
||||||
|
|
||||||
|
self.assertEquals(Post.objects.count(), 1)
|
||||||
|
|
||||||
|
self.assertFalse("Foreign Minister Mohammad Javad Zarif says the US" in post.body)
|
||||||
|
self.assertTrue("Federal Communications Commission" in post.body)
|
||||||
|
|
||||||
|
def test_content_detail_is_not_prioritized_if_shorter(self):
|
||||||
|
builder = FeedBuilder
|
||||||
|
rule = CollectionRuleFactory()
|
||||||
|
mock_stream = MagicMock(rule=rule)
|
||||||
|
|
||||||
|
with builder((mock_with_shorter_content_detail, mock_stream)) as builder:
|
||||||
|
builder.save()
|
||||||
|
|
||||||
|
post = Post.objects.get()
|
||||||
|
|
||||||
|
self.assertEquals(Post.objects.count(), 1)
|
||||||
|
|
||||||
|
self.assertTrue("Foreign Minister Mohammad Javad Zarif says the US" in post.body)
|
||||||
|
|
||||||
|
def test_content_detail_is_concatinated(self):
|
||||||
|
builder = FeedBuilder
|
||||||
|
rule = CollectionRuleFactory()
|
||||||
|
mock_stream = MagicMock(rule=rule)
|
||||||
|
|
||||||
|
with builder((mock_with_multiple_content_detail, mock_stream)) as builder:
|
||||||
|
builder.save()
|
||||||
|
|
||||||
|
post = Post.objects.get()
|
||||||
|
|
||||||
|
self.assertEquals(Post.objects.count(), 1)
|
||||||
|
|
||||||
|
self.assertEquals(post.body, "Yippie\n Ya\n Yee")
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue