- Update static configuration
- Builder refactor
- Fix for images stretching to far
This commit is contained in:
Sonny Bakker 2020-10-17 13:19:49 +02:00
parent b6921a20e7
commit 9e5e05c056
15 changed files with 568 additions and 341 deletions

View file

@ -3427,7 +3427,6 @@
"is_active": true,
"date_joined": "2019-07-18T18:52:36.080Z",
"email": "sonny@bakker.nl",
"task": 10,
"reddit_refresh_token": null,
"reddit_access_token": null,
"groups": [],

View file

@ -0,0 +1,16 @@
from newsreader.news.collection.exceptions.builder import (
BuilderDuplicateException,
BuilderException,
BuilderMissingDataException,
BuilderParseException,
)
from newsreader.news.collection.exceptions.stream import (
StreamConnectionException,
StreamDeniedException,
StreamException,
StreamForbiddenException,
StreamNotFoundException,
StreamParseException,
StreamTimeOutException,
StreamTooManyException,
)

View file

@ -0,0 +1,21 @@
class BuilderException(Exception):
message = "Builder exception"
def __init__(self, payload=None, message=None):
self.payload = payload
self.message = message if message else self.message
def __str__(self):
return self.message
class BuilderMissingDataException(BuilderException):
message = "Payload contains missing data"
class BuilderDuplicateException(BuilderException):
message = "Payload contains duplicate entry"
class BuilderParseException(BuilderException):
message = "Failed to parse payload"

View file

@ -39,6 +39,18 @@ class FeedBuilder(PostBuilder):
rule__type = RuleTypeChoices.feed
def build(self):
instances = []
with FeedDuplicateHandler(self.stream.rule) as duplicate_handler:
entries = self.payload.get("entries", [])
for entry in entries:
post = self.build_post(entry)
instances.append(post)
self.instances = duplicate_handler.check(instances)
def build_post(self, entry):
field_mapping = {
"id": "remote_identifier",
"title": "title",
@ -48,41 +60,37 @@ class FeedBuilder(PostBuilder):
"author": "author",
}
tz = pytz.timezone(self.stream.rule.timezone)
instances = []
data = {"rule_id": self.stream.rule.pk}
with FeedDuplicateHandler(self.stream.rule) as duplicate_handler:
entries = self.payload.get("entries", [])
for field, model_field in field_mapping.items():
if not field in entry:
continue
for entry in entries:
data = {"rule_id": self.stream.rule.pk}
value = truncate_text(Post, model_field, entry[field])
for field, model_field in field_mapping.items():
if not field in entry:
continue
if field == "published_parsed":
data[model_field] = build_publication_date(value, tz)
elif field == "summary":
data[model_field] = self.sanitize_fragment(value)
else:
data[model_field] = value
value = truncate_text(Post, model_field, entry[field])
content_details = self.get_content_details(entry)
if field == "published_parsed":
data[model_field] = build_publication_date(value, tz)
elif field == "summary":
data[model_field] = self.sanitize_fragment(value)
else:
data[model_field] = value
# use content details key if it contains more information
if not "body" in data or len(data["body"]) < len(content_details):
data["body"] = content_details
if "content" in entry:
content = self.get_content(entry["content"])
body = data.get("body", "")
return Post(**data)
if not body or len(body) < len(content):
data["body"] = content
def get_content_details(self, entry):
content_items = entry.get("content")
instances.append(Post(**data))
if not content_items:
return ""
self.instances = duplicate_handler.check(instances)
def get_content(self, items):
content = "\n ".join([item.get("value") for item in items])
return self.sanitize_fragment(content)
content_details = "\n ".join([item.get("value") for item in content_items])
return self.sanitize_fragment(content_details)
class FeedStream(PostStream):

View file

@ -28,6 +28,10 @@ from newsreader.news.collection.constants import (
WHITELISTED_TAGS,
)
from newsreader.news.collection.exceptions import (
BuilderDuplicateException,
BuilderException,
BuilderMissingDataException,
BuilderParseException,
StreamDeniedException,
StreamException,
StreamParseException,
@ -122,99 +126,136 @@ class RedditBuilder(PostBuilder):
if not "data" in self.payload or not "children" in self.payload["data"]:
return
posts = self.payload["data"]["children"]
rule = self.stream.rule
for post in posts:
if not "data" in post or post["kind"] != REDDIT_POST:
continue
data = post["data"]
remote_identifier = data["id"]
title = truncate_text(Post, "title", data["title"])
author = truncate_text(Post, "author", data["author"])
post_url_fragment = data["permalink"]
direct_url = data["url"]
is_text_post = data["is_self"]
if remote_identifier in results:
continue
if is_text_post:
uncleaned_body = data["selftext_html"]
unescaped_body = unescape(uncleaned_body) if uncleaned_body else ""
body = self.sanitize_fragment(unescaped_body) if unescaped_body else ""
elif direct_url.endswith(REDDIT_IMAGE_EXTENSIONS):
body = format_html(
"<div><img alt='{title}' src='{url}' loading='lazy' /></div>",
url=direct_url,
title=title,
)
elif data["is_video"]:
video_info = data["secure_media"]["reddit_video"]
body = format_html(
"<div><video controls muted><source src='{url}' type='video/mp4' /></video></div>",
url=video_info["fallback_url"],
)
elif direct_url.endswith(REDDIT_VIDEO_EXTENSIONS):
extension = next(
extension.replace(".", "")
for extension in REDDIT_VIDEO_EXTENSIONS
if direct_url.endswith(extension)
)
if extension == "gifv":
body = format_html(
"<div><video controls muted><source src='{url}' type='video/mp4' /></video></div>",
url=direct_url.replace(extension, "mp4"),
)
else:
body = format_html(
"<div><video controls muted><source src='{url}' type='video/{extension}' /></video></div>",
url=direct_url,
extension=extension,
)
else:
body = format_html(
"<div><a target='_blank' rel='noopener noreferrer' alt='{title}' href='{url}' class='link'>Direct url</a></div>",
url=direct_url,
title=title,
)
entries = self.payload["data"]["children"]
for entry in entries:
try:
parsed_date = datetime.fromtimestamp(post["data"]["created_utc"])
created_date = pytz.utc.localize(parsed_date)
except (OverflowError, OSError):
logging.warning(
f"Failed parsing timestamp from {REDDIT_URL}{post_url_fragment}"
)
created_date = timezone.now()
post_data = {
"remote_identifier": remote_identifier,
"title": title,
"body": body,
"author": author,
"url": f"{REDDIT_URL}{post_url_fragment}",
"publication_date": created_date,
"rule": rule,
}
if remote_identifier in self.existing_posts:
existing_post = self.existing_posts[remote_identifier]
for key, value in post_data.items():
setattr(existing_post, key, value)
results[existing_post.remote_identifier] = existing_post
post = self.build_post(entry)
except BuilderException:
logger.exception("Failed building post")
continue
results[remote_identifier] = Post(**post_data)
identifier = post.remote_identifier
results[identifier] = post
self.instances = results.values()
def build_post(self, entry):
rule = self.stream.rule
entry_data = entry.get("data", {})
remote_identifier = entry_data.get("id", "")
kind = entry.get("kind")
if remote_identifier in self.existing_posts:
raise BuilderDuplicateException(payload=entry)
elif kind != REDDIT_POST:
raise BuilderParseException(
message=f"Payload is not an reddit post, its of kind {kind}",
payload=entry,
)
elif not entry_data:
raise BuilderMissingDataException(
message=f"Post {remote_identifier} did not contain any data",
payload=entry,
)
try:
title = entry_data["title"]
author = entry_data["author"]
post_url_fragment = entry_data["permalink"]
direct_url = entry_data["url"]
is_text = entry_data["is_self"]
is_video = entry_data["is_video"]
except KeyError as e:
raise BuilderMissingDataException(payload=entry) from e
title = truncate_text(Post, "title", title)
author = truncate_text(Post, "author", author)
if is_text:
body = self.get_text_post(entry_data)
elif direct_url.endswith(REDDIT_IMAGE_EXTENSIONS):
body = self.get_image_post(title, direct_url)
elif is_video:
body = self.get_native_video_post(entry_data)
elif direct_url.endswith(REDDIT_VIDEO_EXTENSIONS):
body = self.get_video_post(direct_url)
else:
body = self.get_url_post(title, direct_url)
try:
parsed_date = datetime.fromtimestamp(entry_data["created_utc"])
created_date = pytz.utc.localize(parsed_date)
except (OverflowError, OSError) as e:
raise BuilderParseException(payload=entry) from e
except KeyError as e:
raise BuilderMissingDataException(payload=entry) from e
post_entry = {
"remote_identifier": remote_identifier,
"title": title,
"body": body,
"author": author,
"url": f"{REDDIT_URL}{post_url_fragment}",
"publication_date": created_date,
"rule": rule,
}
return Post(**post_entry)
def get_text_post(self, entry):
try:
uncleaned_body = entry["selftext_html"]
except KeyError as e:
raise BuilderMissingDataException(payload=entry) from e
unescaped_body = unescape(uncleaned_body) if uncleaned_body else ""
return self.sanitize_fragment(unescaped_body) if unescaped_body else ""
def get_image_post(self, title, url):
return format_html(
"<div><img alt='{title}' src='{url}' loading='lazy' /></div>",
url=url,
title=title,
)
def get_native_video_post(self, entry):
try:
video_info = entry["secure_media"]["reddit_video"]
except KeyError as e:
raise BuilderMissingDataException(payload=entry) from e
return format_html(
"<div><video controls muted><source src='{url}' type='video/mp4' /></video></div>",
url=video_info["fallback_url"],
)
def get_video_post(self, url):
extension = next(
extension.replace(".", "")
for extension in REDDIT_VIDEO_EXTENSIONS
if url.endswith(extension)
)
if extension == "gifv":
return format_html(
"<div><video controls muted><source src='{url}' type='video/mp4' /></video></div>",
url=url.replace(extension, "mp4"),
)
return format_html(
"<div><video controls muted><source src='{url}' type='video/{extension}' /></video></div>",
url=url,
extension=extension,
)
def get_url_post(self, title, url):
return format_html(
"<div><a target='_blank' rel='noopener noreferrer' alt='{title}' href='{url}' class='link'>Direct url</a></div>",
url=url,
title=title,
)
class RedditStream(PostStream):
rule_type = RuleTypeChoices.subreddit

View file

@ -6,19 +6,21 @@
<form class="form rules-form">
{% csrf_token %}
<section class="section form__section form__section--actions">
<div class="form__actions">
<a class="link button button--confirm" href="{% url "news:collection:feed-create" %}">{% trans "Add a feed" %}</a>
<a class="link button button--confirm" href="{% url "news:collection:import" %}">{% trans "Import feeds" %}</a>
<a class="link button button--reddit" href="{% url "news:collection:subreddit-create" %}">{% trans "Add a subreddit" %}</a>
<a class="link button button--twitter" href="{% url "news:collection:twitter-timeline-create" %}">{% trans "Add a Twitter profile" %}</a>
</div>
</section>
<section class="section form__section form__section--actions">
<fieldset class="fieldset form__fieldset">
<input type="submit" class="button button--primary" formaction="{% url "news:collection:rules-enable" %}" formmethod="post" value="{% trans "Enable" %}" />
<input type="submit" class="button button--primary" formaction="{% url "news:collection:rules-disable" %}" formmethod="post" value="{% trans "Disable" %}" />
<input type="submit" class="button button--error" formaction="{% url "news:collection:rules-delete" %}" formmethod="post" value="{% trans "Delete" %}"/>
</fieldset>
<div class="form__actions">
<a class="link button button--confirm" href="{% url "news:collection:feed-create" %}">{% trans "Add a feed" %}</a>
<a class="link button button--reddit" href="{% url "news:collection:subreddit-create" %}">{% trans "Add a subreddit" %}</a>
<a class="link button button--twitter" href="{% url "news:collection:twitter-timeline-create" %}">{% trans "Add a Twitter profile" %}</a>
<a class="link button button--confirm" href="{% url "news:collection:import" %}">{% trans "Import rules" %}</a>
</div>
</section>
<section class="section form__section">

View file

@ -1,4 +1,4 @@
from datetime import date, datetime, time
from datetime import datetime
from unittest.mock import Mock
from django.test import TestCase
@ -21,277 +21,233 @@ class FeedBuilderTestCase(TestCase):
def setUp(self):
self.maxDiff = None
def test_basic_entry(self):
builder = FeedBuilder
rule = FeedFactory()
mock_stream = Mock(rule=rule)
with builder(simple_mock, mock_stream) as builder:
builder.build()
builder.save()
post = Post.objects.get()
publication_date = datetime.combine(
date(2019, 5, 20), time(hour=16, minute=7, second=37)
)
aware_date = pytz.utc.localize(publication_date)
self.assertEquals(post.publication_date, aware_date)
self.assertEquals(Post.objects.count(), 1)
self.assertEquals(
post.remote_identifier,
"https://www.bbc.co.uk/news/world-us-canada-48338168",
)
self.assertEquals(
post.url, "https://www.bbc.co.uk/news/world-us-canada-48338168"
)
self.assertEquals(
post.title, "Trump's 'genocidal taunts' will not end Iran - Zarif"
)
def test_multiple_entries(self):
builder = FeedBuilder
rule = FeedFactory()
mock_stream = Mock(rule=rule)
with builder(multiple_mock, mock_stream) as builder:
with FeedBuilder(multiple_mock, mock_stream) as builder:
builder.build()
builder.save()
posts = Post.objects.order_by("-publication_date")
self.assertEquals(Post.objects.count(), 3)
self.assertEqual(Post.objects.count(), 3)
post = posts[0]
publication_date = datetime.combine(
date(2019, 5, 20), time(hour=16, minute=32, second=38)
publication_date = datetime(
2019, 5, 20, hour=16, minute=32, second=38, tzinfo=pytz.utc
)
aware_date = pytz.utc.localize(publication_date)
self.assertEquals(
self.assertEqual(
post.publication_date.strftime("%Y-%m-%d %H:%M:%S"),
aware_date.strftime("%Y-%m-%d %H:%M:%S"),
publication_date.strftime("%Y-%m-%d %H:%M:%S"),
)
self.assertEquals(
self.assertEqual(
post.remote_identifier,
"https://www.bbc.co.uk/news/uk-england-birmingham-48339080",
)
self.assertEquals(
self.assertEqual(
post.url, "https://www.bbc.co.uk/news/uk-england-birmingham-48339080"
)
self.assertEquals(
self.assertEqual(
post.title, "Birmingham head teacher threatened over LGBT lessons"
)
post = posts[1]
publication_date = datetime.combine(
date(2019, 5, 20), time(hour=16, minute=7, second=37)
publication_date = datetime(
2019, 5, 20, hour=16, minute=7, second=37, tzinfo=pytz.utc
)
aware_date = pytz.utc.localize(publication_date)
self.assertEquals(
self.assertEqual(
post.publication_date.strftime("%Y-%m-%d %H:%M:%S"),
aware_date.strftime("%Y-%m-%d %H:%M:%S"),
publication_date.strftime("%Y-%m-%d %H:%M:%S"),
)
self.assertEquals(
self.assertEqual(
post.remote_identifier,
"https://www.bbc.co.uk/news/world-us-canada-48338168",
)
self.assertEquals(
self.assertEqual(
post.url, "https://www.bbc.co.uk/news/world-us-canada-48338168"
)
self.assertEquals(
self.assertEqual(
post.title, "Trump's 'genocidal taunts' will not end Iran - Zarif"
)
def test_entries_without_remote_identifier(self):
builder = FeedBuilder
rule = FeedFactory()
mock_stream = Mock(rule=rule)
with builder(mock_without_identifier, mock_stream) as builder:
with FeedBuilder(mock_without_identifier, mock_stream) as builder:
builder.build()
builder.save()
posts = Post.objects.order_by("-publication_date")
self.assertEquals(Post.objects.count(), 2)
self.assertEqual(Post.objects.count(), 2)
post = posts[0]
publication_date = datetime.combine(
date(2019, 5, 20), time(hour=16, minute=7, second=37)
publication_date = datetime(
2019, 5, 20, hour=16, minute=7, second=37, tzinfo=pytz.utc
)
aware_date = pytz.utc.localize(publication_date)
self.assertEquals(post.publication_date, aware_date)
self.assertEquals(post.remote_identifier, None)
self.assertEquals(
self.assertEqual(post.publication_date, publication_date)
self.assertEqual(post.remote_identifier, None)
self.assertEqual(
post.url, "https://www.bbc.co.uk/news/world-us-canada-48338168"
)
self.assertEquals(
self.assertEqual(
post.title, "Trump's 'genocidal taunts' will not end Iran - Zarif"
)
post = posts[1]
publication_date = datetime.combine(
date(2019, 5, 20), time(hour=12, minute=19, second=19)
publication_date = datetime(
2019, 5, 20, hour=12, minute=19, second=19, tzinfo=pytz.utc
)
aware_date = pytz.utc.localize(publication_date)
self.assertEquals(post.publication_date, aware_date)
self.assertEquals(post.remote_identifier, None)
self.assertEquals(post.url, "https://www.bbc.co.uk/news/technology-48334739")
self.assertEquals(post.title, "Huawei's Android loss: How it affects you")
self.assertEqual(post.publication_date, publication_date)
self.assertEqual(post.remote_identifier, None)
self.assertEqual(post.url, "https://www.bbc.co.uk/news/technology-48334739")
self.assertEqual(post.title, "Huawei's Android loss: How it affects you")
def test_entry_without_publication_date(self):
builder = FeedBuilder
rule = FeedFactory()
mock_stream = Mock(rule=rule)
with builder(mock_without_publish_date, mock_stream) as builder:
with FeedBuilder(mock_without_publish_date, mock_stream) as builder:
builder.build()
builder.save()
posts = Post.objects.order_by("-publication_date")
self.assertEquals(Post.objects.count(), 2)
self.assertEqual(Post.objects.count(), 2)
post = posts[0]
self.assertEquals(
self.assertEqual(
post.publication_date.strftime("%Y-%m-%d %H:%M"), "2019-10-30 12:30"
)
self.assertEquals(post.created, timezone.now())
self.assertEquals(
self.assertEqual(post.created, timezone.now())
self.assertEqual(
post.remote_identifier,
"https://www.bbc.co.uk/news/world-us-canada-48338168",
)
post = posts[1]
self.assertEquals(
self.assertEqual(
post.publication_date.strftime("%Y-%m-%d %H:%M"), "2019-10-30 12:30"
)
self.assertEquals(post.created, timezone.now())
self.assertEquals(
self.assertEqual(post.created, timezone.now())
self.assertEqual(
post.remote_identifier, "https://www.bbc.co.uk/news/technology-48334739"
)
def test_entry_without_url(self):
builder = FeedBuilder
rule = FeedFactory()
mock_stream = Mock(rule=rule)
with builder(mock_without_url, mock_stream) as builder:
with FeedBuilder(mock_without_url, mock_stream) as builder:
builder.build()
builder.save()
posts = Post.objects.order_by("-publication_date")
self.assertEquals(Post.objects.count(), 2)
self.assertEqual(Post.objects.count(), 2)
post = posts[0]
self.assertEquals(post.created, timezone.now())
self.assertEquals(
self.assertEqual(post.created, timezone.now())
self.assertEqual(
post.remote_identifier,
"https://www.bbc.co.uk/news/world-us-canada-48338168",
)
post = posts[1]
self.assertEquals(post.created, timezone.now())
self.assertEquals(
self.assertEqual(post.created, timezone.now())
self.assertEqual(
post.remote_identifier, "https://www.bbc.co.uk/news/technology-48334739"
)
def test_entry_without_body(self):
builder = FeedBuilder
rule = FeedFactory()
mock_stream = Mock(rule=rule)
with builder(mock_without_body, mock_stream) as builder:
with FeedBuilder(mock_without_body, mock_stream) as builder:
builder.build()
builder.save()
posts = Post.objects.order_by("-publication_date")
self.assertEquals(Post.objects.count(), 2)
self.assertEqual(Post.objects.count(), 2)
post = posts[0]
self.assertEquals(
self.assertEqual(
post.created.strftime("%Y-%m-%d %H:%M:%S"), "2019-10-30 12:30:00"
)
self.assertEquals(
self.assertEqual(
post.remote_identifier,
"https://www.bbc.co.uk/news/uk-england-birmingham-48339080",
)
self.assertEquals(post.body, "")
self.assertEqual(post.body, "")
post = posts[1]
self.assertEquals(
self.assertEqual(
post.created.strftime("%Y-%m-%d %H:%M:%S"), "2019-10-30 12:30:00"
)
self.assertEquals(
self.assertEqual(
post.remote_identifier,
"https://www.bbc.co.uk/news/world-us-canada-48338168",
)
self.assertEquals(post.body, "")
self.assertEqual(post.body, "")
def test_entry_without_author(self):
builder = FeedBuilder
rule = FeedFactory()
mock_stream = Mock(rule=rule)
with builder(mock_without_author, mock_stream) as builder:
with FeedBuilder(mock_without_author, mock_stream) as builder:
builder.build()
builder.save()
posts = Post.objects.order_by("-publication_date")
self.assertEquals(Post.objects.count(), 2)
self.assertEqual(Post.objects.count(), 2)
post = posts[0]
self.assertEquals(post.created, timezone.now())
self.assertEquals(
self.assertEqual(post.created, timezone.now())
self.assertEqual(
post.remote_identifier,
"https://www.bbc.co.uk/news/world-us-canada-48338168",
)
self.assertEquals(post.author, None)
self.assertEqual(post.author, None)
post = posts[1]
self.assertEquals(post.created, timezone.now())
self.assertEquals(
self.assertEqual(post.created, timezone.now())
self.assertEqual(
post.remote_identifier, "https://www.bbc.co.uk/news/technology-48334739"
)
self.assertEquals(post.author, None)
self.assertEqual(post.author, None)
def test_empty_entries(self):
builder = FeedBuilder
rule = FeedFactory()
mock_stream = Mock(rule=rule)
with builder(mock_without_entries, mock_stream) as builder:
with FeedBuilder(mock_without_entries, mock_stream) as builder:
builder.build()
builder.save()
self.assertEquals(Post.objects.count(), 0)
self.assertEqual(Post.objects.count(), 0)
def test_update_entries(self):
builder = FeedBuilder
rule = FeedFactory()
mock_stream = Mock(rule=rule)
@ -303,36 +259,35 @@ class FeedBuilderTestCase(TestCase):
remote_identifier="a5479c66-8fae-11e9-8422-00163ef6bee7", rule=rule
)
with builder(mock_with_update_entries, mock_stream) as builder:
with FeedBuilder(mock_with_update_entries, mock_stream) as builder:
builder.build()
builder.save()
self.assertEquals(Post.objects.count(), 3)
self.assertEqual(Post.objects.count(), 3)
existing_first_post.refresh_from_db()
existing_second_post.refresh_from_db()
self.assertEquals(
self.assertEqual(
existing_first_post.title,
"Trump's 'genocidal taunts' will not end Iran - Zarif",
)
self.assertEquals(
self.assertEqual(
existing_second_post.title, "Huawei's Android loss: How it affects you"
)
def test_html_sanitizing(self):
builder = FeedBuilder
rule = FeedFactory()
mock_stream = Mock(rule=rule)
with builder(mock_with_html, mock_stream) as builder:
with FeedBuilder(mock_with_html, mock_stream) as builder:
builder.build()
builder.save()
post = Post.objects.get()
self.assertEquals(Post.objects.count(), 1)
self.assertEqual(Post.objects.count(), 1)
self.assertTrue("<article>" in post.body)
self.assertTrue("<h1>" in post.body)
@ -345,64 +300,60 @@ class FeedBuilderTestCase(TestCase):
self.assertTrue("<iframe>" not in post.body)
def test_long_author_text_is_truncated(self):
builder = FeedBuilder
rule = FeedFactory()
mock_stream = Mock(rule=rule)
with builder(mock_with_long_author, mock_stream) as builder:
with FeedBuilder(mock_with_long_author, mock_stream) as builder:
builder.build()
builder.save()
post = Post.objects.get()
self.assertEquals(Post.objects.count(), 1)
self.assertEqual(Post.objects.count(), 1)
self.assertEquals(len(post.author), 40)
self.assertEqual(len(post.author), 40)
def test_long_title_text_is_truncated(self):
builder = FeedBuilder
rule = FeedFactory()
mock_stream = Mock(rule=rule)
with builder(mock_with_long_title, mock_stream) as builder:
with FeedBuilder(mock_with_long_title, mock_stream) as builder:
builder.build()
builder.save()
post = Post.objects.get()
self.assertEquals(Post.objects.count(), 1)
self.assertEqual(Post.objects.count(), 1)
self.assertEquals(len(post.title), 200)
self.assertEqual(len(post.title), 200)
self.assertTrue(post.title.endswith(""))
def test_long_title_exotic_title(self):
builder = FeedBuilder
rule = FeedFactory()
mock_stream = Mock(rule=rule)
with builder(mock_with_long_exotic_title, mock_stream) as builder:
with FeedBuilder(mock_with_long_exotic_title, mock_stream) as builder:
builder.build()
builder.save()
post = Post.objects.get()
self.assertEquals(Post.objects.count(), 1)
self.assertEqual(Post.objects.count(), 1)
self.assertEquals(len(post.title), 200)
self.assertEqual(len(post.title), 200)
self.assertTrue(post.title.endswith(""))
def test_content_detail_is_prioritized_if_longer(self):
builder = FeedBuilder
rule = FeedFactory()
mock_stream = Mock(rule=rule)
with builder(mock_with_longer_content_detail, mock_stream) as builder:
with FeedBuilder(mock_with_longer_content_detail, mock_stream) as builder:
builder.build()
builder.save()
post = Post.objects.get()
self.assertEquals(Post.objects.count(), 1)
self.assertEqual(Post.objects.count(), 1)
self.assertFalse(
"Foreign Minister Mohammad Javad Zarif says the US" in post.body
@ -410,33 +361,31 @@ class FeedBuilderTestCase(TestCase):
self.assertTrue("Federal Communications Commission" in post.body)
def test_content_detail_is_not_prioritized_if_shorter(self):
builder = FeedBuilder
rule = FeedFactory()
mock_stream = Mock(rule=rule)
with builder(mock_with_shorter_content_detail, mock_stream) as builder:
with FeedBuilder(mock_with_shorter_content_detail, mock_stream) as builder:
builder.build()
builder.save()
post = Post.objects.get()
self.assertEquals(Post.objects.count(), 1)
self.assertEqual(Post.objects.count(), 1)
self.assertTrue(
"Foreign Minister Mohammad Javad Zarif says the US" in post.body
)
def test_content_detail_is_concatinated(self):
builder = FeedBuilder
rule = FeedFactory()
mock_stream = Mock(rule=rule)
with builder(mock_with_multiple_content_detail, mock_stream) as builder:
with FeedBuilder(mock_with_multiple_content_detail, mock_stream) as builder:
builder.build()
builder.save()
post = Post.objects.get()
self.assertEquals(Post.objects.count(), 1)
self.assertEqual(Post.objects.count(), 1)
self.assertEquals(post.body, "Yippie\n Ya\n Yee")
self.assertEqual(post.body, "Yippie\n Ya\n Yee")

View file

@ -86,52 +86,6 @@ class RedditBuilderTestCase(TestCase):
self.assertEquals(Post.objects.count(), 0)
def test_update_posts(self):
subreddit = SubredditFactory()
existing_post = RedditPostFactory(
remote_identifier="hm0qct",
author="Old author",
title="Old title",
body="Old body",
url="https://bbc.com/",
rule=subreddit,
)
builder = RedditBuilder
mock_stream = Mock(rule=subreddit)
with builder(simple_mock, mock_stream) as builder:
builder.build()
builder.save()
posts = {post.remote_identifier: post for post in Post.objects.all()}
self.assertCountEqual(
("hm0qct", "hna75r", "hngs71", "hngsj8", "hnd7cy"), posts.keys()
)
existing_post.refresh_from_db()
self.assertEquals(existing_post.remote_identifier, "hm0qct")
self.assertEquals(existing_post.author, "AutoModerator")
self.assertEquals(
existing_post.title,
"Linux Experiences/Rants or Education/Certifications thread - July 06, 2020",
)
self.assertIn(
"This megathread is also to hear opinions from anyone just starting out "
"with Linux or those that have used Linux (GNU or otherwise) for a long time.",
existing_post.body,
)
self.assertEquals(
existing_post.publication_date,
pytz.utc.localize(datetime(2020, 7, 6, 6, 11, 22)),
)
self.assertEquals(
existing_post.url,
"https://www.reddit.com/r/linux/comments/hm0qct/linux_experiencesrants_or_educationcertifications/",
)
def test_html_sanitizing(self):
builder = RedditBuilder
@ -225,17 +179,6 @@ class RedditBuilderTestCase(TestCase):
("hm0qct", "hna75r", "hngs71", "hngsj8", "hnd7cy"), posts.keys()
)
duplicate_post.refresh_from_db()
self.assertEquals(
duplicate_post.publication_date,
pytz.utc.localize(datetime(2020, 7, 6, 6, 11, 22)),
)
self.assertEquals(
duplicate_post.title,
"Linux Experiences/Rants or Education/Certifications thread - July 06, 2020",
)
def test_image_post(self):
builder = RedditBuilder

View file

@ -2185,3 +2185,202 @@ unsanitized_mock = [
},
}
]
broken_mock = [
{
"contributors": None,
"coordinates": None,
"created_at": "Fri Aug 07 00:17:05 +0000 2020",
"display_text_range": [11, 59],
"entities": {
"hashtags": [],
"symbols": [],
"urls": [
{
"display_url": "youtu.be/rDy7tPf6CT8",
"expanded_url": "https://youtu.be/rDy7tPf6CT8",
"indices": [36, 59],
"url": "https://t.co/trAcIxBMlX",
}
],
"user_mentions": [
{
"id": 975844884606275587,
"id_str": "975844884606275587",
"indices": [0, 10],
"name": "ArieNeo",
"screen_name": "ArieNeoSC",
}
],
},
"favorite_count": 19,
"favorited": False,
# Note the missing full_text key here
"geo": None,
"id": 1291528756373286914,
"id_str": "1291528756373286914",
"in_reply_to_screen_name": "ArieNeoSC",
"in_reply_to_status_id": 1291507356313038850,
"in_reply_to_status_id_str": "1291507356313038850",
"in_reply_to_user_id": 975844884606275587,
"in_reply_to_user_id_str": "975844884606275587",
"is_quote_status": False,
"lang": "en",
"place": None,
"possibly_sensitive": False,
"retweet_count": 5,
"retweeted": False,
"source": '<a href="https://mobile.twitter.com" rel="nofollow">Twitter Web App</a>',
"truncated": False,
"user": {
"contributors_enabled": False,
"created_at": "Wed Sep 05 00:58:11 +0000 2012",
"default_profile": False,
"default_profile_image": False,
"description": "The official Twitter profile for #StarCitizen and Roberts Space Industries.",
"entities": {
"description": {"urls": []},
"url": {
"urls": [
{
"display_url": "robertsspaceindustries.com",
"expanded_url": "http://www.robertsspaceindustries.com",
"indices": [0, 23],
"url": "https://t.co/iqO6apof3y",
}
]
},
},
"favourites_count": 4588,
"follow_request_sent": None,
"followers_count": 106169,
"following": None,
"friends_count": 201,
"geo_enabled": False,
"has_extended_profile": False,
"id": 803542770,
"id_str": "803542770",
"is_translation_enabled": False,
"is_translator": False,
"lang": None,
"listed_count": 890,
"location": "Roberts Space Industries",
"name": "Star Citizen",
"notifications": None,
"profile_background_color": "131516",
"profile_background_image_url": "http://abs.twimg.com/images/themes/theme14/bg.gif",
"profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme14/bg.gif",
"profile_background_tile": False,
"profile_banner_url": "https://pbs.twimg.com/profile_banners/803542770/1596651186",
"profile_image_url": "http://pbs.twimg.com/profile_images/963109950103814144/ysnj_Asy_normal.jpg",
"profile_image_url_https": "https://pbs.twimg.com/profile_images/963109950103814144/ysnj_Asy_normal.jpg",
"profile_link_color": "0A5485",
"profile_sidebar_border_color": "FFFFFF",
"profile_sidebar_fill_color": "EFEFEF",
"profile_text_color": "333333",
"profile_use_background_image": True,
"protected": False,
"screen_name": "RobertsSpaceInd",
"statuses_count": 6210,
"time_zone": None,
"translator_type": "none",
"url": "https://t.co/iqO6apof3y",
"utc_offset": None,
"verified": True,
},
},
{
"contributors": None,
"coordinates": None,
"created_at": "Wed Jul 29 19:01:47 +0000 2020",
"display_text_range": [10, 98],
"entities": {
"hashtags": [],
"symbols": [],
"urls": [],
"user_mentions": [
{
"id": 435221600,
"id_str": "435221600",
"indices": [0, 9],
"name": "Christopher Blough",
"screen_name": "RelicCcb",
}
],
},
"favorite_count": 1,
"favorited": False,
"full_text": "@RelicCcb Hi Christoper, we have checked the status of your investigation and it is still ongoing.",
"geo": None,
"id": 1288550304095416320,
"id_str": "1288550304095416320",
"in_reply_to_screen_name": "RelicCcb",
"in_reply_to_status_id": 1288475147951898625,
"in_reply_to_status_id_str": "1288475147951898625",
"in_reply_to_user_id": 435221600,
"in_reply_to_user_id_str": "435221600",
"is_quote_status": False,
"lang": "en",
"place": None,
"retweet_count": 0,
"retweeted": False,
"source": '<a href="https://mobile.twitter.com" rel="nofollow">Twitter Web App</a>',
"truncated": False,
"user": {
"contributors_enabled": False,
"created_at": "Wed Sep 05 00:58:11 +0000 2012",
"default_profile": False,
"default_profile_image": False,
"description": "The official Twitter profile for #StarCitizen and Roberts Space Industries.",
"entities": {
"description": {"urls": []},
"url": {
"urls": [
{
"display_url": "robertsspaceindustries.com",
"expanded_url": "http://www.robertsspaceindustries.com",
"indices": [0, 23],
"url": "https://t.co/iqO6apof3y",
}
]
},
},
"favourites_count": 4588,
"follow_request_sent": None,
"followers_count": 106169,
"following": None,
"friends_count": 201,
"geo_enabled": False,
"has_extended_profile": False,
"id": 803542770,
"id_str": "803542770",
"is_translation_enabled": False,
"is_translator": False,
"lang": None,
"listed_count": 890,
"location": "Roberts Space Industries",
"name": "Star Citizen",
"notifications": None,
"profile_background_color": "131516",
"profile_background_image_url": "http://abs.twimg.com/images/themes/theme14/bg.gif",
"profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme14/bg.gif",
"profile_background_tile": False,
"profile_banner_url": "https://pbs.twimg.com/profile_banners/803542770/1596651186",
"profile_image_url": "http://pbs.twimg.com/profile_images/963109950103814144/ysnj_Asy_normal.jpg",
"profile_image_url_https": "https://pbs.twimg.com/profile_images/963109950103814144/ysnj_Asy_normal.jpg",
"profile_link_color": "0A5485",
"profile_sidebar_border_color": "FFFFFF",
"profile_sidebar_fill_color": "EFEFEF",
"profile_text_color": "333333",
"profile_use_background_image": True,
"protected": False,
"screen_name": "RobertsSpaceInd",
"statuses_count": 6210,
"time_zone": None,
"translator_type": "none",
"url": "https://t.co/iqO6apof3y",
"utc_offset": None,
"verified": True,
},
},
]

View file

@ -10,6 +10,7 @@ from ftfy import fix_text
from newsreader.news.collection.tests.factories import TwitterTimelineFactory
from newsreader.news.collection.tests.twitter.builder.mocks import (
broken_mock,
gif_mock,
image_mock,
quoted_mock,
@ -410,3 +411,21 @@ class TwitterBuilderTestCase(TestCase):
builder.save()
self.assertEquals(Post.objects.count(), 2)
def test_bad_post(self):
"""
Tests that the builder will ignore posts which miss data
"""
builder = TwitterBuilder
profile = TwitterTimelineFactory(screen_name="RobertsSpaceInd")
mock_stream = Mock(rule=profile)
with builder(broken_mock, mock_stream) as builder:
builder.build()
builder.save()
self.assertCountEqual(
Post.objects.values_list("remote_identifier", flat=True),
["1288550304095416320"],
)

View file

@ -22,6 +22,10 @@ from newsreader.news.collection.base import (
)
from newsreader.news.collection.choices import RuleTypeChoices, TwitterPostTypeChoices
from newsreader.news.collection.exceptions import (
BuilderDuplicateException,
BuilderException,
BuilderMissingDataException,
BuilderParseException,
StreamDeniedException,
StreamException,
StreamNotFoundException,
@ -48,43 +52,69 @@ class TwitterBuilder(PostBuilder):
def build(self):
results = {}
rule = self.stream.rule
for post in self.payload:
remote_identifier = post["id_str"]
if remote_identifier in self.existing_posts:
try:
post = self.build_post(post)
except BuilderException:
logger.exception("Failed building post")
continue
url = f"{TWITTER_URL}/{rule.screen_name}/status/{remote_identifier}"
body = urlize(post["full_text"], nofollow=True)
identifier = post.remote_identifier
results[identifier] = post
self.instances = results.values()
def build_post(self, data):
remote_identifier = data.get("id_str", "")
rule = self.stream.rule
if remote_identifier in self.existing_posts:
raise BuilderDuplicateException(payload=data)
try:
body = urlize(data["full_text"], nofollow=True)
title = truncate_text(
Post, "title", self.sanitize_fragment(post["full_text"])
Post, "title", self.sanitize_fragment(data["full_text"])
)
publication_date = pytz.utc.localize(
datetime.strptime(post["created_at"], "%a %b %d %H:%M:%S +0000 %Y")
datetime.strptime(data["created_at"], "%a %b %d %H:%M:%S +0000 %Y")
)
except KeyError as e:
raise BuilderMissingDataException(payload=data) from e
except (OverflowError, OSError) as e:
raise BuilderParseException(payload=data) from e
if "extended_entities" in post:
try:
media_entities = self.get_media_entities(post)
body += media_entities
except KeyError:
logger.exception(f"Failed parsing media_entities for {url}")
url = f"{TWITTER_URL}/{rule.screen_name}/status/{remote_identifier}"
if "retweeted_status" in post:
original_post = post["retweeted_status"]
if "extended_entities" in data:
try:
media_entities = self.get_media_entities(data)
body += media_entities
except KeyError as e:
raise BuilderMissingDataException(
message="Failed parsing data for media entities", payload=data
) from e
try:
if "retweeted_status" in data:
original_post = data["retweeted_status"]
original_tweet = urlize(original_post["full_text"], nofollow=True)
body = f"{body} <br><div>Original tweet: {original_tweet}</div>"
if "quoted_status" in post:
original_post = post["quoted_status"]
if "quoted_status" in data:
original_post = data["quoted_status"]
original_tweet = urlize(original_post["full_text"], nofollow=True)
body = f"{body} <br><div>Quoted tweet: {original_tweet}</div>"
except KeyError as e:
raise BuilderMissingDataException(
message="Failed parsing data for original tweet", payload=data
) from e
body = self.sanitize_fragment(body)
body = self.sanitize_fragment(body)
data = {
return Post(
**{
"remote_identifier": remote_identifier,
"title": fix_text(title),
"body": fix_text(body),
@ -93,13 +123,10 @@ class TwitterBuilder(PostBuilder):
"url": url,
"rule": rule,
}
)
results[remote_identifier] = Post(**data)
self.instances = results.values()
def get_media_entities(self, post):
media_entities = post["extended_entities"]["media"]
def get_media_entities(self, data):
media_entities = data["extended_entities"]["media"]
formatted_entities = ""
for media_entity in media_entities:

View file

@ -70,6 +70,8 @@
& img, video {
padding: 10px 0;
width: max-content;
max-width: 100%;
}

View file

@ -16,7 +16,7 @@
{% if request.user.is_authenticated %}
<li class="nav__item"><a href="{% url 'index' %}">Home</a></li>
<li class="nav__item"><a href="{% url 'news:core:categories' %}">Categories</a></li>
<li class="nav__item"><a href="{% url 'news:collection:rules' %}">Feeds</a></li>
<li class="nav__item"><a href="{% url 'news:collection:rules' %}">Sources</a></li>
<li class="nav__item"><a href="{% url 'accounts:settings:home' %}">Settings</a></li>
{% if request.user.is_superuser %}
<li class="nav__item"><a href="{% url 'admin:index' %}">Admin</a></li>