0.3.3
- Update static configuration - Builder refactor - Fix for images stretching to far
This commit is contained in:
parent
b6921a20e7
commit
9e5e05c056
15 changed files with 568 additions and 341 deletions
|
|
@ -3427,7 +3427,6 @@
|
|||
"is_active": true,
|
||||
"date_joined": "2019-07-18T18:52:36.080Z",
|
||||
"email": "sonny@bakker.nl",
|
||||
"task": 10,
|
||||
"reddit_refresh_token": null,
|
||||
"reddit_access_token": null,
|
||||
"groups": [],
|
||||
|
|
|
|||
16
src/newsreader/news/collection/exceptions/__init__.py
Normal file
16
src/newsreader/news/collection/exceptions/__init__.py
Normal file
|
|
@ -0,0 +1,16 @@
|
|||
from newsreader.news.collection.exceptions.builder import (
|
||||
BuilderDuplicateException,
|
||||
BuilderException,
|
||||
BuilderMissingDataException,
|
||||
BuilderParseException,
|
||||
)
|
||||
from newsreader.news.collection.exceptions.stream import (
|
||||
StreamConnectionException,
|
||||
StreamDeniedException,
|
||||
StreamException,
|
||||
StreamForbiddenException,
|
||||
StreamNotFoundException,
|
||||
StreamParseException,
|
||||
StreamTimeOutException,
|
||||
StreamTooManyException,
|
||||
)
|
||||
21
src/newsreader/news/collection/exceptions/builder.py
Normal file
21
src/newsreader/news/collection/exceptions/builder.py
Normal file
|
|
@ -0,0 +1,21 @@
|
|||
class BuilderException(Exception):
|
||||
message = "Builder exception"
|
||||
|
||||
def __init__(self, payload=None, message=None):
|
||||
self.payload = payload
|
||||
self.message = message if message else self.message
|
||||
|
||||
def __str__(self):
|
||||
return self.message
|
||||
|
||||
|
||||
class BuilderMissingDataException(BuilderException):
|
||||
message = "Payload contains missing data"
|
||||
|
||||
|
||||
class BuilderDuplicateException(BuilderException):
|
||||
message = "Payload contains duplicate entry"
|
||||
|
||||
|
||||
class BuilderParseException(BuilderException):
|
||||
message = "Failed to parse payload"
|
||||
|
|
@ -39,6 +39,18 @@ class FeedBuilder(PostBuilder):
|
|||
rule__type = RuleTypeChoices.feed
|
||||
|
||||
def build(self):
|
||||
instances = []
|
||||
|
||||
with FeedDuplicateHandler(self.stream.rule) as duplicate_handler:
|
||||
entries = self.payload.get("entries", [])
|
||||
|
||||
for entry in entries:
|
||||
post = self.build_post(entry)
|
||||
instances.append(post)
|
||||
|
||||
self.instances = duplicate_handler.check(instances)
|
||||
|
||||
def build_post(self, entry):
|
||||
field_mapping = {
|
||||
"id": "remote_identifier",
|
||||
"title": "title",
|
||||
|
|
@ -48,41 +60,37 @@ class FeedBuilder(PostBuilder):
|
|||
"author": "author",
|
||||
}
|
||||
tz = pytz.timezone(self.stream.rule.timezone)
|
||||
instances = []
|
||||
data = {"rule_id": self.stream.rule.pk}
|
||||
|
||||
with FeedDuplicateHandler(self.stream.rule) as duplicate_handler:
|
||||
entries = self.payload.get("entries", [])
|
||||
for field, model_field in field_mapping.items():
|
||||
if not field in entry:
|
||||
continue
|
||||
|
||||
for entry in entries:
|
||||
data = {"rule_id": self.stream.rule.pk}
|
||||
value = truncate_text(Post, model_field, entry[field])
|
||||
|
||||
for field, model_field in field_mapping.items():
|
||||
if not field in entry:
|
||||
continue
|
||||
if field == "published_parsed":
|
||||
data[model_field] = build_publication_date(value, tz)
|
||||
elif field == "summary":
|
||||
data[model_field] = self.sanitize_fragment(value)
|
||||
else:
|
||||
data[model_field] = value
|
||||
|
||||
value = truncate_text(Post, model_field, entry[field])
|
||||
content_details = self.get_content_details(entry)
|
||||
|
||||
if field == "published_parsed":
|
||||
data[model_field] = build_publication_date(value, tz)
|
||||
elif field == "summary":
|
||||
data[model_field] = self.sanitize_fragment(value)
|
||||
else:
|
||||
data[model_field] = value
|
||||
# use content details key if it contains more information
|
||||
if not "body" in data or len(data["body"]) < len(content_details):
|
||||
data["body"] = content_details
|
||||
|
||||
if "content" in entry:
|
||||
content = self.get_content(entry["content"])
|
||||
body = data.get("body", "")
|
||||
return Post(**data)
|
||||
|
||||
if not body or len(body) < len(content):
|
||||
data["body"] = content
|
||||
def get_content_details(self, entry):
|
||||
content_items = entry.get("content")
|
||||
|
||||
instances.append(Post(**data))
|
||||
if not content_items:
|
||||
return ""
|
||||
|
||||
self.instances = duplicate_handler.check(instances)
|
||||
|
||||
def get_content(self, items):
|
||||
content = "\n ".join([item.get("value") for item in items])
|
||||
return self.sanitize_fragment(content)
|
||||
content_details = "\n ".join([item.get("value") for item in content_items])
|
||||
return self.sanitize_fragment(content_details)
|
||||
|
||||
|
||||
class FeedStream(PostStream):
|
||||
|
|
|
|||
|
|
@ -28,6 +28,10 @@ from newsreader.news.collection.constants import (
|
|||
WHITELISTED_TAGS,
|
||||
)
|
||||
from newsreader.news.collection.exceptions import (
|
||||
BuilderDuplicateException,
|
||||
BuilderException,
|
||||
BuilderMissingDataException,
|
||||
BuilderParseException,
|
||||
StreamDeniedException,
|
||||
StreamException,
|
||||
StreamParseException,
|
||||
|
|
@ -122,99 +126,136 @@ class RedditBuilder(PostBuilder):
|
|||
if not "data" in self.payload or not "children" in self.payload["data"]:
|
||||
return
|
||||
|
||||
posts = self.payload["data"]["children"]
|
||||
rule = self.stream.rule
|
||||
|
||||
for post in posts:
|
||||
if not "data" in post or post["kind"] != REDDIT_POST:
|
||||
continue
|
||||
|
||||
data = post["data"]
|
||||
|
||||
remote_identifier = data["id"]
|
||||
title = truncate_text(Post, "title", data["title"])
|
||||
author = truncate_text(Post, "author", data["author"])
|
||||
post_url_fragment = data["permalink"]
|
||||
direct_url = data["url"]
|
||||
is_text_post = data["is_self"]
|
||||
|
||||
if remote_identifier in results:
|
||||
continue
|
||||
|
||||
if is_text_post:
|
||||
uncleaned_body = data["selftext_html"]
|
||||
unescaped_body = unescape(uncleaned_body) if uncleaned_body else ""
|
||||
body = self.sanitize_fragment(unescaped_body) if unescaped_body else ""
|
||||
elif direct_url.endswith(REDDIT_IMAGE_EXTENSIONS):
|
||||
body = format_html(
|
||||
"<div><img alt='{title}' src='{url}' loading='lazy' /></div>",
|
||||
url=direct_url,
|
||||
title=title,
|
||||
)
|
||||
elif data["is_video"]:
|
||||
video_info = data["secure_media"]["reddit_video"]
|
||||
|
||||
body = format_html(
|
||||
"<div><video controls muted><source src='{url}' type='video/mp4' /></video></div>",
|
||||
url=video_info["fallback_url"],
|
||||
)
|
||||
elif direct_url.endswith(REDDIT_VIDEO_EXTENSIONS):
|
||||
extension = next(
|
||||
extension.replace(".", "")
|
||||
for extension in REDDIT_VIDEO_EXTENSIONS
|
||||
if direct_url.endswith(extension)
|
||||
)
|
||||
|
||||
if extension == "gifv":
|
||||
body = format_html(
|
||||
"<div><video controls muted><source src='{url}' type='video/mp4' /></video></div>",
|
||||
url=direct_url.replace(extension, "mp4"),
|
||||
)
|
||||
else:
|
||||
body = format_html(
|
||||
"<div><video controls muted><source src='{url}' type='video/{extension}' /></video></div>",
|
||||
url=direct_url,
|
||||
extension=extension,
|
||||
)
|
||||
else:
|
||||
body = format_html(
|
||||
"<div><a target='_blank' rel='noopener noreferrer' alt='{title}' href='{url}' class='link'>Direct url</a></div>",
|
||||
url=direct_url,
|
||||
title=title,
|
||||
)
|
||||
entries = self.payload["data"]["children"]
|
||||
|
||||
for entry in entries:
|
||||
try:
|
||||
parsed_date = datetime.fromtimestamp(post["data"]["created_utc"])
|
||||
created_date = pytz.utc.localize(parsed_date)
|
||||
except (OverflowError, OSError):
|
||||
logging.warning(
|
||||
f"Failed parsing timestamp from {REDDIT_URL}{post_url_fragment}"
|
||||
)
|
||||
created_date = timezone.now()
|
||||
|
||||
post_data = {
|
||||
"remote_identifier": remote_identifier,
|
||||
"title": title,
|
||||
"body": body,
|
||||
"author": author,
|
||||
"url": f"{REDDIT_URL}{post_url_fragment}",
|
||||
"publication_date": created_date,
|
||||
"rule": rule,
|
||||
}
|
||||
|
||||
if remote_identifier in self.existing_posts:
|
||||
existing_post = self.existing_posts[remote_identifier]
|
||||
|
||||
for key, value in post_data.items():
|
||||
setattr(existing_post, key, value)
|
||||
|
||||
results[existing_post.remote_identifier] = existing_post
|
||||
post = self.build_post(entry)
|
||||
except BuilderException:
|
||||
logger.exception("Failed building post")
|
||||
continue
|
||||
|
||||
results[remote_identifier] = Post(**post_data)
|
||||
identifier = post.remote_identifier
|
||||
results[identifier] = post
|
||||
|
||||
self.instances = results.values()
|
||||
|
||||
def build_post(self, entry):
|
||||
rule = self.stream.rule
|
||||
entry_data = entry.get("data", {})
|
||||
remote_identifier = entry_data.get("id", "")
|
||||
kind = entry.get("kind")
|
||||
|
||||
if remote_identifier in self.existing_posts:
|
||||
raise BuilderDuplicateException(payload=entry)
|
||||
elif kind != REDDIT_POST:
|
||||
raise BuilderParseException(
|
||||
message=f"Payload is not an reddit post, its of kind {kind}",
|
||||
payload=entry,
|
||||
)
|
||||
elif not entry_data:
|
||||
raise BuilderMissingDataException(
|
||||
message=f"Post {remote_identifier} did not contain any data",
|
||||
payload=entry,
|
||||
)
|
||||
|
||||
try:
|
||||
title = entry_data["title"]
|
||||
author = entry_data["author"]
|
||||
post_url_fragment = entry_data["permalink"]
|
||||
direct_url = entry_data["url"]
|
||||
is_text = entry_data["is_self"]
|
||||
is_video = entry_data["is_video"]
|
||||
except KeyError as e:
|
||||
raise BuilderMissingDataException(payload=entry) from e
|
||||
|
||||
title = truncate_text(Post, "title", title)
|
||||
author = truncate_text(Post, "author", author)
|
||||
|
||||
if is_text:
|
||||
body = self.get_text_post(entry_data)
|
||||
elif direct_url.endswith(REDDIT_IMAGE_EXTENSIONS):
|
||||
body = self.get_image_post(title, direct_url)
|
||||
elif is_video:
|
||||
body = self.get_native_video_post(entry_data)
|
||||
elif direct_url.endswith(REDDIT_VIDEO_EXTENSIONS):
|
||||
body = self.get_video_post(direct_url)
|
||||
else:
|
||||
body = self.get_url_post(title, direct_url)
|
||||
|
||||
try:
|
||||
parsed_date = datetime.fromtimestamp(entry_data["created_utc"])
|
||||
created_date = pytz.utc.localize(parsed_date)
|
||||
except (OverflowError, OSError) as e:
|
||||
raise BuilderParseException(payload=entry) from e
|
||||
except KeyError as e:
|
||||
raise BuilderMissingDataException(payload=entry) from e
|
||||
|
||||
post_entry = {
|
||||
"remote_identifier": remote_identifier,
|
||||
"title": title,
|
||||
"body": body,
|
||||
"author": author,
|
||||
"url": f"{REDDIT_URL}{post_url_fragment}",
|
||||
"publication_date": created_date,
|
||||
"rule": rule,
|
||||
}
|
||||
|
||||
return Post(**post_entry)
|
||||
|
||||
def get_text_post(self, entry):
|
||||
try:
|
||||
uncleaned_body = entry["selftext_html"]
|
||||
except KeyError as e:
|
||||
raise BuilderMissingDataException(payload=entry) from e
|
||||
|
||||
unescaped_body = unescape(uncleaned_body) if uncleaned_body else ""
|
||||
return self.sanitize_fragment(unescaped_body) if unescaped_body else ""
|
||||
|
||||
def get_image_post(self, title, url):
|
||||
return format_html(
|
||||
"<div><img alt='{title}' src='{url}' loading='lazy' /></div>",
|
||||
url=url,
|
||||
title=title,
|
||||
)
|
||||
|
||||
def get_native_video_post(self, entry):
|
||||
try:
|
||||
video_info = entry["secure_media"]["reddit_video"]
|
||||
except KeyError as e:
|
||||
raise BuilderMissingDataException(payload=entry) from e
|
||||
|
||||
return format_html(
|
||||
"<div><video controls muted><source src='{url}' type='video/mp4' /></video></div>",
|
||||
url=video_info["fallback_url"],
|
||||
)
|
||||
|
||||
def get_video_post(self, url):
|
||||
extension = next(
|
||||
extension.replace(".", "")
|
||||
for extension in REDDIT_VIDEO_EXTENSIONS
|
||||
if url.endswith(extension)
|
||||
)
|
||||
|
||||
if extension == "gifv":
|
||||
return format_html(
|
||||
"<div><video controls muted><source src='{url}' type='video/mp4' /></video></div>",
|
||||
url=url.replace(extension, "mp4"),
|
||||
)
|
||||
|
||||
return format_html(
|
||||
"<div><video controls muted><source src='{url}' type='video/{extension}' /></video></div>",
|
||||
url=url,
|
||||
extension=extension,
|
||||
)
|
||||
|
||||
def get_url_post(self, title, url):
|
||||
return format_html(
|
||||
"<div><a target='_blank' rel='noopener noreferrer' alt='{title}' href='{url}' class='link'>Direct url</a></div>",
|
||||
url=url,
|
||||
title=title,
|
||||
)
|
||||
|
||||
|
||||
class RedditStream(PostStream):
|
||||
rule_type = RuleTypeChoices.subreddit
|
||||
|
|
|
|||
|
|
@ -6,19 +6,21 @@
|
|||
<form class="form rules-form">
|
||||
{% csrf_token %}
|
||||
|
||||
<section class="section form__section form__section--actions">
|
||||
<div class="form__actions">
|
||||
<a class="link button button--confirm" href="{% url "news:collection:feed-create" %}">{% trans "Add a feed" %}</a>
|
||||
<a class="link button button--confirm" href="{% url "news:collection:import" %}">{% trans "Import feeds" %}</a>
|
||||
<a class="link button button--reddit" href="{% url "news:collection:subreddit-create" %}">{% trans "Add a subreddit" %}</a>
|
||||
<a class="link button button--twitter" href="{% url "news:collection:twitter-timeline-create" %}">{% trans "Add a Twitter profile" %}</a>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<section class="section form__section form__section--actions">
|
||||
<fieldset class="fieldset form__fieldset">
|
||||
<input type="submit" class="button button--primary" formaction="{% url "news:collection:rules-enable" %}" formmethod="post" value="{% trans "Enable" %}" />
|
||||
<input type="submit" class="button button--primary" formaction="{% url "news:collection:rules-disable" %}" formmethod="post" value="{% trans "Disable" %}" />
|
||||
<input type="submit" class="button button--error" formaction="{% url "news:collection:rules-delete" %}" formmethod="post" value="{% trans "Delete" %}"/>
|
||||
</fieldset>
|
||||
|
||||
<div class="form__actions">
|
||||
<a class="link button button--confirm" href="{% url "news:collection:feed-create" %}">{% trans "Add a feed" %}</a>
|
||||
<a class="link button button--reddit" href="{% url "news:collection:subreddit-create" %}">{% trans "Add a subreddit" %}</a>
|
||||
<a class="link button button--twitter" href="{% url "news:collection:twitter-timeline-create" %}">{% trans "Add a Twitter profile" %}</a>
|
||||
<a class="link button button--confirm" href="{% url "news:collection:import" %}">{% trans "Import rules" %}</a>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<section class="section form__section">
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
from datetime import date, datetime, time
|
||||
from datetime import datetime
|
||||
from unittest.mock import Mock
|
||||
|
||||
from django.test import TestCase
|
||||
|
|
@ -21,277 +21,233 @@ class FeedBuilderTestCase(TestCase):
|
|||
def setUp(self):
|
||||
self.maxDiff = None
|
||||
|
||||
def test_basic_entry(self):
|
||||
builder = FeedBuilder
|
||||
rule = FeedFactory()
|
||||
mock_stream = Mock(rule=rule)
|
||||
|
||||
with builder(simple_mock, mock_stream) as builder:
|
||||
builder.build()
|
||||
builder.save()
|
||||
|
||||
post = Post.objects.get()
|
||||
|
||||
publication_date = datetime.combine(
|
||||
date(2019, 5, 20), time(hour=16, minute=7, second=37)
|
||||
)
|
||||
aware_date = pytz.utc.localize(publication_date)
|
||||
|
||||
self.assertEquals(post.publication_date, aware_date)
|
||||
self.assertEquals(Post.objects.count(), 1)
|
||||
|
||||
self.assertEquals(
|
||||
post.remote_identifier,
|
||||
"https://www.bbc.co.uk/news/world-us-canada-48338168",
|
||||
)
|
||||
|
||||
self.assertEquals(
|
||||
post.url, "https://www.bbc.co.uk/news/world-us-canada-48338168"
|
||||
)
|
||||
|
||||
self.assertEquals(
|
||||
post.title, "Trump's 'genocidal taunts' will not end Iran - Zarif"
|
||||
)
|
||||
|
||||
def test_multiple_entries(self):
|
||||
builder = FeedBuilder
|
||||
rule = FeedFactory()
|
||||
mock_stream = Mock(rule=rule)
|
||||
|
||||
with builder(multiple_mock, mock_stream) as builder:
|
||||
with FeedBuilder(multiple_mock, mock_stream) as builder:
|
||||
builder.build()
|
||||
builder.save()
|
||||
|
||||
posts = Post.objects.order_by("-publication_date")
|
||||
self.assertEquals(Post.objects.count(), 3)
|
||||
self.assertEqual(Post.objects.count(), 3)
|
||||
|
||||
post = posts[0]
|
||||
|
||||
publication_date = datetime.combine(
|
||||
date(2019, 5, 20), time(hour=16, minute=32, second=38)
|
||||
publication_date = datetime(
|
||||
2019, 5, 20, hour=16, minute=32, second=38, tzinfo=pytz.utc
|
||||
)
|
||||
aware_date = pytz.utc.localize(publication_date)
|
||||
|
||||
self.assertEquals(
|
||||
self.assertEqual(
|
||||
post.publication_date.strftime("%Y-%m-%d %H:%M:%S"),
|
||||
aware_date.strftime("%Y-%m-%d %H:%M:%S"),
|
||||
publication_date.strftime("%Y-%m-%d %H:%M:%S"),
|
||||
)
|
||||
|
||||
self.assertEquals(
|
||||
self.assertEqual(
|
||||
post.remote_identifier,
|
||||
"https://www.bbc.co.uk/news/uk-england-birmingham-48339080",
|
||||
)
|
||||
|
||||
self.assertEquals(
|
||||
self.assertEqual(
|
||||
post.url, "https://www.bbc.co.uk/news/uk-england-birmingham-48339080"
|
||||
)
|
||||
|
||||
self.assertEquals(
|
||||
self.assertEqual(
|
||||
post.title, "Birmingham head teacher threatened over LGBT lessons"
|
||||
)
|
||||
|
||||
post = posts[1]
|
||||
|
||||
publication_date = datetime.combine(
|
||||
date(2019, 5, 20), time(hour=16, minute=7, second=37)
|
||||
publication_date = datetime(
|
||||
2019, 5, 20, hour=16, minute=7, second=37, tzinfo=pytz.utc
|
||||
)
|
||||
aware_date = pytz.utc.localize(publication_date)
|
||||
|
||||
self.assertEquals(
|
||||
self.assertEqual(
|
||||
post.publication_date.strftime("%Y-%m-%d %H:%M:%S"),
|
||||
aware_date.strftime("%Y-%m-%d %H:%M:%S"),
|
||||
publication_date.strftime("%Y-%m-%d %H:%M:%S"),
|
||||
)
|
||||
|
||||
self.assertEquals(
|
||||
self.assertEqual(
|
||||
post.remote_identifier,
|
||||
"https://www.bbc.co.uk/news/world-us-canada-48338168",
|
||||
)
|
||||
|
||||
self.assertEquals(
|
||||
self.assertEqual(
|
||||
post.url, "https://www.bbc.co.uk/news/world-us-canada-48338168"
|
||||
)
|
||||
|
||||
self.assertEquals(
|
||||
self.assertEqual(
|
||||
post.title, "Trump's 'genocidal taunts' will not end Iran - Zarif"
|
||||
)
|
||||
|
||||
def test_entries_without_remote_identifier(self):
|
||||
builder = FeedBuilder
|
||||
rule = FeedFactory()
|
||||
mock_stream = Mock(rule=rule)
|
||||
|
||||
with builder(mock_without_identifier, mock_stream) as builder:
|
||||
with FeedBuilder(mock_without_identifier, mock_stream) as builder:
|
||||
builder.build()
|
||||
builder.save()
|
||||
|
||||
posts = Post.objects.order_by("-publication_date")
|
||||
self.assertEquals(Post.objects.count(), 2)
|
||||
self.assertEqual(Post.objects.count(), 2)
|
||||
|
||||
post = posts[0]
|
||||
|
||||
publication_date = datetime.combine(
|
||||
date(2019, 5, 20), time(hour=16, minute=7, second=37)
|
||||
publication_date = datetime(
|
||||
2019, 5, 20, hour=16, minute=7, second=37, tzinfo=pytz.utc
|
||||
)
|
||||
aware_date = pytz.utc.localize(publication_date)
|
||||
|
||||
self.assertEquals(post.publication_date, aware_date)
|
||||
self.assertEquals(post.remote_identifier, None)
|
||||
self.assertEquals(
|
||||
self.assertEqual(post.publication_date, publication_date)
|
||||
self.assertEqual(post.remote_identifier, None)
|
||||
self.assertEqual(
|
||||
post.url, "https://www.bbc.co.uk/news/world-us-canada-48338168"
|
||||
)
|
||||
self.assertEquals(
|
||||
self.assertEqual(
|
||||
post.title, "Trump's 'genocidal taunts' will not end Iran - Zarif"
|
||||
)
|
||||
|
||||
post = posts[1]
|
||||
|
||||
publication_date = datetime.combine(
|
||||
date(2019, 5, 20), time(hour=12, minute=19, second=19)
|
||||
publication_date = datetime(
|
||||
2019, 5, 20, hour=12, minute=19, second=19, tzinfo=pytz.utc
|
||||
)
|
||||
aware_date = pytz.utc.localize(publication_date)
|
||||
|
||||
self.assertEquals(post.publication_date, aware_date)
|
||||
self.assertEquals(post.remote_identifier, None)
|
||||
self.assertEquals(post.url, "https://www.bbc.co.uk/news/technology-48334739")
|
||||
self.assertEquals(post.title, "Huawei's Android loss: How it affects you")
|
||||
self.assertEqual(post.publication_date, publication_date)
|
||||
self.assertEqual(post.remote_identifier, None)
|
||||
self.assertEqual(post.url, "https://www.bbc.co.uk/news/technology-48334739")
|
||||
self.assertEqual(post.title, "Huawei's Android loss: How it affects you")
|
||||
|
||||
def test_entry_without_publication_date(self):
|
||||
builder = FeedBuilder
|
||||
rule = FeedFactory()
|
||||
mock_stream = Mock(rule=rule)
|
||||
|
||||
with builder(mock_without_publish_date, mock_stream) as builder:
|
||||
with FeedBuilder(mock_without_publish_date, mock_stream) as builder:
|
||||
builder.build()
|
||||
builder.save()
|
||||
|
||||
posts = Post.objects.order_by("-publication_date")
|
||||
self.assertEquals(Post.objects.count(), 2)
|
||||
self.assertEqual(Post.objects.count(), 2)
|
||||
|
||||
post = posts[0]
|
||||
|
||||
self.assertEquals(
|
||||
self.assertEqual(
|
||||
post.publication_date.strftime("%Y-%m-%d %H:%M"), "2019-10-30 12:30"
|
||||
)
|
||||
self.assertEquals(post.created, timezone.now())
|
||||
self.assertEquals(
|
||||
self.assertEqual(post.created, timezone.now())
|
||||
self.assertEqual(
|
||||
post.remote_identifier,
|
||||
"https://www.bbc.co.uk/news/world-us-canada-48338168",
|
||||
)
|
||||
|
||||
post = posts[1]
|
||||
|
||||
self.assertEquals(
|
||||
self.assertEqual(
|
||||
post.publication_date.strftime("%Y-%m-%d %H:%M"), "2019-10-30 12:30"
|
||||
)
|
||||
self.assertEquals(post.created, timezone.now())
|
||||
self.assertEquals(
|
||||
self.assertEqual(post.created, timezone.now())
|
||||
self.assertEqual(
|
||||
post.remote_identifier, "https://www.bbc.co.uk/news/technology-48334739"
|
||||
)
|
||||
|
||||
def test_entry_without_url(self):
|
||||
builder = FeedBuilder
|
||||
rule = FeedFactory()
|
||||
mock_stream = Mock(rule=rule)
|
||||
|
||||
with builder(mock_without_url, mock_stream) as builder:
|
||||
with FeedBuilder(mock_without_url, mock_stream) as builder:
|
||||
builder.build()
|
||||
builder.save()
|
||||
|
||||
posts = Post.objects.order_by("-publication_date")
|
||||
self.assertEquals(Post.objects.count(), 2)
|
||||
self.assertEqual(Post.objects.count(), 2)
|
||||
|
||||
post = posts[0]
|
||||
|
||||
self.assertEquals(post.created, timezone.now())
|
||||
self.assertEquals(
|
||||
self.assertEqual(post.created, timezone.now())
|
||||
self.assertEqual(
|
||||
post.remote_identifier,
|
||||
"https://www.bbc.co.uk/news/world-us-canada-48338168",
|
||||
)
|
||||
|
||||
post = posts[1]
|
||||
|
||||
self.assertEquals(post.created, timezone.now())
|
||||
self.assertEquals(
|
||||
self.assertEqual(post.created, timezone.now())
|
||||
self.assertEqual(
|
||||
post.remote_identifier, "https://www.bbc.co.uk/news/technology-48334739"
|
||||
)
|
||||
|
||||
def test_entry_without_body(self):
|
||||
builder = FeedBuilder
|
||||
rule = FeedFactory()
|
||||
mock_stream = Mock(rule=rule)
|
||||
|
||||
with builder(mock_without_body, mock_stream) as builder:
|
||||
with FeedBuilder(mock_without_body, mock_stream) as builder:
|
||||
builder.build()
|
||||
builder.save()
|
||||
|
||||
posts = Post.objects.order_by("-publication_date")
|
||||
|
||||
self.assertEquals(Post.objects.count(), 2)
|
||||
self.assertEqual(Post.objects.count(), 2)
|
||||
|
||||
post = posts[0]
|
||||
|
||||
self.assertEquals(
|
||||
self.assertEqual(
|
||||
post.created.strftime("%Y-%m-%d %H:%M:%S"), "2019-10-30 12:30:00"
|
||||
)
|
||||
self.assertEquals(
|
||||
self.assertEqual(
|
||||
post.remote_identifier,
|
||||
"https://www.bbc.co.uk/news/uk-england-birmingham-48339080",
|
||||
)
|
||||
self.assertEquals(post.body, "")
|
||||
self.assertEqual(post.body, "")
|
||||
|
||||
post = posts[1]
|
||||
|
||||
self.assertEquals(
|
||||
self.assertEqual(
|
||||
post.created.strftime("%Y-%m-%d %H:%M:%S"), "2019-10-30 12:30:00"
|
||||
)
|
||||
self.assertEquals(
|
||||
self.assertEqual(
|
||||
post.remote_identifier,
|
||||
"https://www.bbc.co.uk/news/world-us-canada-48338168",
|
||||
)
|
||||
self.assertEquals(post.body, "")
|
||||
self.assertEqual(post.body, "")
|
||||
|
||||
def test_entry_without_author(self):
|
||||
builder = FeedBuilder
|
||||
rule = FeedFactory()
|
||||
mock_stream = Mock(rule=rule)
|
||||
|
||||
with builder(mock_without_author, mock_stream) as builder:
|
||||
with FeedBuilder(mock_without_author, mock_stream) as builder:
|
||||
builder.build()
|
||||
builder.save()
|
||||
|
||||
posts = Post.objects.order_by("-publication_date")
|
||||
self.assertEquals(Post.objects.count(), 2)
|
||||
self.assertEqual(Post.objects.count(), 2)
|
||||
|
||||
post = posts[0]
|
||||
|
||||
self.assertEquals(post.created, timezone.now())
|
||||
self.assertEquals(
|
||||
self.assertEqual(post.created, timezone.now())
|
||||
self.assertEqual(
|
||||
post.remote_identifier,
|
||||
"https://www.bbc.co.uk/news/world-us-canada-48338168",
|
||||
)
|
||||
self.assertEquals(post.author, None)
|
||||
self.assertEqual(post.author, None)
|
||||
|
||||
post = posts[1]
|
||||
|
||||
self.assertEquals(post.created, timezone.now())
|
||||
self.assertEquals(
|
||||
self.assertEqual(post.created, timezone.now())
|
||||
self.assertEqual(
|
||||
post.remote_identifier, "https://www.bbc.co.uk/news/technology-48334739"
|
||||
)
|
||||
self.assertEquals(post.author, None)
|
||||
self.assertEqual(post.author, None)
|
||||
|
||||
def test_empty_entries(self):
|
||||
builder = FeedBuilder
|
||||
rule = FeedFactory()
|
||||
mock_stream = Mock(rule=rule)
|
||||
|
||||
with builder(mock_without_entries, mock_stream) as builder:
|
||||
with FeedBuilder(mock_without_entries, mock_stream) as builder:
|
||||
builder.build()
|
||||
builder.save()
|
||||
|
||||
self.assertEquals(Post.objects.count(), 0)
|
||||
self.assertEqual(Post.objects.count(), 0)
|
||||
|
||||
def test_update_entries(self):
|
||||
builder = FeedBuilder
|
||||
rule = FeedFactory()
|
||||
mock_stream = Mock(rule=rule)
|
||||
|
||||
|
|
@ -303,36 +259,35 @@ class FeedBuilderTestCase(TestCase):
|
|||
remote_identifier="a5479c66-8fae-11e9-8422-00163ef6bee7", rule=rule
|
||||
)
|
||||
|
||||
with builder(mock_with_update_entries, mock_stream) as builder:
|
||||
with FeedBuilder(mock_with_update_entries, mock_stream) as builder:
|
||||
builder.build()
|
||||
builder.save()
|
||||
|
||||
self.assertEquals(Post.objects.count(), 3)
|
||||
self.assertEqual(Post.objects.count(), 3)
|
||||
|
||||
existing_first_post.refresh_from_db()
|
||||
existing_second_post.refresh_from_db()
|
||||
|
||||
self.assertEquals(
|
||||
self.assertEqual(
|
||||
existing_first_post.title,
|
||||
"Trump's 'genocidal taunts' will not end Iran - Zarif",
|
||||
)
|
||||
|
||||
self.assertEquals(
|
||||
self.assertEqual(
|
||||
existing_second_post.title, "Huawei's Android loss: How it affects you"
|
||||
)
|
||||
|
||||
def test_html_sanitizing(self):
|
||||
builder = FeedBuilder
|
||||
rule = FeedFactory()
|
||||
mock_stream = Mock(rule=rule)
|
||||
|
||||
with builder(mock_with_html, mock_stream) as builder:
|
||||
with FeedBuilder(mock_with_html, mock_stream) as builder:
|
||||
builder.build()
|
||||
builder.save()
|
||||
|
||||
post = Post.objects.get()
|
||||
|
||||
self.assertEquals(Post.objects.count(), 1)
|
||||
self.assertEqual(Post.objects.count(), 1)
|
||||
|
||||
self.assertTrue("<article>" in post.body)
|
||||
self.assertTrue("<h1>" in post.body)
|
||||
|
|
@ -345,64 +300,60 @@ class FeedBuilderTestCase(TestCase):
|
|||
self.assertTrue("<iframe>" not in post.body)
|
||||
|
||||
def test_long_author_text_is_truncated(self):
|
||||
builder = FeedBuilder
|
||||
rule = FeedFactory()
|
||||
mock_stream = Mock(rule=rule)
|
||||
|
||||
with builder(mock_with_long_author, mock_stream) as builder:
|
||||
with FeedBuilder(mock_with_long_author, mock_stream) as builder:
|
||||
builder.build()
|
||||
builder.save()
|
||||
|
||||
post = Post.objects.get()
|
||||
|
||||
self.assertEquals(Post.objects.count(), 1)
|
||||
self.assertEqual(Post.objects.count(), 1)
|
||||
|
||||
self.assertEquals(len(post.author), 40)
|
||||
self.assertEqual(len(post.author), 40)
|
||||
|
||||
def test_long_title_text_is_truncated(self):
|
||||
builder = FeedBuilder
|
||||
rule = FeedFactory()
|
||||
mock_stream = Mock(rule=rule)
|
||||
|
||||
with builder(mock_with_long_title, mock_stream) as builder:
|
||||
with FeedBuilder(mock_with_long_title, mock_stream) as builder:
|
||||
builder.build()
|
||||
builder.save()
|
||||
|
||||
post = Post.objects.get()
|
||||
|
||||
self.assertEquals(Post.objects.count(), 1)
|
||||
self.assertEqual(Post.objects.count(), 1)
|
||||
|
||||
self.assertEquals(len(post.title), 200)
|
||||
self.assertEqual(len(post.title), 200)
|
||||
self.assertTrue(post.title.endswith("…"))
|
||||
|
||||
def test_long_title_exotic_title(self):
|
||||
builder = FeedBuilder
|
||||
rule = FeedFactory()
|
||||
mock_stream = Mock(rule=rule)
|
||||
|
||||
with builder(mock_with_long_exotic_title, mock_stream) as builder:
|
||||
with FeedBuilder(mock_with_long_exotic_title, mock_stream) as builder:
|
||||
builder.build()
|
||||
builder.save()
|
||||
|
||||
post = Post.objects.get()
|
||||
|
||||
self.assertEquals(Post.objects.count(), 1)
|
||||
self.assertEqual(Post.objects.count(), 1)
|
||||
|
||||
self.assertEquals(len(post.title), 200)
|
||||
self.assertEqual(len(post.title), 200)
|
||||
self.assertTrue(post.title.endswith("…"))
|
||||
|
||||
def test_content_detail_is_prioritized_if_longer(self):
|
||||
builder = FeedBuilder
|
||||
rule = FeedFactory()
|
||||
mock_stream = Mock(rule=rule)
|
||||
|
||||
with builder(mock_with_longer_content_detail, mock_stream) as builder:
|
||||
with FeedBuilder(mock_with_longer_content_detail, mock_stream) as builder:
|
||||
builder.build()
|
||||
builder.save()
|
||||
|
||||
post = Post.objects.get()
|
||||
|
||||
self.assertEquals(Post.objects.count(), 1)
|
||||
self.assertEqual(Post.objects.count(), 1)
|
||||
|
||||
self.assertFalse(
|
||||
"Foreign Minister Mohammad Javad Zarif says the US" in post.body
|
||||
|
|
@ -410,33 +361,31 @@ class FeedBuilderTestCase(TestCase):
|
|||
self.assertTrue("Federal Communications Commission" in post.body)
|
||||
|
||||
def test_content_detail_is_not_prioritized_if_shorter(self):
|
||||
builder = FeedBuilder
|
||||
rule = FeedFactory()
|
||||
mock_stream = Mock(rule=rule)
|
||||
|
||||
with builder(mock_with_shorter_content_detail, mock_stream) as builder:
|
||||
with FeedBuilder(mock_with_shorter_content_detail, mock_stream) as builder:
|
||||
builder.build()
|
||||
builder.save()
|
||||
|
||||
post = Post.objects.get()
|
||||
|
||||
self.assertEquals(Post.objects.count(), 1)
|
||||
self.assertEqual(Post.objects.count(), 1)
|
||||
|
||||
self.assertTrue(
|
||||
"Foreign Minister Mohammad Javad Zarif says the US" in post.body
|
||||
)
|
||||
|
||||
def test_content_detail_is_concatinated(self):
|
||||
builder = FeedBuilder
|
||||
rule = FeedFactory()
|
||||
mock_stream = Mock(rule=rule)
|
||||
|
||||
with builder(mock_with_multiple_content_detail, mock_stream) as builder:
|
||||
with FeedBuilder(mock_with_multiple_content_detail, mock_stream) as builder:
|
||||
builder.build()
|
||||
builder.save()
|
||||
|
||||
post = Post.objects.get()
|
||||
|
||||
self.assertEquals(Post.objects.count(), 1)
|
||||
self.assertEqual(Post.objects.count(), 1)
|
||||
|
||||
self.assertEquals(post.body, "Yippie\n Ya\n Yee")
|
||||
self.assertEqual(post.body, "Yippie\n Ya\n Yee")
|
||||
|
|
|
|||
|
|
@ -86,52 +86,6 @@ class RedditBuilderTestCase(TestCase):
|
|||
|
||||
self.assertEquals(Post.objects.count(), 0)
|
||||
|
||||
def test_update_posts(self):
|
||||
subreddit = SubredditFactory()
|
||||
existing_post = RedditPostFactory(
|
||||
remote_identifier="hm0qct",
|
||||
author="Old author",
|
||||
title="Old title",
|
||||
body="Old body",
|
||||
url="https://bbc.com/",
|
||||
rule=subreddit,
|
||||
)
|
||||
|
||||
builder = RedditBuilder
|
||||
mock_stream = Mock(rule=subreddit)
|
||||
|
||||
with builder(simple_mock, mock_stream) as builder:
|
||||
builder.build()
|
||||
builder.save()
|
||||
|
||||
posts = {post.remote_identifier: post for post in Post.objects.all()}
|
||||
|
||||
self.assertCountEqual(
|
||||
("hm0qct", "hna75r", "hngs71", "hngsj8", "hnd7cy"), posts.keys()
|
||||
)
|
||||
|
||||
existing_post.refresh_from_db()
|
||||
|
||||
self.assertEquals(existing_post.remote_identifier, "hm0qct")
|
||||
self.assertEquals(existing_post.author, "AutoModerator")
|
||||
self.assertEquals(
|
||||
existing_post.title,
|
||||
"Linux Experiences/Rants or Education/Certifications thread - July 06, 2020",
|
||||
)
|
||||
self.assertIn(
|
||||
"This megathread is also to hear opinions from anyone just starting out "
|
||||
"with Linux or those that have used Linux (GNU or otherwise) for a long time.",
|
||||
existing_post.body,
|
||||
)
|
||||
self.assertEquals(
|
||||
existing_post.publication_date,
|
||||
pytz.utc.localize(datetime(2020, 7, 6, 6, 11, 22)),
|
||||
)
|
||||
self.assertEquals(
|
||||
existing_post.url,
|
||||
"https://www.reddit.com/r/linux/comments/hm0qct/linux_experiencesrants_or_educationcertifications/",
|
||||
)
|
||||
|
||||
def test_html_sanitizing(self):
|
||||
builder = RedditBuilder
|
||||
|
||||
|
|
@ -225,17 +179,6 @@ class RedditBuilderTestCase(TestCase):
|
|||
("hm0qct", "hna75r", "hngs71", "hngsj8", "hnd7cy"), posts.keys()
|
||||
)
|
||||
|
||||
duplicate_post.refresh_from_db()
|
||||
|
||||
self.assertEquals(
|
||||
duplicate_post.publication_date,
|
||||
pytz.utc.localize(datetime(2020, 7, 6, 6, 11, 22)),
|
||||
)
|
||||
self.assertEquals(
|
||||
duplicate_post.title,
|
||||
"Linux Experiences/Rants or Education/Certifications thread - July 06, 2020",
|
||||
)
|
||||
|
||||
def test_image_post(self):
|
||||
builder = RedditBuilder
|
||||
|
||||
|
|
|
|||
|
|
@ -2185,3 +2185,202 @@ unsanitized_mock = [
|
|||
},
|
||||
}
|
||||
]
|
||||
|
||||
broken_mock = [
|
||||
{
|
||||
"contributors": None,
|
||||
"coordinates": None,
|
||||
"created_at": "Fri Aug 07 00:17:05 +0000 2020",
|
||||
"display_text_range": [11, 59],
|
||||
"entities": {
|
||||
"hashtags": [],
|
||||
"symbols": [],
|
||||
"urls": [
|
||||
{
|
||||
"display_url": "youtu.be/rDy7tPf6CT8",
|
||||
"expanded_url": "https://youtu.be/rDy7tPf6CT8",
|
||||
"indices": [36, 59],
|
||||
"url": "https://t.co/trAcIxBMlX",
|
||||
}
|
||||
],
|
||||
"user_mentions": [
|
||||
{
|
||||
"id": 975844884606275587,
|
||||
"id_str": "975844884606275587",
|
||||
"indices": [0, 10],
|
||||
"name": "ArieNeo",
|
||||
"screen_name": "ArieNeoSC",
|
||||
}
|
||||
],
|
||||
},
|
||||
"favorite_count": 19,
|
||||
"favorited": False,
|
||||
# Note the missing full_text key here
|
||||
"geo": None,
|
||||
"id": 1291528756373286914,
|
||||
"id_str": "1291528756373286914",
|
||||
"in_reply_to_screen_name": "ArieNeoSC",
|
||||
"in_reply_to_status_id": 1291507356313038850,
|
||||
"in_reply_to_status_id_str": "1291507356313038850",
|
||||
"in_reply_to_user_id": 975844884606275587,
|
||||
"in_reply_to_user_id_str": "975844884606275587",
|
||||
"is_quote_status": False,
|
||||
"lang": "en",
|
||||
"place": None,
|
||||
"possibly_sensitive": False,
|
||||
"retweet_count": 5,
|
||||
"retweeted": False,
|
||||
"source": '<a href="https://mobile.twitter.com" rel="nofollow">Twitter Web App</a>',
|
||||
"truncated": False,
|
||||
"user": {
|
||||
"contributors_enabled": False,
|
||||
"created_at": "Wed Sep 05 00:58:11 +0000 2012",
|
||||
"default_profile": False,
|
||||
"default_profile_image": False,
|
||||
"description": "The official Twitter profile for #StarCitizen and Roberts Space Industries.",
|
||||
"entities": {
|
||||
"description": {"urls": []},
|
||||
"url": {
|
||||
"urls": [
|
||||
{
|
||||
"display_url": "robertsspaceindustries.com",
|
||||
"expanded_url": "http://www.robertsspaceindustries.com",
|
||||
"indices": [0, 23],
|
||||
"url": "https://t.co/iqO6apof3y",
|
||||
}
|
||||
]
|
||||
},
|
||||
},
|
||||
"favourites_count": 4588,
|
||||
"follow_request_sent": None,
|
||||
"followers_count": 106169,
|
||||
"following": None,
|
||||
"friends_count": 201,
|
||||
"geo_enabled": False,
|
||||
"has_extended_profile": False,
|
||||
"id": 803542770,
|
||||
"id_str": "803542770",
|
||||
"is_translation_enabled": False,
|
||||
"is_translator": False,
|
||||
"lang": None,
|
||||
"listed_count": 890,
|
||||
"location": "Roberts Space Industries",
|
||||
"name": "Star Citizen",
|
||||
"notifications": None,
|
||||
"profile_background_color": "131516",
|
||||
"profile_background_image_url": "http://abs.twimg.com/images/themes/theme14/bg.gif",
|
||||
"profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme14/bg.gif",
|
||||
"profile_background_tile": False,
|
||||
"profile_banner_url": "https://pbs.twimg.com/profile_banners/803542770/1596651186",
|
||||
"profile_image_url": "http://pbs.twimg.com/profile_images/963109950103814144/ysnj_Asy_normal.jpg",
|
||||
"profile_image_url_https": "https://pbs.twimg.com/profile_images/963109950103814144/ysnj_Asy_normal.jpg",
|
||||
"profile_link_color": "0A5485",
|
||||
"profile_sidebar_border_color": "FFFFFF",
|
||||
"profile_sidebar_fill_color": "EFEFEF",
|
||||
"profile_text_color": "333333",
|
||||
"profile_use_background_image": True,
|
||||
"protected": False,
|
||||
"screen_name": "RobertsSpaceInd",
|
||||
"statuses_count": 6210,
|
||||
"time_zone": None,
|
||||
"translator_type": "none",
|
||||
"url": "https://t.co/iqO6apof3y",
|
||||
"utc_offset": None,
|
||||
"verified": True,
|
||||
},
|
||||
},
|
||||
{
|
||||
"contributors": None,
|
||||
"coordinates": None,
|
||||
"created_at": "Wed Jul 29 19:01:47 +0000 2020",
|
||||
"display_text_range": [10, 98],
|
||||
"entities": {
|
||||
"hashtags": [],
|
||||
"symbols": [],
|
||||
"urls": [],
|
||||
"user_mentions": [
|
||||
{
|
||||
"id": 435221600,
|
||||
"id_str": "435221600",
|
||||
"indices": [0, 9],
|
||||
"name": "Christopher Blough",
|
||||
"screen_name": "RelicCcb",
|
||||
}
|
||||
],
|
||||
},
|
||||
"favorite_count": 1,
|
||||
"favorited": False,
|
||||
"full_text": "@RelicCcb Hi Christoper, we have checked the status of your investigation and it is still ongoing.",
|
||||
"geo": None,
|
||||
"id": 1288550304095416320,
|
||||
"id_str": "1288550304095416320",
|
||||
"in_reply_to_screen_name": "RelicCcb",
|
||||
"in_reply_to_status_id": 1288475147951898625,
|
||||
"in_reply_to_status_id_str": "1288475147951898625",
|
||||
"in_reply_to_user_id": 435221600,
|
||||
"in_reply_to_user_id_str": "435221600",
|
||||
"is_quote_status": False,
|
||||
"lang": "en",
|
||||
"place": None,
|
||||
"retweet_count": 0,
|
||||
"retweeted": False,
|
||||
"source": '<a href="https://mobile.twitter.com" rel="nofollow">Twitter Web App</a>',
|
||||
"truncated": False,
|
||||
"user": {
|
||||
"contributors_enabled": False,
|
||||
"created_at": "Wed Sep 05 00:58:11 +0000 2012",
|
||||
"default_profile": False,
|
||||
"default_profile_image": False,
|
||||
"description": "The official Twitter profile for #StarCitizen and Roberts Space Industries.",
|
||||
"entities": {
|
||||
"description": {"urls": []},
|
||||
"url": {
|
||||
"urls": [
|
||||
{
|
||||
"display_url": "robertsspaceindustries.com",
|
||||
"expanded_url": "http://www.robertsspaceindustries.com",
|
||||
"indices": [0, 23],
|
||||
"url": "https://t.co/iqO6apof3y",
|
||||
}
|
||||
]
|
||||
},
|
||||
},
|
||||
"favourites_count": 4588,
|
||||
"follow_request_sent": None,
|
||||
"followers_count": 106169,
|
||||
"following": None,
|
||||
"friends_count": 201,
|
||||
"geo_enabled": False,
|
||||
"has_extended_profile": False,
|
||||
"id": 803542770,
|
||||
"id_str": "803542770",
|
||||
"is_translation_enabled": False,
|
||||
"is_translator": False,
|
||||
"lang": None,
|
||||
"listed_count": 890,
|
||||
"location": "Roberts Space Industries",
|
||||
"name": "Star Citizen",
|
||||
"notifications": None,
|
||||
"profile_background_color": "131516",
|
||||
"profile_background_image_url": "http://abs.twimg.com/images/themes/theme14/bg.gif",
|
||||
"profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme14/bg.gif",
|
||||
"profile_background_tile": False,
|
||||
"profile_banner_url": "https://pbs.twimg.com/profile_banners/803542770/1596651186",
|
||||
"profile_image_url": "http://pbs.twimg.com/profile_images/963109950103814144/ysnj_Asy_normal.jpg",
|
||||
"profile_image_url_https": "https://pbs.twimg.com/profile_images/963109950103814144/ysnj_Asy_normal.jpg",
|
||||
"profile_link_color": "0A5485",
|
||||
"profile_sidebar_border_color": "FFFFFF",
|
||||
"profile_sidebar_fill_color": "EFEFEF",
|
||||
"profile_text_color": "333333",
|
||||
"profile_use_background_image": True,
|
||||
"protected": False,
|
||||
"screen_name": "RobertsSpaceInd",
|
||||
"statuses_count": 6210,
|
||||
"time_zone": None,
|
||||
"translator_type": "none",
|
||||
"url": "https://t.co/iqO6apof3y",
|
||||
"utc_offset": None,
|
||||
"verified": True,
|
||||
},
|
||||
},
|
||||
]
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@ from ftfy import fix_text
|
|||
|
||||
from newsreader.news.collection.tests.factories import TwitterTimelineFactory
|
||||
from newsreader.news.collection.tests.twitter.builder.mocks import (
|
||||
broken_mock,
|
||||
gif_mock,
|
||||
image_mock,
|
||||
quoted_mock,
|
||||
|
|
@ -410,3 +411,21 @@ class TwitterBuilderTestCase(TestCase):
|
|||
builder.save()
|
||||
|
||||
self.assertEquals(Post.objects.count(), 2)
|
||||
|
||||
def test_bad_post(self):
|
||||
"""
|
||||
Tests that the builder will ignore posts which miss data
|
||||
"""
|
||||
builder = TwitterBuilder
|
||||
|
||||
profile = TwitterTimelineFactory(screen_name="RobertsSpaceInd")
|
||||
mock_stream = Mock(rule=profile)
|
||||
|
||||
with builder(broken_mock, mock_stream) as builder:
|
||||
builder.build()
|
||||
builder.save()
|
||||
|
||||
self.assertCountEqual(
|
||||
Post.objects.values_list("remote_identifier", flat=True),
|
||||
["1288550304095416320"],
|
||||
)
|
||||
|
|
|
|||
|
|
@ -22,6 +22,10 @@ from newsreader.news.collection.base import (
|
|||
)
|
||||
from newsreader.news.collection.choices import RuleTypeChoices, TwitterPostTypeChoices
|
||||
from newsreader.news.collection.exceptions import (
|
||||
BuilderDuplicateException,
|
||||
BuilderException,
|
||||
BuilderMissingDataException,
|
||||
BuilderParseException,
|
||||
StreamDeniedException,
|
||||
StreamException,
|
||||
StreamNotFoundException,
|
||||
|
|
@ -48,43 +52,69 @@ class TwitterBuilder(PostBuilder):
|
|||
|
||||
def build(self):
|
||||
results = {}
|
||||
rule = self.stream.rule
|
||||
|
||||
for post in self.payload:
|
||||
remote_identifier = post["id_str"]
|
||||
|
||||
if remote_identifier in self.existing_posts:
|
||||
try:
|
||||
post = self.build_post(post)
|
||||
except BuilderException:
|
||||
logger.exception("Failed building post")
|
||||
continue
|
||||
|
||||
url = f"{TWITTER_URL}/{rule.screen_name}/status/{remote_identifier}"
|
||||
body = urlize(post["full_text"], nofollow=True)
|
||||
identifier = post.remote_identifier
|
||||
results[identifier] = post
|
||||
|
||||
self.instances = results.values()
|
||||
|
||||
def build_post(self, data):
|
||||
remote_identifier = data.get("id_str", "")
|
||||
rule = self.stream.rule
|
||||
|
||||
if remote_identifier in self.existing_posts:
|
||||
raise BuilderDuplicateException(payload=data)
|
||||
|
||||
try:
|
||||
body = urlize(data["full_text"], nofollow=True)
|
||||
title = truncate_text(
|
||||
Post, "title", self.sanitize_fragment(post["full_text"])
|
||||
Post, "title", self.sanitize_fragment(data["full_text"])
|
||||
)
|
||||
|
||||
publication_date = pytz.utc.localize(
|
||||
datetime.strptime(post["created_at"], "%a %b %d %H:%M:%S +0000 %Y")
|
||||
datetime.strptime(data["created_at"], "%a %b %d %H:%M:%S +0000 %Y")
|
||||
)
|
||||
except KeyError as e:
|
||||
raise BuilderMissingDataException(payload=data) from e
|
||||
except (OverflowError, OSError) as e:
|
||||
raise BuilderParseException(payload=data) from e
|
||||
|
||||
if "extended_entities" in post:
|
||||
try:
|
||||
media_entities = self.get_media_entities(post)
|
||||
body += media_entities
|
||||
except KeyError:
|
||||
logger.exception(f"Failed parsing media_entities for {url}")
|
||||
url = f"{TWITTER_URL}/{rule.screen_name}/status/{remote_identifier}"
|
||||
|
||||
if "retweeted_status" in post:
|
||||
original_post = post["retweeted_status"]
|
||||
if "extended_entities" in data:
|
||||
try:
|
||||
media_entities = self.get_media_entities(data)
|
||||
body += media_entities
|
||||
except KeyError as e:
|
||||
raise BuilderMissingDataException(
|
||||
message="Failed parsing data for media entities", payload=data
|
||||
) from e
|
||||
|
||||
try:
|
||||
if "retweeted_status" in data:
|
||||
original_post = data["retweeted_status"]
|
||||
original_tweet = urlize(original_post["full_text"], nofollow=True)
|
||||
body = f"{body} <br><div>Original tweet: {original_tweet}</div>"
|
||||
if "quoted_status" in post:
|
||||
original_post = post["quoted_status"]
|
||||
if "quoted_status" in data:
|
||||
original_post = data["quoted_status"]
|
||||
original_tweet = urlize(original_post["full_text"], nofollow=True)
|
||||
body = f"{body} <br><div>Quoted tweet: {original_tweet}</div>"
|
||||
except KeyError as e:
|
||||
raise BuilderMissingDataException(
|
||||
message="Failed parsing data for original tweet", payload=data
|
||||
) from e
|
||||
|
||||
body = self.sanitize_fragment(body)
|
||||
body = self.sanitize_fragment(body)
|
||||
|
||||
data = {
|
||||
return Post(
|
||||
**{
|
||||
"remote_identifier": remote_identifier,
|
||||
"title": fix_text(title),
|
||||
"body": fix_text(body),
|
||||
|
|
@ -93,13 +123,10 @@ class TwitterBuilder(PostBuilder):
|
|||
"url": url,
|
||||
"rule": rule,
|
||||
}
|
||||
)
|
||||
|
||||
results[remote_identifier] = Post(**data)
|
||||
|
||||
self.instances = results.values()
|
||||
|
||||
def get_media_entities(self, post):
|
||||
media_entities = post["extended_entities"]["media"]
|
||||
def get_media_entities(self, data):
|
||||
media_entities = data["extended_entities"]["media"]
|
||||
formatted_entities = ""
|
||||
|
||||
for media_entity in media_entities:
|
||||
|
|
|
|||
|
|
@ -70,6 +70,8 @@
|
|||
|
||||
& img, video {
|
||||
padding: 10px 0;
|
||||
|
||||
width: max-content;
|
||||
max-width: 100%;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -16,7 +16,7 @@
|
|||
{% if request.user.is_authenticated %}
|
||||
<li class="nav__item"><a href="{% url 'index' %}">Home</a></li>
|
||||
<li class="nav__item"><a href="{% url 'news:core:categories' %}">Categories</a></li>
|
||||
<li class="nav__item"><a href="{% url 'news:collection:rules' %}">Feeds</a></li>
|
||||
<li class="nav__item"><a href="{% url 'news:collection:rules' %}">Sources</a></li>
|
||||
<li class="nav__item"><a href="{% url 'accounts:settings:home' %}">Settings</a></li>
|
||||
{% if request.user.is_superuser %}
|
||||
<li class="nav__item"><a href="{% url 'admin:index' %}">Admin</a></li>
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue