diff --git a/src/newsreader/news/collection/exceptions/__init__.py b/src/newsreader/news/collection/exceptions/__init__.py
new file mode 100644
index 0000000..35ce72d
--- /dev/null
+++ b/src/newsreader/news/collection/exceptions/__init__.py
@@ -0,0 +1,16 @@
+from newsreader.news.collection.exceptions.builder import (
+ BuilderDuplicateException,
+ BuilderException,
+ BuilderMissingDataException,
+ BuilderParseException,
+)
+from newsreader.news.collection.exceptions.stream import (
+ StreamConnectionException,
+ StreamDeniedException,
+ StreamException,
+ StreamForbiddenException,
+ StreamNotFoundException,
+ StreamParseException,
+ StreamTimeOutException,
+ StreamTooManyException,
+)
diff --git a/src/newsreader/news/collection/exceptions/builder.py b/src/newsreader/news/collection/exceptions/builder.py
new file mode 100644
index 0000000..6fb2d60
--- /dev/null
+++ b/src/newsreader/news/collection/exceptions/builder.py
@@ -0,0 +1,21 @@
+class BuilderException(Exception):
+ message = "Builder exception"
+
+ def __init__(self, payload=None, message=None):
+ self.payload = payload
+ self.message = message if message else self.message
+
+ def __str__(self):
+ return self.message
+
+
+class BuilderMissingDataException(BuilderException):
+ message = "Payload contains missing data"
+
+
+class BuilderDuplicateException(BuilderException):
+ message = "Payload contains duplicate entry"
+
+
+class BuilderParseException(BuilderException):
+ message = "Failed to parse payload"
diff --git a/src/newsreader/news/collection/exceptions.py b/src/newsreader/news/collection/exceptions/stream.py
similarity index 100%
rename from src/newsreader/news/collection/exceptions.py
rename to src/newsreader/news/collection/exceptions/stream.py
diff --git a/src/newsreader/news/collection/feed.py b/src/newsreader/news/collection/feed.py
index ae6cd42..379f18e 100644
--- a/src/newsreader/news/collection/feed.py
+++ b/src/newsreader/news/collection/feed.py
@@ -39,6 +39,18 @@ class FeedBuilder(PostBuilder):
rule__type = RuleTypeChoices.feed
def build(self):
+ instances = []
+
+ with FeedDuplicateHandler(self.stream.rule) as duplicate_handler:
+ entries = self.payload.get("entries", [])
+
+ for entry in entries:
+ post = self.build_post(entry)
+ instances.append(post)
+
+ self.instances = duplicate_handler.check(instances)
+
+ def build_post(self, entry):
field_mapping = {
"id": "remote_identifier",
"title": "title",
@@ -48,41 +60,37 @@ class FeedBuilder(PostBuilder):
"author": "author",
}
tz = pytz.timezone(self.stream.rule.timezone)
- instances = []
+ data = {"rule_id": self.stream.rule.pk}
- with FeedDuplicateHandler(self.stream.rule) as duplicate_handler:
- entries = self.payload.get("entries", [])
+ for field, model_field in field_mapping.items():
+ if not field in entry:
+ continue
- for entry in entries:
- data = {"rule_id": self.stream.rule.pk}
+ value = truncate_text(Post, model_field, entry[field])
- for field, model_field in field_mapping.items():
- if not field in entry:
- continue
+ if field == "published_parsed":
+ data[model_field] = build_publication_date(value, tz)
+ elif field == "summary":
+ data[model_field] = self.sanitize_fragment(value)
+ else:
+ data[model_field] = value
- value = truncate_text(Post, model_field, entry[field])
+ content_details = self.get_content_details(entry)
- if field == "published_parsed":
- data[model_field] = build_publication_date(value, tz)
- elif field == "summary":
- data[model_field] = self.sanitize_fragment(value)
- else:
- data[model_field] = value
+ # use content details key if it contains more information
+ if not "body" in data or len(data["body"]) < len(content_details):
+ data["body"] = content_details
- if "content" in entry:
- content = self.get_content(entry["content"])
- body = data.get("body", "")
+ return Post(**data)
- if not body or len(body) < len(content):
- data["body"] = content
+ def get_content_details(self, entry):
+ content_items = entry.get("content")
- instances.append(Post(**data))
+ if not content_items:
+ return ""
- self.instances = duplicate_handler.check(instances)
-
- def get_content(self, items):
- content = "\n ".join([item.get("value") for item in items])
- return self.sanitize_fragment(content)
+ content_details = "\n ".join([item.get("value") for item in content_items])
+ return self.sanitize_fragment(content_details)
class FeedStream(PostStream):
diff --git a/src/newsreader/news/collection/reddit.py b/src/newsreader/news/collection/reddit.py
index daeb85f..1fbffe2 100644
--- a/src/newsreader/news/collection/reddit.py
+++ b/src/newsreader/news/collection/reddit.py
@@ -28,6 +28,10 @@ from newsreader.news.collection.constants import (
WHITELISTED_TAGS,
)
from newsreader.news.collection.exceptions import (
+ BuilderDuplicateException,
+ BuilderException,
+ BuilderMissingDataException,
+ BuilderParseException,
StreamDeniedException,
StreamException,
StreamParseException,
@@ -122,99 +126,136 @@ class RedditBuilder(PostBuilder):
if not "data" in self.payload or not "children" in self.payload["data"]:
return
- posts = self.payload["data"]["children"]
- rule = self.stream.rule
-
- for post in posts:
- if not "data" in post or post["kind"] != REDDIT_POST:
- continue
-
- data = post["data"]
-
- remote_identifier = data["id"]
- title = truncate_text(Post, "title", data["title"])
- author = truncate_text(Post, "author", data["author"])
- post_url_fragment = data["permalink"]
- direct_url = data["url"]
- is_text_post = data["is_self"]
-
- if remote_identifier in results:
- continue
-
- if is_text_post:
- uncleaned_body = data["selftext_html"]
- unescaped_body = unescape(uncleaned_body) if uncleaned_body else ""
- body = self.sanitize_fragment(unescaped_body) if unescaped_body else ""
- elif direct_url.endswith(REDDIT_IMAGE_EXTENSIONS):
- body = format_html(
- "

",
- url=direct_url,
- title=title,
- )
- elif data["is_video"]:
- video_info = data["secure_media"]["reddit_video"]
-
- body = format_html(
- "",
- url=video_info["fallback_url"],
- )
- elif direct_url.endswith(REDDIT_VIDEO_EXTENSIONS):
- extension = next(
- extension.replace(".", "")
- for extension in REDDIT_VIDEO_EXTENSIONS
- if direct_url.endswith(extension)
- )
-
- if extension == "gifv":
- body = format_html(
- "",
- url=direct_url.replace(extension, "mp4"),
- )
- else:
- body = format_html(
- "",
- url=direct_url,
- extension=extension,
- )
- else:
- body = format_html(
- "",
- url=direct_url,
- title=title,
- )
+ entries = self.payload["data"]["children"]
+ for entry in entries:
try:
- parsed_date = datetime.fromtimestamp(post["data"]["created_utc"])
- created_date = pytz.utc.localize(parsed_date)
- except (OverflowError, OSError):
- logging.warning(
- f"Failed parsing timestamp from {REDDIT_URL}{post_url_fragment}"
- )
- created_date = timezone.now()
-
- post_data = {
- "remote_identifier": remote_identifier,
- "title": title,
- "body": body,
- "author": author,
- "url": f"{REDDIT_URL}{post_url_fragment}",
- "publication_date": created_date,
- "rule": rule,
- }
-
- if remote_identifier in self.existing_posts:
- existing_post = self.existing_posts[remote_identifier]
-
- for key, value in post_data.items():
- setattr(existing_post, key, value)
-
- results[existing_post.remote_identifier] = existing_post
+ post = self.build_post(entry)
+ except BuilderException:
+ logger.exception("Failed building post")
continue
- results[remote_identifier] = Post(**post_data)
+ identifier = post.remote_identifier
+ results[identifier] = post
self.instances = results.values()
+ def build_post(self, entry):
+ rule = self.stream.rule
+ entry_data = entry.get("data", {})
+ remote_identifier = entry_data.get("id", "")
+ kind = entry.get("kind")
+
+ if remote_identifier in self.existing_posts:
+ raise BuilderDuplicateException(payload=entry)
+ elif kind != REDDIT_POST:
+ raise BuilderParseException(
+ message=f"Payload is not an reddit post, its of kind {kind}",
+ payload=entry,
+ )
+ elif not entry_data:
+ raise BuilderMissingDataException(
+ message=f"Post {remote_identifier} did not contain any data",
+ payload=entry,
+ )
+
+ try:
+ title = entry_data["title"]
+ author = entry_data["author"]
+ post_url_fragment = entry_data["permalink"]
+ direct_url = entry_data["url"]
+ is_text = entry_data["is_self"]
+ is_video = entry_data["is_video"]
+ except KeyError as e:
+ raise BuilderMissingDataException(payload=entry) from e
+
+ title = truncate_text(Post, "title", title)
+ author = truncate_text(Post, "author", author)
+
+ if is_text:
+ body = self.get_text_post(entry_data)
+ elif direct_url.endswith(REDDIT_IMAGE_EXTENSIONS):
+ body = self.get_image_post(title, direct_url)
+ elif is_video:
+ body = self.get_native_video_post(entry_data)
+ elif direct_url.endswith(REDDIT_VIDEO_EXTENSIONS):
+ body = self.get_video_post(direct_url)
+ else:
+ body = self.get_url_post(title, direct_url)
+
+ try:
+ parsed_date = datetime.fromtimestamp(entry_data["created_utc"])
+ created_date = pytz.utc.localize(parsed_date)
+ except (OverflowError, OSError) as e:
+ raise BuilderParseException(payload=entry) from e
+ except KeyError as e:
+ raise BuilderMissingDataException(payload=entry) from e
+
+ post_entry = {
+ "remote_identifier": remote_identifier,
+ "title": title,
+ "body": body,
+ "author": author,
+ "url": f"{REDDIT_URL}{post_url_fragment}",
+ "publication_date": created_date,
+ "rule": rule,
+ }
+
+ return Post(**post_entry)
+
+ def get_text_post(self, entry):
+ try:
+ uncleaned_body = entry["selftext_html"]
+ except KeyError as e:
+ raise BuilderMissingDataException(payload=entry) from e
+
+ unescaped_body = unescape(uncleaned_body) if uncleaned_body else ""
+ return self.sanitize_fragment(unescaped_body) if unescaped_body else ""
+
+ def get_image_post(self, title, url):
+ return format_html(
+ "",
+ url=url,
+ title=title,
+ )
+
+ def get_native_video_post(self, entry):
+ try:
+ video_info = entry["secure_media"]["reddit_video"]
+ except KeyError as e:
+ raise BuilderMissingDataException(payload=entry) from e
+
+ return format_html(
+ "",
+ url=video_info["fallback_url"],
+ )
+
+ def get_video_post(self, url):
+ extension = next(
+ extension.replace(".", "")
+ for extension in REDDIT_VIDEO_EXTENSIONS
+ if url.endswith(extension)
+ )
+
+ if extension == "gifv":
+ return format_html(
+ "",
+ url=url.replace(extension, "mp4"),
+ )
+
+ return format_html(
+ "",
+ url=url,
+ extension=extension,
+ )
+
+ def get_url_post(self, title, url):
+ return format_html(
+ "",
+ url=url,
+ title=title,
+ )
+
class RedditStream(PostStream):
rule_type = RuleTypeChoices.subreddit
diff --git a/src/newsreader/news/collection/tests/feed/builder/tests.py b/src/newsreader/news/collection/tests/feed/builder/tests.py
index 571a7cd..7f4edf0 100644
--- a/src/newsreader/news/collection/tests/feed/builder/tests.py
+++ b/src/newsreader/news/collection/tests/feed/builder/tests.py
@@ -1,4 +1,4 @@
-from datetime import date, datetime, time
+from datetime import datetime
from unittest.mock import Mock
from django.test import TestCase
@@ -21,277 +21,233 @@ class FeedBuilderTestCase(TestCase):
def setUp(self):
self.maxDiff = None
- def test_basic_entry(self):
- builder = FeedBuilder
- rule = FeedFactory()
- mock_stream = Mock(rule=rule)
-
- with builder(simple_mock, mock_stream) as builder:
- builder.build()
- builder.save()
-
- post = Post.objects.get()
-
- publication_date = datetime.combine(
- date(2019, 5, 20), time(hour=16, minute=7, second=37)
- )
- aware_date = pytz.utc.localize(publication_date)
-
- self.assertEquals(post.publication_date, aware_date)
- self.assertEquals(Post.objects.count(), 1)
-
- self.assertEquals(
- post.remote_identifier,
- "https://www.bbc.co.uk/news/world-us-canada-48338168",
- )
-
- self.assertEquals(
- post.url, "https://www.bbc.co.uk/news/world-us-canada-48338168"
- )
-
- self.assertEquals(
- post.title, "Trump's 'genocidal taunts' will not end Iran - Zarif"
- )
-
def test_multiple_entries(self):
- builder = FeedBuilder
rule = FeedFactory()
mock_stream = Mock(rule=rule)
- with builder(multiple_mock, mock_stream) as builder:
+ with FeedBuilder(multiple_mock, mock_stream) as builder:
builder.build()
builder.save()
posts = Post.objects.order_by("-publication_date")
- self.assertEquals(Post.objects.count(), 3)
+ self.assertEqual(Post.objects.count(), 3)
post = posts[0]
- publication_date = datetime.combine(
- date(2019, 5, 20), time(hour=16, minute=32, second=38)
+ publication_date = datetime(
+ 2019, 5, 20, hour=16, minute=32, second=38, tzinfo=pytz.utc
)
- aware_date = pytz.utc.localize(publication_date)
- self.assertEquals(
+ self.assertEqual(
post.publication_date.strftime("%Y-%m-%d %H:%M:%S"),
- aware_date.strftime("%Y-%m-%d %H:%M:%S"),
+ publication_date.strftime("%Y-%m-%d %H:%M:%S"),
)
- self.assertEquals(
+ self.assertEqual(
post.remote_identifier,
"https://www.bbc.co.uk/news/uk-england-birmingham-48339080",
)
- self.assertEquals(
+ self.assertEqual(
post.url, "https://www.bbc.co.uk/news/uk-england-birmingham-48339080"
)
- self.assertEquals(
+ self.assertEqual(
post.title, "Birmingham head teacher threatened over LGBT lessons"
)
post = posts[1]
- publication_date = datetime.combine(
- date(2019, 5, 20), time(hour=16, minute=7, second=37)
+ publication_date = datetime(
+ 2019, 5, 20, hour=16, minute=7, second=37, tzinfo=pytz.utc
)
- aware_date = pytz.utc.localize(publication_date)
- self.assertEquals(
+ self.assertEqual(
post.publication_date.strftime("%Y-%m-%d %H:%M:%S"),
- aware_date.strftime("%Y-%m-%d %H:%M:%S"),
+ publication_date.strftime("%Y-%m-%d %H:%M:%S"),
)
- self.assertEquals(
+ self.assertEqual(
post.remote_identifier,
"https://www.bbc.co.uk/news/world-us-canada-48338168",
)
- self.assertEquals(
+ self.assertEqual(
post.url, "https://www.bbc.co.uk/news/world-us-canada-48338168"
)
- self.assertEquals(
+ self.assertEqual(
post.title, "Trump's 'genocidal taunts' will not end Iran - Zarif"
)
def test_entries_without_remote_identifier(self):
- builder = FeedBuilder
rule = FeedFactory()
mock_stream = Mock(rule=rule)
- with builder(mock_without_identifier, mock_stream) as builder:
+ with FeedBuilder(mock_without_identifier, mock_stream) as builder:
builder.build()
builder.save()
posts = Post.objects.order_by("-publication_date")
- self.assertEquals(Post.objects.count(), 2)
+ self.assertEqual(Post.objects.count(), 2)
post = posts[0]
- publication_date = datetime.combine(
- date(2019, 5, 20), time(hour=16, minute=7, second=37)
+ publication_date = datetime(
+ 2019, 5, 20, hour=16, minute=7, second=37, tzinfo=pytz.utc
)
- aware_date = pytz.utc.localize(publication_date)
- self.assertEquals(post.publication_date, aware_date)
- self.assertEquals(post.remote_identifier, None)
- self.assertEquals(
+ self.assertEqual(post.publication_date, publication_date)
+ self.assertEqual(post.remote_identifier, None)
+ self.assertEqual(
post.url, "https://www.bbc.co.uk/news/world-us-canada-48338168"
)
- self.assertEquals(
+ self.assertEqual(
post.title, "Trump's 'genocidal taunts' will not end Iran - Zarif"
)
post = posts[1]
- publication_date = datetime.combine(
- date(2019, 5, 20), time(hour=12, minute=19, second=19)
+ publication_date = datetime(
+ 2019, 5, 20, hour=12, minute=19, second=19, tzinfo=pytz.utc
)
- aware_date = pytz.utc.localize(publication_date)
- self.assertEquals(post.publication_date, aware_date)
- self.assertEquals(post.remote_identifier, None)
- self.assertEquals(post.url, "https://www.bbc.co.uk/news/technology-48334739")
- self.assertEquals(post.title, "Huawei's Android loss: How it affects you")
+ self.assertEqual(post.publication_date, publication_date)
+ self.assertEqual(post.remote_identifier, None)
+ self.assertEqual(post.url, "https://www.bbc.co.uk/news/technology-48334739")
+ self.assertEqual(post.title, "Huawei's Android loss: How it affects you")
def test_entry_without_publication_date(self):
- builder = FeedBuilder
rule = FeedFactory()
mock_stream = Mock(rule=rule)
- with builder(mock_without_publish_date, mock_stream) as builder:
+ with FeedBuilder(mock_without_publish_date, mock_stream) as builder:
builder.build()
builder.save()
posts = Post.objects.order_by("-publication_date")
- self.assertEquals(Post.objects.count(), 2)
+ self.assertEqual(Post.objects.count(), 2)
post = posts[0]
- self.assertEquals(
+ self.assertEqual(
post.publication_date.strftime("%Y-%m-%d %H:%M"), "2019-10-30 12:30"
)
- self.assertEquals(post.created, timezone.now())
- self.assertEquals(
+ self.assertEqual(post.created, timezone.now())
+ self.assertEqual(
post.remote_identifier,
"https://www.bbc.co.uk/news/world-us-canada-48338168",
)
post = posts[1]
- self.assertEquals(
+ self.assertEqual(
post.publication_date.strftime("%Y-%m-%d %H:%M"), "2019-10-30 12:30"
)
- self.assertEquals(post.created, timezone.now())
- self.assertEquals(
+ self.assertEqual(post.created, timezone.now())
+ self.assertEqual(
post.remote_identifier, "https://www.bbc.co.uk/news/technology-48334739"
)
def test_entry_without_url(self):
- builder = FeedBuilder
rule = FeedFactory()
mock_stream = Mock(rule=rule)
- with builder(mock_without_url, mock_stream) as builder:
+ with FeedBuilder(mock_without_url, mock_stream) as builder:
builder.build()
builder.save()
posts = Post.objects.order_by("-publication_date")
- self.assertEquals(Post.objects.count(), 2)
+ self.assertEqual(Post.objects.count(), 2)
post = posts[0]
- self.assertEquals(post.created, timezone.now())
- self.assertEquals(
+ self.assertEqual(post.created, timezone.now())
+ self.assertEqual(
post.remote_identifier,
"https://www.bbc.co.uk/news/world-us-canada-48338168",
)
post = posts[1]
- self.assertEquals(post.created, timezone.now())
- self.assertEquals(
+ self.assertEqual(post.created, timezone.now())
+ self.assertEqual(
post.remote_identifier, "https://www.bbc.co.uk/news/technology-48334739"
)
def test_entry_without_body(self):
- builder = FeedBuilder
rule = FeedFactory()
mock_stream = Mock(rule=rule)
- with builder(mock_without_body, mock_stream) as builder:
+ with FeedBuilder(mock_without_body, mock_stream) as builder:
builder.build()
builder.save()
posts = Post.objects.order_by("-publication_date")
- self.assertEquals(Post.objects.count(), 2)
+ self.assertEqual(Post.objects.count(), 2)
post = posts[0]
- self.assertEquals(
+ self.assertEqual(
post.created.strftime("%Y-%m-%d %H:%M:%S"), "2019-10-30 12:30:00"
)
- self.assertEquals(
+ self.assertEqual(
post.remote_identifier,
"https://www.bbc.co.uk/news/uk-england-birmingham-48339080",
)
- self.assertEquals(post.body, "")
+ self.assertEqual(post.body, "")
post = posts[1]
- self.assertEquals(
+ self.assertEqual(
post.created.strftime("%Y-%m-%d %H:%M:%S"), "2019-10-30 12:30:00"
)
- self.assertEquals(
+ self.assertEqual(
post.remote_identifier,
"https://www.bbc.co.uk/news/world-us-canada-48338168",
)
- self.assertEquals(post.body, "")
+ self.assertEqual(post.body, "")
def test_entry_without_author(self):
- builder = FeedBuilder
rule = FeedFactory()
mock_stream = Mock(rule=rule)
- with builder(mock_without_author, mock_stream) as builder:
+ with FeedBuilder(mock_without_author, mock_stream) as builder:
builder.build()
builder.save()
posts = Post.objects.order_by("-publication_date")
- self.assertEquals(Post.objects.count(), 2)
+ self.assertEqual(Post.objects.count(), 2)
post = posts[0]
- self.assertEquals(post.created, timezone.now())
- self.assertEquals(
+ self.assertEqual(post.created, timezone.now())
+ self.assertEqual(
post.remote_identifier,
"https://www.bbc.co.uk/news/world-us-canada-48338168",
)
- self.assertEquals(post.author, None)
+ self.assertEqual(post.author, None)
post = posts[1]
- self.assertEquals(post.created, timezone.now())
- self.assertEquals(
+ self.assertEqual(post.created, timezone.now())
+ self.assertEqual(
post.remote_identifier, "https://www.bbc.co.uk/news/technology-48334739"
)
- self.assertEquals(post.author, None)
+ self.assertEqual(post.author, None)
def test_empty_entries(self):
- builder = FeedBuilder
rule = FeedFactory()
mock_stream = Mock(rule=rule)
- with builder(mock_without_entries, mock_stream) as builder:
+ with FeedBuilder(mock_without_entries, mock_stream) as builder:
builder.build()
builder.save()
- self.assertEquals(Post.objects.count(), 0)
+ self.assertEqual(Post.objects.count(), 0)
def test_update_entries(self):
- builder = FeedBuilder
rule = FeedFactory()
mock_stream = Mock(rule=rule)
@@ -303,36 +259,35 @@ class FeedBuilderTestCase(TestCase):
remote_identifier="a5479c66-8fae-11e9-8422-00163ef6bee7", rule=rule
)
- with builder(mock_with_update_entries, mock_stream) as builder:
+ with FeedBuilder(mock_with_update_entries, mock_stream) as builder:
builder.build()
builder.save()
- self.assertEquals(Post.objects.count(), 3)
+ self.assertEqual(Post.objects.count(), 3)
existing_first_post.refresh_from_db()
existing_second_post.refresh_from_db()
- self.assertEquals(
+ self.assertEqual(
existing_first_post.title,
"Trump's 'genocidal taunts' will not end Iran - Zarif",
)
- self.assertEquals(
+ self.assertEqual(
existing_second_post.title, "Huawei's Android loss: How it affects you"
)
def test_html_sanitizing(self):
- builder = FeedBuilder
rule = FeedFactory()
mock_stream = Mock(rule=rule)
- with builder(mock_with_html, mock_stream) as builder:
+ with FeedBuilder(mock_with_html, mock_stream) as builder:
builder.build()
builder.save()
post = Post.objects.get()
- self.assertEquals(Post.objects.count(), 1)
+ self.assertEqual(Post.objects.count(), 1)
self.assertTrue("" in post.body)
self.assertTrue("" in post.body)
@@ -345,64 +300,60 @@ class FeedBuilderTestCase(TestCase):
self.assertTrue("