Refactor FeedBuilder

This commit is contained in:
Sonny Bakker 2020-10-08 22:51:36 +02:00
parent 1a7279c533
commit 90553168df
2 changed files with 144 additions and 163 deletions

View file

@ -39,6 +39,23 @@ class FeedBuilder(PostBuilder):
rule__type = RuleTypeChoices.feed
def build(self):
instances = []
with FeedDuplicateHandler(self.stream.rule) as duplicate_handler:
entries = self.payload.get("entries", [])
for entry in entries:
try:
post = self.build_post(entry)
except KeyError:
logger.exception(f"Failed building post")
continue
instances.append(post)
self.instances = duplicate_handler.check(instances)
def build_post(self, entry):
field_mapping = {
"id": "remote_identifier",
"title": "title",
@ -48,12 +65,6 @@ class FeedBuilder(PostBuilder):
"author": "author",
}
tz = pytz.timezone(self.stream.rule.timezone)
instances = []
with FeedDuplicateHandler(self.stream.rule) as duplicate_handler:
entries = self.payload.get("entries", [])
for entry in entries:
data = {"rule_id": self.stream.rule.pk}
for field, model_field in field_mapping.items():
@ -69,20 +80,22 @@ class FeedBuilder(PostBuilder):
else:
data[model_field] = value
if "content" in entry:
content = self.get_content(entry["content"])
body = data.get("body", "")
content_details = self.get_content_details(entry)
if not body or len(body) < len(content):
data["body"] = content
# use content details key if it contains more information
if not "body" in data or len(data["body"]) < len(content_details):
data["body"] = content_details
instances.append(Post(**data))
return Post(**data)
self.instances = duplicate_handler.check(instances)
def get_content_details(self, entry):
content_items = entry.get("content")
def get_content(self, items):
content = "\n ".join([item.get("value") for item in items])
return self.sanitize_fragment(content)
if not content_items:
return ""
content_details = "\n ".join([item.get("value") for item in content_items])
return self.sanitize_fragment(content_details)
class FeedStream(PostStream):

View file

@ -1,5 +1,6 @@
from datetime import date, datetime, time
from unittest.mock import Mock
from datetime import datetime
from unittest.mock import Mock, patch
from uuid import uuid4
from django.test import TestCase
from django.utils import timezone
@ -21,277 +22,233 @@ class FeedBuilderTestCase(TestCase):
def setUp(self):
self.maxDiff = None
def test_basic_entry(self):
builder = FeedBuilder
rule = FeedFactory()
mock_stream = Mock(rule=rule)
with builder(simple_mock, mock_stream) as builder:
builder.build()
builder.save()
post = Post.objects.get()
publication_date = datetime.combine(
date(2019, 5, 20), time(hour=16, minute=7, second=37)
)
aware_date = pytz.utc.localize(publication_date)
self.assertEquals(post.publication_date, aware_date)
self.assertEquals(Post.objects.count(), 1)
self.assertEquals(
post.remote_identifier,
"https://www.bbc.co.uk/news/world-us-canada-48338168",
)
self.assertEquals(
post.url, "https://www.bbc.co.uk/news/world-us-canada-48338168"
)
self.assertEquals(
post.title, "Trump's 'genocidal taunts' will not end Iran - Zarif"
)
def test_multiple_entries(self):
builder = FeedBuilder
rule = FeedFactory()
mock_stream = Mock(rule=rule)
with builder(multiple_mock, mock_stream) as builder:
with FeedBuilder(multiple_mock, mock_stream) as builder:
builder.build()
builder.save()
posts = Post.objects.order_by("-publication_date")
self.assertEquals(Post.objects.count(), 3)
self.assertEqual(Post.objects.count(), 3)
post = posts[0]
publication_date = datetime.combine(
date(2019, 5, 20), time(hour=16, minute=32, second=38)
publication_date = datetime(
2019, 5, 20, hour=16, minute=32, second=38, tzinfo=pytz.utc
)
aware_date = pytz.utc.localize(publication_date)
self.assertEquals(
self.assertEqual(
post.publication_date.strftime("%Y-%m-%d %H:%M:%S"),
aware_date.strftime("%Y-%m-%d %H:%M:%S"),
publication_date.strftime("%Y-%m-%d %H:%M:%S"),
)
self.assertEquals(
self.assertEqual(
post.remote_identifier,
"https://www.bbc.co.uk/news/uk-england-birmingham-48339080",
)
self.assertEquals(
self.assertEqual(
post.url, "https://www.bbc.co.uk/news/uk-england-birmingham-48339080"
)
self.assertEquals(
self.assertEqual(
post.title, "Birmingham head teacher threatened over LGBT lessons"
)
post = posts[1]
publication_date = datetime.combine(
date(2019, 5, 20), time(hour=16, minute=7, second=37)
publication_date = datetime(
2019, 5, 20, hour=16, minute=7, second=37, tzinfo=pytz.utc
)
aware_date = pytz.utc.localize(publication_date)
self.assertEquals(
self.assertEqual(
post.publication_date.strftime("%Y-%m-%d %H:%M:%S"),
aware_date.strftime("%Y-%m-%d %H:%M:%S"),
publication_date.strftime("%Y-%m-%d %H:%M:%S"),
)
self.assertEquals(
self.assertEqual(
post.remote_identifier,
"https://www.bbc.co.uk/news/world-us-canada-48338168",
)
self.assertEquals(
self.assertEqual(
post.url, "https://www.bbc.co.uk/news/world-us-canada-48338168"
)
self.assertEquals(
self.assertEqual(
post.title, "Trump's 'genocidal taunts' will not end Iran - Zarif"
)
def test_entries_without_remote_identifier(self):
builder = FeedBuilder
rule = FeedFactory()
mock_stream = Mock(rule=rule)
with builder(mock_without_identifier, mock_stream) as builder:
with FeedBuilder(mock_without_identifier, mock_stream) as builder:
builder.build()
builder.save()
posts = Post.objects.order_by("-publication_date")
self.assertEquals(Post.objects.count(), 2)
self.assertEqual(Post.objects.count(), 2)
post = posts[0]
publication_date = datetime.combine(
date(2019, 5, 20), time(hour=16, minute=7, second=37)
publication_date = datetime(
2019, 5, 20, hour=16, minute=7, second=37, tzinfo=pytz.utc
)
aware_date = pytz.utc.localize(publication_date)
self.assertEquals(post.publication_date, aware_date)
self.assertEquals(post.remote_identifier, None)
self.assertEquals(
self.assertEqual(post.publication_date, publication_date)
self.assertEqual(post.remote_identifier, None)
self.assertEqual(
post.url, "https://www.bbc.co.uk/news/world-us-canada-48338168"
)
self.assertEquals(
self.assertEqual(
post.title, "Trump's 'genocidal taunts' will not end Iran - Zarif"
)
post = posts[1]
publication_date = datetime.combine(
date(2019, 5, 20), time(hour=12, minute=19, second=19)
publication_date = datetime(
2019, 5, 20, hour=12, minute=19, second=19, tzinfo=pytz.utc
)
aware_date = pytz.utc.localize(publication_date)
self.assertEquals(post.publication_date, aware_date)
self.assertEquals(post.remote_identifier, None)
self.assertEquals(post.url, "https://www.bbc.co.uk/news/technology-48334739")
self.assertEquals(post.title, "Huawei's Android loss: How it affects you")
self.assertEqual(post.publication_date, publication_date)
self.assertEqual(post.remote_identifier, None)
self.assertEqual(post.url, "https://www.bbc.co.uk/news/technology-48334739")
self.assertEqual(post.title, "Huawei's Android loss: How it affects you")
def test_entry_without_publication_date(self):
builder = FeedBuilder
rule = FeedFactory()
mock_stream = Mock(rule=rule)
with builder(mock_without_publish_date, mock_stream) as builder:
with FeedBuilder(mock_without_publish_date, mock_stream) as builder:
builder.build()
builder.save()
posts = Post.objects.order_by("-publication_date")
self.assertEquals(Post.objects.count(), 2)
self.assertEqual(Post.objects.count(), 2)
post = posts[0]
self.assertEquals(
self.assertEqual(
post.publication_date.strftime("%Y-%m-%d %H:%M"), "2019-10-30 12:30"
)
self.assertEquals(post.created, timezone.now())
self.assertEquals(
self.assertEqual(post.created, timezone.now())
self.assertEqual(
post.remote_identifier,
"https://www.bbc.co.uk/news/world-us-canada-48338168",
)
post = posts[1]
self.assertEquals(
self.assertEqual(
post.publication_date.strftime("%Y-%m-%d %H:%M"), "2019-10-30 12:30"
)
self.assertEquals(post.created, timezone.now())
self.assertEquals(
self.assertEqual(post.created, timezone.now())
self.assertEqual(
post.remote_identifier, "https://www.bbc.co.uk/news/technology-48334739"
)
def test_entry_without_url(self):
builder = FeedBuilder
rule = FeedFactory()
mock_stream = Mock(rule=rule)
with builder(mock_without_url, mock_stream) as builder:
with FeedBuilder(mock_without_url, mock_stream) as builder:
builder.build()
builder.save()
posts = Post.objects.order_by("-publication_date")
self.assertEquals(Post.objects.count(), 2)
self.assertEqual(Post.objects.count(), 2)
post = posts[0]
self.assertEquals(post.created, timezone.now())
self.assertEquals(
self.assertEqual(post.created, timezone.now())
self.assertEqual(
post.remote_identifier,
"https://www.bbc.co.uk/news/world-us-canada-48338168",
)
post = posts[1]
self.assertEquals(post.created, timezone.now())
self.assertEquals(
self.assertEqual(post.created, timezone.now())
self.assertEqual(
post.remote_identifier, "https://www.bbc.co.uk/news/technology-48334739"
)
def test_entry_without_body(self):
builder = FeedBuilder
rule = FeedFactory()
mock_stream = Mock(rule=rule)
with builder(mock_without_body, mock_stream) as builder:
with FeedBuilder(mock_without_body, mock_stream) as builder:
builder.build()
builder.save()
posts = Post.objects.order_by("-publication_date")
self.assertEquals(Post.objects.count(), 2)
self.assertEqual(Post.objects.count(), 2)
post = posts[0]
self.assertEquals(
self.assertEqual(
post.created.strftime("%Y-%m-%d %H:%M:%S"), "2019-10-30 12:30:00"
)
self.assertEquals(
self.assertEqual(
post.remote_identifier,
"https://www.bbc.co.uk/news/uk-england-birmingham-48339080",
)
self.assertEquals(post.body, "")
self.assertEqual(post.body, "")
post = posts[1]
self.assertEquals(
self.assertEqual(
post.created.strftime("%Y-%m-%d %H:%M:%S"), "2019-10-30 12:30:00"
)
self.assertEquals(
self.assertEqual(
post.remote_identifier,
"https://www.bbc.co.uk/news/world-us-canada-48338168",
)
self.assertEquals(post.body, "")
self.assertEqual(post.body, "")
def test_entry_without_author(self):
builder = FeedBuilder
rule = FeedFactory()
mock_stream = Mock(rule=rule)
with builder(mock_without_author, mock_stream) as builder:
with FeedBuilder(mock_without_author, mock_stream) as builder:
builder.build()
builder.save()
posts = Post.objects.order_by("-publication_date")
self.assertEquals(Post.objects.count(), 2)
self.assertEqual(Post.objects.count(), 2)
post = posts[0]
self.assertEquals(post.created, timezone.now())
self.assertEquals(
self.assertEqual(post.created, timezone.now())
self.assertEqual(
post.remote_identifier,
"https://www.bbc.co.uk/news/world-us-canada-48338168",
)
self.assertEquals(post.author, None)
self.assertEqual(post.author, None)
post = posts[1]
self.assertEquals(post.created, timezone.now())
self.assertEquals(
self.assertEqual(post.created, timezone.now())
self.assertEqual(
post.remote_identifier, "https://www.bbc.co.uk/news/technology-48334739"
)
self.assertEquals(post.author, None)
self.assertEqual(post.author, None)
def test_empty_entries(self):
builder = FeedBuilder
rule = FeedFactory()
mock_stream = Mock(rule=rule)
with builder(mock_without_entries, mock_stream) as builder:
with FeedBuilder(mock_without_entries, mock_stream) as builder:
builder.build()
builder.save()
self.assertEquals(Post.objects.count(), 0)
self.assertEqual(Post.objects.count(), 0)
def test_update_entries(self):
builder = FeedBuilder
rule = FeedFactory()
mock_stream = Mock(rule=rule)
@ -303,36 +260,35 @@ class FeedBuilderTestCase(TestCase):
remote_identifier="a5479c66-8fae-11e9-8422-00163ef6bee7", rule=rule
)
with builder(mock_with_update_entries, mock_stream) as builder:
with FeedBuilder(mock_with_update_entries, mock_stream) as builder:
builder.build()
builder.save()
self.assertEquals(Post.objects.count(), 3)
self.assertEqual(Post.objects.count(), 3)
existing_first_post.refresh_from_db()
existing_second_post.refresh_from_db()
self.assertEquals(
self.assertEqual(
existing_first_post.title,
"Trump's 'genocidal taunts' will not end Iran - Zarif",
)
self.assertEquals(
self.assertEqual(
existing_second_post.title, "Huawei's Android loss: How it affects you"
)
def test_html_sanitizing(self):
builder = FeedBuilder
rule = FeedFactory()
mock_stream = Mock(rule=rule)
with builder(mock_with_html, mock_stream) as builder:
with FeedBuilder(mock_with_html, mock_stream) as builder:
builder.build()
builder.save()
post = Post.objects.get()
self.assertEquals(Post.objects.count(), 1)
self.assertEqual(Post.objects.count(), 1)
self.assertTrue("<article>" in post.body)
self.assertTrue("<h1>" in post.body)
@ -345,64 +301,60 @@ class FeedBuilderTestCase(TestCase):
self.assertTrue("<iframe>" not in post.body)
def test_long_author_text_is_truncated(self):
builder = FeedBuilder
rule = FeedFactory()
mock_stream = Mock(rule=rule)
with builder(mock_with_long_author, mock_stream) as builder:
with FeedBuilder(mock_with_long_author, mock_stream) as builder:
builder.build()
builder.save()
post = Post.objects.get()
self.assertEquals(Post.objects.count(), 1)
self.assertEqual(Post.objects.count(), 1)
self.assertEquals(len(post.author), 40)
self.assertEqual(len(post.author), 40)
def test_long_title_text_is_truncated(self):
builder = FeedBuilder
rule = FeedFactory()
mock_stream = Mock(rule=rule)
with builder(mock_with_long_title, mock_stream) as builder:
with FeedBuilder(mock_with_long_title, mock_stream) as builder:
builder.build()
builder.save()
post = Post.objects.get()
self.assertEquals(Post.objects.count(), 1)
self.assertEqual(Post.objects.count(), 1)
self.assertEquals(len(post.title), 200)
self.assertEqual(len(post.title), 200)
self.assertTrue(post.title.endswith(""))
def test_long_title_exotic_title(self):
builder = FeedBuilder
rule = FeedFactory()
mock_stream = Mock(rule=rule)
with builder(mock_with_long_exotic_title, mock_stream) as builder:
with FeedBuilder(mock_with_long_exotic_title, mock_stream) as builder:
builder.build()
builder.save()
post = Post.objects.get()
self.assertEquals(Post.objects.count(), 1)
self.assertEqual(Post.objects.count(), 1)
self.assertEquals(len(post.title), 200)
self.assertEqual(len(post.title), 200)
self.assertTrue(post.title.endswith(""))
def test_content_detail_is_prioritized_if_longer(self):
builder = FeedBuilder
rule = FeedFactory()
mock_stream = Mock(rule=rule)
with builder(mock_with_longer_content_detail, mock_stream) as builder:
with FeedBuilder(mock_with_longer_content_detail, mock_stream) as builder:
builder.build()
builder.save()
post = Post.objects.get()
self.assertEquals(Post.objects.count(), 1)
self.assertEqual(Post.objects.count(), 1)
self.assertFalse(
"Foreign Minister Mohammad Javad Zarif says the US" in post.body
@ -410,33 +362,49 @@ class FeedBuilderTestCase(TestCase):
self.assertTrue("Federal Communications Commission" in post.body)
def test_content_detail_is_not_prioritized_if_shorter(self):
builder = FeedBuilder
rule = FeedFactory()
mock_stream = Mock(rule=rule)
with builder(mock_with_shorter_content_detail, mock_stream) as builder:
with FeedBuilder(mock_with_shorter_content_detail, mock_stream) as builder:
builder.build()
builder.save()
post = Post.objects.get()
self.assertEquals(Post.objects.count(), 1)
self.assertEqual(Post.objects.count(), 1)
self.assertTrue(
"Foreign Minister Mohammad Javad Zarif says the US" in post.body
)
def test_content_detail_is_concatinated(self):
builder = FeedBuilder
rule = FeedFactory()
mock_stream = Mock(rule=rule)
with builder(mock_with_multiple_content_detail, mock_stream) as builder:
with FeedBuilder(mock_with_multiple_content_detail, mock_stream) as builder:
builder.build()
builder.save()
post = Post.objects.get()
self.assertEquals(Post.objects.count(), 1)
self.assertEqual(Post.objects.count(), 1)
self.assertEquals(post.body, "Yippie\n Ya\n Yee")
self.assertEqual(post.body, "Yippie\n Ya\n Yee")
@patch("newsreader.news.collection.feed.FeedBuilder.build_post")
def test_post_key_error(self, mocked_build_post):
rule = FeedFactory()
mock_stream = Mock(rule=rule)
identifier = str(uuid4())
build_post = FeedPostFactory.build(rule=rule, remote_identifier=identifier)
mocked_build_post.side_effect = (KeyError, build_post)
with FeedBuilder({"entries": [{}, {}]}, mock_stream) as builder:
builder.build()
builder.save()
posts = Post.objects.values_list("remote_identifier", flat=True)
self.assertCountEqual(posts, (identifier,))