Refactor FeedBuilder

This commit is contained in:
Sonny Bakker 2020-10-08 22:51:36 +02:00
parent 1a7279c533
commit 90553168df
2 changed files with 144 additions and 163 deletions

View file

@ -39,6 +39,23 @@ class FeedBuilder(PostBuilder):
rule__type = RuleTypeChoices.feed rule__type = RuleTypeChoices.feed
def build(self): def build(self):
instances = []
with FeedDuplicateHandler(self.stream.rule) as duplicate_handler:
entries = self.payload.get("entries", [])
for entry in entries:
try:
post = self.build_post(entry)
except KeyError:
logger.exception(f"Failed building post")
continue
instances.append(post)
self.instances = duplicate_handler.check(instances)
def build_post(self, entry):
field_mapping = { field_mapping = {
"id": "remote_identifier", "id": "remote_identifier",
"title": "title", "title": "title",
@ -48,41 +65,37 @@ class FeedBuilder(PostBuilder):
"author": "author", "author": "author",
} }
tz = pytz.timezone(self.stream.rule.timezone) tz = pytz.timezone(self.stream.rule.timezone)
instances = [] data = {"rule_id": self.stream.rule.pk}
with FeedDuplicateHandler(self.stream.rule) as duplicate_handler: for field, model_field in field_mapping.items():
entries = self.payload.get("entries", []) if not field in entry:
continue
for entry in entries: value = truncate_text(Post, model_field, entry[field])
data = {"rule_id": self.stream.rule.pk}
for field, model_field in field_mapping.items(): if field == "published_parsed":
if not field in entry: data[model_field] = build_publication_date(value, tz)
continue elif field == "summary":
data[model_field] = self.sanitize_fragment(value)
else:
data[model_field] = value
value = truncate_text(Post, model_field, entry[field]) content_details = self.get_content_details(entry)
if field == "published_parsed": # use content details key if it contains more information
data[model_field] = build_publication_date(value, tz) if not "body" in data or len(data["body"]) < len(content_details):
elif field == "summary": data["body"] = content_details
data[model_field] = self.sanitize_fragment(value)
else:
data[model_field] = value
if "content" in entry: return Post(**data)
content = self.get_content(entry["content"])
body = data.get("body", "")
if not body or len(body) < len(content): def get_content_details(self, entry):
data["body"] = content content_items = entry.get("content")
instances.append(Post(**data)) if not content_items:
return ""
self.instances = duplicate_handler.check(instances) content_details = "\n ".join([item.get("value") for item in content_items])
return self.sanitize_fragment(content_details)
def get_content(self, items):
content = "\n ".join([item.get("value") for item in items])
return self.sanitize_fragment(content)
class FeedStream(PostStream): class FeedStream(PostStream):

View file

@ -1,5 +1,6 @@
from datetime import date, datetime, time from datetime import datetime
from unittest.mock import Mock from unittest.mock import Mock, patch
from uuid import uuid4
from django.test import TestCase from django.test import TestCase
from django.utils import timezone from django.utils import timezone
@ -21,277 +22,233 @@ class FeedBuilderTestCase(TestCase):
def setUp(self): def setUp(self):
self.maxDiff = None self.maxDiff = None
def test_basic_entry(self):
builder = FeedBuilder
rule = FeedFactory()
mock_stream = Mock(rule=rule)
with builder(simple_mock, mock_stream) as builder:
builder.build()
builder.save()
post = Post.objects.get()
publication_date = datetime.combine(
date(2019, 5, 20), time(hour=16, minute=7, second=37)
)
aware_date = pytz.utc.localize(publication_date)
self.assertEquals(post.publication_date, aware_date)
self.assertEquals(Post.objects.count(), 1)
self.assertEquals(
post.remote_identifier,
"https://www.bbc.co.uk/news/world-us-canada-48338168",
)
self.assertEquals(
post.url, "https://www.bbc.co.uk/news/world-us-canada-48338168"
)
self.assertEquals(
post.title, "Trump's 'genocidal taunts' will not end Iran - Zarif"
)
def test_multiple_entries(self): def test_multiple_entries(self):
builder = FeedBuilder
rule = FeedFactory() rule = FeedFactory()
mock_stream = Mock(rule=rule) mock_stream = Mock(rule=rule)
with builder(multiple_mock, mock_stream) as builder: with FeedBuilder(multiple_mock, mock_stream) as builder:
builder.build() builder.build()
builder.save() builder.save()
posts = Post.objects.order_by("-publication_date") posts = Post.objects.order_by("-publication_date")
self.assertEquals(Post.objects.count(), 3) self.assertEqual(Post.objects.count(), 3)
post = posts[0] post = posts[0]
publication_date = datetime.combine( publication_date = datetime(
date(2019, 5, 20), time(hour=16, minute=32, second=38) 2019, 5, 20, hour=16, minute=32, second=38, tzinfo=pytz.utc
) )
aware_date = pytz.utc.localize(publication_date)
self.assertEquals( self.assertEqual(
post.publication_date.strftime("%Y-%m-%d %H:%M:%S"), post.publication_date.strftime("%Y-%m-%d %H:%M:%S"),
aware_date.strftime("%Y-%m-%d %H:%M:%S"), publication_date.strftime("%Y-%m-%d %H:%M:%S"),
) )
self.assertEquals( self.assertEqual(
post.remote_identifier, post.remote_identifier,
"https://www.bbc.co.uk/news/uk-england-birmingham-48339080", "https://www.bbc.co.uk/news/uk-england-birmingham-48339080",
) )
self.assertEquals( self.assertEqual(
post.url, "https://www.bbc.co.uk/news/uk-england-birmingham-48339080" post.url, "https://www.bbc.co.uk/news/uk-england-birmingham-48339080"
) )
self.assertEquals( self.assertEqual(
post.title, "Birmingham head teacher threatened over LGBT lessons" post.title, "Birmingham head teacher threatened over LGBT lessons"
) )
post = posts[1] post = posts[1]
publication_date = datetime.combine( publication_date = datetime(
date(2019, 5, 20), time(hour=16, minute=7, second=37) 2019, 5, 20, hour=16, minute=7, second=37, tzinfo=pytz.utc
) )
aware_date = pytz.utc.localize(publication_date)
self.assertEquals( self.assertEqual(
post.publication_date.strftime("%Y-%m-%d %H:%M:%S"), post.publication_date.strftime("%Y-%m-%d %H:%M:%S"),
aware_date.strftime("%Y-%m-%d %H:%M:%S"), publication_date.strftime("%Y-%m-%d %H:%M:%S"),
) )
self.assertEquals( self.assertEqual(
post.remote_identifier, post.remote_identifier,
"https://www.bbc.co.uk/news/world-us-canada-48338168", "https://www.bbc.co.uk/news/world-us-canada-48338168",
) )
self.assertEquals( self.assertEqual(
post.url, "https://www.bbc.co.uk/news/world-us-canada-48338168" post.url, "https://www.bbc.co.uk/news/world-us-canada-48338168"
) )
self.assertEquals( self.assertEqual(
post.title, "Trump's 'genocidal taunts' will not end Iran - Zarif" post.title, "Trump's 'genocidal taunts' will not end Iran - Zarif"
) )
def test_entries_without_remote_identifier(self): def test_entries_without_remote_identifier(self):
builder = FeedBuilder
rule = FeedFactory() rule = FeedFactory()
mock_stream = Mock(rule=rule) mock_stream = Mock(rule=rule)
with builder(mock_without_identifier, mock_stream) as builder: with FeedBuilder(mock_without_identifier, mock_stream) as builder:
builder.build() builder.build()
builder.save() builder.save()
posts = Post.objects.order_by("-publication_date") posts = Post.objects.order_by("-publication_date")
self.assertEquals(Post.objects.count(), 2) self.assertEqual(Post.objects.count(), 2)
post = posts[0] post = posts[0]
publication_date = datetime.combine( publication_date = datetime(
date(2019, 5, 20), time(hour=16, minute=7, second=37) 2019, 5, 20, hour=16, minute=7, second=37, tzinfo=pytz.utc
) )
aware_date = pytz.utc.localize(publication_date)
self.assertEquals(post.publication_date, aware_date) self.assertEqual(post.publication_date, publication_date)
self.assertEquals(post.remote_identifier, None) self.assertEqual(post.remote_identifier, None)
self.assertEquals( self.assertEqual(
post.url, "https://www.bbc.co.uk/news/world-us-canada-48338168" post.url, "https://www.bbc.co.uk/news/world-us-canada-48338168"
) )
self.assertEquals( self.assertEqual(
post.title, "Trump's 'genocidal taunts' will not end Iran - Zarif" post.title, "Trump's 'genocidal taunts' will not end Iran - Zarif"
) )
post = posts[1] post = posts[1]
publication_date = datetime.combine( publication_date = datetime(
date(2019, 5, 20), time(hour=12, minute=19, second=19) 2019, 5, 20, hour=12, minute=19, second=19, tzinfo=pytz.utc
) )
aware_date = pytz.utc.localize(publication_date)
self.assertEquals(post.publication_date, aware_date) self.assertEqual(post.publication_date, publication_date)
self.assertEquals(post.remote_identifier, None) self.assertEqual(post.remote_identifier, None)
self.assertEquals(post.url, "https://www.bbc.co.uk/news/technology-48334739") self.assertEqual(post.url, "https://www.bbc.co.uk/news/technology-48334739")
self.assertEquals(post.title, "Huawei's Android loss: How it affects you") self.assertEqual(post.title, "Huawei's Android loss: How it affects you")
def test_entry_without_publication_date(self): def test_entry_without_publication_date(self):
builder = FeedBuilder
rule = FeedFactory() rule = FeedFactory()
mock_stream = Mock(rule=rule) mock_stream = Mock(rule=rule)
with builder(mock_without_publish_date, mock_stream) as builder: with FeedBuilder(mock_without_publish_date, mock_stream) as builder:
builder.build() builder.build()
builder.save() builder.save()
posts = Post.objects.order_by("-publication_date") posts = Post.objects.order_by("-publication_date")
self.assertEquals(Post.objects.count(), 2) self.assertEqual(Post.objects.count(), 2)
post = posts[0] post = posts[0]
self.assertEquals( self.assertEqual(
post.publication_date.strftime("%Y-%m-%d %H:%M"), "2019-10-30 12:30" post.publication_date.strftime("%Y-%m-%d %H:%M"), "2019-10-30 12:30"
) )
self.assertEquals(post.created, timezone.now()) self.assertEqual(post.created, timezone.now())
self.assertEquals( self.assertEqual(
post.remote_identifier, post.remote_identifier,
"https://www.bbc.co.uk/news/world-us-canada-48338168", "https://www.bbc.co.uk/news/world-us-canada-48338168",
) )
post = posts[1] post = posts[1]
self.assertEquals( self.assertEqual(
post.publication_date.strftime("%Y-%m-%d %H:%M"), "2019-10-30 12:30" post.publication_date.strftime("%Y-%m-%d %H:%M"), "2019-10-30 12:30"
) )
self.assertEquals(post.created, timezone.now()) self.assertEqual(post.created, timezone.now())
self.assertEquals( self.assertEqual(
post.remote_identifier, "https://www.bbc.co.uk/news/technology-48334739" post.remote_identifier, "https://www.bbc.co.uk/news/technology-48334739"
) )
def test_entry_without_url(self): def test_entry_without_url(self):
builder = FeedBuilder
rule = FeedFactory() rule = FeedFactory()
mock_stream = Mock(rule=rule) mock_stream = Mock(rule=rule)
with builder(mock_without_url, mock_stream) as builder: with FeedBuilder(mock_without_url, mock_stream) as builder:
builder.build() builder.build()
builder.save() builder.save()
posts = Post.objects.order_by("-publication_date") posts = Post.objects.order_by("-publication_date")
self.assertEquals(Post.objects.count(), 2) self.assertEqual(Post.objects.count(), 2)
post = posts[0] post = posts[0]
self.assertEquals(post.created, timezone.now()) self.assertEqual(post.created, timezone.now())
self.assertEquals( self.assertEqual(
post.remote_identifier, post.remote_identifier,
"https://www.bbc.co.uk/news/world-us-canada-48338168", "https://www.bbc.co.uk/news/world-us-canada-48338168",
) )
post = posts[1] post = posts[1]
self.assertEquals(post.created, timezone.now()) self.assertEqual(post.created, timezone.now())
self.assertEquals( self.assertEqual(
post.remote_identifier, "https://www.bbc.co.uk/news/technology-48334739" post.remote_identifier, "https://www.bbc.co.uk/news/technology-48334739"
) )
def test_entry_without_body(self): def test_entry_without_body(self):
builder = FeedBuilder
rule = FeedFactory() rule = FeedFactory()
mock_stream = Mock(rule=rule) mock_stream = Mock(rule=rule)
with builder(mock_without_body, mock_stream) as builder: with FeedBuilder(mock_without_body, mock_stream) as builder:
builder.build() builder.build()
builder.save() builder.save()
posts = Post.objects.order_by("-publication_date") posts = Post.objects.order_by("-publication_date")
self.assertEquals(Post.objects.count(), 2) self.assertEqual(Post.objects.count(), 2)
post = posts[0] post = posts[0]
self.assertEquals( self.assertEqual(
post.created.strftime("%Y-%m-%d %H:%M:%S"), "2019-10-30 12:30:00" post.created.strftime("%Y-%m-%d %H:%M:%S"), "2019-10-30 12:30:00"
) )
self.assertEquals( self.assertEqual(
post.remote_identifier, post.remote_identifier,
"https://www.bbc.co.uk/news/uk-england-birmingham-48339080", "https://www.bbc.co.uk/news/uk-england-birmingham-48339080",
) )
self.assertEquals(post.body, "") self.assertEqual(post.body, "")
post = posts[1] post = posts[1]
self.assertEquals( self.assertEqual(
post.created.strftime("%Y-%m-%d %H:%M:%S"), "2019-10-30 12:30:00" post.created.strftime("%Y-%m-%d %H:%M:%S"), "2019-10-30 12:30:00"
) )
self.assertEquals( self.assertEqual(
post.remote_identifier, post.remote_identifier,
"https://www.bbc.co.uk/news/world-us-canada-48338168", "https://www.bbc.co.uk/news/world-us-canada-48338168",
) )
self.assertEquals(post.body, "") self.assertEqual(post.body, "")
def test_entry_without_author(self): def test_entry_without_author(self):
builder = FeedBuilder
rule = FeedFactory() rule = FeedFactory()
mock_stream = Mock(rule=rule) mock_stream = Mock(rule=rule)
with builder(mock_without_author, mock_stream) as builder: with FeedBuilder(mock_without_author, mock_stream) as builder:
builder.build() builder.build()
builder.save() builder.save()
posts = Post.objects.order_by("-publication_date") posts = Post.objects.order_by("-publication_date")
self.assertEquals(Post.objects.count(), 2) self.assertEqual(Post.objects.count(), 2)
post = posts[0] post = posts[0]
self.assertEquals(post.created, timezone.now()) self.assertEqual(post.created, timezone.now())
self.assertEquals( self.assertEqual(
post.remote_identifier, post.remote_identifier,
"https://www.bbc.co.uk/news/world-us-canada-48338168", "https://www.bbc.co.uk/news/world-us-canada-48338168",
) )
self.assertEquals(post.author, None) self.assertEqual(post.author, None)
post = posts[1] post = posts[1]
self.assertEquals(post.created, timezone.now()) self.assertEqual(post.created, timezone.now())
self.assertEquals( self.assertEqual(
post.remote_identifier, "https://www.bbc.co.uk/news/technology-48334739" post.remote_identifier, "https://www.bbc.co.uk/news/technology-48334739"
) )
self.assertEquals(post.author, None) self.assertEqual(post.author, None)
def test_empty_entries(self): def test_empty_entries(self):
builder = FeedBuilder
rule = FeedFactory() rule = FeedFactory()
mock_stream = Mock(rule=rule) mock_stream = Mock(rule=rule)
with builder(mock_without_entries, mock_stream) as builder: with FeedBuilder(mock_without_entries, mock_stream) as builder:
builder.build() builder.build()
builder.save() builder.save()
self.assertEquals(Post.objects.count(), 0) self.assertEqual(Post.objects.count(), 0)
def test_update_entries(self): def test_update_entries(self):
builder = FeedBuilder
rule = FeedFactory() rule = FeedFactory()
mock_stream = Mock(rule=rule) mock_stream = Mock(rule=rule)
@ -303,36 +260,35 @@ class FeedBuilderTestCase(TestCase):
remote_identifier="a5479c66-8fae-11e9-8422-00163ef6bee7", rule=rule remote_identifier="a5479c66-8fae-11e9-8422-00163ef6bee7", rule=rule
) )
with builder(mock_with_update_entries, mock_stream) as builder: with FeedBuilder(mock_with_update_entries, mock_stream) as builder:
builder.build() builder.build()
builder.save() builder.save()
self.assertEquals(Post.objects.count(), 3) self.assertEqual(Post.objects.count(), 3)
existing_first_post.refresh_from_db() existing_first_post.refresh_from_db()
existing_second_post.refresh_from_db() existing_second_post.refresh_from_db()
self.assertEquals( self.assertEqual(
existing_first_post.title, existing_first_post.title,
"Trump's 'genocidal taunts' will not end Iran - Zarif", "Trump's 'genocidal taunts' will not end Iran - Zarif",
) )
self.assertEquals( self.assertEqual(
existing_second_post.title, "Huawei's Android loss: How it affects you" existing_second_post.title, "Huawei's Android loss: How it affects you"
) )
def test_html_sanitizing(self): def test_html_sanitizing(self):
builder = FeedBuilder
rule = FeedFactory() rule = FeedFactory()
mock_stream = Mock(rule=rule) mock_stream = Mock(rule=rule)
with builder(mock_with_html, mock_stream) as builder: with FeedBuilder(mock_with_html, mock_stream) as builder:
builder.build() builder.build()
builder.save() builder.save()
post = Post.objects.get() post = Post.objects.get()
self.assertEquals(Post.objects.count(), 1) self.assertEqual(Post.objects.count(), 1)
self.assertTrue("<article>" in post.body) self.assertTrue("<article>" in post.body)
self.assertTrue("<h1>" in post.body) self.assertTrue("<h1>" in post.body)
@ -345,64 +301,60 @@ class FeedBuilderTestCase(TestCase):
self.assertTrue("<iframe>" not in post.body) self.assertTrue("<iframe>" not in post.body)
def test_long_author_text_is_truncated(self): def test_long_author_text_is_truncated(self):
builder = FeedBuilder
rule = FeedFactory() rule = FeedFactory()
mock_stream = Mock(rule=rule) mock_stream = Mock(rule=rule)
with builder(mock_with_long_author, mock_stream) as builder: with FeedBuilder(mock_with_long_author, mock_stream) as builder:
builder.build() builder.build()
builder.save() builder.save()
post = Post.objects.get() post = Post.objects.get()
self.assertEquals(Post.objects.count(), 1) self.assertEqual(Post.objects.count(), 1)
self.assertEquals(len(post.author), 40) self.assertEqual(len(post.author), 40)
def test_long_title_text_is_truncated(self): def test_long_title_text_is_truncated(self):
builder = FeedBuilder
rule = FeedFactory() rule = FeedFactory()
mock_stream = Mock(rule=rule) mock_stream = Mock(rule=rule)
with builder(mock_with_long_title, mock_stream) as builder: with FeedBuilder(mock_with_long_title, mock_stream) as builder:
builder.build() builder.build()
builder.save() builder.save()
post = Post.objects.get() post = Post.objects.get()
self.assertEquals(Post.objects.count(), 1) self.assertEqual(Post.objects.count(), 1)
self.assertEquals(len(post.title), 200) self.assertEqual(len(post.title), 200)
self.assertTrue(post.title.endswith("")) self.assertTrue(post.title.endswith(""))
def test_long_title_exotic_title(self): def test_long_title_exotic_title(self):
builder = FeedBuilder
rule = FeedFactory() rule = FeedFactory()
mock_stream = Mock(rule=rule) mock_stream = Mock(rule=rule)
with builder(mock_with_long_exotic_title, mock_stream) as builder: with FeedBuilder(mock_with_long_exotic_title, mock_stream) as builder:
builder.build() builder.build()
builder.save() builder.save()
post = Post.objects.get() post = Post.objects.get()
self.assertEquals(Post.objects.count(), 1) self.assertEqual(Post.objects.count(), 1)
self.assertEquals(len(post.title), 200) self.assertEqual(len(post.title), 200)
self.assertTrue(post.title.endswith("")) self.assertTrue(post.title.endswith(""))
def test_content_detail_is_prioritized_if_longer(self): def test_content_detail_is_prioritized_if_longer(self):
builder = FeedBuilder
rule = FeedFactory() rule = FeedFactory()
mock_stream = Mock(rule=rule) mock_stream = Mock(rule=rule)
with builder(mock_with_longer_content_detail, mock_stream) as builder: with FeedBuilder(mock_with_longer_content_detail, mock_stream) as builder:
builder.build() builder.build()
builder.save() builder.save()
post = Post.objects.get() post = Post.objects.get()
self.assertEquals(Post.objects.count(), 1) self.assertEqual(Post.objects.count(), 1)
self.assertFalse( self.assertFalse(
"Foreign Minister Mohammad Javad Zarif says the US" in post.body "Foreign Minister Mohammad Javad Zarif says the US" in post.body
@ -410,33 +362,49 @@ class FeedBuilderTestCase(TestCase):
self.assertTrue("Federal Communications Commission" in post.body) self.assertTrue("Federal Communications Commission" in post.body)
def test_content_detail_is_not_prioritized_if_shorter(self): def test_content_detail_is_not_prioritized_if_shorter(self):
builder = FeedBuilder
rule = FeedFactory() rule = FeedFactory()
mock_stream = Mock(rule=rule) mock_stream = Mock(rule=rule)
with builder(mock_with_shorter_content_detail, mock_stream) as builder: with FeedBuilder(mock_with_shorter_content_detail, mock_stream) as builder:
builder.build() builder.build()
builder.save() builder.save()
post = Post.objects.get() post = Post.objects.get()
self.assertEquals(Post.objects.count(), 1) self.assertEqual(Post.objects.count(), 1)
self.assertTrue( self.assertTrue(
"Foreign Minister Mohammad Javad Zarif says the US" in post.body "Foreign Minister Mohammad Javad Zarif says the US" in post.body
) )
def test_content_detail_is_concatinated(self): def test_content_detail_is_concatinated(self):
builder = FeedBuilder
rule = FeedFactory() rule = FeedFactory()
mock_stream = Mock(rule=rule) mock_stream = Mock(rule=rule)
with builder(mock_with_multiple_content_detail, mock_stream) as builder: with FeedBuilder(mock_with_multiple_content_detail, mock_stream) as builder:
builder.build() builder.build()
builder.save() builder.save()
post = Post.objects.get() post = Post.objects.get()
self.assertEquals(Post.objects.count(), 1) self.assertEqual(Post.objects.count(), 1)
self.assertEquals(post.body, "Yippie\n Ya\n Yee") self.assertEqual(post.body, "Yippie\n Ya\n Yee")
@patch("newsreader.news.collection.feed.FeedBuilder.build_post")
def test_post_key_error(self, mocked_build_post):
rule = FeedFactory()
mock_stream = Mock(rule=rule)
identifier = str(uuid4())
build_post = FeedPostFactory.build(rule=rule, remote_identifier=identifier)
mocked_build_post.side_effect = (KeyError, build_post)
with FeedBuilder({"entries": [{}, {}]}, mock_stream) as builder:
builder.build()
builder.save()
posts = Post.objects.values_list("remote_identifier", flat=True)
self.assertCountEqual(posts, (identifier,))