- Update logging
- Update FeedDuplicateHandler
This commit is contained in:
sonny 2020-06-18 20:29:48 +02:00
parent 00f6427c57
commit 2be35bce53
7 changed files with 280 additions and 171 deletions

View file

@ -3,10 +3,10 @@ python-linting:
allow_failure: true allow_failure: true
image: python:3.7.4-slim-stretch image: python:3.7.4-slim-stretch
before_script: before_script:
- pip install poetry - pip install poetry --quiet
- poetry config cache-dir ~/.cache/poetry - poetry config cache-dir ~/.cache/poetry
- poetry config virtualenvs.in-project true - poetry config virtualenvs.in-project true
- poetry install --no-interaction - poetry install --no-interaction --quiet
script: script:
- poetry run isort src/ --check-only --recursive - poetry run isort src/ --check-only --recursive
- poetry run black src/ --line-length 88 --check - poetry run black src/ --line-length 88 --check

View file

@ -6,10 +6,10 @@ python-tests:
- memcached:1.5.22 - memcached:1.5.22
image: python:3.7.4-slim-stretch image: python:3.7.4-slim-stretch
before_script: before_script:
- pip install poetry - pip install poetry --quiet
- poetry config cache-dir .cache/poetry - poetry config cache-dir .cache/poetry
- poetry config virtualenvs.in-project true - poetry config virtualenvs.in-project true
- poetry install --no-interaction - poetry install --no-interaction --quiet
script: script:
- poetry run coverage run src/manage.py test newsreader - poetry run coverage run src/manage.py test newsreader
- poetry run coverage report - poetry run coverage report

View file

@ -103,7 +103,7 @@ CACHES = {
# https://docs.djangoproject.com/en/2.2/topics/logging/#configuring-logging # https://docs.djangoproject.com/en/2.2/topics/logging/#configuring-logging
LOGGING = { LOGGING = {
"version": 1, "version": 1,
"disable_existing_loggers": False, "disable_existing_loggers": True,
"filters": { "filters": {
"require_debug_false": {"()": "django.utils.log.RequireDebugFalse"}, "require_debug_false": {"()": "django.utils.log.RequireDebugFalse"},
"require_debug_true": {"()": "django.utils.log.RequireDebugTrue"}, "require_debug_true": {"()": "django.utils.log.RequireDebugTrue"},
@ -114,7 +114,11 @@ LOGGING = {
"format": "[{server_time}] {message}", "format": "[{server_time}] {message}",
"style": "{", "style": "{",
}, },
"syslog": {"class": "logging.Formatter", "format": "{message}", "style": "{"}, "syslog": {
"class": "logging.Formatter",
"format": "[newsreader] {message}",
"style": "{",
},
}, },
"handlers": { "handlers": {
"console": { "console": {
@ -124,6 +128,7 @@ LOGGING = {
}, },
"django.server": { "django.server": {
"level": "INFO", "level": "INFO",
"filters": ["require_debug_true"],
"class": "logging.StreamHandler", "class": "logging.StreamHandler",
"formatter": "django.server", "formatter": "django.server",
}, },
@ -157,7 +162,6 @@ LOGGING = {
"level": "INFO", "level": "INFO",
"propagate": False, "propagate": False,
}, },
"celery": {"handlers": ["syslog", "console"], "level": "INFO"},
"celery.task": {"handlers": ["syslog", "console"], "level": "INFO"}, "celery.task": {"handlers": ["syslog", "console"], "level": "INFO"},
}, },
} }

View file

@ -52,9 +52,7 @@ class FeedBuilder(Builder):
entries = data.get("entries", []) entries = data.get("entries", [])
instances = self.build(entries, stream.rule) instances = self.build(entries, stream.rule)
posts = duplicate_handler.check(instances) self.instances = duplicate_handler.check(instances)
self.instances = [post for post in posts]
def build(self, entries, rule): def build(self, entries, rule):
field_mapping = { field_mapping = {
@ -196,22 +194,27 @@ class FeedDuplicateHandler:
def check(self, instances): def check(self, instances):
deduplicated_instances = self.deduplicate_instances(instances) deduplicated_instances = self.deduplicate_instances(instances)
checked_instances = []
for instance in deduplicated_instances: for instance in deduplicated_instances:
if instance.remote_identifier in self.existing_identifiers: if instance.remote_identifier in self.existing_identifiers:
existing_post = self.handle_duplicate_identifier(instance) existing_post = self.handle_duplicate_identifier(instance)
yield existing_post checked_instances.append(existing_post)
continue continue
elif self.in_database(instance): elif self.in_database(instance):
existing_post = self.get_duplicate_in_database(instance) existing_post = self.get_duplicate_in_database(instance)
if self.in_time_slot(instance, existing_post): if self.in_time_slot(instance, existing_post):
yield self.update_existing_post(instance, existing_post) checked_instances.append(
self.update_existing_post(instance, existing_post)
)
continue continue
yield instance checked_instances.append(instance)
return checked_instances
def in_database(self, post): def in_database(self, post):
values = {field: getattr(post, field, None) for field in self.duplicate_fields} values = {field: getattr(post, field, None) for field in self.duplicate_fields}
@ -229,23 +232,29 @@ class FeedDuplicateHandler:
return True return True
def deduplicate_instances(self, instances): def deduplicate_instances(self, instances):
sorted_instances = sorted(
instances, key=lambda instance: instance.publication_date, reverse=True
)
deduplicated_instances = [] deduplicated_instances = []
for instance in instances: for instance in sorted_instances:
instance_identifier = instance.remote_identifier
duplicate = False
values = { values = {
field: getattr(instance, field, None) for field in self.duplicate_fields field: getattr(instance, field, None) for field in self.duplicate_fields
} }
duplicate = False
for deduplicated_instance in deduplicated_instances: for deduplicated_instance in deduplicated_instances:
deduplicated_identifier = deduplicated_instance.remote_identifier deduplicated_identifier = deduplicated_instance.remote_identifier
instance_identifier = instance.remote_identifier
has_identifiers = deduplicated_identifier and instance_identifier has_identifiers = deduplicated_identifier and instance_identifier
if self.is_duplicate(deduplicated_instance, values): is_same_identifier = (
duplicate = True has_identifiers and deduplicated_identifier == instance_identifier
break )
elif has_identifiers and deduplicated_identifier == instance_identifier: is_duplicate = self.is_duplicate(deduplicated_instance, values)
if is_duplicate or is_same_identifier:
duplicate = True duplicate = True
break break

View file

@ -16,6 +16,7 @@ from newsreader.news.core.tests.factories import PostFactory
from .mocks import * from .mocks import *
@freeze_time("2019-10-30 12:30:00")
class FeedBuilderTestCase(TestCase): class FeedBuilderTestCase(TestCase):
def setUp(self): def setUp(self):
self.maxDiff = None self.maxDiff = None
@ -30,8 +31,10 @@ class FeedBuilderTestCase(TestCase):
post = Post.objects.get() post = Post.objects.get()
d = datetime.combine(date(2019, 5, 20), time(hour=16, minute=7, second=37)) publication_date = datetime.combine(
aware_date = pytz.utc.localize(d) date(2019, 5, 20), time(hour=16, minute=7, second=37)
)
aware_date = pytz.utc.localize(publication_date)
self.assertEquals(post.publication_date, aware_date) self.assertEquals(post.publication_date, aware_date)
self.assertEquals(Post.objects.count(), 1) self.assertEquals(Post.objects.count(), 1)
@ -57,49 +60,60 @@ class FeedBuilderTestCase(TestCase):
with builder((multiple_mock, mock_stream)) as builder: with builder((multiple_mock, mock_stream)) as builder:
builder.save() builder.save()
posts = Post.objects.order_by("id") posts = Post.objects.order_by("-publication_date")
self.assertEquals(Post.objects.count(), 3) self.assertEquals(Post.objects.count(), 3)
first_post = posts[0] post = posts[0]
second_post = posts[1]
d = datetime.combine(date(2019, 5, 20), time(hour=16, minute=7, second=37)) publication_date = datetime.combine(
aware_date = pytz.utc.localize(d) date(2019, 5, 20), time(hour=16, minute=32, second=38)
)
self.assertEquals(first_post.publication_date, aware_date) aware_date = pytz.utc.localize(publication_date)
self.assertEquals( self.assertEquals(
first_post.remote_identifier, post.publication_date.strftime("%Y-%m-%d %H:%M:%S"),
aware_date.strftime("%Y-%m-%d %H:%M:%S"),
)
self.assertEquals(
post.remote_identifier,
"https://www.bbc.co.uk/news/uk-england-birmingham-48339080",
)
self.assertEquals(
post.url, "https://www.bbc.co.uk/news/uk-england-birmingham-48339080"
)
self.assertEquals(
post.title, "Birmingham head teacher threatened over LGBT lessons"
)
post = posts[1]
publication_date = datetime.combine(
date(2019, 5, 20), time(hour=16, minute=7, second=37)
)
aware_date = pytz.utc.localize(publication_date)
self.assertEquals(
post.publication_date.strftime("%Y-%m-%d %H:%M:%S"),
aware_date.strftime("%Y-%m-%d %H:%M:%S"),
)
self.assertEquals(
post.remote_identifier,
"https://www.bbc.co.uk/news/world-us-canada-48338168", "https://www.bbc.co.uk/news/world-us-canada-48338168",
) )
self.assertEquals( self.assertEquals(
first_post.url, "https://www.bbc.co.uk/news/world-us-canada-48338168" post.url, "https://www.bbc.co.uk/news/world-us-canada-48338168"
) )
self.assertEquals( self.assertEquals(
first_post.title, "Trump's 'genocidal taunts' will not end Iran - Zarif" post.title, "Trump's 'genocidal taunts' will not end Iran - Zarif"
) )
d = datetime.combine(date(2019, 5, 20), time(hour=12, minute=19, second=19)) def test_entries_without_remote_identifier(self):
aware_date = pytz.utc.localize(d)
self.assertEquals(second_post.publication_date, aware_date)
self.assertEquals(
second_post.remote_identifier,
"https://www.bbc.co.uk/news/technology-48334739",
)
self.assertEquals(
second_post.url, "https://www.bbc.co.uk/news/technology-48334739"
)
self.assertEquals(
second_post.title, "Huawei's Android loss: How it affects you"
)
def test_entry_without_remote_identifier(self):
builder = FeedBuilder builder = FeedBuilder
rule = CollectionRuleFactory() rule = CollectionRuleFactory()
mock_stream = MagicMock(rule=rule) mock_stream = MagicMock(rule=rule)
@ -107,27 +121,37 @@ class FeedBuilderTestCase(TestCase):
with builder((mock_without_identifier, mock_stream)) as builder: with builder((mock_without_identifier, mock_stream)) as builder:
builder.save() builder.save()
posts = Post.objects.order_by("id") posts = Post.objects.order_by("-publication_date")
self.assertEquals(Post.objects.count(), 2) self.assertEquals(Post.objects.count(), 2)
first_post = posts[0] post = posts[0]
d = datetime.combine(date(2019, 5, 20), time(hour=16, minute=7, second=37)) publication_date = datetime.combine(
aware_date = pytz.utc.localize(d) date(2019, 5, 20), time(hour=16, minute=7, second=37)
)
self.assertEquals(first_post.publication_date, aware_date) aware_date = pytz.utc.localize(publication_date)
self.assertEquals(first_post.remote_identifier, None)
self.assertEquals(post.publication_date, aware_date)
self.assertEquals(post.remote_identifier, None)
self.assertEquals( self.assertEquals(
first_post.url, "https://www.bbc.co.uk/news/world-us-canada-48338168" post.url, "https://www.bbc.co.uk/news/world-us-canada-48338168"
)
self.assertEquals(
post.title, "Trump's 'genocidal taunts' will not end Iran - Zarif"
) )
self.assertEquals( post = posts[1]
first_post.title, "Trump's 'genocidal taunts' will not end Iran - Zarif"
) publication_date = datetime.combine(
date(2019, 5, 20), time(hour=12, minute=19, second=19)
)
aware_date = pytz.utc.localize(publication_date)
self.assertEquals(post.publication_date, aware_date)
self.assertEquals(post.remote_identifier, None)
self.assertEquals(post.url, "https://www.bbc.co.uk/news/technology-48334739")
self.assertEquals(post.title, "Huawei's Android loss: How it affects you")
@freeze_time("2019-10-30 12:30:00")
def test_entry_without_publication_date(self): def test_entry_without_publication_date(self):
builder = FeedBuilder builder = FeedBuilder
rule = CollectionRuleFactory() rule = CollectionRuleFactory()
@ -136,25 +160,30 @@ class FeedBuilderTestCase(TestCase):
with builder((mock_without_publish_date, mock_stream)) as builder: with builder((mock_without_publish_date, mock_stream)) as builder:
builder.save() builder.save()
posts = Post.objects.order_by("id") posts = Post.objects.order_by("-publication_date")
self.assertEquals(Post.objects.count(), 2) self.assertEquals(Post.objects.count(), 2)
first_post = posts[0] post = posts[0]
second_post = posts[1]
self.assertEquals(first_post.created, timezone.now())
self.assertEquals( self.assertEquals(
first_post.remote_identifier, post.publication_date.strftime("%Y-%m-%d %H:%M"), "2019-10-30 12:30"
)
self.assertEquals(post.created, timezone.now())
self.assertEquals(
post.remote_identifier,
"https://www.bbc.co.uk/news/world-us-canada-48338168", "https://www.bbc.co.uk/news/world-us-canada-48338168",
) )
self.assertEquals(second_post.created, timezone.now()) post = posts[1]
self.assertEquals( self.assertEquals(
second_post.remote_identifier, post.publication_date.strftime("%Y-%m-%d %H:%M"), "2019-10-30 12:30"
"https://www.bbc.co.uk/news/technology-48334739", )
self.assertEquals(post.created, timezone.now())
self.assertEquals(
post.remote_identifier, "https://www.bbc.co.uk/news/technology-48334739"
) )
@freeze_time("2019-10-30 12:30:00")
def test_entry_without_url(self): def test_entry_without_url(self):
builder = FeedBuilder builder = FeedBuilder
rule = CollectionRuleFactory() rule = CollectionRuleFactory()
@ -163,25 +192,24 @@ class FeedBuilderTestCase(TestCase):
with builder((mock_without_url, mock_stream)) as builder: with builder((mock_without_url, mock_stream)) as builder:
builder.save() builder.save()
posts = Post.objects.order_by("id") posts = Post.objects.order_by("-publication_date")
self.assertEquals(Post.objects.count(), 2) self.assertEquals(Post.objects.count(), 2)
first_post = posts[0] post = posts[0]
second_post = posts[1]
self.assertEquals(first_post.created, timezone.now()) self.assertEquals(post.created, timezone.now())
self.assertEquals( self.assertEquals(
first_post.remote_identifier, post.remote_identifier,
"https://www.bbc.co.uk/news/world-us-canada-48338168", "https://www.bbc.co.uk/news/world-us-canada-48338168",
) )
self.assertEquals(second_post.created, timezone.now()) post = posts[1]
self.assertEquals(post.created, timezone.now())
self.assertEquals( self.assertEquals(
second_post.remote_identifier, post.remote_identifier, "https://www.bbc.co.uk/news/technology-48334739"
"https://www.bbc.co.uk/news/technology-48334739",
) )
@freeze_time("2019-10-30 12:30:00")
def test_entry_without_body(self): def test_entry_without_body(self):
builder = FeedBuilder builder = FeedBuilder
rule = CollectionRuleFactory() rule = CollectionRuleFactory()
@ -190,25 +218,32 @@ class FeedBuilderTestCase(TestCase):
with builder((mock_without_body, mock_stream)) as builder: with builder((mock_without_body, mock_stream)) as builder:
builder.save() builder.save()
posts = Post.objects.order_by("id") posts = Post.objects.order_by("-publication_date")
self.assertEquals(Post.objects.count(), 2) self.assertEquals(Post.objects.count(), 2)
first_post = posts[0] post = posts[0]
second_post = posts[1]
self.assertEquals(first_post.created, timezone.now())
self.assertEquals( self.assertEquals(
first_post.remote_identifier, post.created.strftime("%Y-%m-%d %H:%M:%S"), "2019-10-30 12:30:00"
"https://www.bbc.co.uk/news/world-us-canada-48338168",
) )
self.assertEquals(second_post.created, timezone.now())
self.assertEquals( self.assertEquals(
second_post.remote_identifier, post.remote_identifier,
"https://www.bbc.co.uk/news/uk-england-birmingham-48339080", "https://www.bbc.co.uk/news/uk-england-birmingham-48339080",
) )
self.assertEquals(post.body, "")
post = posts[1]
self.assertEquals(
post.created.strftime("%Y-%m-%d %H:%M:%S"), "2019-10-30 12:30:00"
)
self.assertEquals(
post.remote_identifier,
"https://www.bbc.co.uk/news/world-us-canada-48338168",
)
self.assertEquals(post.body, "")
@freeze_time("2019-10-30 12:30:00")
def test_entry_without_author(self): def test_entry_without_author(self):
builder = FeedBuilder builder = FeedBuilder
rule = CollectionRuleFactory() rule = CollectionRuleFactory()
@ -217,23 +252,25 @@ class FeedBuilderTestCase(TestCase):
with builder((mock_without_author, mock_stream)) as builder: with builder((mock_without_author, mock_stream)) as builder:
builder.save() builder.save()
posts = Post.objects.order_by("id") posts = Post.objects.order_by("-publication_date")
self.assertEquals(Post.objects.count(), 2) self.assertEquals(Post.objects.count(), 2)
first_post = posts[0] post = posts[0]
second_post = posts[1]
self.assertEquals(first_post.created, timezone.now()) self.assertEquals(post.created, timezone.now())
self.assertEquals( self.assertEquals(
first_post.remote_identifier, post.remote_identifier,
"https://www.bbc.co.uk/news/world-us-canada-48338168", "https://www.bbc.co.uk/news/world-us-canada-48338168",
) )
self.assertEquals(post.author, None)
self.assertEquals(second_post.created, timezone.now()) post = posts[1]
self.assertEquals(post.created, timezone.now())
self.assertEquals( self.assertEquals(
second_post.remote_identifier, post.remote_identifier, "https://www.bbc.co.uk/news/technology-48334739"
"https://www.bbc.co.uk/news/technology-48334739",
) )
self.assertEquals(post.author, None)
def test_empty_entries(self): def test_empty_entries(self):
builder = FeedBuilder builder = FeedBuilder

View file

@ -11,110 +11,137 @@ from newsreader.news.core.models import Post
from newsreader.news.core.tests.factories import PostFactory from newsreader.news.core.tests.factories import PostFactory
@freeze_time("2019-10-30 12:30:00")
class FeedDuplicateHandlerTestCase(TestCase): class FeedDuplicateHandlerTestCase(TestCase):
def setUp(self): def setUp(self):
self.maxDiff = None self.maxDiff = None
def test_duplicate_entries_with_remote_identifiers(self): def test_duplicate_entries_with_remote_identifiers(self):
rule = CollectionRuleFactory() rule = CollectionRuleFactory()
existing_post = PostFactory.create( existing_post = PostFactory.create(
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7", rule=rule remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7", rule=rule
) )
new_post = PostFactory.build(
new_posts = PostFactory.build_batch(
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7", remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7",
title="title got updated", publication_date=timezone.now() - timedelta(days=7),
rule=rule,
size=5,
)
last_post = PostFactory.build(
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7",
publication_date=timezone.now(),
rule=rule, rule=rule,
) )
with FeedDuplicateHandler(rule) as duplicate_handler: with FeedDuplicateHandler(rule) as duplicate_handler:
posts_gen = duplicate_handler.check([new_post]) posts = duplicate_handler.check((*new_posts, last_post))
posts = list(posts_gen)
self.assertEquals(len(posts), 1) self.assertEquals(len(posts), 1)
post = posts[0] post = posts[0]
existing_post.refresh_from_db()
self.assertEquals(existing_post.pk, post.pk) self.assertEquals(
self.assertEquals(post.publication_date, new_post.publication_date) post.publication_date.strftime("%Y-%m-%d %H-%M-%S"),
self.assertEquals(post.title, new_post.title) last_post.publication_date.strftime("%Y-%m-%d %H-%M-%S"),
self.assertEquals(post.body, new_post.body) )
self.assertEquals(post.rule, new_post.rule) self.assertEquals(post.title, last_post.title)
self.assertEquals(post.body, last_post.body)
self.assertEquals(post.rule, last_post.rule)
self.assertEquals(post.read, False) self.assertEquals(post.read, False)
@freeze_time("2019-10-30 12:30:00")
def test_duplicate_entries_with_different_remote_identifiers(self): def test_duplicate_entries_with_different_remote_identifiers(self):
rule = CollectionRuleFactory() rule = CollectionRuleFactory()
publication_date = timezone.now()
existing_post = PostFactory.create( existing_post = PostFactory(
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7", remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7",
url="https://bbc.com", url="https://bbc.com",
title="New post", title="New post",
body="Body", body="Body",
publication_date=publication_date, publication_date=timezone.now() - timedelta(minutes=10),
rule=rule, rule=rule,
) )
new_post = PostFactory.build(
new_posts = PostFactory.build_batch(
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7Q", remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7Q",
url="https://bbc.com", url="https://bbc.com",
title="New post", title="New post",
body="Body", body="Body",
publication_date=publication_date, publication_date=timezone.now() - timedelta(minutes=5),
rule=rule,
size=5,
)
last_post = PostFactory.build(
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7Q",
url="https://bbc.com",
title="New post",
body="Body",
publication_date=timezone.now(),
rule=rule, rule=rule,
) )
with FeedDuplicateHandler(rule) as duplicate_handler: with FeedDuplicateHandler(rule) as duplicate_handler:
posts_gen = duplicate_handler.check([new_post]) posts = duplicate_handler.check((*new_posts, last_post))
posts = list(posts_gen)
self.assertEquals(len(posts), 1) self.assertEquals(len(posts), 1)
existing_post.refresh_from_db()
post = posts[0] post = posts[0]
self.assertEquals(existing_post.pk, post.pk) self.assertEquals(
self.assertEquals(post.title, new_post.title) post.publication_date.strftime("%Y-%m-%d %H-%M-%S"),
self.assertEquals(post.body, new_post.body) last_post.publication_date.strftime("%Y-%m-%d %H-%M-%S"),
self.assertEquals(post.rule, new_post.rule) )
self.assertEquals(post.publication_date, new_post.publication_date) self.assertEquals(post.title, last_post.title)
self.assertEquals(post.body, last_post.body)
self.assertEquals(post.rule, last_post.rule)
self.assertEquals(post.read, False) self.assertEquals(post.read, False)
def test_duplicate_entries_in_recent_database(self): def test_duplicate_entries_in_recent_database(self):
publication_date = timezone.now()
rule = CollectionRuleFactory() rule = CollectionRuleFactory()
existing_post = PostFactory.create(
existing_post = PostFactory(
url="https://www.bbc.co.uk/news/uk-england-birmingham-48339080", url="https://www.bbc.co.uk/news/uk-england-birmingham-48339080",
title="Birmingham head teacher threatened over LGBT lessons", title="Birmingham head teacher threatened over LGBT lessons",
body="Google's move to end business ties with Huawei will affect current devices", body="Google's move to end business ties with Huawei will affect current devices",
publication_date=publication_date, publication_date=timezone.now() - timedelta(minutes=10),
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7", remote_identifier=None,
rule=rule, rule=rule,
) )
new_post = PostFactory.build(
new_posts = PostFactory.build_batch(
url="https://www.bbc.co.uk/news/uk-england-birmingham-48339080", url="https://www.bbc.co.uk/news/uk-england-birmingham-48339080",
title="Birmingham head teacher threatened over LGBT lessons", title="Birmingham head teacher threatened over LGBT lessons",
body="Google's move to end business ties with Huawei will affect current devices", body="Google's move to end business ties with Huawei will affect current devices",
publication_date=publication_date, publication_date=timezone.now() - timedelta(minutes=5),
remote_identifier=None,
rule=rule,
size=5,
)
last_post = PostFactory.build(
url="https://www.bbc.co.uk/news/uk-england-birmingham-48339080",
title="Birmingham head teacher threatened over LGBT lessons",
body="Google's move to end business ties with Huawei will affect current devices",
publication_date=timezone.now(),
remote_identifier=None, remote_identifier=None,
rule=rule, rule=rule,
) )
with FeedDuplicateHandler(rule) as duplicate_handler: with FeedDuplicateHandler(rule) as duplicate_handler:
posts_gen = duplicate_handler.check([new_post]) posts = duplicate_handler.check((*new_posts, last_post))
posts = list(posts_gen)
self.assertEquals(len(posts), 1) self.assertEquals(len(posts), 1)
existing_post.refresh_from_db()
post = posts[0] post = posts[0]
self.assertEquals(existing_post.pk, post.pk) self.assertEquals(
self.assertEquals(post.title, new_post.title) post.publication_date.strftime("%Y-%m-%d %H-%M-%S"),
self.assertEquals(post.body, new_post.body) last_post.publication_date.strftime("%Y-%m-%d %H-%M-%S"),
self.assertEquals(post.rule, new_post.rule) )
self.assertEquals(post.publication_date, new_post.publication_date) self.assertEquals(post.title, last_post.title)
self.assertEquals(post.body, last_post.body)
self.assertEquals(post.rule, last_post.rule)
self.assertEquals(post.read, False) self.assertEquals(post.read, False)
def test_multiple_existing_entries_with_identifier(self): def test_multiple_existing_entries_with_identifier(self):
@ -124,15 +151,20 @@ class FeedDuplicateHandlerTestCase(TestCase):
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7", rule=rule, size=5 remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7", rule=rule, size=5
) )
new_post = PostFactory.build( new_posts = PostFactory.build_batch(
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7", remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7",
title="This is a new one", publication_date=timezone.now() - timedelta(hours=5),
rule=rule,
size=5,
)
last_post = PostFactory.build(
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7",
publication_date=timezone.now() - timedelta(minutes=5),
rule=rule, rule=rule,
) )
with FeedDuplicateHandler(rule) as duplicate_handler: with FeedDuplicateHandler(rule) as duplicate_handler:
posts_gen = duplicate_handler.check([new_post]) posts = duplicate_handler.check((*new_posts, last_post))
posts = list(posts_gen)
self.assertEquals(len(posts), 1) self.assertEquals(len(posts), 1)
@ -145,77 +177,101 @@ class FeedDuplicateHandlerTestCase(TestCase):
post = posts[0] post = posts[0]
self.assertEquals(post.title, new_post.title) self.assertEquals(
self.assertEquals(post.body, new_post.body) post.publication_date.strftime("%Y-%m-%d %H-%M-%S"),
self.assertEquals(post.publication_date, new_post.publication_date) last_post.publication_date.strftime("%Y-%m-%d %H-%M-%S"),
self.assertEquals(post.rule, new_post.rule) )
self.assertEquals(post.title, last_post.title)
self.assertEquals(post.body, last_post.body)
self.assertEquals(post.rule, last_post.rule)
self.assertEquals(post.read, False) self.assertEquals(post.read, False)
@freeze_time("2019-10-30 12:30:00")
def test_duplicate_entries_outside_time_slot(self): def test_duplicate_entries_outside_time_slot(self):
publication_date = timezone.now()
rule = CollectionRuleFactory() rule = CollectionRuleFactory()
existing_post = PostFactory.create(
existing_post = PostFactory(
url="https://www.bbc.co.uk/news/uk-england-birmingham-48339080", url="https://www.bbc.co.uk/news/uk-england-birmingham-48339080",
title="Birmingham head teacher threatened over LGBT lessons", title="Birmingham head teacher threatened over LGBT lessons",
body="Google's move to end business ties with Huawei will affect current devices", body="Google's move to end business ties with Huawei will affect current devices",
publication_date=publication_date, publication_date=timezone.now(),
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7", remote_identifier=None,
rule=rule, rule=rule,
) )
new_post = PostFactory.build(
new_posts = PostFactory.build_batch(
url="https://www.bbc.co.uk/news/uk-england-birmingham-48339080", url="https://www.bbc.co.uk/news/uk-england-birmingham-48339080",
title="Birmingham head teacher threatened over LGBT lessons", title="Birmingham head teacher threatened over LGBT lessons",
body="Google's move to end business ties with Huawei will affect current devices", body="Google's move to end business ties with Huawei will affect current devices",
publication_date=publication_date + timedelta(minutes=12), publication_date=timezone.now() + timedelta(minutes=12),
remote_identifier=None,
rule=rule,
size=5,
)
last_post = PostFactory.build(
url="https://www.bbc.co.uk/news/uk-england-birmingham-48339080",
title="Birmingham head teacher threatened over LGBT lessons",
body="Google's move to end business ties with Huawei will affect current devices",
publication_date=timezone.now() + timedelta(minutes=13),
remote_identifier=None, remote_identifier=None,
rule=rule, rule=rule,
) )
with FeedDuplicateHandler(rule) as duplicate_handler: with FeedDuplicateHandler(rule) as duplicate_handler:
posts_gen = duplicate_handler.check([new_post]) posts = duplicate_handler.check((*new_posts, last_post))
posts = list(posts_gen)
self.assertEquals(len(posts), 1) self.assertEquals(len(posts), 1)
existing_post.refresh_from_db()
post = posts[0] post = posts[0]
self.assertEquals(post.pk, None) self.assertEquals(post.pk, None)
self.assertEquals(post.title, new_post.title) self.assertEquals(
self.assertEquals(post.body, new_post.body) post.publication_date.strftime("%Y-%m-%d %H-%M-%S"),
self.assertEquals(post.rule, new_post.rule) last_post.publication_date.strftime("%Y-%m-%d %H-%M-%S"),
self.assertEquals(post.publication_date, new_post.publication_date) )
self.assertEquals(post.title, last_post.title)
self.assertEquals(post.body, last_post.body)
self.assertEquals(post.rule, last_post.rule)
self.assertEquals(post.read, False) self.assertEquals(post.read, False)
def test_duplicate_entries_in_collected_entries(self): def test_duplicate_entries_in_collected_entries(self):
rule = CollectionRuleFactory() rule = CollectionRuleFactory()
post_1 = PostFactory.build( post_1 = PostFactory.build(
title="title got updated", body="body", url="https://bbc.com", rule=rule title="title got updated",
body="body",
url="https://bbc.com",
publication_date=timezone.now(),
rule=rule,
) )
duplicate_post_1 = PostFactory.build( duplicate_post_1 = PostFactory.build(
title="title got updated", body="body", url="https://bbc.com", rule=rule title="title got updated",
body="body",
url="https://bbc.com",
publication_date=timezone.now() - timedelta(minutes=5),
rule=rule,
) )
post_2 = PostFactory.build( post_2 = PostFactory.build(
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7" remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7",
publication_date=timezone.now(),
) )
duplicate_post_2 = PostFactory.build( duplicate_post_2 = PostFactory.build(
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7" remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7",
publication_date=timezone.now() - timedelta(minutes=5),
) )
collected_posts = (post_1, post_2, duplicate_post_1, duplicate_post_2) collected_posts = (post_1, post_2, duplicate_post_1, duplicate_post_2)
with FeedDuplicateHandler(rule) as duplicate_handler: with FeedDuplicateHandler(rule) as duplicate_handler:
posts_gen = duplicate_handler.check(collected_posts) posts = duplicate_handler.check(collected_posts)
posts = list(posts_gen)
self.assertEquals(len(posts), 2) self.assertEquals(len(posts), 2)
post = posts[0] post = posts[0]
self.assertEquals(post_1.publication_date, post.publication_date) self.assertEquals(
post_1.publication_date.strftime("%Y-%m-%d %H-%M-%S"),
post.publication_date.strftime("%Y-%m-%d %H-%M-%S"),
)
self.assertEquals(post_1.title, post.title) self.assertEquals(post_1.title, post.title)
self.assertEquals(post_1.body, post.body) self.assertEquals(post_1.body, post.body)
self.assertEquals(post_1.rule, post.rule) self.assertEquals(post_1.rule, post.rule)
@ -223,7 +279,10 @@ class FeedDuplicateHandlerTestCase(TestCase):
post = posts[1] post = posts[1]
self.assertEquals(post_2.publication_date, post.publication_date) self.assertEquals(
post_2.publication_date.strftime("%Y-%m-%d %H-%M-%S"),
post.publication_date.strftime("%Y-%m-%d %H-%M-%S"),
)
self.assertEquals(post_2.title, post.title) self.assertEquals(post_2.title, post.title)
self.assertEquals(post_2.body, post.body) self.assertEquals(post_2.body, post.body)
self.assertEquals(post_2.rule, post.rule) self.assertEquals(post_2.rule, post.rule)

View file

@ -8,7 +8,7 @@ from newsreader.news.collection.models import CollectionRule
class Post(TimeStampedModel): class Post(TimeStampedModel):
title = models.CharField(max_length=200, blank=True, null=True) title = models.CharField(max_length=200, blank=True, null=True)
body = models.TextField(blank=True, null=True) body = models.TextField(blank=True)
author = models.CharField(max_length=40, blank=True, null=True) author = models.CharField(max_length=40, blank=True, null=True)
publication_date = models.DateTimeField(default=timezone.now) publication_date = models.DateTimeField(default=timezone.now)
url = models.URLField(max_length=1024, blank=True, null=True) url = models.URLField(max_length=1024, blank=True, null=True)