0.2.3.6
- Update logging - Update FeedDuplicateHandler
This commit is contained in:
parent
00f6427c57
commit
2be35bce53
7 changed files with 280 additions and 171 deletions
|
|
@ -3,10 +3,10 @@ python-linting:
|
||||||
allow_failure: true
|
allow_failure: true
|
||||||
image: python:3.7.4-slim-stretch
|
image: python:3.7.4-slim-stretch
|
||||||
before_script:
|
before_script:
|
||||||
- pip install poetry
|
- pip install poetry --quiet
|
||||||
- poetry config cache-dir ~/.cache/poetry
|
- poetry config cache-dir ~/.cache/poetry
|
||||||
- poetry config virtualenvs.in-project true
|
- poetry config virtualenvs.in-project true
|
||||||
- poetry install --no-interaction
|
- poetry install --no-interaction --quiet
|
||||||
script:
|
script:
|
||||||
- poetry run isort src/ --check-only --recursive
|
- poetry run isort src/ --check-only --recursive
|
||||||
- poetry run black src/ --line-length 88 --check
|
- poetry run black src/ --line-length 88 --check
|
||||||
|
|
|
||||||
|
|
@ -6,10 +6,10 @@ python-tests:
|
||||||
- memcached:1.5.22
|
- memcached:1.5.22
|
||||||
image: python:3.7.4-slim-stretch
|
image: python:3.7.4-slim-stretch
|
||||||
before_script:
|
before_script:
|
||||||
- pip install poetry
|
- pip install poetry --quiet
|
||||||
- poetry config cache-dir .cache/poetry
|
- poetry config cache-dir .cache/poetry
|
||||||
- poetry config virtualenvs.in-project true
|
- poetry config virtualenvs.in-project true
|
||||||
- poetry install --no-interaction
|
- poetry install --no-interaction --quiet
|
||||||
script:
|
script:
|
||||||
- poetry run coverage run src/manage.py test newsreader
|
- poetry run coverage run src/manage.py test newsreader
|
||||||
- poetry run coverage report
|
- poetry run coverage report
|
||||||
|
|
|
||||||
|
|
@ -103,7 +103,7 @@ CACHES = {
|
||||||
# https://docs.djangoproject.com/en/2.2/topics/logging/#configuring-logging
|
# https://docs.djangoproject.com/en/2.2/topics/logging/#configuring-logging
|
||||||
LOGGING = {
|
LOGGING = {
|
||||||
"version": 1,
|
"version": 1,
|
||||||
"disable_existing_loggers": False,
|
"disable_existing_loggers": True,
|
||||||
"filters": {
|
"filters": {
|
||||||
"require_debug_false": {"()": "django.utils.log.RequireDebugFalse"},
|
"require_debug_false": {"()": "django.utils.log.RequireDebugFalse"},
|
||||||
"require_debug_true": {"()": "django.utils.log.RequireDebugTrue"},
|
"require_debug_true": {"()": "django.utils.log.RequireDebugTrue"},
|
||||||
|
|
@ -114,7 +114,11 @@ LOGGING = {
|
||||||
"format": "[{server_time}] {message}",
|
"format": "[{server_time}] {message}",
|
||||||
"style": "{",
|
"style": "{",
|
||||||
},
|
},
|
||||||
"syslog": {"class": "logging.Formatter", "format": "{message}", "style": "{"},
|
"syslog": {
|
||||||
|
"class": "logging.Formatter",
|
||||||
|
"format": "[newsreader] {message}",
|
||||||
|
"style": "{",
|
||||||
|
},
|
||||||
},
|
},
|
||||||
"handlers": {
|
"handlers": {
|
||||||
"console": {
|
"console": {
|
||||||
|
|
@ -124,6 +128,7 @@ LOGGING = {
|
||||||
},
|
},
|
||||||
"django.server": {
|
"django.server": {
|
||||||
"level": "INFO",
|
"level": "INFO",
|
||||||
|
"filters": ["require_debug_true"],
|
||||||
"class": "logging.StreamHandler",
|
"class": "logging.StreamHandler",
|
||||||
"formatter": "django.server",
|
"formatter": "django.server",
|
||||||
},
|
},
|
||||||
|
|
@ -157,7 +162,6 @@ LOGGING = {
|
||||||
"level": "INFO",
|
"level": "INFO",
|
||||||
"propagate": False,
|
"propagate": False,
|
||||||
},
|
},
|
||||||
"celery": {"handlers": ["syslog", "console"], "level": "INFO"},
|
|
||||||
"celery.task": {"handlers": ["syslog", "console"], "level": "INFO"},
|
"celery.task": {"handlers": ["syslog", "console"], "level": "INFO"},
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -52,9 +52,7 @@ class FeedBuilder(Builder):
|
||||||
entries = data.get("entries", [])
|
entries = data.get("entries", [])
|
||||||
|
|
||||||
instances = self.build(entries, stream.rule)
|
instances = self.build(entries, stream.rule)
|
||||||
posts = duplicate_handler.check(instances)
|
self.instances = duplicate_handler.check(instances)
|
||||||
|
|
||||||
self.instances = [post for post in posts]
|
|
||||||
|
|
||||||
def build(self, entries, rule):
|
def build(self, entries, rule):
|
||||||
field_mapping = {
|
field_mapping = {
|
||||||
|
|
@ -196,22 +194,27 @@ class FeedDuplicateHandler:
|
||||||
|
|
||||||
def check(self, instances):
|
def check(self, instances):
|
||||||
deduplicated_instances = self.deduplicate_instances(instances)
|
deduplicated_instances = self.deduplicate_instances(instances)
|
||||||
|
checked_instances = []
|
||||||
|
|
||||||
for instance in deduplicated_instances:
|
for instance in deduplicated_instances:
|
||||||
if instance.remote_identifier in self.existing_identifiers:
|
if instance.remote_identifier in self.existing_identifiers:
|
||||||
existing_post = self.handle_duplicate_identifier(instance)
|
existing_post = self.handle_duplicate_identifier(instance)
|
||||||
|
|
||||||
yield existing_post
|
checked_instances.append(existing_post)
|
||||||
|
|
||||||
continue
|
continue
|
||||||
elif self.in_database(instance):
|
elif self.in_database(instance):
|
||||||
existing_post = self.get_duplicate_in_database(instance)
|
existing_post = self.get_duplicate_in_database(instance)
|
||||||
|
|
||||||
if self.in_time_slot(instance, existing_post):
|
if self.in_time_slot(instance, existing_post):
|
||||||
yield self.update_existing_post(instance, existing_post)
|
checked_instances.append(
|
||||||
|
self.update_existing_post(instance, existing_post)
|
||||||
|
)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
yield instance
|
checked_instances.append(instance)
|
||||||
|
|
||||||
|
return checked_instances
|
||||||
|
|
||||||
def in_database(self, post):
|
def in_database(self, post):
|
||||||
values = {field: getattr(post, field, None) for field in self.duplicate_fields}
|
values = {field: getattr(post, field, None) for field in self.duplicate_fields}
|
||||||
|
|
@ -229,23 +232,29 @@ class FeedDuplicateHandler:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def deduplicate_instances(self, instances):
|
def deduplicate_instances(self, instances):
|
||||||
|
sorted_instances = sorted(
|
||||||
|
instances, key=lambda instance: instance.publication_date, reverse=True
|
||||||
|
)
|
||||||
deduplicated_instances = []
|
deduplicated_instances = []
|
||||||
|
|
||||||
for instance in instances:
|
for instance in sorted_instances:
|
||||||
|
instance_identifier = instance.remote_identifier
|
||||||
|
duplicate = False
|
||||||
|
|
||||||
values = {
|
values = {
|
||||||
field: getattr(instance, field, None) for field in self.duplicate_fields
|
field: getattr(instance, field, None) for field in self.duplicate_fields
|
||||||
}
|
}
|
||||||
duplicate = False
|
|
||||||
|
|
||||||
for deduplicated_instance in deduplicated_instances:
|
for deduplicated_instance in deduplicated_instances:
|
||||||
deduplicated_identifier = deduplicated_instance.remote_identifier
|
deduplicated_identifier = deduplicated_instance.remote_identifier
|
||||||
instance_identifier = instance.remote_identifier
|
|
||||||
has_identifiers = deduplicated_identifier and instance_identifier
|
has_identifiers = deduplicated_identifier and instance_identifier
|
||||||
|
|
||||||
if self.is_duplicate(deduplicated_instance, values):
|
is_same_identifier = (
|
||||||
duplicate = True
|
has_identifiers and deduplicated_identifier == instance_identifier
|
||||||
break
|
)
|
||||||
elif has_identifiers and deduplicated_identifier == instance_identifier:
|
is_duplicate = self.is_duplicate(deduplicated_instance, values)
|
||||||
|
|
||||||
|
if is_duplicate or is_same_identifier:
|
||||||
duplicate = True
|
duplicate = True
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -16,6 +16,7 @@ from newsreader.news.core.tests.factories import PostFactory
|
||||||
from .mocks import *
|
from .mocks import *
|
||||||
|
|
||||||
|
|
||||||
|
@freeze_time("2019-10-30 12:30:00")
|
||||||
class FeedBuilderTestCase(TestCase):
|
class FeedBuilderTestCase(TestCase):
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.maxDiff = None
|
self.maxDiff = None
|
||||||
|
|
@ -30,8 +31,10 @@ class FeedBuilderTestCase(TestCase):
|
||||||
|
|
||||||
post = Post.objects.get()
|
post = Post.objects.get()
|
||||||
|
|
||||||
d = datetime.combine(date(2019, 5, 20), time(hour=16, minute=7, second=37))
|
publication_date = datetime.combine(
|
||||||
aware_date = pytz.utc.localize(d)
|
date(2019, 5, 20), time(hour=16, minute=7, second=37)
|
||||||
|
)
|
||||||
|
aware_date = pytz.utc.localize(publication_date)
|
||||||
|
|
||||||
self.assertEquals(post.publication_date, aware_date)
|
self.assertEquals(post.publication_date, aware_date)
|
||||||
self.assertEquals(Post.objects.count(), 1)
|
self.assertEquals(Post.objects.count(), 1)
|
||||||
|
|
@ -57,49 +60,60 @@ class FeedBuilderTestCase(TestCase):
|
||||||
with builder((multiple_mock, mock_stream)) as builder:
|
with builder((multiple_mock, mock_stream)) as builder:
|
||||||
builder.save()
|
builder.save()
|
||||||
|
|
||||||
posts = Post.objects.order_by("id")
|
posts = Post.objects.order_by("-publication_date")
|
||||||
self.assertEquals(Post.objects.count(), 3)
|
self.assertEquals(Post.objects.count(), 3)
|
||||||
|
|
||||||
first_post = posts[0]
|
post = posts[0]
|
||||||
second_post = posts[1]
|
|
||||||
|
|
||||||
d = datetime.combine(date(2019, 5, 20), time(hour=16, minute=7, second=37))
|
publication_date = datetime.combine(
|
||||||
aware_date = pytz.utc.localize(d)
|
date(2019, 5, 20), time(hour=16, minute=32, second=38)
|
||||||
|
)
|
||||||
self.assertEquals(first_post.publication_date, aware_date)
|
aware_date = pytz.utc.localize(publication_date)
|
||||||
|
|
||||||
self.assertEquals(
|
self.assertEquals(
|
||||||
first_post.remote_identifier,
|
post.publication_date.strftime("%Y-%m-%d %H:%M:%S"),
|
||||||
|
aware_date.strftime("%Y-%m-%d %H:%M:%S"),
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEquals(
|
||||||
|
post.remote_identifier,
|
||||||
|
"https://www.bbc.co.uk/news/uk-england-birmingham-48339080",
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEquals(
|
||||||
|
post.url, "https://www.bbc.co.uk/news/uk-england-birmingham-48339080"
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEquals(
|
||||||
|
post.title, "Birmingham head teacher threatened over LGBT lessons"
|
||||||
|
)
|
||||||
|
|
||||||
|
post = posts[1]
|
||||||
|
|
||||||
|
publication_date = datetime.combine(
|
||||||
|
date(2019, 5, 20), time(hour=16, minute=7, second=37)
|
||||||
|
)
|
||||||
|
aware_date = pytz.utc.localize(publication_date)
|
||||||
|
|
||||||
|
self.assertEquals(
|
||||||
|
post.publication_date.strftime("%Y-%m-%d %H:%M:%S"),
|
||||||
|
aware_date.strftime("%Y-%m-%d %H:%M:%S"),
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEquals(
|
||||||
|
post.remote_identifier,
|
||||||
"https://www.bbc.co.uk/news/world-us-canada-48338168",
|
"https://www.bbc.co.uk/news/world-us-canada-48338168",
|
||||||
)
|
)
|
||||||
|
|
||||||
self.assertEquals(
|
self.assertEquals(
|
||||||
first_post.url, "https://www.bbc.co.uk/news/world-us-canada-48338168"
|
post.url, "https://www.bbc.co.uk/news/world-us-canada-48338168"
|
||||||
)
|
)
|
||||||
|
|
||||||
self.assertEquals(
|
self.assertEquals(
|
||||||
first_post.title, "Trump's 'genocidal taunts' will not end Iran - Zarif"
|
post.title, "Trump's 'genocidal taunts' will not end Iran - Zarif"
|
||||||
)
|
)
|
||||||
|
|
||||||
d = datetime.combine(date(2019, 5, 20), time(hour=12, minute=19, second=19))
|
def test_entries_without_remote_identifier(self):
|
||||||
aware_date = pytz.utc.localize(d)
|
|
||||||
|
|
||||||
self.assertEquals(second_post.publication_date, aware_date)
|
|
||||||
|
|
||||||
self.assertEquals(
|
|
||||||
second_post.remote_identifier,
|
|
||||||
"https://www.bbc.co.uk/news/technology-48334739",
|
|
||||||
)
|
|
||||||
|
|
||||||
self.assertEquals(
|
|
||||||
second_post.url, "https://www.bbc.co.uk/news/technology-48334739"
|
|
||||||
)
|
|
||||||
|
|
||||||
self.assertEquals(
|
|
||||||
second_post.title, "Huawei's Android loss: How it affects you"
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_entry_without_remote_identifier(self):
|
|
||||||
builder = FeedBuilder
|
builder = FeedBuilder
|
||||||
rule = CollectionRuleFactory()
|
rule = CollectionRuleFactory()
|
||||||
mock_stream = MagicMock(rule=rule)
|
mock_stream = MagicMock(rule=rule)
|
||||||
|
|
@ -107,27 +121,37 @@ class FeedBuilderTestCase(TestCase):
|
||||||
with builder((mock_without_identifier, mock_stream)) as builder:
|
with builder((mock_without_identifier, mock_stream)) as builder:
|
||||||
builder.save()
|
builder.save()
|
||||||
|
|
||||||
posts = Post.objects.order_by("id")
|
posts = Post.objects.order_by("-publication_date")
|
||||||
self.assertEquals(Post.objects.count(), 2)
|
self.assertEquals(Post.objects.count(), 2)
|
||||||
|
|
||||||
first_post = posts[0]
|
post = posts[0]
|
||||||
|
|
||||||
d = datetime.combine(date(2019, 5, 20), time(hour=16, minute=7, second=37))
|
publication_date = datetime.combine(
|
||||||
aware_date = pytz.utc.localize(d)
|
date(2019, 5, 20), time(hour=16, minute=7, second=37)
|
||||||
|
)
|
||||||
self.assertEquals(first_post.publication_date, aware_date)
|
aware_date = pytz.utc.localize(publication_date)
|
||||||
|
|
||||||
self.assertEquals(first_post.remote_identifier, None)
|
|
||||||
|
|
||||||
|
self.assertEquals(post.publication_date, aware_date)
|
||||||
|
self.assertEquals(post.remote_identifier, None)
|
||||||
self.assertEquals(
|
self.assertEquals(
|
||||||
first_post.url, "https://www.bbc.co.uk/news/world-us-canada-48338168"
|
post.url, "https://www.bbc.co.uk/news/world-us-canada-48338168"
|
||||||
|
)
|
||||||
|
self.assertEquals(
|
||||||
|
post.title, "Trump's 'genocidal taunts' will not end Iran - Zarif"
|
||||||
)
|
)
|
||||||
|
|
||||||
self.assertEquals(
|
post = posts[1]
|
||||||
first_post.title, "Trump's 'genocidal taunts' will not end Iran - Zarif"
|
|
||||||
)
|
publication_date = datetime.combine(
|
||||||
|
date(2019, 5, 20), time(hour=12, minute=19, second=19)
|
||||||
|
)
|
||||||
|
aware_date = pytz.utc.localize(publication_date)
|
||||||
|
|
||||||
|
self.assertEquals(post.publication_date, aware_date)
|
||||||
|
self.assertEquals(post.remote_identifier, None)
|
||||||
|
self.assertEquals(post.url, "https://www.bbc.co.uk/news/technology-48334739")
|
||||||
|
self.assertEquals(post.title, "Huawei's Android loss: How it affects you")
|
||||||
|
|
||||||
@freeze_time("2019-10-30 12:30:00")
|
|
||||||
def test_entry_without_publication_date(self):
|
def test_entry_without_publication_date(self):
|
||||||
builder = FeedBuilder
|
builder = FeedBuilder
|
||||||
rule = CollectionRuleFactory()
|
rule = CollectionRuleFactory()
|
||||||
|
|
@ -136,25 +160,30 @@ class FeedBuilderTestCase(TestCase):
|
||||||
with builder((mock_without_publish_date, mock_stream)) as builder:
|
with builder((mock_without_publish_date, mock_stream)) as builder:
|
||||||
builder.save()
|
builder.save()
|
||||||
|
|
||||||
posts = Post.objects.order_by("id")
|
posts = Post.objects.order_by("-publication_date")
|
||||||
self.assertEquals(Post.objects.count(), 2)
|
self.assertEquals(Post.objects.count(), 2)
|
||||||
|
|
||||||
first_post = posts[0]
|
post = posts[0]
|
||||||
second_post = posts[1]
|
|
||||||
|
|
||||||
self.assertEquals(first_post.created, timezone.now())
|
|
||||||
self.assertEquals(
|
self.assertEquals(
|
||||||
first_post.remote_identifier,
|
post.publication_date.strftime("%Y-%m-%d %H:%M"), "2019-10-30 12:30"
|
||||||
|
)
|
||||||
|
self.assertEquals(post.created, timezone.now())
|
||||||
|
self.assertEquals(
|
||||||
|
post.remote_identifier,
|
||||||
"https://www.bbc.co.uk/news/world-us-canada-48338168",
|
"https://www.bbc.co.uk/news/world-us-canada-48338168",
|
||||||
)
|
)
|
||||||
|
|
||||||
self.assertEquals(second_post.created, timezone.now())
|
post = posts[1]
|
||||||
|
|
||||||
self.assertEquals(
|
self.assertEquals(
|
||||||
second_post.remote_identifier,
|
post.publication_date.strftime("%Y-%m-%d %H:%M"), "2019-10-30 12:30"
|
||||||
"https://www.bbc.co.uk/news/technology-48334739",
|
)
|
||||||
|
self.assertEquals(post.created, timezone.now())
|
||||||
|
self.assertEquals(
|
||||||
|
post.remote_identifier, "https://www.bbc.co.uk/news/technology-48334739"
|
||||||
)
|
)
|
||||||
|
|
||||||
@freeze_time("2019-10-30 12:30:00")
|
|
||||||
def test_entry_without_url(self):
|
def test_entry_without_url(self):
|
||||||
builder = FeedBuilder
|
builder = FeedBuilder
|
||||||
rule = CollectionRuleFactory()
|
rule = CollectionRuleFactory()
|
||||||
|
|
@ -163,25 +192,24 @@ class FeedBuilderTestCase(TestCase):
|
||||||
with builder((mock_without_url, mock_stream)) as builder:
|
with builder((mock_without_url, mock_stream)) as builder:
|
||||||
builder.save()
|
builder.save()
|
||||||
|
|
||||||
posts = Post.objects.order_by("id")
|
posts = Post.objects.order_by("-publication_date")
|
||||||
self.assertEquals(Post.objects.count(), 2)
|
self.assertEquals(Post.objects.count(), 2)
|
||||||
|
|
||||||
first_post = posts[0]
|
post = posts[0]
|
||||||
second_post = posts[1]
|
|
||||||
|
|
||||||
self.assertEquals(first_post.created, timezone.now())
|
self.assertEquals(post.created, timezone.now())
|
||||||
self.assertEquals(
|
self.assertEquals(
|
||||||
first_post.remote_identifier,
|
post.remote_identifier,
|
||||||
"https://www.bbc.co.uk/news/world-us-canada-48338168",
|
"https://www.bbc.co.uk/news/world-us-canada-48338168",
|
||||||
)
|
)
|
||||||
|
|
||||||
self.assertEquals(second_post.created, timezone.now())
|
post = posts[1]
|
||||||
|
|
||||||
|
self.assertEquals(post.created, timezone.now())
|
||||||
self.assertEquals(
|
self.assertEquals(
|
||||||
second_post.remote_identifier,
|
post.remote_identifier, "https://www.bbc.co.uk/news/technology-48334739"
|
||||||
"https://www.bbc.co.uk/news/technology-48334739",
|
|
||||||
)
|
)
|
||||||
|
|
||||||
@freeze_time("2019-10-30 12:30:00")
|
|
||||||
def test_entry_without_body(self):
|
def test_entry_without_body(self):
|
||||||
builder = FeedBuilder
|
builder = FeedBuilder
|
||||||
rule = CollectionRuleFactory()
|
rule = CollectionRuleFactory()
|
||||||
|
|
@ -190,25 +218,32 @@ class FeedBuilderTestCase(TestCase):
|
||||||
with builder((mock_without_body, mock_stream)) as builder:
|
with builder((mock_without_body, mock_stream)) as builder:
|
||||||
builder.save()
|
builder.save()
|
||||||
|
|
||||||
posts = Post.objects.order_by("id")
|
posts = Post.objects.order_by("-publication_date")
|
||||||
|
|
||||||
self.assertEquals(Post.objects.count(), 2)
|
self.assertEquals(Post.objects.count(), 2)
|
||||||
|
|
||||||
first_post = posts[0]
|
post = posts[0]
|
||||||
second_post = posts[1]
|
|
||||||
|
|
||||||
self.assertEquals(first_post.created, timezone.now())
|
|
||||||
self.assertEquals(
|
self.assertEquals(
|
||||||
first_post.remote_identifier,
|
post.created.strftime("%Y-%m-%d %H:%M:%S"), "2019-10-30 12:30:00"
|
||||||
"https://www.bbc.co.uk/news/world-us-canada-48338168",
|
|
||||||
)
|
)
|
||||||
|
|
||||||
self.assertEquals(second_post.created, timezone.now())
|
|
||||||
self.assertEquals(
|
self.assertEquals(
|
||||||
second_post.remote_identifier,
|
post.remote_identifier,
|
||||||
"https://www.bbc.co.uk/news/uk-england-birmingham-48339080",
|
"https://www.bbc.co.uk/news/uk-england-birmingham-48339080",
|
||||||
)
|
)
|
||||||
|
self.assertEquals(post.body, "")
|
||||||
|
|
||||||
|
post = posts[1]
|
||||||
|
|
||||||
|
self.assertEquals(
|
||||||
|
post.created.strftime("%Y-%m-%d %H:%M:%S"), "2019-10-30 12:30:00"
|
||||||
|
)
|
||||||
|
self.assertEquals(
|
||||||
|
post.remote_identifier,
|
||||||
|
"https://www.bbc.co.uk/news/world-us-canada-48338168",
|
||||||
|
)
|
||||||
|
self.assertEquals(post.body, "")
|
||||||
|
|
||||||
@freeze_time("2019-10-30 12:30:00")
|
|
||||||
def test_entry_without_author(self):
|
def test_entry_without_author(self):
|
||||||
builder = FeedBuilder
|
builder = FeedBuilder
|
||||||
rule = CollectionRuleFactory()
|
rule = CollectionRuleFactory()
|
||||||
|
|
@ -217,23 +252,25 @@ class FeedBuilderTestCase(TestCase):
|
||||||
with builder((mock_without_author, mock_stream)) as builder:
|
with builder((mock_without_author, mock_stream)) as builder:
|
||||||
builder.save()
|
builder.save()
|
||||||
|
|
||||||
posts = Post.objects.order_by("id")
|
posts = Post.objects.order_by("-publication_date")
|
||||||
self.assertEquals(Post.objects.count(), 2)
|
self.assertEquals(Post.objects.count(), 2)
|
||||||
|
|
||||||
first_post = posts[0]
|
post = posts[0]
|
||||||
second_post = posts[1]
|
|
||||||
|
|
||||||
self.assertEquals(first_post.created, timezone.now())
|
self.assertEquals(post.created, timezone.now())
|
||||||
self.assertEquals(
|
self.assertEquals(
|
||||||
first_post.remote_identifier,
|
post.remote_identifier,
|
||||||
"https://www.bbc.co.uk/news/world-us-canada-48338168",
|
"https://www.bbc.co.uk/news/world-us-canada-48338168",
|
||||||
)
|
)
|
||||||
|
self.assertEquals(post.author, None)
|
||||||
|
|
||||||
self.assertEquals(second_post.created, timezone.now())
|
post = posts[1]
|
||||||
|
|
||||||
|
self.assertEquals(post.created, timezone.now())
|
||||||
self.assertEquals(
|
self.assertEquals(
|
||||||
second_post.remote_identifier,
|
post.remote_identifier, "https://www.bbc.co.uk/news/technology-48334739"
|
||||||
"https://www.bbc.co.uk/news/technology-48334739",
|
|
||||||
)
|
)
|
||||||
|
self.assertEquals(post.author, None)
|
||||||
|
|
||||||
def test_empty_entries(self):
|
def test_empty_entries(self):
|
||||||
builder = FeedBuilder
|
builder = FeedBuilder
|
||||||
|
|
|
||||||
|
|
@ -11,110 +11,137 @@ from newsreader.news.core.models import Post
|
||||||
from newsreader.news.core.tests.factories import PostFactory
|
from newsreader.news.core.tests.factories import PostFactory
|
||||||
|
|
||||||
|
|
||||||
|
@freeze_time("2019-10-30 12:30:00")
|
||||||
class FeedDuplicateHandlerTestCase(TestCase):
|
class FeedDuplicateHandlerTestCase(TestCase):
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.maxDiff = None
|
self.maxDiff = None
|
||||||
|
|
||||||
def test_duplicate_entries_with_remote_identifiers(self):
|
def test_duplicate_entries_with_remote_identifiers(self):
|
||||||
rule = CollectionRuleFactory()
|
rule = CollectionRuleFactory()
|
||||||
|
|
||||||
existing_post = PostFactory.create(
|
existing_post = PostFactory.create(
|
||||||
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7", rule=rule
|
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7", rule=rule
|
||||||
)
|
)
|
||||||
new_post = PostFactory.build(
|
|
||||||
|
new_posts = PostFactory.build_batch(
|
||||||
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7",
|
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7",
|
||||||
title="title got updated",
|
publication_date=timezone.now() - timedelta(days=7),
|
||||||
|
rule=rule,
|
||||||
|
size=5,
|
||||||
|
)
|
||||||
|
last_post = PostFactory.build(
|
||||||
|
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7",
|
||||||
|
publication_date=timezone.now(),
|
||||||
rule=rule,
|
rule=rule,
|
||||||
)
|
)
|
||||||
|
|
||||||
with FeedDuplicateHandler(rule) as duplicate_handler:
|
with FeedDuplicateHandler(rule) as duplicate_handler:
|
||||||
posts_gen = duplicate_handler.check([new_post])
|
posts = duplicate_handler.check((*new_posts, last_post))
|
||||||
posts = list(posts_gen)
|
|
||||||
|
|
||||||
self.assertEquals(len(posts), 1)
|
self.assertEquals(len(posts), 1)
|
||||||
|
|
||||||
post = posts[0]
|
post = posts[0]
|
||||||
existing_post.refresh_from_db()
|
|
||||||
|
|
||||||
self.assertEquals(existing_post.pk, post.pk)
|
self.assertEquals(
|
||||||
self.assertEquals(post.publication_date, new_post.publication_date)
|
post.publication_date.strftime("%Y-%m-%d %H-%M-%S"),
|
||||||
self.assertEquals(post.title, new_post.title)
|
last_post.publication_date.strftime("%Y-%m-%d %H-%M-%S"),
|
||||||
self.assertEquals(post.body, new_post.body)
|
)
|
||||||
self.assertEquals(post.rule, new_post.rule)
|
self.assertEquals(post.title, last_post.title)
|
||||||
|
self.assertEquals(post.body, last_post.body)
|
||||||
|
self.assertEquals(post.rule, last_post.rule)
|
||||||
self.assertEquals(post.read, False)
|
self.assertEquals(post.read, False)
|
||||||
|
|
||||||
@freeze_time("2019-10-30 12:30:00")
|
|
||||||
def test_duplicate_entries_with_different_remote_identifiers(self):
|
def test_duplicate_entries_with_different_remote_identifiers(self):
|
||||||
rule = CollectionRuleFactory()
|
rule = CollectionRuleFactory()
|
||||||
publication_date = timezone.now()
|
|
||||||
|
|
||||||
existing_post = PostFactory.create(
|
existing_post = PostFactory(
|
||||||
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7",
|
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7",
|
||||||
url="https://bbc.com",
|
url="https://bbc.com",
|
||||||
title="New post",
|
title="New post",
|
||||||
body="Body",
|
body="Body",
|
||||||
publication_date=publication_date,
|
publication_date=timezone.now() - timedelta(minutes=10),
|
||||||
rule=rule,
|
rule=rule,
|
||||||
)
|
)
|
||||||
new_post = PostFactory.build(
|
|
||||||
|
new_posts = PostFactory.build_batch(
|
||||||
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7Q",
|
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7Q",
|
||||||
url="https://bbc.com",
|
url="https://bbc.com",
|
||||||
title="New post",
|
title="New post",
|
||||||
body="Body",
|
body="Body",
|
||||||
publication_date=publication_date,
|
publication_date=timezone.now() - timedelta(minutes=5),
|
||||||
|
rule=rule,
|
||||||
|
size=5,
|
||||||
|
)
|
||||||
|
last_post = PostFactory.build(
|
||||||
|
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7Q",
|
||||||
|
url="https://bbc.com",
|
||||||
|
title="New post",
|
||||||
|
body="Body",
|
||||||
|
publication_date=timezone.now(),
|
||||||
rule=rule,
|
rule=rule,
|
||||||
)
|
)
|
||||||
|
|
||||||
with FeedDuplicateHandler(rule) as duplicate_handler:
|
with FeedDuplicateHandler(rule) as duplicate_handler:
|
||||||
posts_gen = duplicate_handler.check([new_post])
|
posts = duplicate_handler.check((*new_posts, last_post))
|
||||||
posts = list(posts_gen)
|
|
||||||
|
|
||||||
self.assertEquals(len(posts), 1)
|
self.assertEquals(len(posts), 1)
|
||||||
|
|
||||||
existing_post.refresh_from_db()
|
|
||||||
post = posts[0]
|
post = posts[0]
|
||||||
|
|
||||||
self.assertEquals(existing_post.pk, post.pk)
|
self.assertEquals(
|
||||||
self.assertEquals(post.title, new_post.title)
|
post.publication_date.strftime("%Y-%m-%d %H-%M-%S"),
|
||||||
self.assertEquals(post.body, new_post.body)
|
last_post.publication_date.strftime("%Y-%m-%d %H-%M-%S"),
|
||||||
self.assertEquals(post.rule, new_post.rule)
|
)
|
||||||
self.assertEquals(post.publication_date, new_post.publication_date)
|
self.assertEquals(post.title, last_post.title)
|
||||||
|
self.assertEquals(post.body, last_post.body)
|
||||||
|
self.assertEquals(post.rule, last_post.rule)
|
||||||
self.assertEquals(post.read, False)
|
self.assertEquals(post.read, False)
|
||||||
|
|
||||||
def test_duplicate_entries_in_recent_database(self):
|
def test_duplicate_entries_in_recent_database(self):
|
||||||
publication_date = timezone.now()
|
|
||||||
|
|
||||||
rule = CollectionRuleFactory()
|
rule = CollectionRuleFactory()
|
||||||
existing_post = PostFactory.create(
|
|
||||||
|
existing_post = PostFactory(
|
||||||
url="https://www.bbc.co.uk/news/uk-england-birmingham-48339080",
|
url="https://www.bbc.co.uk/news/uk-england-birmingham-48339080",
|
||||||
title="Birmingham head teacher threatened over LGBT lessons",
|
title="Birmingham head teacher threatened over LGBT lessons",
|
||||||
body="Google's move to end business ties with Huawei will affect current devices",
|
body="Google's move to end business ties with Huawei will affect current devices",
|
||||||
publication_date=publication_date,
|
publication_date=timezone.now() - timedelta(minutes=10),
|
||||||
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7",
|
remote_identifier=None,
|
||||||
rule=rule,
|
rule=rule,
|
||||||
)
|
)
|
||||||
new_post = PostFactory.build(
|
|
||||||
|
new_posts = PostFactory.build_batch(
|
||||||
url="https://www.bbc.co.uk/news/uk-england-birmingham-48339080",
|
url="https://www.bbc.co.uk/news/uk-england-birmingham-48339080",
|
||||||
title="Birmingham head teacher threatened over LGBT lessons",
|
title="Birmingham head teacher threatened over LGBT lessons",
|
||||||
body="Google's move to end business ties with Huawei will affect current devices",
|
body="Google's move to end business ties with Huawei will affect current devices",
|
||||||
publication_date=publication_date,
|
publication_date=timezone.now() - timedelta(minutes=5),
|
||||||
|
remote_identifier=None,
|
||||||
|
rule=rule,
|
||||||
|
size=5,
|
||||||
|
)
|
||||||
|
|
||||||
|
last_post = PostFactory.build(
|
||||||
|
url="https://www.bbc.co.uk/news/uk-england-birmingham-48339080",
|
||||||
|
title="Birmingham head teacher threatened over LGBT lessons",
|
||||||
|
body="Google's move to end business ties with Huawei will affect current devices",
|
||||||
|
publication_date=timezone.now(),
|
||||||
remote_identifier=None,
|
remote_identifier=None,
|
||||||
rule=rule,
|
rule=rule,
|
||||||
)
|
)
|
||||||
|
|
||||||
with FeedDuplicateHandler(rule) as duplicate_handler:
|
with FeedDuplicateHandler(rule) as duplicate_handler:
|
||||||
posts_gen = duplicate_handler.check([new_post])
|
posts = duplicate_handler.check((*new_posts, last_post))
|
||||||
posts = list(posts_gen)
|
|
||||||
|
|
||||||
self.assertEquals(len(posts), 1)
|
self.assertEquals(len(posts), 1)
|
||||||
|
|
||||||
existing_post.refresh_from_db()
|
|
||||||
post = posts[0]
|
post = posts[0]
|
||||||
|
|
||||||
self.assertEquals(existing_post.pk, post.pk)
|
self.assertEquals(
|
||||||
self.assertEquals(post.title, new_post.title)
|
post.publication_date.strftime("%Y-%m-%d %H-%M-%S"),
|
||||||
self.assertEquals(post.body, new_post.body)
|
last_post.publication_date.strftime("%Y-%m-%d %H-%M-%S"),
|
||||||
self.assertEquals(post.rule, new_post.rule)
|
)
|
||||||
self.assertEquals(post.publication_date, new_post.publication_date)
|
self.assertEquals(post.title, last_post.title)
|
||||||
|
self.assertEquals(post.body, last_post.body)
|
||||||
|
self.assertEquals(post.rule, last_post.rule)
|
||||||
self.assertEquals(post.read, False)
|
self.assertEquals(post.read, False)
|
||||||
|
|
||||||
def test_multiple_existing_entries_with_identifier(self):
|
def test_multiple_existing_entries_with_identifier(self):
|
||||||
|
|
@ -124,15 +151,20 @@ class FeedDuplicateHandlerTestCase(TestCase):
|
||||||
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7", rule=rule, size=5
|
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7", rule=rule, size=5
|
||||||
)
|
)
|
||||||
|
|
||||||
new_post = PostFactory.build(
|
new_posts = PostFactory.build_batch(
|
||||||
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7",
|
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7",
|
||||||
title="This is a new one",
|
publication_date=timezone.now() - timedelta(hours=5),
|
||||||
|
rule=rule,
|
||||||
|
size=5,
|
||||||
|
)
|
||||||
|
last_post = PostFactory.build(
|
||||||
|
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7",
|
||||||
|
publication_date=timezone.now() - timedelta(minutes=5),
|
||||||
rule=rule,
|
rule=rule,
|
||||||
)
|
)
|
||||||
|
|
||||||
with FeedDuplicateHandler(rule) as duplicate_handler:
|
with FeedDuplicateHandler(rule) as duplicate_handler:
|
||||||
posts_gen = duplicate_handler.check([new_post])
|
posts = duplicate_handler.check((*new_posts, last_post))
|
||||||
posts = list(posts_gen)
|
|
||||||
|
|
||||||
self.assertEquals(len(posts), 1)
|
self.assertEquals(len(posts), 1)
|
||||||
|
|
||||||
|
|
@ -145,77 +177,101 @@ class FeedDuplicateHandlerTestCase(TestCase):
|
||||||
|
|
||||||
post = posts[0]
|
post = posts[0]
|
||||||
|
|
||||||
self.assertEquals(post.title, new_post.title)
|
self.assertEquals(
|
||||||
self.assertEquals(post.body, new_post.body)
|
post.publication_date.strftime("%Y-%m-%d %H-%M-%S"),
|
||||||
self.assertEquals(post.publication_date, new_post.publication_date)
|
last_post.publication_date.strftime("%Y-%m-%d %H-%M-%S"),
|
||||||
self.assertEquals(post.rule, new_post.rule)
|
)
|
||||||
|
self.assertEquals(post.title, last_post.title)
|
||||||
|
self.assertEquals(post.body, last_post.body)
|
||||||
|
self.assertEquals(post.rule, last_post.rule)
|
||||||
self.assertEquals(post.read, False)
|
self.assertEquals(post.read, False)
|
||||||
|
|
||||||
@freeze_time("2019-10-30 12:30:00")
|
|
||||||
def test_duplicate_entries_outside_time_slot(self):
|
def test_duplicate_entries_outside_time_slot(self):
|
||||||
publication_date = timezone.now()
|
|
||||||
|
|
||||||
rule = CollectionRuleFactory()
|
rule = CollectionRuleFactory()
|
||||||
existing_post = PostFactory.create(
|
|
||||||
|
existing_post = PostFactory(
|
||||||
url="https://www.bbc.co.uk/news/uk-england-birmingham-48339080",
|
url="https://www.bbc.co.uk/news/uk-england-birmingham-48339080",
|
||||||
title="Birmingham head teacher threatened over LGBT lessons",
|
title="Birmingham head teacher threatened over LGBT lessons",
|
||||||
body="Google's move to end business ties with Huawei will affect current devices",
|
body="Google's move to end business ties with Huawei will affect current devices",
|
||||||
publication_date=publication_date,
|
publication_date=timezone.now(),
|
||||||
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7",
|
remote_identifier=None,
|
||||||
rule=rule,
|
rule=rule,
|
||||||
)
|
)
|
||||||
new_post = PostFactory.build(
|
|
||||||
|
new_posts = PostFactory.build_batch(
|
||||||
url="https://www.bbc.co.uk/news/uk-england-birmingham-48339080",
|
url="https://www.bbc.co.uk/news/uk-england-birmingham-48339080",
|
||||||
title="Birmingham head teacher threatened over LGBT lessons",
|
title="Birmingham head teacher threatened over LGBT lessons",
|
||||||
body="Google's move to end business ties with Huawei will affect current devices",
|
body="Google's move to end business ties with Huawei will affect current devices",
|
||||||
publication_date=publication_date + timedelta(minutes=12),
|
publication_date=timezone.now() + timedelta(minutes=12),
|
||||||
|
remote_identifier=None,
|
||||||
|
rule=rule,
|
||||||
|
size=5,
|
||||||
|
)
|
||||||
|
last_post = PostFactory.build(
|
||||||
|
url="https://www.bbc.co.uk/news/uk-england-birmingham-48339080",
|
||||||
|
title="Birmingham head teacher threatened over LGBT lessons",
|
||||||
|
body="Google's move to end business ties with Huawei will affect current devices",
|
||||||
|
publication_date=timezone.now() + timedelta(minutes=13),
|
||||||
remote_identifier=None,
|
remote_identifier=None,
|
||||||
rule=rule,
|
rule=rule,
|
||||||
)
|
)
|
||||||
|
|
||||||
with FeedDuplicateHandler(rule) as duplicate_handler:
|
with FeedDuplicateHandler(rule) as duplicate_handler:
|
||||||
posts_gen = duplicate_handler.check([new_post])
|
posts = duplicate_handler.check((*new_posts, last_post))
|
||||||
posts = list(posts_gen)
|
|
||||||
|
|
||||||
self.assertEquals(len(posts), 1)
|
self.assertEquals(len(posts), 1)
|
||||||
|
|
||||||
existing_post.refresh_from_db()
|
|
||||||
post = posts[0]
|
post = posts[0]
|
||||||
|
|
||||||
self.assertEquals(post.pk, None)
|
self.assertEquals(post.pk, None)
|
||||||
self.assertEquals(post.title, new_post.title)
|
self.assertEquals(
|
||||||
self.assertEquals(post.body, new_post.body)
|
post.publication_date.strftime("%Y-%m-%d %H-%M-%S"),
|
||||||
self.assertEquals(post.rule, new_post.rule)
|
last_post.publication_date.strftime("%Y-%m-%d %H-%M-%S"),
|
||||||
self.assertEquals(post.publication_date, new_post.publication_date)
|
)
|
||||||
|
self.assertEquals(post.title, last_post.title)
|
||||||
|
self.assertEquals(post.body, last_post.body)
|
||||||
|
self.assertEquals(post.rule, last_post.rule)
|
||||||
self.assertEquals(post.read, False)
|
self.assertEquals(post.read, False)
|
||||||
|
|
||||||
def test_duplicate_entries_in_collected_entries(self):
|
def test_duplicate_entries_in_collected_entries(self):
|
||||||
rule = CollectionRuleFactory()
|
rule = CollectionRuleFactory()
|
||||||
post_1 = PostFactory.build(
|
post_1 = PostFactory.build(
|
||||||
title="title got updated", body="body", url="https://bbc.com", rule=rule
|
title="title got updated",
|
||||||
|
body="body",
|
||||||
|
url="https://bbc.com",
|
||||||
|
publication_date=timezone.now(),
|
||||||
|
rule=rule,
|
||||||
)
|
)
|
||||||
duplicate_post_1 = PostFactory.build(
|
duplicate_post_1 = PostFactory.build(
|
||||||
title="title got updated", body="body", url="https://bbc.com", rule=rule
|
title="title got updated",
|
||||||
|
body="body",
|
||||||
|
url="https://bbc.com",
|
||||||
|
publication_date=timezone.now() - timedelta(minutes=5),
|
||||||
|
rule=rule,
|
||||||
)
|
)
|
||||||
|
|
||||||
post_2 = PostFactory.build(
|
post_2 = PostFactory.build(
|
||||||
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7"
|
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7",
|
||||||
|
publication_date=timezone.now(),
|
||||||
)
|
)
|
||||||
duplicate_post_2 = PostFactory.build(
|
duplicate_post_2 = PostFactory.build(
|
||||||
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7"
|
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7",
|
||||||
|
publication_date=timezone.now() - timedelta(minutes=5),
|
||||||
)
|
)
|
||||||
|
|
||||||
collected_posts = (post_1, post_2, duplicate_post_1, duplicate_post_2)
|
collected_posts = (post_1, post_2, duplicate_post_1, duplicate_post_2)
|
||||||
|
|
||||||
with FeedDuplicateHandler(rule) as duplicate_handler:
|
with FeedDuplicateHandler(rule) as duplicate_handler:
|
||||||
posts_gen = duplicate_handler.check(collected_posts)
|
posts = duplicate_handler.check(collected_posts)
|
||||||
posts = list(posts_gen)
|
|
||||||
|
|
||||||
self.assertEquals(len(posts), 2)
|
self.assertEquals(len(posts), 2)
|
||||||
|
|
||||||
post = posts[0]
|
post = posts[0]
|
||||||
|
|
||||||
self.assertEquals(post_1.publication_date, post.publication_date)
|
self.assertEquals(
|
||||||
|
post_1.publication_date.strftime("%Y-%m-%d %H-%M-%S"),
|
||||||
|
post.publication_date.strftime("%Y-%m-%d %H-%M-%S"),
|
||||||
|
)
|
||||||
self.assertEquals(post_1.title, post.title)
|
self.assertEquals(post_1.title, post.title)
|
||||||
self.assertEquals(post_1.body, post.body)
|
self.assertEquals(post_1.body, post.body)
|
||||||
self.assertEquals(post_1.rule, post.rule)
|
self.assertEquals(post_1.rule, post.rule)
|
||||||
|
|
@ -223,7 +279,10 @@ class FeedDuplicateHandlerTestCase(TestCase):
|
||||||
|
|
||||||
post = posts[1]
|
post = posts[1]
|
||||||
|
|
||||||
self.assertEquals(post_2.publication_date, post.publication_date)
|
self.assertEquals(
|
||||||
|
post_2.publication_date.strftime("%Y-%m-%d %H-%M-%S"),
|
||||||
|
post.publication_date.strftime("%Y-%m-%d %H-%M-%S"),
|
||||||
|
)
|
||||||
self.assertEquals(post_2.title, post.title)
|
self.assertEquals(post_2.title, post.title)
|
||||||
self.assertEquals(post_2.body, post.body)
|
self.assertEquals(post_2.body, post.body)
|
||||||
self.assertEquals(post_2.rule, post.rule)
|
self.assertEquals(post_2.rule, post.rule)
|
||||||
|
|
|
||||||
|
|
@ -8,7 +8,7 @@ from newsreader.news.collection.models import CollectionRule
|
||||||
|
|
||||||
class Post(TimeStampedModel):
|
class Post(TimeStampedModel):
|
||||||
title = models.CharField(max_length=200, blank=True, null=True)
|
title = models.CharField(max_length=200, blank=True, null=True)
|
||||||
body = models.TextField(blank=True, null=True)
|
body = models.TextField(blank=True)
|
||||||
author = models.CharField(max_length=40, blank=True, null=True)
|
author = models.CharField(max_length=40, blank=True, null=True)
|
||||||
publication_date = models.DateTimeField(default=timezone.now)
|
publication_date = models.DateTimeField(default=timezone.now)
|
||||||
url = models.URLField(max_length=1024, blank=True, null=True)
|
url = models.URLField(max_length=1024, blank=True, null=True)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue