0.2.3.6
- Update logging - Update FeedDuplicateHandler
This commit is contained in:
parent
00f6427c57
commit
2be35bce53
7 changed files with 280 additions and 171 deletions
|
|
@ -3,10 +3,10 @@ python-linting:
|
|||
allow_failure: true
|
||||
image: python:3.7.4-slim-stretch
|
||||
before_script:
|
||||
- pip install poetry
|
||||
- pip install poetry --quiet
|
||||
- poetry config cache-dir ~/.cache/poetry
|
||||
- poetry config virtualenvs.in-project true
|
||||
- poetry install --no-interaction
|
||||
- poetry install --no-interaction --quiet
|
||||
script:
|
||||
- poetry run isort src/ --check-only --recursive
|
||||
- poetry run black src/ --line-length 88 --check
|
||||
|
|
|
|||
|
|
@ -6,10 +6,10 @@ python-tests:
|
|||
- memcached:1.5.22
|
||||
image: python:3.7.4-slim-stretch
|
||||
before_script:
|
||||
- pip install poetry
|
||||
- pip install poetry --quiet
|
||||
- poetry config cache-dir .cache/poetry
|
||||
- poetry config virtualenvs.in-project true
|
||||
- poetry install --no-interaction
|
||||
- poetry install --no-interaction --quiet
|
||||
script:
|
||||
- poetry run coverage run src/manage.py test newsreader
|
||||
- poetry run coverage report
|
||||
|
|
|
|||
|
|
@ -103,7 +103,7 @@ CACHES = {
|
|||
# https://docs.djangoproject.com/en/2.2/topics/logging/#configuring-logging
|
||||
LOGGING = {
|
||||
"version": 1,
|
||||
"disable_existing_loggers": False,
|
||||
"disable_existing_loggers": True,
|
||||
"filters": {
|
||||
"require_debug_false": {"()": "django.utils.log.RequireDebugFalse"},
|
||||
"require_debug_true": {"()": "django.utils.log.RequireDebugTrue"},
|
||||
|
|
@ -114,7 +114,11 @@ LOGGING = {
|
|||
"format": "[{server_time}] {message}",
|
||||
"style": "{",
|
||||
},
|
||||
"syslog": {"class": "logging.Formatter", "format": "{message}", "style": "{"},
|
||||
"syslog": {
|
||||
"class": "logging.Formatter",
|
||||
"format": "[newsreader] {message}",
|
||||
"style": "{",
|
||||
},
|
||||
},
|
||||
"handlers": {
|
||||
"console": {
|
||||
|
|
@ -124,6 +128,7 @@ LOGGING = {
|
|||
},
|
||||
"django.server": {
|
||||
"level": "INFO",
|
||||
"filters": ["require_debug_true"],
|
||||
"class": "logging.StreamHandler",
|
||||
"formatter": "django.server",
|
||||
},
|
||||
|
|
@ -157,7 +162,6 @@ LOGGING = {
|
|||
"level": "INFO",
|
||||
"propagate": False,
|
||||
},
|
||||
"celery": {"handlers": ["syslog", "console"], "level": "INFO"},
|
||||
"celery.task": {"handlers": ["syslog", "console"], "level": "INFO"},
|
||||
},
|
||||
}
|
||||
|
|
|
|||
|
|
@ -52,9 +52,7 @@ class FeedBuilder(Builder):
|
|||
entries = data.get("entries", [])
|
||||
|
||||
instances = self.build(entries, stream.rule)
|
||||
posts = duplicate_handler.check(instances)
|
||||
|
||||
self.instances = [post for post in posts]
|
||||
self.instances = duplicate_handler.check(instances)
|
||||
|
||||
def build(self, entries, rule):
|
||||
field_mapping = {
|
||||
|
|
@ -196,22 +194,27 @@ class FeedDuplicateHandler:
|
|||
|
||||
def check(self, instances):
|
||||
deduplicated_instances = self.deduplicate_instances(instances)
|
||||
checked_instances = []
|
||||
|
||||
for instance in deduplicated_instances:
|
||||
if instance.remote_identifier in self.existing_identifiers:
|
||||
existing_post = self.handle_duplicate_identifier(instance)
|
||||
|
||||
yield existing_post
|
||||
checked_instances.append(existing_post)
|
||||
|
||||
continue
|
||||
elif self.in_database(instance):
|
||||
existing_post = self.get_duplicate_in_database(instance)
|
||||
|
||||
if self.in_time_slot(instance, existing_post):
|
||||
yield self.update_existing_post(instance, existing_post)
|
||||
checked_instances.append(
|
||||
self.update_existing_post(instance, existing_post)
|
||||
)
|
||||
continue
|
||||
|
||||
yield instance
|
||||
checked_instances.append(instance)
|
||||
|
||||
return checked_instances
|
||||
|
||||
def in_database(self, post):
|
||||
values = {field: getattr(post, field, None) for field in self.duplicate_fields}
|
||||
|
|
@ -229,23 +232,29 @@ class FeedDuplicateHandler:
|
|||
return True
|
||||
|
||||
def deduplicate_instances(self, instances):
|
||||
sorted_instances = sorted(
|
||||
instances, key=lambda instance: instance.publication_date, reverse=True
|
||||
)
|
||||
deduplicated_instances = []
|
||||
|
||||
for instance in instances:
|
||||
for instance in sorted_instances:
|
||||
instance_identifier = instance.remote_identifier
|
||||
duplicate = False
|
||||
|
||||
values = {
|
||||
field: getattr(instance, field, None) for field in self.duplicate_fields
|
||||
}
|
||||
duplicate = False
|
||||
|
||||
for deduplicated_instance in deduplicated_instances:
|
||||
deduplicated_identifier = deduplicated_instance.remote_identifier
|
||||
instance_identifier = instance.remote_identifier
|
||||
has_identifiers = deduplicated_identifier and instance_identifier
|
||||
|
||||
if self.is_duplicate(deduplicated_instance, values):
|
||||
duplicate = True
|
||||
break
|
||||
elif has_identifiers and deduplicated_identifier == instance_identifier:
|
||||
is_same_identifier = (
|
||||
has_identifiers and deduplicated_identifier == instance_identifier
|
||||
)
|
||||
is_duplicate = self.is_duplicate(deduplicated_instance, values)
|
||||
|
||||
if is_duplicate or is_same_identifier:
|
||||
duplicate = True
|
||||
break
|
||||
|
||||
|
|
|
|||
|
|
@ -16,6 +16,7 @@ from newsreader.news.core.tests.factories import PostFactory
|
|||
from .mocks import *
|
||||
|
||||
|
||||
@freeze_time("2019-10-30 12:30:00")
|
||||
class FeedBuilderTestCase(TestCase):
|
||||
def setUp(self):
|
||||
self.maxDiff = None
|
||||
|
|
@ -30,8 +31,10 @@ class FeedBuilderTestCase(TestCase):
|
|||
|
||||
post = Post.objects.get()
|
||||
|
||||
d = datetime.combine(date(2019, 5, 20), time(hour=16, minute=7, second=37))
|
||||
aware_date = pytz.utc.localize(d)
|
||||
publication_date = datetime.combine(
|
||||
date(2019, 5, 20), time(hour=16, minute=7, second=37)
|
||||
)
|
||||
aware_date = pytz.utc.localize(publication_date)
|
||||
|
||||
self.assertEquals(post.publication_date, aware_date)
|
||||
self.assertEquals(Post.objects.count(), 1)
|
||||
|
|
@ -57,49 +60,60 @@ class FeedBuilderTestCase(TestCase):
|
|||
with builder((multiple_mock, mock_stream)) as builder:
|
||||
builder.save()
|
||||
|
||||
posts = Post.objects.order_by("id")
|
||||
posts = Post.objects.order_by("-publication_date")
|
||||
self.assertEquals(Post.objects.count(), 3)
|
||||
|
||||
first_post = posts[0]
|
||||
second_post = posts[1]
|
||||
post = posts[0]
|
||||
|
||||
d = datetime.combine(date(2019, 5, 20), time(hour=16, minute=7, second=37))
|
||||
aware_date = pytz.utc.localize(d)
|
||||
|
||||
self.assertEquals(first_post.publication_date, aware_date)
|
||||
publication_date = datetime.combine(
|
||||
date(2019, 5, 20), time(hour=16, minute=32, second=38)
|
||||
)
|
||||
aware_date = pytz.utc.localize(publication_date)
|
||||
|
||||
self.assertEquals(
|
||||
first_post.remote_identifier,
|
||||
post.publication_date.strftime("%Y-%m-%d %H:%M:%S"),
|
||||
aware_date.strftime("%Y-%m-%d %H:%M:%S"),
|
||||
)
|
||||
|
||||
self.assertEquals(
|
||||
post.remote_identifier,
|
||||
"https://www.bbc.co.uk/news/uk-england-birmingham-48339080",
|
||||
)
|
||||
|
||||
self.assertEquals(
|
||||
post.url, "https://www.bbc.co.uk/news/uk-england-birmingham-48339080"
|
||||
)
|
||||
|
||||
self.assertEquals(
|
||||
post.title, "Birmingham head teacher threatened over LGBT lessons"
|
||||
)
|
||||
|
||||
post = posts[1]
|
||||
|
||||
publication_date = datetime.combine(
|
||||
date(2019, 5, 20), time(hour=16, minute=7, second=37)
|
||||
)
|
||||
aware_date = pytz.utc.localize(publication_date)
|
||||
|
||||
self.assertEquals(
|
||||
post.publication_date.strftime("%Y-%m-%d %H:%M:%S"),
|
||||
aware_date.strftime("%Y-%m-%d %H:%M:%S"),
|
||||
)
|
||||
|
||||
self.assertEquals(
|
||||
post.remote_identifier,
|
||||
"https://www.bbc.co.uk/news/world-us-canada-48338168",
|
||||
)
|
||||
|
||||
self.assertEquals(
|
||||
first_post.url, "https://www.bbc.co.uk/news/world-us-canada-48338168"
|
||||
post.url, "https://www.bbc.co.uk/news/world-us-canada-48338168"
|
||||
)
|
||||
|
||||
self.assertEquals(
|
||||
first_post.title, "Trump's 'genocidal taunts' will not end Iran - Zarif"
|
||||
post.title, "Trump's 'genocidal taunts' will not end Iran - Zarif"
|
||||
)
|
||||
|
||||
d = datetime.combine(date(2019, 5, 20), time(hour=12, minute=19, second=19))
|
||||
aware_date = pytz.utc.localize(d)
|
||||
|
||||
self.assertEquals(second_post.publication_date, aware_date)
|
||||
|
||||
self.assertEquals(
|
||||
second_post.remote_identifier,
|
||||
"https://www.bbc.co.uk/news/technology-48334739",
|
||||
)
|
||||
|
||||
self.assertEquals(
|
||||
second_post.url, "https://www.bbc.co.uk/news/technology-48334739"
|
||||
)
|
||||
|
||||
self.assertEquals(
|
||||
second_post.title, "Huawei's Android loss: How it affects you"
|
||||
)
|
||||
|
||||
def test_entry_without_remote_identifier(self):
|
||||
def test_entries_without_remote_identifier(self):
|
||||
builder = FeedBuilder
|
||||
rule = CollectionRuleFactory()
|
||||
mock_stream = MagicMock(rule=rule)
|
||||
|
|
@ -107,27 +121,37 @@ class FeedBuilderTestCase(TestCase):
|
|||
with builder((mock_without_identifier, mock_stream)) as builder:
|
||||
builder.save()
|
||||
|
||||
posts = Post.objects.order_by("id")
|
||||
posts = Post.objects.order_by("-publication_date")
|
||||
self.assertEquals(Post.objects.count(), 2)
|
||||
|
||||
first_post = posts[0]
|
||||
post = posts[0]
|
||||
|
||||
d = datetime.combine(date(2019, 5, 20), time(hour=16, minute=7, second=37))
|
||||
aware_date = pytz.utc.localize(d)
|
||||
|
||||
self.assertEquals(first_post.publication_date, aware_date)
|
||||
|
||||
self.assertEquals(first_post.remote_identifier, None)
|
||||
publication_date = datetime.combine(
|
||||
date(2019, 5, 20), time(hour=16, minute=7, second=37)
|
||||
)
|
||||
aware_date = pytz.utc.localize(publication_date)
|
||||
|
||||
self.assertEquals(post.publication_date, aware_date)
|
||||
self.assertEquals(post.remote_identifier, None)
|
||||
self.assertEquals(
|
||||
first_post.url, "https://www.bbc.co.uk/news/world-us-canada-48338168"
|
||||
post.url, "https://www.bbc.co.uk/news/world-us-canada-48338168"
|
||||
)
|
||||
self.assertEquals(
|
||||
post.title, "Trump's 'genocidal taunts' will not end Iran - Zarif"
|
||||
)
|
||||
|
||||
self.assertEquals(
|
||||
first_post.title, "Trump's 'genocidal taunts' will not end Iran - Zarif"
|
||||
)
|
||||
post = posts[1]
|
||||
|
||||
publication_date = datetime.combine(
|
||||
date(2019, 5, 20), time(hour=12, minute=19, second=19)
|
||||
)
|
||||
aware_date = pytz.utc.localize(publication_date)
|
||||
|
||||
self.assertEquals(post.publication_date, aware_date)
|
||||
self.assertEquals(post.remote_identifier, None)
|
||||
self.assertEquals(post.url, "https://www.bbc.co.uk/news/technology-48334739")
|
||||
self.assertEquals(post.title, "Huawei's Android loss: How it affects you")
|
||||
|
||||
@freeze_time("2019-10-30 12:30:00")
|
||||
def test_entry_without_publication_date(self):
|
||||
builder = FeedBuilder
|
||||
rule = CollectionRuleFactory()
|
||||
|
|
@ -136,25 +160,30 @@ class FeedBuilderTestCase(TestCase):
|
|||
with builder((mock_without_publish_date, mock_stream)) as builder:
|
||||
builder.save()
|
||||
|
||||
posts = Post.objects.order_by("id")
|
||||
posts = Post.objects.order_by("-publication_date")
|
||||
self.assertEquals(Post.objects.count(), 2)
|
||||
|
||||
first_post = posts[0]
|
||||
second_post = posts[1]
|
||||
post = posts[0]
|
||||
|
||||
self.assertEquals(first_post.created, timezone.now())
|
||||
self.assertEquals(
|
||||
first_post.remote_identifier,
|
||||
post.publication_date.strftime("%Y-%m-%d %H:%M"), "2019-10-30 12:30"
|
||||
)
|
||||
self.assertEquals(post.created, timezone.now())
|
||||
self.assertEquals(
|
||||
post.remote_identifier,
|
||||
"https://www.bbc.co.uk/news/world-us-canada-48338168",
|
||||
)
|
||||
|
||||
self.assertEquals(second_post.created, timezone.now())
|
||||
post = posts[1]
|
||||
|
||||
self.assertEquals(
|
||||
second_post.remote_identifier,
|
||||
"https://www.bbc.co.uk/news/technology-48334739",
|
||||
post.publication_date.strftime("%Y-%m-%d %H:%M"), "2019-10-30 12:30"
|
||||
)
|
||||
self.assertEquals(post.created, timezone.now())
|
||||
self.assertEquals(
|
||||
post.remote_identifier, "https://www.bbc.co.uk/news/technology-48334739"
|
||||
)
|
||||
|
||||
@freeze_time("2019-10-30 12:30:00")
|
||||
def test_entry_without_url(self):
|
||||
builder = FeedBuilder
|
||||
rule = CollectionRuleFactory()
|
||||
|
|
@ -163,25 +192,24 @@ class FeedBuilderTestCase(TestCase):
|
|||
with builder((mock_without_url, mock_stream)) as builder:
|
||||
builder.save()
|
||||
|
||||
posts = Post.objects.order_by("id")
|
||||
posts = Post.objects.order_by("-publication_date")
|
||||
self.assertEquals(Post.objects.count(), 2)
|
||||
|
||||
first_post = posts[0]
|
||||
second_post = posts[1]
|
||||
post = posts[0]
|
||||
|
||||
self.assertEquals(first_post.created, timezone.now())
|
||||
self.assertEquals(post.created, timezone.now())
|
||||
self.assertEquals(
|
||||
first_post.remote_identifier,
|
||||
post.remote_identifier,
|
||||
"https://www.bbc.co.uk/news/world-us-canada-48338168",
|
||||
)
|
||||
|
||||
self.assertEquals(second_post.created, timezone.now())
|
||||
post = posts[1]
|
||||
|
||||
self.assertEquals(post.created, timezone.now())
|
||||
self.assertEquals(
|
||||
second_post.remote_identifier,
|
||||
"https://www.bbc.co.uk/news/technology-48334739",
|
||||
post.remote_identifier, "https://www.bbc.co.uk/news/technology-48334739"
|
||||
)
|
||||
|
||||
@freeze_time("2019-10-30 12:30:00")
|
||||
def test_entry_without_body(self):
|
||||
builder = FeedBuilder
|
||||
rule = CollectionRuleFactory()
|
||||
|
|
@ -190,25 +218,32 @@ class FeedBuilderTestCase(TestCase):
|
|||
with builder((mock_without_body, mock_stream)) as builder:
|
||||
builder.save()
|
||||
|
||||
posts = Post.objects.order_by("id")
|
||||
posts = Post.objects.order_by("-publication_date")
|
||||
|
||||
self.assertEquals(Post.objects.count(), 2)
|
||||
|
||||
first_post = posts[0]
|
||||
second_post = posts[1]
|
||||
post = posts[0]
|
||||
|
||||
self.assertEquals(first_post.created, timezone.now())
|
||||
self.assertEquals(
|
||||
first_post.remote_identifier,
|
||||
"https://www.bbc.co.uk/news/world-us-canada-48338168",
|
||||
post.created.strftime("%Y-%m-%d %H:%M:%S"), "2019-10-30 12:30:00"
|
||||
)
|
||||
|
||||
self.assertEquals(second_post.created, timezone.now())
|
||||
self.assertEquals(
|
||||
second_post.remote_identifier,
|
||||
post.remote_identifier,
|
||||
"https://www.bbc.co.uk/news/uk-england-birmingham-48339080",
|
||||
)
|
||||
self.assertEquals(post.body, "")
|
||||
|
||||
post = posts[1]
|
||||
|
||||
self.assertEquals(
|
||||
post.created.strftime("%Y-%m-%d %H:%M:%S"), "2019-10-30 12:30:00"
|
||||
)
|
||||
self.assertEquals(
|
||||
post.remote_identifier,
|
||||
"https://www.bbc.co.uk/news/world-us-canada-48338168",
|
||||
)
|
||||
self.assertEquals(post.body, "")
|
||||
|
||||
@freeze_time("2019-10-30 12:30:00")
|
||||
def test_entry_without_author(self):
|
||||
builder = FeedBuilder
|
||||
rule = CollectionRuleFactory()
|
||||
|
|
@ -217,23 +252,25 @@ class FeedBuilderTestCase(TestCase):
|
|||
with builder((mock_without_author, mock_stream)) as builder:
|
||||
builder.save()
|
||||
|
||||
posts = Post.objects.order_by("id")
|
||||
posts = Post.objects.order_by("-publication_date")
|
||||
self.assertEquals(Post.objects.count(), 2)
|
||||
|
||||
first_post = posts[0]
|
||||
second_post = posts[1]
|
||||
post = posts[0]
|
||||
|
||||
self.assertEquals(first_post.created, timezone.now())
|
||||
self.assertEquals(post.created, timezone.now())
|
||||
self.assertEquals(
|
||||
first_post.remote_identifier,
|
||||
post.remote_identifier,
|
||||
"https://www.bbc.co.uk/news/world-us-canada-48338168",
|
||||
)
|
||||
self.assertEquals(post.author, None)
|
||||
|
||||
self.assertEquals(second_post.created, timezone.now())
|
||||
post = posts[1]
|
||||
|
||||
self.assertEquals(post.created, timezone.now())
|
||||
self.assertEquals(
|
||||
second_post.remote_identifier,
|
||||
"https://www.bbc.co.uk/news/technology-48334739",
|
||||
post.remote_identifier, "https://www.bbc.co.uk/news/technology-48334739"
|
||||
)
|
||||
self.assertEquals(post.author, None)
|
||||
|
||||
def test_empty_entries(self):
|
||||
builder = FeedBuilder
|
||||
|
|
|
|||
|
|
@ -11,110 +11,137 @@ from newsreader.news.core.models import Post
|
|||
from newsreader.news.core.tests.factories import PostFactory
|
||||
|
||||
|
||||
@freeze_time("2019-10-30 12:30:00")
|
||||
class FeedDuplicateHandlerTestCase(TestCase):
|
||||
def setUp(self):
|
||||
self.maxDiff = None
|
||||
|
||||
def test_duplicate_entries_with_remote_identifiers(self):
|
||||
rule = CollectionRuleFactory()
|
||||
|
||||
existing_post = PostFactory.create(
|
||||
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7", rule=rule
|
||||
)
|
||||
new_post = PostFactory.build(
|
||||
|
||||
new_posts = PostFactory.build_batch(
|
||||
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7",
|
||||
title="title got updated",
|
||||
publication_date=timezone.now() - timedelta(days=7),
|
||||
rule=rule,
|
||||
size=5,
|
||||
)
|
||||
last_post = PostFactory.build(
|
||||
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7",
|
||||
publication_date=timezone.now(),
|
||||
rule=rule,
|
||||
)
|
||||
|
||||
with FeedDuplicateHandler(rule) as duplicate_handler:
|
||||
posts_gen = duplicate_handler.check([new_post])
|
||||
posts = list(posts_gen)
|
||||
posts = duplicate_handler.check((*new_posts, last_post))
|
||||
|
||||
self.assertEquals(len(posts), 1)
|
||||
|
||||
post = posts[0]
|
||||
existing_post.refresh_from_db()
|
||||
|
||||
self.assertEquals(existing_post.pk, post.pk)
|
||||
self.assertEquals(post.publication_date, new_post.publication_date)
|
||||
self.assertEquals(post.title, new_post.title)
|
||||
self.assertEquals(post.body, new_post.body)
|
||||
self.assertEquals(post.rule, new_post.rule)
|
||||
self.assertEquals(
|
||||
post.publication_date.strftime("%Y-%m-%d %H-%M-%S"),
|
||||
last_post.publication_date.strftime("%Y-%m-%d %H-%M-%S"),
|
||||
)
|
||||
self.assertEquals(post.title, last_post.title)
|
||||
self.assertEquals(post.body, last_post.body)
|
||||
self.assertEquals(post.rule, last_post.rule)
|
||||
self.assertEquals(post.read, False)
|
||||
|
||||
@freeze_time("2019-10-30 12:30:00")
|
||||
def test_duplicate_entries_with_different_remote_identifiers(self):
|
||||
rule = CollectionRuleFactory()
|
||||
publication_date = timezone.now()
|
||||
|
||||
existing_post = PostFactory.create(
|
||||
existing_post = PostFactory(
|
||||
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7",
|
||||
url="https://bbc.com",
|
||||
title="New post",
|
||||
body="Body",
|
||||
publication_date=publication_date,
|
||||
publication_date=timezone.now() - timedelta(minutes=10),
|
||||
rule=rule,
|
||||
)
|
||||
new_post = PostFactory.build(
|
||||
|
||||
new_posts = PostFactory.build_batch(
|
||||
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7Q",
|
||||
url="https://bbc.com",
|
||||
title="New post",
|
||||
body="Body",
|
||||
publication_date=publication_date,
|
||||
publication_date=timezone.now() - timedelta(minutes=5),
|
||||
rule=rule,
|
||||
size=5,
|
||||
)
|
||||
last_post = PostFactory.build(
|
||||
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7Q",
|
||||
url="https://bbc.com",
|
||||
title="New post",
|
||||
body="Body",
|
||||
publication_date=timezone.now(),
|
||||
rule=rule,
|
||||
)
|
||||
|
||||
with FeedDuplicateHandler(rule) as duplicate_handler:
|
||||
posts_gen = duplicate_handler.check([new_post])
|
||||
posts = list(posts_gen)
|
||||
posts = duplicate_handler.check((*new_posts, last_post))
|
||||
|
||||
self.assertEquals(len(posts), 1)
|
||||
|
||||
existing_post.refresh_from_db()
|
||||
post = posts[0]
|
||||
|
||||
self.assertEquals(existing_post.pk, post.pk)
|
||||
self.assertEquals(post.title, new_post.title)
|
||||
self.assertEquals(post.body, new_post.body)
|
||||
self.assertEquals(post.rule, new_post.rule)
|
||||
self.assertEquals(post.publication_date, new_post.publication_date)
|
||||
self.assertEquals(
|
||||
post.publication_date.strftime("%Y-%m-%d %H-%M-%S"),
|
||||
last_post.publication_date.strftime("%Y-%m-%d %H-%M-%S"),
|
||||
)
|
||||
self.assertEquals(post.title, last_post.title)
|
||||
self.assertEquals(post.body, last_post.body)
|
||||
self.assertEquals(post.rule, last_post.rule)
|
||||
self.assertEquals(post.read, False)
|
||||
|
||||
def test_duplicate_entries_in_recent_database(self):
|
||||
publication_date = timezone.now()
|
||||
|
||||
rule = CollectionRuleFactory()
|
||||
existing_post = PostFactory.create(
|
||||
|
||||
existing_post = PostFactory(
|
||||
url="https://www.bbc.co.uk/news/uk-england-birmingham-48339080",
|
||||
title="Birmingham head teacher threatened over LGBT lessons",
|
||||
body="Google's move to end business ties with Huawei will affect current devices",
|
||||
publication_date=publication_date,
|
||||
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7",
|
||||
publication_date=timezone.now() - timedelta(minutes=10),
|
||||
remote_identifier=None,
|
||||
rule=rule,
|
||||
)
|
||||
new_post = PostFactory.build(
|
||||
|
||||
new_posts = PostFactory.build_batch(
|
||||
url="https://www.bbc.co.uk/news/uk-england-birmingham-48339080",
|
||||
title="Birmingham head teacher threatened over LGBT lessons",
|
||||
body="Google's move to end business ties with Huawei will affect current devices",
|
||||
publication_date=publication_date,
|
||||
publication_date=timezone.now() - timedelta(minutes=5),
|
||||
remote_identifier=None,
|
||||
rule=rule,
|
||||
size=5,
|
||||
)
|
||||
|
||||
last_post = PostFactory.build(
|
||||
url="https://www.bbc.co.uk/news/uk-england-birmingham-48339080",
|
||||
title="Birmingham head teacher threatened over LGBT lessons",
|
||||
body="Google's move to end business ties with Huawei will affect current devices",
|
||||
publication_date=timezone.now(),
|
||||
remote_identifier=None,
|
||||
rule=rule,
|
||||
)
|
||||
|
||||
with FeedDuplicateHandler(rule) as duplicate_handler:
|
||||
posts_gen = duplicate_handler.check([new_post])
|
||||
posts = list(posts_gen)
|
||||
posts = duplicate_handler.check((*new_posts, last_post))
|
||||
|
||||
self.assertEquals(len(posts), 1)
|
||||
|
||||
existing_post.refresh_from_db()
|
||||
post = posts[0]
|
||||
|
||||
self.assertEquals(existing_post.pk, post.pk)
|
||||
self.assertEquals(post.title, new_post.title)
|
||||
self.assertEquals(post.body, new_post.body)
|
||||
self.assertEquals(post.rule, new_post.rule)
|
||||
self.assertEquals(post.publication_date, new_post.publication_date)
|
||||
self.assertEquals(
|
||||
post.publication_date.strftime("%Y-%m-%d %H-%M-%S"),
|
||||
last_post.publication_date.strftime("%Y-%m-%d %H-%M-%S"),
|
||||
)
|
||||
self.assertEquals(post.title, last_post.title)
|
||||
self.assertEquals(post.body, last_post.body)
|
||||
self.assertEquals(post.rule, last_post.rule)
|
||||
self.assertEquals(post.read, False)
|
||||
|
||||
def test_multiple_existing_entries_with_identifier(self):
|
||||
|
|
@ -124,15 +151,20 @@ class FeedDuplicateHandlerTestCase(TestCase):
|
|||
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7", rule=rule, size=5
|
||||
)
|
||||
|
||||
new_post = PostFactory.build(
|
||||
new_posts = PostFactory.build_batch(
|
||||
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7",
|
||||
title="This is a new one",
|
||||
publication_date=timezone.now() - timedelta(hours=5),
|
||||
rule=rule,
|
||||
size=5,
|
||||
)
|
||||
last_post = PostFactory.build(
|
||||
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7",
|
||||
publication_date=timezone.now() - timedelta(minutes=5),
|
||||
rule=rule,
|
||||
)
|
||||
|
||||
with FeedDuplicateHandler(rule) as duplicate_handler:
|
||||
posts_gen = duplicate_handler.check([new_post])
|
||||
posts = list(posts_gen)
|
||||
posts = duplicate_handler.check((*new_posts, last_post))
|
||||
|
||||
self.assertEquals(len(posts), 1)
|
||||
|
||||
|
|
@ -145,77 +177,101 @@ class FeedDuplicateHandlerTestCase(TestCase):
|
|||
|
||||
post = posts[0]
|
||||
|
||||
self.assertEquals(post.title, new_post.title)
|
||||
self.assertEquals(post.body, new_post.body)
|
||||
self.assertEquals(post.publication_date, new_post.publication_date)
|
||||
self.assertEquals(post.rule, new_post.rule)
|
||||
self.assertEquals(
|
||||
post.publication_date.strftime("%Y-%m-%d %H-%M-%S"),
|
||||
last_post.publication_date.strftime("%Y-%m-%d %H-%M-%S"),
|
||||
)
|
||||
self.assertEquals(post.title, last_post.title)
|
||||
self.assertEquals(post.body, last_post.body)
|
||||
self.assertEquals(post.rule, last_post.rule)
|
||||
self.assertEquals(post.read, False)
|
||||
|
||||
@freeze_time("2019-10-30 12:30:00")
|
||||
def test_duplicate_entries_outside_time_slot(self):
|
||||
publication_date = timezone.now()
|
||||
|
||||
rule = CollectionRuleFactory()
|
||||
existing_post = PostFactory.create(
|
||||
|
||||
existing_post = PostFactory(
|
||||
url="https://www.bbc.co.uk/news/uk-england-birmingham-48339080",
|
||||
title="Birmingham head teacher threatened over LGBT lessons",
|
||||
body="Google's move to end business ties with Huawei will affect current devices",
|
||||
publication_date=publication_date,
|
||||
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7",
|
||||
publication_date=timezone.now(),
|
||||
remote_identifier=None,
|
||||
rule=rule,
|
||||
)
|
||||
new_post = PostFactory.build(
|
||||
|
||||
new_posts = PostFactory.build_batch(
|
||||
url="https://www.bbc.co.uk/news/uk-england-birmingham-48339080",
|
||||
title="Birmingham head teacher threatened over LGBT lessons",
|
||||
body="Google's move to end business ties with Huawei will affect current devices",
|
||||
publication_date=publication_date + timedelta(minutes=12),
|
||||
publication_date=timezone.now() + timedelta(minutes=12),
|
||||
remote_identifier=None,
|
||||
rule=rule,
|
||||
size=5,
|
||||
)
|
||||
last_post = PostFactory.build(
|
||||
url="https://www.bbc.co.uk/news/uk-england-birmingham-48339080",
|
||||
title="Birmingham head teacher threatened over LGBT lessons",
|
||||
body="Google's move to end business ties with Huawei will affect current devices",
|
||||
publication_date=timezone.now() + timedelta(minutes=13),
|
||||
remote_identifier=None,
|
||||
rule=rule,
|
||||
)
|
||||
|
||||
with FeedDuplicateHandler(rule) as duplicate_handler:
|
||||
posts_gen = duplicate_handler.check([new_post])
|
||||
posts = list(posts_gen)
|
||||
posts = duplicate_handler.check((*new_posts, last_post))
|
||||
|
||||
self.assertEquals(len(posts), 1)
|
||||
|
||||
existing_post.refresh_from_db()
|
||||
post = posts[0]
|
||||
|
||||
self.assertEquals(post.pk, None)
|
||||
self.assertEquals(post.title, new_post.title)
|
||||
self.assertEquals(post.body, new_post.body)
|
||||
self.assertEquals(post.rule, new_post.rule)
|
||||
self.assertEquals(post.publication_date, new_post.publication_date)
|
||||
self.assertEquals(
|
||||
post.publication_date.strftime("%Y-%m-%d %H-%M-%S"),
|
||||
last_post.publication_date.strftime("%Y-%m-%d %H-%M-%S"),
|
||||
)
|
||||
self.assertEquals(post.title, last_post.title)
|
||||
self.assertEquals(post.body, last_post.body)
|
||||
self.assertEquals(post.rule, last_post.rule)
|
||||
self.assertEquals(post.read, False)
|
||||
|
||||
def test_duplicate_entries_in_collected_entries(self):
|
||||
rule = CollectionRuleFactory()
|
||||
post_1 = PostFactory.build(
|
||||
title="title got updated", body="body", url="https://bbc.com", rule=rule
|
||||
title="title got updated",
|
||||
body="body",
|
||||
url="https://bbc.com",
|
||||
publication_date=timezone.now(),
|
||||
rule=rule,
|
||||
)
|
||||
duplicate_post_1 = PostFactory.build(
|
||||
title="title got updated", body="body", url="https://bbc.com", rule=rule
|
||||
title="title got updated",
|
||||
body="body",
|
||||
url="https://bbc.com",
|
||||
publication_date=timezone.now() - timedelta(minutes=5),
|
||||
rule=rule,
|
||||
)
|
||||
|
||||
post_2 = PostFactory.build(
|
||||
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7"
|
||||
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7",
|
||||
publication_date=timezone.now(),
|
||||
)
|
||||
duplicate_post_2 = PostFactory.build(
|
||||
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7"
|
||||
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7",
|
||||
publication_date=timezone.now() - timedelta(minutes=5),
|
||||
)
|
||||
|
||||
collected_posts = (post_1, post_2, duplicate_post_1, duplicate_post_2)
|
||||
|
||||
with FeedDuplicateHandler(rule) as duplicate_handler:
|
||||
posts_gen = duplicate_handler.check(collected_posts)
|
||||
posts = list(posts_gen)
|
||||
posts = duplicate_handler.check(collected_posts)
|
||||
|
||||
self.assertEquals(len(posts), 2)
|
||||
|
||||
post = posts[0]
|
||||
|
||||
self.assertEquals(post_1.publication_date, post.publication_date)
|
||||
self.assertEquals(
|
||||
post_1.publication_date.strftime("%Y-%m-%d %H-%M-%S"),
|
||||
post.publication_date.strftime("%Y-%m-%d %H-%M-%S"),
|
||||
)
|
||||
self.assertEquals(post_1.title, post.title)
|
||||
self.assertEquals(post_1.body, post.body)
|
||||
self.assertEquals(post_1.rule, post.rule)
|
||||
|
|
@ -223,7 +279,10 @@ class FeedDuplicateHandlerTestCase(TestCase):
|
|||
|
||||
post = posts[1]
|
||||
|
||||
self.assertEquals(post_2.publication_date, post.publication_date)
|
||||
self.assertEquals(
|
||||
post_2.publication_date.strftime("%Y-%m-%d %H-%M-%S"),
|
||||
post.publication_date.strftime("%Y-%m-%d %H-%M-%S"),
|
||||
)
|
||||
self.assertEquals(post_2.title, post.title)
|
||||
self.assertEquals(post_2.body, post.body)
|
||||
self.assertEquals(post_2.rule, post.rule)
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ from newsreader.news.collection.models import CollectionRule
|
|||
|
||||
class Post(TimeStampedModel):
|
||||
title = models.CharField(max_length=200, blank=True, null=True)
|
||||
body = models.TextField(blank=True, null=True)
|
||||
body = models.TextField(blank=True)
|
||||
author = models.CharField(max_length=40, blank=True, null=True)
|
||||
publication_date = models.DateTimeField(default=timezone.now)
|
||||
url = models.URLField(max_length=1024, blank=True, null=True)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue