Fix data errors

This commit is contained in:
sonny 2020-03-05 00:02:40 +01:00
parent afc3c11775
commit 533561ba1e
27 changed files with 101 additions and 39 deletions

View file

@ -83,4 +83,4 @@ python linting:
script: script:
- isort -rc src/ --check-only - isort -rc src/ --check-only
- black -l 88 --check src/ - black -l 88 --check src/
- autoflake -rc src/ - autoflake --check --remove-all-unused-imports --ignore-init-module-imports --recursive src/

View file

@ -1,16 +1,13 @@
import datetime import datetime
from django.conf import settings from django.conf import settings
from django.core import mail
from django.test import TestCase from django.test import TestCase
from django.test.utils import override_settings
from django.urls import reverse from django.urls import reverse
from django.utils.translation import gettext as _ from django.utils.translation import gettext as _
from registration.models import RegistrationProfile from registration.models import RegistrationProfile
from newsreader.accounts.models import User from newsreader.accounts.models import User
from newsreader.accounts.tests.factories import UserFactory
class ActivationTestCase(TestCase): class ActivationTestCase(TestCase):

View file

@ -1,14 +1,10 @@
from typing import Dict from typing import Dict
from django.contrib.auth.tokens import default_token_generator as token_generator
from django.core import mail from django.core import mail
from django.test import TestCase from django.test import TestCase
from django.urls import reverse from django.urls import reverse
from django.utils.encoding import force_bytes
from django.utils.http import urlsafe_base64_encode
from django.utils.translation import gettext as _ from django.utils.translation import gettext as _
from newsreader.accounts.models import User
from newsreader.accounts.tests.factories import UserFactory from newsreader.accounts.tests.factories import UserFactory

View file

@ -1,13 +1,10 @@
from django.conf import settings
from django.core import mail from django.core import mail
from django.test import TransactionTestCase as TestCase from django.test import TransactionTestCase as TestCase
from django.test.utils import override_settings
from django.urls import reverse from django.urls import reverse
from django.utils.translation import gettext as _ from django.utils.translation import gettext as _
from registration.models import RegistrationProfile from registration.models import RegistrationProfile
from newsreader.accounts.models import User
from newsreader.accounts.tests.factories import RegistrationProfileFactory, UserFactory from newsreader.accounts.tests.factories import RegistrationProfileFactory, UserFactory

View file

@ -1,6 +1,6 @@
from django.test import TestCase from django.test import TestCase
from django_celery_beat.models import IntervalSchedule, PeriodicTask from django_celery_beat.models import PeriodicTask
from newsreader.accounts.models import User from newsreader.accounts.models import User

View file

@ -1,4 +1,4 @@
from django.urls import include, path from django.urls import path
from newsreader.accounts.views import ( from newsreader.accounts.views import (
ActivationCompleteView, ActivationCompleteView,

View file

@ -1,4 +1,4 @@
from .base import * from .base import * # isort:skip
DEBUG = True DEBUG = True

View file

@ -1,5 +1,4 @@
from .dev import * from .dev import * # isort:skip
# Celery # Celery
# https://docs.celeryproject.org/en/latest/userguide/configuration.html # https://docs.celeryproject.org/en/latest/userguide/configuration.html

View file

@ -1,4 +1,4 @@
from .base import * # noqa from .base import * # isort:skip
DEBUG = True DEBUG = True

View file

@ -1,5 +1,3 @@
from django.db.models.query import QuerySet
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from newsreader.news.collection.exceptions import StreamParseException from newsreader.news.collection.exceptions import StreamParseException

View file

@ -6,7 +6,6 @@ from rest_framework.generics import (
get_object_or_404, get_object_or_404,
) )
from rest_framework.response import Response from rest_framework.response import Response
from rest_framework.serializers import Serializer
from newsreader.core.pagination import LargeResultSetPagination, ResultSetPagination from newsreader.core.pagination import LargeResultSetPagination, ResultSetPagination
from newsreader.news.collection.models import CollectionRule from newsreader.news.collection.models import CollectionRule

View file

@ -1,5 +1,8 @@
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from django.core.exceptions import MultipleObjectsReturned, ObjectDoesNotExist
from django.db.models.fields import CharField, TextField from django.db.models.fields import CharField, TextField
from django.template.defaultfilters import truncatechars from django.template.defaultfilters import truncatechars
from django.utils import timezone from django.utils import timezone
@ -21,11 +24,13 @@ from newsreader.news.collection.exceptions import (
StreamParseException, StreamParseException,
StreamTimeOutException, StreamTimeOutException,
) )
from newsreader.news.collection.models import CollectionRule
from newsreader.news.collection.utils import build_publication_date, fetch from newsreader.news.collection.utils import build_publication_date, fetch
from newsreader.news.core.models import Post from newsreader.news.core.models import Post
logger = logging.getLogger(__name__)
class FeedBuilder(Builder): class FeedBuilder(Builder):
instances = [] instances = []
@ -164,7 +169,8 @@ class FeedClient(Client):
yield response_data yield response_data
except StreamException as e: except StreamException as e:
stream.rule.error = e.message length = stream.rule._meta.get_field("error").max_length
stream.rule.error = e.message[-length:]
stream.rule.succeeded = False stream.rule.succeeded = False
yield ({"entries": []}, stream) yield ({"entries": []}, stream)
@ -195,8 +201,8 @@ class FeedDuplicateHandler:
if instance.remote_identifier in self.existing_identifiers: if instance.remote_identifier in self.existing_identifiers:
existing_post = self.handle_duplicate(instance) existing_post = self.handle_duplicate(instance)
if existing_post: yield existing_post
yield existing_post
continue continue
elif not instance.remote_identifier and self.in_database(instance): elif not instance.remote_identifier and self.in_database(instance):
continue continue
@ -229,7 +235,16 @@ class FeedDuplicateHandler:
remote_identifier=instance.remote_identifier remote_identifier=instance.remote_identifier
) )
except ObjectDoesNotExist: except ObjectDoesNotExist:
return logger.error(
f"Duplicate handler tried retrieving post {instance.remote_identifier} but failed doing so."
)
return instance
except MultipleObjectsReturned:
existing_instances = self.queryset.filter(
remote_identifier=instance.remote_identifier
).order_by("-publication_date")
existing_instance = existing_instances.last()
existing_instances.exclude(pk=existing_instance.pk).delete()
for field in instance._meta.get_fields(): for field in instance._meta.get_fields():
getattr(existing_instance, field.name, object()) getattr(existing_instance, field.name, object())

View file

@ -0,0 +1,16 @@
# Generated by Django 2.2 on 2020-03-03 19:32
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [("collection", "0004_auto_20190714_1422")]
operations = [
migrations.AlterField(
model_name="collectionrule",
name="error",
field=models.CharField(blank=True, max_length=1024, null=True),
)
]

View file

@ -33,7 +33,7 @@ class CollectionRule(TimeStampedModel):
last_suceeded = models.DateTimeField(blank=True, null=True) last_suceeded = models.DateTimeField(blank=True, null=True)
succeeded = models.BooleanField(default=False) succeeded = models.BooleanField(default=False)
error = models.CharField(max_length=255, blank=True, null=True) error = models.CharField(max_length=1024, blank=True, null=True)
user = models.ForeignKey("accounts.User", _("Owner"), related_name="rules") user = models.ForeignKey("accounts.User", _("Owner"), related_name="rules")

View file

@ -1,6 +1,6 @@
import json import json
from django.test import Client, TestCase from django.test import TestCase
from django.urls import reverse from django.urls import reverse
from newsreader.accounts.tests.factories import UserFactory from newsreader.accounts.tests.factories import UserFactory

View file

@ -2,7 +2,7 @@ import json
from datetime import date, datetime, time from datetime import date, datetime, time
from django.test import Client, TestCase from django.test import TestCase
from django.urls import reverse from django.urls import reverse
import pytz import pytz

View file

@ -1,6 +1,7 @@
from unittest.mock import MagicMock, patch from unittest.mock import MagicMock, patch
from django.test import TestCase from django.test import TestCase
from django.utils.lorem_ipsum import words
from newsreader.news.collection.exceptions import ( from newsreader.news.collection.exceptions import (
StreamDeniedException, StreamDeniedException,
@ -101,3 +102,29 @@ class FeedClientTestCase(TestCase):
self.assertEquals(stream.rule.succeeded, False) self.assertEquals(stream.rule.succeeded, False)
self.mocked_read.assert_called_once_with() self.mocked_read.assert_called_once_with()
def test_client_catches_stream_parse_exception(self):
rule = CollectionRuleFactory.create()
mock_stream = MagicMock(rule=rule)
self.mocked_read.side_effect = StreamParseException("Stream has wrong contents")
with FeedClient([rule]) as client:
for data, stream in client:
self.assertEquals(data, {"entries": []})
self.assertEquals(stream.rule.error, "Stream has wrong contents")
self.assertEquals(stream.rule.succeeded, False)
self.mocked_read.assert_called_once_with()
def test_client_catches_long_exception_text(self):
rule = CollectionRuleFactory.create()
mock_stream = MagicMock(rule=rule)
self.mocked_read.side_effect = StreamParseException(words(1000))
with FeedClient([rule]) as client:
for data, stream in client:
self.assertEquals(data, {"entries": []})
self.assertEquals(len(stream.rule.error), 1024)
self.assertEquals(stream.rule.succeeded, False)
self.mocked_read.assert_called_once_with()

View file

@ -33,7 +33,7 @@ class FeedDuplicateHandlerTestCase(TestCase):
self.assertTrue(post.title != existing_post.title) self.assertTrue(post.title != existing_post.title)
def test_duplicate_entries_in_recent_database(self): def test_duplicate_entries_in_recent_database(self):
PostFactory.create_batch(size=20) PostFactory.create_batch(size=10)
publication_date = timezone.now() publication_date = timezone.now()
@ -60,3 +60,24 @@ class FeedDuplicateHandlerTestCase(TestCase):
posts = list(posts_gen) posts = list(posts_gen)
self.assertEquals(len(posts), 0) self.assertEquals(len(posts), 0)
def test_multiple_existing_entries_with_identifier(self):
timezone.now()
rule = CollectionRuleFactory()
PostFactory.create_batch(
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7", rule=rule, size=5
)
new_post = PostFactory.build(
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7",
title="This is a new one",
rule=rule,
)
with FeedDuplicateHandler(rule) as duplicate_handler:
posts_gen = duplicate_handler.check([new_post])
posts = list(posts_gen)
self.assertEquals(len(posts), 1)
self.assertEquals(posts[0].title, new_post.title)

View file

@ -1,7 +1,7 @@
import os import os
from django.conf import settings from django.conf import settings
from django.test import Client, TestCase from django.test import TestCase
from django.urls import reverse from django.urls import reverse
from django.utils.translation import gettext_lazy as _ from django.utils.translation import gettext_lazy as _

View file

@ -5,7 +5,6 @@ from django.utils import timezone
import requests import requests
from requests.exceptions import RequestException from requests.exceptions import RequestException
from requests.models import Response
from newsreader.news.collection.response_handler import ResponseHandler from newsreader.news.collection.response_handler import ResponseHandler

View file

@ -13,8 +13,7 @@ from rest_framework.permissions import IsAuthenticated
from rest_framework.response import Response from rest_framework.response import Response
from newsreader.accounts.permissions import IsPostOwner from newsreader.accounts.permissions import IsPostOwner
from newsreader.core.pagination import LargeResultSetPagination, ResultSetPagination from newsreader.core.pagination import LargeResultSetPagination
from newsreader.news.collection.models import CollectionRule
from newsreader.news.collection.serializers import RuleSerializer from newsreader.news.collection.serializers import RuleSerializer
from newsreader.news.core.filters import ReadFilter from newsreader.news.core.filters import ReadFilter
from newsreader.news.core.models import Category, Post from newsreader.news.core.models import Category, Post

View file

@ -1,6 +1,5 @@
from rest_framework import serializers from rest_framework import serializers
from newsreader.news import collection
from newsreader.news.core.models import Category, Post from newsreader.news.core.models import Category, Post

View file

@ -1,6 +1,6 @@
import json import json
from django.test import Client, TestCase from django.test import TestCase
from django.urls import reverse from django.urls import reverse
from newsreader.accounts.tests.factories import UserFactory from newsreader.accounts.tests.factories import UserFactory

View file

@ -2,7 +2,7 @@ import json
from datetime import date, datetime, time from datetime import date, datetime, time
from django.test import Client, TestCase from django.test import TestCase
from django.urls import reverse from django.urls import reverse
import pytz import pytz

View file

@ -1,6 +1,6 @@
import json import json
from django.test import Client, TestCase from django.test import TestCase
from django.urls import reverse from django.urls import reverse
from newsreader.accounts.tests.factories import UserFactory from newsreader.accounts.tests.factories import UserFactory

View file

@ -1,6 +1,6 @@
from datetime import date, datetime, time from datetime import date, datetime, time
from django.test import Client, TestCase from django.test import TestCase
from django.urls import reverse from django.urls import reverse
import pytz import pytz

View file

@ -1,4 +1,4 @@
from django.test import Client, TestCase from django.test import TestCase
from django.urls import reverse from django.urls import reverse
from newsreader.accounts.tests.factories import UserFactory from newsreader.accounts.tests.factories import UserFactory