Fix data errors

This commit is contained in:
sonny 2020-03-05 00:02:40 +01:00
parent afc3c11775
commit 533561ba1e
27 changed files with 101 additions and 39 deletions

View file

@ -83,4 +83,4 @@ python linting:
script:
- isort -rc src/ --check-only
- black -l 88 --check src/
- autoflake -rc src/
- autoflake --check --remove-all-unused-imports --ignore-init-module-imports --recursive src/

View file

@ -1,16 +1,13 @@
import datetime
from django.conf import settings
from django.core import mail
from django.test import TestCase
from django.test.utils import override_settings
from django.urls import reverse
from django.utils.translation import gettext as _
from registration.models import RegistrationProfile
from newsreader.accounts.models import User
from newsreader.accounts.tests.factories import UserFactory
class ActivationTestCase(TestCase):

View file

@ -1,14 +1,10 @@
from typing import Dict
from django.contrib.auth.tokens import default_token_generator as token_generator
from django.core import mail
from django.test import TestCase
from django.urls import reverse
from django.utils.encoding import force_bytes
from django.utils.http import urlsafe_base64_encode
from django.utils.translation import gettext as _
from newsreader.accounts.models import User
from newsreader.accounts.tests.factories import UserFactory

View file

@ -1,13 +1,10 @@
from django.conf import settings
from django.core import mail
from django.test import TransactionTestCase as TestCase
from django.test.utils import override_settings
from django.urls import reverse
from django.utils.translation import gettext as _
from registration.models import RegistrationProfile
from newsreader.accounts.models import User
from newsreader.accounts.tests.factories import RegistrationProfileFactory, UserFactory

View file

@ -1,6 +1,6 @@
from django.test import TestCase
from django_celery_beat.models import IntervalSchedule, PeriodicTask
from django_celery_beat.models import PeriodicTask
from newsreader.accounts.models import User

View file

@ -1,4 +1,4 @@
from django.urls import include, path
from django.urls import path
from newsreader.accounts.views import (
ActivationCompleteView,

View file

@ -1,4 +1,4 @@
from .base import *
from .base import * # isort:skip
DEBUG = True

View file

@ -1,5 +1,4 @@
from .dev import *
from .dev import * # isort:skip
# Celery
# https://docs.celeryproject.org/en/latest/userguide/configuration.html

View file

@ -1,4 +1,4 @@
from .base import * # noqa
from .base import * # isort:skip
DEBUG = True

View file

@ -1,5 +1,3 @@
from django.db.models.query import QuerySet
from bs4 import BeautifulSoup
from newsreader.news.collection.exceptions import StreamParseException

View file

@ -6,7 +6,6 @@ from rest_framework.generics import (
get_object_or_404,
)
from rest_framework.response import Response
from rest_framework.serializers import Serializer
from newsreader.core.pagination import LargeResultSetPagination, ResultSetPagination
from newsreader.news.collection.models import CollectionRule

View file

@ -1,5 +1,8 @@
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed
from django.core.exceptions import MultipleObjectsReturned, ObjectDoesNotExist
from django.db.models.fields import CharField, TextField
from django.template.defaultfilters import truncatechars
from django.utils import timezone
@ -21,11 +24,13 @@ from newsreader.news.collection.exceptions import (
StreamParseException,
StreamTimeOutException,
)
from newsreader.news.collection.models import CollectionRule
from newsreader.news.collection.utils import build_publication_date, fetch
from newsreader.news.core.models import Post
logger = logging.getLogger(__name__)
class FeedBuilder(Builder):
instances = []
@ -164,7 +169,8 @@ class FeedClient(Client):
yield response_data
except StreamException as e:
stream.rule.error = e.message
length = stream.rule._meta.get_field("error").max_length
stream.rule.error = e.message[-length:]
stream.rule.succeeded = False
yield ({"entries": []}, stream)
@ -195,8 +201,8 @@ class FeedDuplicateHandler:
if instance.remote_identifier in self.existing_identifiers:
existing_post = self.handle_duplicate(instance)
if existing_post:
yield existing_post
yield existing_post
continue
elif not instance.remote_identifier and self.in_database(instance):
continue
@ -229,7 +235,16 @@ class FeedDuplicateHandler:
remote_identifier=instance.remote_identifier
)
except ObjectDoesNotExist:
return
logger.error(
f"Duplicate handler tried retrieving post {instance.remote_identifier} but failed doing so."
)
return instance
except MultipleObjectsReturned:
existing_instances = self.queryset.filter(
remote_identifier=instance.remote_identifier
).order_by("-publication_date")
existing_instance = existing_instances.last()
existing_instances.exclude(pk=existing_instance.pk).delete()
for field in instance._meta.get_fields():
getattr(existing_instance, field.name, object())

View file

@ -0,0 +1,16 @@
# Generated by Django 2.2 on 2020-03-03 19:32
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [("collection", "0004_auto_20190714_1422")]
operations = [
migrations.AlterField(
model_name="collectionrule",
name="error",
field=models.CharField(blank=True, max_length=1024, null=True),
)
]

View file

@ -33,7 +33,7 @@ class CollectionRule(TimeStampedModel):
last_suceeded = models.DateTimeField(blank=True, null=True)
succeeded = models.BooleanField(default=False)
error = models.CharField(max_length=255, blank=True, null=True)
error = models.CharField(max_length=1024, blank=True, null=True)
user = models.ForeignKey("accounts.User", _("Owner"), related_name="rules")

View file

@ -1,6 +1,6 @@
import json
from django.test import Client, TestCase
from django.test import TestCase
from django.urls import reverse
from newsreader.accounts.tests.factories import UserFactory

View file

@ -2,7 +2,7 @@ import json
from datetime import date, datetime, time
from django.test import Client, TestCase
from django.test import TestCase
from django.urls import reverse
import pytz

View file

@ -1,6 +1,7 @@
from unittest.mock import MagicMock, patch
from django.test import TestCase
from django.utils.lorem_ipsum import words
from newsreader.news.collection.exceptions import (
StreamDeniedException,
@ -101,3 +102,29 @@ class FeedClientTestCase(TestCase):
self.assertEquals(stream.rule.succeeded, False)
self.mocked_read.assert_called_once_with()
def test_client_catches_stream_parse_exception(self):
rule = CollectionRuleFactory.create()
mock_stream = MagicMock(rule=rule)
self.mocked_read.side_effect = StreamParseException("Stream has wrong contents")
with FeedClient([rule]) as client:
for data, stream in client:
self.assertEquals(data, {"entries": []})
self.assertEquals(stream.rule.error, "Stream has wrong contents")
self.assertEquals(stream.rule.succeeded, False)
self.mocked_read.assert_called_once_with()
def test_client_catches_long_exception_text(self):
rule = CollectionRuleFactory.create()
mock_stream = MagicMock(rule=rule)
self.mocked_read.side_effect = StreamParseException(words(1000))
with FeedClient([rule]) as client:
for data, stream in client:
self.assertEquals(data, {"entries": []})
self.assertEquals(len(stream.rule.error), 1024)
self.assertEquals(stream.rule.succeeded, False)
self.mocked_read.assert_called_once_with()

View file

@ -33,7 +33,7 @@ class FeedDuplicateHandlerTestCase(TestCase):
self.assertTrue(post.title != existing_post.title)
def test_duplicate_entries_in_recent_database(self):
PostFactory.create_batch(size=20)
PostFactory.create_batch(size=10)
publication_date = timezone.now()
@ -60,3 +60,24 @@ class FeedDuplicateHandlerTestCase(TestCase):
posts = list(posts_gen)
self.assertEquals(len(posts), 0)
def test_multiple_existing_entries_with_identifier(self):
timezone.now()
rule = CollectionRuleFactory()
PostFactory.create_batch(
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7", rule=rule, size=5
)
new_post = PostFactory.build(
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7",
title="This is a new one",
rule=rule,
)
with FeedDuplicateHandler(rule) as duplicate_handler:
posts_gen = duplicate_handler.check([new_post])
posts = list(posts_gen)
self.assertEquals(len(posts), 1)
self.assertEquals(posts[0].title, new_post.title)

View file

@ -1,7 +1,7 @@
import os
from django.conf import settings
from django.test import Client, TestCase
from django.test import TestCase
from django.urls import reverse
from django.utils.translation import gettext_lazy as _

View file

@ -5,7 +5,6 @@ from django.utils import timezone
import requests
from requests.exceptions import RequestException
from requests.models import Response
from newsreader.news.collection.response_handler import ResponseHandler

View file

@ -13,8 +13,7 @@ from rest_framework.permissions import IsAuthenticated
from rest_framework.response import Response
from newsreader.accounts.permissions import IsPostOwner
from newsreader.core.pagination import LargeResultSetPagination, ResultSetPagination
from newsreader.news.collection.models import CollectionRule
from newsreader.core.pagination import LargeResultSetPagination
from newsreader.news.collection.serializers import RuleSerializer
from newsreader.news.core.filters import ReadFilter
from newsreader.news.core.models import Category, Post

View file

@ -1,6 +1,5 @@
from rest_framework import serializers
from newsreader.news import collection
from newsreader.news.core.models import Category, Post

View file

@ -1,6 +1,6 @@
import json
from django.test import Client, TestCase
from django.test import TestCase
from django.urls import reverse
from newsreader.accounts.tests.factories import UserFactory

View file

@ -2,7 +2,7 @@ import json
from datetime import date, datetime, time
from django.test import Client, TestCase
from django.test import TestCase
from django.urls import reverse
import pytz

View file

@ -1,6 +1,6 @@
import json
from django.test import Client, TestCase
from django.test import TestCase
from django.urls import reverse
from newsreader.accounts.tests.factories import UserFactory

View file

@ -1,6 +1,6 @@
from datetime import date, datetime, time
from django.test import Client, TestCase
from django.test import TestCase
from django.urls import reverse
import pytz

View file

@ -1,4 +1,4 @@
from django.test import Client, TestCase
from django.test import TestCase
from django.urls import reverse
from newsreader.accounts.tests.factories import UserFactory