From 48a9b25545770db48e318e3250919b134deef53d Mon Sep 17 00:00:00 2001 From: Sonny Date: Mon, 1 Jul 2019 09:36:01 +0200 Subject: [PATCH] Favicon fetcher --- src/newsreader/news/collection/admin.py | 15 +- src/newsreader/news/collection/base.py | 46 ++++- src/newsreader/news/collection/favicon.py | 113 +++++++++++++ src/newsreader/news/collection/feed.py | 8 +- .../management/commands/fetch_favicons.py | 11 ++ .../migrations/0007_auto_20190623_1837.py | 16 ++ .../migrations/0008_auto_20190623_1847.py | 16 ++ .../0009_collectionrule_website_url.py | 16 ++ .../migrations/0010_auto_20190628_2142.py | 19 +++ src/newsreader/news/collection/models.py | 8 +- .../news/collection/tests/__init__.py | 3 + .../news/collection/tests/factories.py | 1 + .../news/collection/tests/favicon/__init__.py | 3 + .../tests/favicon/builder/__init__.py | 1 + .../collection/tests/favicon/builder/mocks.py | 88 ++++++++++ .../collection/tests/favicon/builder/tests.py | 60 +++++++ .../tests/favicon/client/__init__.py | 1 + .../collection/tests/favicon/client/mocks.py | 12 ++ .../collection/tests/favicon/client/tests.py | 91 ++++++++++ .../tests/favicon/collector/__init__.py | 1 + .../tests/favicon/collector/mocks.py | 159 ++++++++++++++++++ .../tests/favicon/collector/tests.py | 147 ++++++++++++++++ .../collection/tests/feed/builder/tests.py | 105 ++++-------- .../collection/tests/feed/client/tests.py | 19 ++- .../collection/tests/feed/collector/tests.py | 85 +++++----- .../tests/feed/duplicate_handler/tests.py | 8 +- .../collection/tests/feed/stream/mocks.py | 111 ++++++------ .../collection/tests/feed/stream/tests.py | 47 +++--- src/newsreader/news/collection/tests/mocks.py | 47 ++++++ src/newsreader/news/collection/tests/tests.py | 134 +++++++++++++++ .../news/collection/tests/utils/__init__.py | 1 + .../news/collection/tests/utils/tests.py | 57 +++++++ src/newsreader/news/collection/utils.py | 17 +- 33 files changed, 1238 insertions(+), 228 deletions(-) create mode 100644 src/newsreader/news/collection/favicon.py create mode 100644 src/newsreader/news/collection/management/commands/fetch_favicons.py create mode 100644 src/newsreader/news/collection/migrations/0007_auto_20190623_1837.py create mode 100644 src/newsreader/news/collection/migrations/0008_auto_20190623_1847.py create mode 100644 src/newsreader/news/collection/migrations/0009_collectionrule_website_url.py create mode 100644 src/newsreader/news/collection/migrations/0010_auto_20190628_2142.py create mode 100644 src/newsreader/news/collection/tests/favicon/__init__.py create mode 100644 src/newsreader/news/collection/tests/favicon/builder/__init__.py create mode 100644 src/newsreader/news/collection/tests/favicon/builder/mocks.py create mode 100644 src/newsreader/news/collection/tests/favicon/builder/tests.py create mode 100644 src/newsreader/news/collection/tests/favicon/client/__init__.py create mode 100644 src/newsreader/news/collection/tests/favicon/client/mocks.py create mode 100644 src/newsreader/news/collection/tests/favicon/client/tests.py create mode 100644 src/newsreader/news/collection/tests/favicon/collector/__init__.py create mode 100644 src/newsreader/news/collection/tests/favicon/collector/mocks.py create mode 100644 src/newsreader/news/collection/tests/favicon/collector/tests.py create mode 100644 src/newsreader/news/collection/tests/mocks.py create mode 100644 src/newsreader/news/collection/tests/tests.py create mode 100644 src/newsreader/news/collection/tests/utils/__init__.py create mode 100644 src/newsreader/news/collection/tests/utils/tests.py diff --git a/src/newsreader/news/collection/admin.py b/src/newsreader/news/collection/admin.py index 972b020..77ae900 100644 --- a/src/newsreader/news/collection/admin.py +++ b/src/newsreader/news/collection/admin.py @@ -4,20 +4,9 @@ from newsreader.news.collection.models import CollectionRule class CollectionRuleAdmin(admin.ModelAdmin): - fields = ( - "url", - "name", - "timezone", - "category", - ) + fields = ("url", "name", "timezone", "category", "favicon") - list_display = ( - "name", - "category", - "url", - "last_suceeded", - "succeeded", - ) + list_display = ("name", "category", "url", "last_suceeded", "succeeded") admin.site.register(CollectionRule, CollectionRuleAdmin) diff --git a/src/newsreader/news/collection/base.py b/src/newsreader/news/collection/base.py index 0ab25d3..2f392b7 100644 --- a/src/newsreader/news/collection/base.py +++ b/src/newsreader/news/collection/base.py @@ -2,9 +2,13 @@ from typing import ContextManager, Dict, List, Optional, Tuple import requests +from bs4 import BeautifulSoup + from django.utils import timezone +from newsreader.news.collection.exceptions import StreamParseException from newsreader.news.collection.models import CollectionRule +from newsreader.news.collection.utils import fetch class Stream: @@ -12,9 +16,7 @@ class Stream: self.rule = rule def read(self) -> Tuple: - url = self.rule.url - response = requests.get(url) - return (self.parse(response.content), self) + raise NotImplementedError def parse(self, payload: bytes) -> Dict: raise NotImplementedError @@ -45,7 +47,7 @@ class Client: class Builder: instances = [] - def __init__(self, stream: Stream) -> None: + def __init__(self, stream: Tuple) -> None: self.stream = stream def __enter__(self) -> ContextManager: @@ -81,3 +83,39 @@ class Collector: class Meta: abstract = True + + +class WebsiteStream(Stream): + def __init__(self, url: str) -> None: + self.url = url + + def read(self) -> Tuple: + response = fetch(self.url) + + return (self.parse(response.content), self) + + def parse(self, payload: bytes) -> BeautifulSoup: + try: + return BeautifulSoup(payload, "lxml") + except TypeError: + raise StreamParseException("Could not parse given HTML") + + +class URLBuilder(Builder): + def __enter__(self) -> ContextManager: + return self + + def build(self) -> Tuple: + data, stream = self.stream + rule = stream.rule + + try: + url = data["feed"]["link"] + except (KeyError, TypeError): + url = None + + if url: + rule.website_url = url + rule.save() + + return rule, url diff --git a/src/newsreader/news/collection/favicon.py b/src/newsreader/news/collection/favicon.py new file mode 100644 index 0000000..8270692 --- /dev/null +++ b/src/newsreader/news/collection/favicon.py @@ -0,0 +1,113 @@ +from concurrent.futures import ThreadPoolExecutor, as_completed +from typing import ContextManager, List, Optional +from urllib.parse import urljoin, urlparse + +from newsreader.news.collection.base import ( + Builder, + Client, + Collector, + Stream, + URLBuilder, + WebsiteStream, +) +from newsreader.news.collection.exceptions import StreamException +from newsreader.news.collection.feed import FeedClient + +LINK_RELS = ["icon", "shortcut icon", "apple-touch-icon", "apple-touch-icon-precomposed"] + + +class FaviconBuilder(Builder): + def build(self) -> None: + rule, soup = self.stream + + url = self.parse(soup, rule.website_url) + + if url: + rule.favicon = url + rule.save() + + def parse(self, soup, website_url) -> Optional[str]: + if not soup.head: + return + + links = soup.head.find_all("link") + url = self.parse_links(links) + + if not url: + return + + parsed_url = urlparse(url) + + if not parsed_url.scheme and not parsed_url.netloc: + if not website_url: + return + return urljoin(website_url, url) + elif not parsed_url.scheme: + return urljoin(f"https://{parsed_url.netloc}", parsed_url.path) + + return url + + def parse_links(self, links: List) -> Optional[str]: + favicons = set() + icons = set() + + for link in links: + if not "href" in link.attrs: + continue + + if "favicon" in link["href"]: + favicons.add(link["href"].lower()) + + if "rel" in link.attrs: + for rel in link["rel"]: + if rel in LINK_RELS: + icons.add(link["href"].lower()) + + if favicons: + return favicons.pop() + elif icons: + return icons.pop() + + +class FaviconClient(Client): + stream = WebsiteStream + + def __init__(self, streams: List) -> None: + self.streams = streams + + def __enter__(self) -> ContextManager: + with ThreadPoolExecutor(max_workers=10) as executor: + futures = {executor.submit(stream.read): rule for rule, stream in self.streams} + + for future in as_completed(futures): + rule = futures[future] + + try: + response_data, stream = future.result() + except StreamException: + continue + + yield (rule, response_data) + + +class FaviconCollector(Collector): + feed_client, favicon_client = (FeedClient, FaviconClient) + url_builder, favicon_builder = (URLBuilder, FaviconBuilder) + + def collect(self, rules: Optional[List] = None) -> None: + streams = [] + + with self.feed_client(rules=rules) as client: + for data, stream in client: + with self.url_builder((data, stream)) as builder: + rule, url = builder.build() + + if not url: + continue + + streams.append((rule, WebsiteStream(url))) + + with self.favicon_client(streams) as client: + for rule, data in client: + with self.favicon_builder((rule, data)) as builder: + builder.build() diff --git a/src/newsreader/news/collection/feed.py b/src/newsreader/news/collection/feed.py index e2c098f..ef66b69 100644 --- a/src/newsreader/news/collection/feed.py +++ b/src/newsreader/news/collection/feed.py @@ -3,7 +3,6 @@ from typing import ContextManager, Dict, Generator, List, Optional, Tuple import bleach import pytz -import requests from feedparser import parse @@ -19,7 +18,7 @@ from newsreader.news.collection.exceptions import ( ) from newsreader.news.collection.models import CollectionRule from newsreader.news.collection.response_handler import ResponseHandler -from newsreader.news.collection.utils import build_publication_date +from newsreader.news.collection.utils import build_publication_date, fetch from newsreader.news.posts.models import Post @@ -92,10 +91,7 @@ class FeedBuilder(Builder): class FeedStream(Stream): def read(self) -> Tuple: url = self.rule.url - response = requests.get(url) - - with ResponseHandler(response) as response_handler: - response_handler.handle_response() + response = fetch(url) return (self.parse(response.content), self) diff --git a/src/newsreader/news/collection/management/commands/fetch_favicons.py b/src/newsreader/news/collection/management/commands/fetch_favicons.py new file mode 100644 index 0000000..1ee96cf --- /dev/null +++ b/src/newsreader/news/collection/management/commands/fetch_favicons.py @@ -0,0 +1,11 @@ +from django.core.management.base import BaseCommand + +from newsreader.news.collection.favicon import FaviconCollector + + +class Command(BaseCommand): + help = "Fetch favicons for collection rules" + + def handle(self, *args, **options): + collector = FaviconCollector() + collector.collect() diff --git a/src/newsreader/news/collection/migrations/0007_auto_20190623_1837.py b/src/newsreader/news/collection/migrations/0007_auto_20190623_1837.py new file mode 100644 index 0000000..cc27d6e --- /dev/null +++ b/src/newsreader/news/collection/migrations/0007_auto_20190623_1837.py @@ -0,0 +1,16 @@ +# Generated by Django 2.2 on 2019-06-23 18:37 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [("collection", "0006_collectionrule_error")] + + operations = [ + migrations.AlterField( + model_name="collectionrule", + name="favicon", + field=models.ImageField(default="favicons/default-favicon.ico", upload_to="favicons/"), + ) + ] diff --git a/src/newsreader/news/collection/migrations/0008_auto_20190623_1847.py b/src/newsreader/news/collection/migrations/0008_auto_20190623_1847.py new file mode 100644 index 0000000..3c8ae66 --- /dev/null +++ b/src/newsreader/news/collection/migrations/0008_auto_20190623_1847.py @@ -0,0 +1,16 @@ +# Generated by Django 2.2 on 2019-06-23 18:47 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [("collection", "0007_auto_20190623_1837")] + + operations = [ + migrations.AlterField( + model_name="collectionrule", + name="favicon", + field=models.URLField(blank=True, null=True), + ) + ] diff --git a/src/newsreader/news/collection/migrations/0009_collectionrule_website_url.py b/src/newsreader/news/collection/migrations/0009_collectionrule_website_url.py new file mode 100644 index 0000000..e5273b3 --- /dev/null +++ b/src/newsreader/news/collection/migrations/0009_collectionrule_website_url.py @@ -0,0 +1,16 @@ +# Generated by Django 2.2 on 2019-06-27 21:27 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [("collection", "0008_auto_20190623_1847")] + + operations = [ + migrations.AddField( + model_name="collectionrule", + name="website_url", + field=models.URLField(blank=True, editable=False, null=True), + ) + ] diff --git a/src/newsreader/news/collection/migrations/0010_auto_20190628_2142.py b/src/newsreader/news/collection/migrations/0010_auto_20190628_2142.py new file mode 100644 index 0000000..3726158 --- /dev/null +++ b/src/newsreader/news/collection/migrations/0010_auto_20190628_2142.py @@ -0,0 +1,19 @@ +# Generated by Django 2.2 on 2019-06-28 21:42 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [("collection", "0009_collectionrule_website_url")] + + operations = [ + migrations.AlterField( + model_name="collectionrule", name="url", field=models.URLField(max_length=1024) + ), + migrations.AlterField( + model_name="collectionrule", + name="website_url", + field=models.URLField(blank=True, editable=False, max_length=1024, null=True), + ), + ] diff --git a/src/newsreader/news/collection/models.py b/src/newsreader/news/collection/models.py index ffb4131..1de9849 100644 --- a/src/newsreader/news/collection/models.py +++ b/src/newsreader/news/collection/models.py @@ -1,5 +1,6 @@ import pytz +from django.conf import settings from django.db import models from django.utils.translation import gettext as _ @@ -8,8 +9,9 @@ class CollectionRule(models.Model): name = models.CharField(max_length=100) source = models.CharField(max_length=100) - url = models.URLField() - favicon = models.ImageField(blank=True, null=True) + url = models.URLField(max_length=1024) + website_url = models.URLField(max_length=1024, editable=False, blank=True, null=True) + favicon = models.URLField(blank=True, null=True) timezone = models.CharField( choices=((timezone, timezone) for timezone in pytz.all_timezones), @@ -23,7 +25,7 @@ class CollectionRule(models.Model): null=True, verbose_name=_("Category"), help_text=_("Posts from this rule will be tagged with this category"), - on_delete=models.SET_NULL + on_delete=models.SET_NULL, ) last_suceeded = models.DateTimeField(blank=True, null=True) diff --git a/src/newsreader/news/collection/tests/__init__.py b/src/newsreader/news/collection/tests/__init__.py index fb6723f..ea6a7c0 100644 --- a/src/newsreader/news/collection/tests/__init__.py +++ b/src/newsreader/news/collection/tests/__init__.py @@ -1 +1,4 @@ +from .favicon import * from .feed import * +from .tests import * +from .utils import * diff --git a/src/newsreader/news/collection/tests/factories.py b/src/newsreader/news/collection/tests/factories.py index 6b42292..be54806 100644 --- a/src/newsreader/news/collection/tests/factories.py +++ b/src/newsreader/news/collection/tests/factories.py @@ -10,3 +10,4 @@ class CollectionRuleFactory(factory.django.DjangoModelFactory): name = factory.Sequence(lambda n: "CollectionRule-{}".format(n)) source = factory.Faker("name") url = factory.Faker("url") + website_url = factory.Faker("url") diff --git a/src/newsreader/news/collection/tests/favicon/__init__.py b/src/newsreader/news/collection/tests/favicon/__init__.py new file mode 100644 index 0000000..5fb0299 --- /dev/null +++ b/src/newsreader/news/collection/tests/favicon/__init__.py @@ -0,0 +1,3 @@ +from .builder import * +from .client import * +from .collector import * diff --git a/src/newsreader/news/collection/tests/favicon/builder/__init__.py b/src/newsreader/news/collection/tests/favicon/builder/__init__.py new file mode 100644 index 0000000..8baa6e5 --- /dev/null +++ b/src/newsreader/news/collection/tests/favicon/builder/__init__.py @@ -0,0 +1 @@ +from .tests import * diff --git a/src/newsreader/news/collection/tests/favicon/builder/mocks.py b/src/newsreader/news/collection/tests/favicon/builder/mocks.py new file mode 100644 index 0000000..ce02475 --- /dev/null +++ b/src/newsreader/news/collection/tests/favicon/builder/mocks.py @@ -0,0 +1,88 @@ +from bs4 import BeautifulSoup + +simple_mock = BeautifulSoup( + """ + + + + + +
+ + + """, + "lxml", +) + +mock_without_url = BeautifulSoup( + """ + + + + + +
+ + + """, + "lxml", +) + +mock_without_header = BeautifulSoup( + """ + + +
+ + + """, + "lxml", +) + +mock_with_weird_path = BeautifulSoup( + """ + + + + + +
+ + + """, + "lxml", +) + +mock_with_other_url = BeautifulSoup( + """ + + + + + + +
+ + + """, + "lxml", +) + +mock_with_multiple_icons = BeautifulSoup( + """ + + + + + + + + + + +
+ + + """, + "lxml", +) diff --git a/src/newsreader/news/collection/tests/favicon/builder/tests.py b/src/newsreader/news/collection/tests/favicon/builder/tests.py new file mode 100644 index 0000000..d08fce7 --- /dev/null +++ b/src/newsreader/news/collection/tests/favicon/builder/tests.py @@ -0,0 +1,60 @@ +from freezegun import freeze_time + +from django.test import TestCase + +from newsreader.news.collection.favicon import FaviconBuilder +from newsreader.news.collection.tests.factories import CollectionRuleFactory +from newsreader.news.collection.tests.favicon.builder.mocks import * + + +class FaviconBuilderTestCase(TestCase): + def setUp(self): + self.maxDiff = None + + def test_simple(self): + rule = CollectionRuleFactory(favicon=None) + + with FaviconBuilder((rule, simple_mock)) as builder: + builder.build() + + self.assertEquals(rule.favicon, "https://www.bbc.com/favicon.ico") + + def test_without_url(self): + rule = CollectionRuleFactory(website_url="https://www.theguardian.com/", favicon=None) + + with FaviconBuilder((rule, mock_without_url)) as builder: + builder.build() + + self.assertEquals(rule.favicon, "https://www.theguardian.com/favicon.ico") + + def test_without_header(self): + rule = CollectionRuleFactory(favicon=None) + + with FaviconBuilder((rule, mock_without_header)) as builder: + builder.build() + + self.assertEquals(rule.favicon, None) + + def test_weird_path(self): + rule = CollectionRuleFactory(favicon=None) + + with FaviconBuilder((rule, mock_with_weird_path)) as builder: + builder.build() + + self.assertEquals(rule.favicon, "https://www.theguardian.com/jabadaba/doe/favicon.ico") + + def test_other_url(self): + rule = CollectionRuleFactory(favicon=None) + + with FaviconBuilder((rule, mock_with_other_url)) as builder: + builder.build() + + self.assertEquals(rule.favicon, "https://www.theguardian.com/icon.png") + + def test_url_with_favicon_takes_precedence(self): + rule = CollectionRuleFactory(favicon=None) + + with FaviconBuilder((rule, mock_with_multiple_icons)) as builder: + builder.build() + + self.assertEquals(rule.favicon, "https://www.bbc.com/favicon.ico") diff --git a/src/newsreader/news/collection/tests/favicon/client/__init__.py b/src/newsreader/news/collection/tests/favicon/client/__init__.py new file mode 100644 index 0000000..8baa6e5 --- /dev/null +++ b/src/newsreader/news/collection/tests/favicon/client/__init__.py @@ -0,0 +1 @@ +from .tests import * diff --git a/src/newsreader/news/collection/tests/favicon/client/mocks.py b/src/newsreader/news/collection/tests/favicon/client/mocks.py new file mode 100644 index 0000000..ba79b27 --- /dev/null +++ b/src/newsreader/news/collection/tests/favicon/client/mocks.py @@ -0,0 +1,12 @@ +from bs4 import BeautifulSoup + +simple_mock = BeautifulSoup( + """ + + +
+ + + """, + "lxml", +) diff --git a/src/newsreader/news/collection/tests/favicon/client/tests.py b/src/newsreader/news/collection/tests/favicon/client/tests.py new file mode 100644 index 0000000..4ac2a40 --- /dev/null +++ b/src/newsreader/news/collection/tests/favicon/client/tests.py @@ -0,0 +1,91 @@ +from unittest.mock import MagicMock + +from django.test import TestCase + +from newsreader.news.collection.base import WebsiteStream +from newsreader.news.collection.exceptions import ( + StreamDeniedException, + StreamException, + StreamNotFoundException, + StreamTimeOutException, +) +from newsreader.news.collection.favicon import FaviconClient +from newsreader.news.collection.tests.factories import CollectionRuleFactory +from newsreader.news.collection.tests.favicon.client.mocks import simple_mock + + +class FaviconClientTestCase(TestCase): + def setUp(self): + self.maxDiff = None + + def test_simple(self): + rule = CollectionRuleFactory() + stream = MagicMock(url="https://www.bbc.com") + stream.read.return_value = (simple_mock, stream) + + with FaviconClient([(rule, stream)]) as client: + for rule, data in client: + self.assertEquals(rule.pk, rule.pk) + self.assertEquals(data, simple_mock) + + stream.read.assert_called_once_with() + + def test_client_catches_stream_exception(self): + rule = CollectionRuleFactory(error=None, succeeded=True) + stream = MagicMock(url="https://www.bbc.com") + stream.read.side_effect = StreamException + + with FaviconClient([(rule, stream)]) as client: + for rule, data in client: + pass + + stream.read.assert_called_once_with() + + # The favicon client does not set CollectionRule errors + self.assertEquals(rule.succeeded, True) + self.assertEquals(rule.error, None) + + def test_client_catches_stream_not_found_exception(self): + rule = CollectionRuleFactory(error=None, succeeded=True) + stream = MagicMock(url="https://www.bbc.com") + stream.read.side_effect = StreamNotFoundException + + with FaviconClient([(rule, stream)]) as client: + for rule, data in client: + pass + + stream.read.assert_called_once_with() + + # The favicon client does not set CollectionRule errors + self.assertEquals(rule.succeeded, True) + self.assertEquals(rule.error, None) + + def test_client_catches_stream_denied_exception(self): + rule = CollectionRuleFactory(error=None, succeeded=True) + stream = MagicMock(url="https://www.bbc.com") + stream.read.side_effect = StreamDeniedException + + with FaviconClient([(rule, stream)]) as client: + for rule, data in client: + pass + + stream.read.assert_called_once_with() + + # The favicon client does not set CollectionRule errors + self.assertEquals(rule.succeeded, True) + self.assertEquals(rule.error, None) + + def test_client_catches_stream_timed_out(self): + rule = CollectionRuleFactory(error=None, succeeded=True) + stream = MagicMock(url="https://www.bbc.com") + stream.read.side_effect = StreamTimeOutException + + with FaviconClient([(rule, stream)]) as client: + for rule, data in client: + pass + + stream.read.assert_called_once_with() + + # The favicon client does not set CollectionRule errors + self.assertEquals(rule.succeeded, True) + self.assertEquals(rule.error, None) diff --git a/src/newsreader/news/collection/tests/favicon/collector/__init__.py b/src/newsreader/news/collection/tests/favicon/collector/__init__.py new file mode 100644 index 0000000..8baa6e5 --- /dev/null +++ b/src/newsreader/news/collection/tests/favicon/collector/__init__.py @@ -0,0 +1 @@ +from .tests import * diff --git a/src/newsreader/news/collection/tests/favicon/collector/mocks.py b/src/newsreader/news/collection/tests/favicon/collector/mocks.py new file mode 100644 index 0000000..8e58167 --- /dev/null +++ b/src/newsreader/news/collection/tests/favicon/collector/mocks.py @@ -0,0 +1,159 @@ +from time import struct_time + +from bs4 import BeautifulSoup + +feed_mock = { + "bozo": 0, + "encoding": "utf-8", + "entries": [ + { + "guidislink": False, + "href": "", + "id": "https://www.bbc.co.uk/news/world-us-canada-48338168", + "link": "https://www.bbc.co.uk/news/world-us-canada-48338168", + "links": [ + { + "href": "https://www.bbc.co.uk/news/world-us-canada-48338168", + "rel": "alternate", + "type": "text/html", + } + ], + "media_thumbnail": [ + { + "height": "1152", + "url": "http://c.files.bbci.co.uk/7605/production/_107031203_mediaitem107031202.jpg", + "width": "2048", + } + ], + "published": "Mon, 20 May 2019 16:07:37 GMT", + "published_parsed": struct_time((2019, 5, 20, 16, 7, 37, 0, 140, 0)), + "summary": "Foreign Minister Mohammad Javad Zarif says the US " + "president should try showing Iranians some respect.", + "summary_detail": { + "base": "http://feeds.bbci.co.uk/news/rss.xml", + "language": None, + "type": "text/html", + "value": "Foreign Minister Mohammad Javad " + "Zarif says the US president should " + "try showing Iranians some " + "respect.", + }, + "title": "Trump's genocidal taunts will not end Iran - Zarif", + "title_detail": { + "base": "http://feeds.bbci.co.uk/news/rss.xml", + "language": None, + "type": "text/plain", + "value": "Trump's genocidal taunts will not " "end Iran - Zarif", + }, + }, + { + "guidislink": False, + "href": "", + "id": "https://www.bbc.co.uk/news/technology-48334739", + "link": "https://www.bbc.co.uk/news/technology-48334739", + "links": [ + { + "href": "https://www.bbc.co.uk/news/technology-48334739", + "rel": "alternate", + "type": "text/html", + } + ], + "media_thumbnail": [ + { + "height": "432", + "url": "http://c.files.bbci.co.uk/4789/production/_107031381_mediaitem107028670.jpg", + "width": "768", + } + ], + "published": "Mon, 20 May 2019 12:19:19 GMT", + "published_parsed": struct_time((2019, 5, 20, 12, 19, 19, 0, 140, 0)), + "summary": "Google's move to end business ties with Huawei will " + "affect current devices and future purchases.", + "summary_detail": { + "base": "http://feeds.bbci.co.uk/news/rss.xml", + "language": None, + "type": "text/html", + "value": "Google's move to end business ties " + "with Huawei will affect current " + "devices and future purchases.", + }, + "title": "Huawei's Android loss: How it affects you", + "title_detail": { + "base": "http://feeds.bbci.co.uk/news/rss.xml", + "language": None, + "type": "text/plain", + "value": "Huawei's Android loss: How it " "affects you", + }, + }, + { + "guidislink": False, + "href": "", + "id": "https://www.bbc.co.uk/news/uk-england-birmingham-48339080", + "link": "https://www.bbc.co.uk/news/uk-england-birmingham-48339080", + "links": [ + { + "href": "https://www.bbc.co.uk/news/uk-england-birmingham-48339080", + "rel": "alternate", + "type": "text/html", + } + ], + "media_thumbnail": [ + { + "height": "549", + "url": "http://c.files.bbci.co.uk/11D67/production/_107036037_lgbtheadjpg.jpg", + "width": "976", + } + ], + "published": "Mon, 20 May 2019 16:32:38 GMT", + "published_parsed": struct_time((2019, 5, 20, 16, 32, 38, 0, 140, 0)), + "summary": "Police are investigating the messages while an MP " + "calls for a protest exclusion zone to protect " + "children.", + "summary_detail": { + "base": "http://feeds.bbci.co.uk/news/rss.xml", + "language": None, + "type": "text/html", + "value": "Police are investigating the " + "messages while an MP calls for a " + "protest exclusion zone to protect " + "children.", + }, + "title": "Birmingham head teacher threatened over LGBT lessons", + "title_detail": { + "base": "http://feeds.bbci.co.uk/news/rss.xml", + "language": None, + "type": "text/plain", + "value": "Birmingham head teacher threatened " "over LGBT lessons", + }, + }, + ], + "feed": { + "image": { + "href": "https://news.bbcimg.co.uk/nol/shared/img/bbc_news_120x60.gif", + "link": "https://www.bbc.co.uk/news/", + "title": "BBC News - Home", + "language": "en-gb", + "link": "https://www.bbc.co.uk/news/", + }, + "link": "https://www.bbc.co.uk/news/", + "links": [{"href": "https://www.bbc.co.uk/news/", "rel": "alternate", "type": "text/html"}], + "title": "BBC News - Home", + }, + "href": "http://feeds.bbci.co.uk/news/rss.xml", + "status": 200, + "version": "rss20", +} + +website_mock = BeautifulSoup( + """ + + + + + +
+ + + """, + "lxml", +) diff --git a/src/newsreader/news/collection/tests/favicon/collector/tests.py b/src/newsreader/news/collection/tests/favicon/collector/tests.py new file mode 100644 index 0000000..6554de4 --- /dev/null +++ b/src/newsreader/news/collection/tests/favicon/collector/tests.py @@ -0,0 +1,147 @@ +from unittest.mock import MagicMock, patch + +import pytz + +from bs4 import BeautifulSoup + +from .mocks import feed_mock, website_mock + +from django.test import TestCase +from django.utils import timezone + +from newsreader.news.collection.exceptions import ( + StreamDeniedException, + StreamException, + StreamForbiddenException, + StreamNotFoundException, + StreamParseException, + StreamTimeOutException, +) +from newsreader.news.collection.favicon import FaviconCollector +from newsreader.news.collection.tests.factories import CollectionRuleFactory + + +class FaviconCollectorTestCase(TestCase): + def setUp(self): + self.maxDiff = None + + self.patched_feed_client = patch("newsreader.news.collection.favicon.FeedClient.__enter__") + self.mocked_feed_client = self.patched_feed_client.start() + + self.patched_website_read = patch("newsreader.news.collection.favicon.WebsiteStream.read") + self.mocked_website_read = self.patched_website_read.start() + + def tearDown(self): + patch.stopall() + + def test_simple(self): + rule = CollectionRuleFactory(succeeded=True, error=None) + + self.mocked_feed_client.return_value = [(feed_mock, MagicMock(rule=rule))] + self.mocked_website_read.return_value = (website_mock, MagicMock()) + + collector = FaviconCollector() + collector.collect() + + rule.refresh_from_db() + + self.assertEquals(rule.succeeded, True) + self.assertEquals(rule.error, None) + self.assertEquals(rule.website_url, "https://www.bbc.co.uk/news/") + self.assertEquals(rule.favicon, "https://www.bbc.co.uk/news/favicon.ico") + + def test_empty_stream(self): + rule = CollectionRuleFactory(succeeded=True, error=None) + + self.mocked_feed_client.return_value = [(feed_mock, MagicMock(rule=rule))] + self.mocked_website_read.return_value = (BeautifulSoup("", "lxml"), MagicMock()) + + collector = FaviconCollector() + collector.collect() + + rule.refresh_from_db() + + self.assertEquals(rule.succeeded, True) + self.assertEquals(rule.error, None) + self.assertEquals(rule.website_url, "https://www.bbc.co.uk/news/") + self.assertEquals(rule.favicon, None) + + def test_not_found(self): + rule = CollectionRuleFactory(succeeded=True, error=None) + + self.mocked_feed_client.return_value = [(feed_mock, MagicMock(rule=rule))] + self.mocked_website_read.side_effect = StreamNotFoundException + + collector = FaviconCollector() + collector.collect() + + rule.refresh_from_db() + + self.assertEquals(rule.succeeded, True) + self.assertEquals(rule.error, None) + self.assertEquals(rule.website_url, "https://www.bbc.co.uk/news/") + self.assertEquals(rule.favicon, None) + + def test_denied(self): + rule = CollectionRuleFactory(succeeded=True, error=None) + + self.mocked_feed_client.return_value = [(feed_mock, MagicMock(rule=rule))] + self.mocked_website_read.side_effect = StreamDeniedException + + collector = FaviconCollector() + collector.collect() + + rule.refresh_from_db() + + self.assertEquals(rule.succeeded, True) + self.assertEquals(rule.error, None) + self.assertEquals(rule.website_url, "https://www.bbc.co.uk/news/") + self.assertEquals(rule.favicon, None) + + def test_forbidden(self): + rule = CollectionRuleFactory(succeeded=True, error=None) + + self.mocked_feed_client.return_value = [(feed_mock, MagicMock(rule=rule))] + self.mocked_website_read.side_effect = StreamForbiddenException + + collector = FaviconCollector() + collector.collect() + + rule.refresh_from_db() + + self.assertEquals(rule.succeeded, True) + self.assertEquals(rule.error, None) + self.assertEquals(rule.website_url, "https://www.bbc.co.uk/news/") + self.assertEquals(rule.favicon, None) + + def test_timed_out(self): + rule = CollectionRuleFactory(succeeded=True, error=None) + + self.mocked_feed_client.return_value = [(feed_mock, MagicMock(rule=rule))] + self.mocked_website_read.side_effect = StreamTimeOutException + + collector = FaviconCollector() + collector.collect() + + rule.refresh_from_db() + + self.assertEquals(rule.succeeded, True) + self.assertEquals(rule.error, None) + self.assertEquals(rule.website_url, "https://www.bbc.co.uk/news/") + self.assertEquals(rule.favicon, None) + + def test_wrong_stream_content_type(self): + rule = CollectionRuleFactory(succeeded=True, error=None) + + self.mocked_feed_client.return_value = [(feed_mock, MagicMock(rule=rule))] + self.mocked_website_read.side_effect = StreamParseException + + collector = FaviconCollector() + collector.collect() + + rule.refresh_from_db() + + self.assertEquals(rule.succeeded, True) + self.assertEquals(rule.error, None) + self.assertEquals(rule.website_url, "https://www.bbc.co.uk/news/") + self.assertEquals(rule.favicon, None) diff --git a/src/newsreader/news/collection/tests/feed/builder/tests.py b/src/newsreader/news/collection/tests/feed/builder/tests.py index 6e4e44d..49a130a 100644 --- a/src/newsreader/news/collection/tests/feed/builder/tests.py +++ b/src/newsreader/news/collection/tests/feed/builder/tests.py @@ -5,26 +5,27 @@ import pytz from freezegun import freeze_time +from .mocks import * + from django.test import TestCase from django.utils import timezone from newsreader.news.collection.feed import FeedBuilder from newsreader.news.collection.tests.factories import CollectionRuleFactory -from newsreader.news.collection.tests.feed.builder.mocks import * from newsreader.news.posts.models import Post from newsreader.news.posts.tests.factories import PostFactory class FeedBuilderTestCase(TestCase): def setUp(self): - pass + self.maxDiff = None def test_basic_entry(self): builder = FeedBuilder rule = CollectionRuleFactory() mock_stream = MagicMock(rule=rule) - with builder((simple_mock, mock_stream,)) as builder: + with builder((simple_mock, mock_stream)) as builder: builder.save() post = Post.objects.get() @@ -36,26 +37,19 @@ class FeedBuilderTestCase(TestCase): self.assertEquals(Post.objects.count(), 1) self.assertEquals( - post.remote_identifier, - "https://www.bbc.co.uk/news/world-us-canada-48338168" + post.remote_identifier, "https://www.bbc.co.uk/news/world-us-canada-48338168" ) - self.assertEquals( - post.url, - "https://www.bbc.co.uk/news/world-us-canada-48338168" - ) + self.assertEquals(post.url, "https://www.bbc.co.uk/news/world-us-canada-48338168") - self.assertEquals( - post.title, - "Trump's 'genocidal taunts' will not end Iran - Zarif" - ) + self.assertEquals(post.title, "Trump's 'genocidal taunts' will not end Iran - Zarif") def test_multiple_entries(self): builder = FeedBuilder rule = CollectionRuleFactory() mock_stream = MagicMock(rule=rule) - with builder((multiple_mock, mock_stream,)) as builder: + with builder((multiple_mock, mock_stream)) as builder: builder.save() posts = Post.objects.order_by("id") @@ -70,19 +64,12 @@ class FeedBuilderTestCase(TestCase): self.assertEquals(first_post.publication_date, aware_date) self.assertEquals( - first_post.remote_identifier, - "https://www.bbc.co.uk/news/world-us-canada-48338168" + first_post.remote_identifier, "https://www.bbc.co.uk/news/world-us-canada-48338168" ) - self.assertEquals( - first_post.url, - "https://www.bbc.co.uk/news/world-us-canada-48338168" - ) + self.assertEquals(first_post.url, "https://www.bbc.co.uk/news/world-us-canada-48338168") - self.assertEquals( - first_post.title, - "Trump's 'genocidal taunts' will not end Iran - Zarif" - ) + self.assertEquals(first_post.title, "Trump's 'genocidal taunts' will not end Iran - Zarif") d = datetime.combine(date(2019, 5, 20), time(hour=12, minute=19, second=19)) aware_date = pytz.utc.localize(d) @@ -90,26 +77,19 @@ class FeedBuilderTestCase(TestCase): self.assertEquals(second_post.publication_date, aware_date) self.assertEquals( - second_post.remote_identifier, - "https://www.bbc.co.uk/news/technology-48334739" + second_post.remote_identifier, "https://www.bbc.co.uk/news/technology-48334739" ) - self.assertEquals( - second_post.url, - "https://www.bbc.co.uk/news/technology-48334739" - ) + self.assertEquals(second_post.url, "https://www.bbc.co.uk/news/technology-48334739") - self.assertEquals( - second_post.title, - "Huawei's Android loss: How it affects you" - ) + self.assertEquals(second_post.title, "Huawei's Android loss: How it affects you") def test_entry_without_remote_identifier(self): builder = FeedBuilder rule = CollectionRuleFactory() mock_stream = MagicMock(rule=rule) - with builder((mock_without_identifier, mock_stream,)) as builder: + with builder((mock_without_identifier, mock_stream)) as builder: builder.save() posts = Post.objects.order_by("id") @@ -124,15 +104,9 @@ class FeedBuilderTestCase(TestCase): self.assertEquals(first_post.remote_identifier, None) - self.assertEquals( - first_post.url, - "https://www.bbc.co.uk/news/world-us-canada-48338168" - ) + self.assertEquals(first_post.url, "https://www.bbc.co.uk/news/world-us-canada-48338168") - self.assertEquals( - first_post.title, - "Trump's 'genocidal taunts' will not end Iran - Zarif" - ) + self.assertEquals(first_post.title, "Trump's 'genocidal taunts' will not end Iran - Zarif") @freeze_time("2019-10-30 12:30:00") def test_entry_without_publication_date(self): @@ -140,7 +114,7 @@ class FeedBuilderTestCase(TestCase): rule = CollectionRuleFactory() mock_stream = MagicMock(rule=rule) - with builder((mock_without_publish_date, mock_stream,)) as builder: + with builder((mock_without_publish_date, mock_stream)) as builder: builder.save() posts = Post.objects.order_by("id") @@ -151,14 +125,12 @@ class FeedBuilderTestCase(TestCase): self.assertEquals(first_post.created, timezone.now()) self.assertEquals( - first_post.remote_identifier, - 'https://www.bbc.co.uk/news/world-us-canada-48338168' + first_post.remote_identifier, "https://www.bbc.co.uk/news/world-us-canada-48338168" ) self.assertEquals(second_post.created, timezone.now()) self.assertEquals( - second_post.remote_identifier, - 'https://www.bbc.co.uk/news/technology-48334739' + second_post.remote_identifier, "https://www.bbc.co.uk/news/technology-48334739" ) @freeze_time("2019-10-30 12:30:00") @@ -167,7 +139,7 @@ class FeedBuilderTestCase(TestCase): rule = CollectionRuleFactory() mock_stream = MagicMock(rule=rule) - with builder((mock_without_url, mock_stream,)) as builder: + with builder((mock_without_url, mock_stream)) as builder: builder.save() posts = Post.objects.order_by("id") @@ -178,14 +150,12 @@ class FeedBuilderTestCase(TestCase): self.assertEquals(first_post.created, timezone.now()) self.assertEquals( - first_post.remote_identifier, - 'https://www.bbc.co.uk/news/world-us-canada-48338168' + first_post.remote_identifier, "https://www.bbc.co.uk/news/world-us-canada-48338168" ) self.assertEquals(second_post.created, timezone.now()) self.assertEquals( - second_post.remote_identifier, - 'https://www.bbc.co.uk/news/technology-48334739' + second_post.remote_identifier, "https://www.bbc.co.uk/news/technology-48334739" ) @freeze_time("2019-10-30 12:30:00") @@ -194,7 +164,7 @@ class FeedBuilderTestCase(TestCase): rule = CollectionRuleFactory() mock_stream = MagicMock(rule=rule) - with builder((mock_without_body, mock_stream,)) as builder: + with builder((mock_without_body, mock_stream)) as builder: builder.save() posts = Post.objects.order_by("id") @@ -205,14 +175,13 @@ class FeedBuilderTestCase(TestCase): self.assertEquals(first_post.created, timezone.now()) self.assertEquals( - first_post.remote_identifier, - 'https://www.bbc.co.uk/news/world-us-canada-48338168' + first_post.remote_identifier, "https://www.bbc.co.uk/news/world-us-canada-48338168" ) self.assertEquals(second_post.created, timezone.now()) self.assertEquals( second_post.remote_identifier, - 'https://www.bbc.co.uk/news/uk-england-birmingham-48339080' + "https://www.bbc.co.uk/news/uk-england-birmingham-48339080", ) @freeze_time("2019-10-30 12:30:00") @@ -221,7 +190,7 @@ class FeedBuilderTestCase(TestCase): rule = CollectionRuleFactory() mock_stream = MagicMock(rule=rule) - with builder((mock_without_author, mock_stream,)) as builder: + with builder((mock_without_author, mock_stream)) as builder: builder.save() posts = Post.objects.order_by("id") @@ -232,14 +201,12 @@ class FeedBuilderTestCase(TestCase): self.assertEquals(first_post.created, timezone.now()) self.assertEquals( - first_post.remote_identifier, - 'https://www.bbc.co.uk/news/world-us-canada-48338168' + first_post.remote_identifier, "https://www.bbc.co.uk/news/world-us-canada-48338168" ) self.assertEquals(second_post.created, timezone.now()) self.assertEquals( - second_post.remote_identifier, - 'https://www.bbc.co.uk/news/technology-48334739' + second_post.remote_identifier, "https://www.bbc.co.uk/news/technology-48334739" ) def test_empty_entries(self): @@ -247,7 +214,7 @@ class FeedBuilderTestCase(TestCase): rule = CollectionRuleFactory() mock_stream = MagicMock(rule=rule) - with builder((mock_without_entries, mock_stream,)) as builder: + with builder((mock_without_entries, mock_stream)) as builder: builder.save() self.assertEquals(Post.objects.count(), 0) @@ -265,7 +232,7 @@ class FeedBuilderTestCase(TestCase): remote_identifier="a5479c66-8fae-11e9-8422-00163ef6bee7", rule=rule ) - with builder((mock_with_update_entries, mock_stream,)) as builder: + with builder((mock_with_update_entries, mock_stream)) as builder: builder.save() self.assertEquals(Post.objects.count(), 3) @@ -274,21 +241,17 @@ class FeedBuilderTestCase(TestCase): existing_second_post.refresh_from_db() self.assertEquals( - existing_first_post.title, - "Trump's 'genocidal taunts' will not end Iran - Zarif" + existing_first_post.title, "Trump's 'genocidal taunts' will not end Iran - Zarif" ) - self.assertEquals( - existing_second_post.title, - "Huawei's Android loss: How it affects you" - ) + self.assertEquals(existing_second_post.title, "Huawei's Android loss: How it affects you") def test_html_sanitizing(self): builder = FeedBuilder rule = CollectionRuleFactory() mock_stream = MagicMock(rule=rule) - with builder((mock_with_html, mock_stream,)) as builder: + with builder((mock_with_html, mock_stream)) as builder: builder.save() post = Post.objects.get() diff --git a/src/newsreader/news/collection/tests/feed/client/tests.py b/src/newsreader/news/collection/tests/feed/client/tests.py index e236666..e49762d 100644 --- a/src/newsreader/news/collection/tests/feed/client/tests.py +++ b/src/newsreader/news/collection/tests/feed/client/tests.py @@ -1,5 +1,7 @@ from unittest.mock import MagicMock, patch +from .mocks import simple_mock + from django.test import TestCase from django.utils import timezone @@ -7,15 +9,17 @@ from newsreader.news.collection.exceptions import ( StreamDeniedException, StreamException, StreamNotFoundException, + StreamParseException, StreamTimeOutException, ) from newsreader.news.collection.feed import FeedClient from newsreader.news.collection.tests.factories import CollectionRuleFactory -from newsreader.news.collection.tests.feed.client.mocks import simple_mock class FeedClientTestCase(TestCase): def setUp(self): + self.maxDiff = None + self.patched_read = patch("newsreader.news.collection.feed.FeedStream.read") self.mocked_read = self.patched_read.start() @@ -85,3 +89,16 @@ class FeedClientTestCase(TestCase): self.assertEquals(stream.rule.succeeded, False) self.mocked_read.assert_called_once_with() + + def test_client_catches_stream_parse_exception(self): + rule = CollectionRuleFactory.create() + mock_stream = MagicMock(rule=rule) + self.mocked_read.side_effect = StreamParseException("Stream has wrong contents") + + with FeedClient([rule]) as client: + for data, stream in client: + self.assertEquals(data, {"entries": []}) + self.assertEquals(stream.rule.error, "Stream has wrong contents") + self.assertEquals(stream.rule.succeeded, False) + + self.mocked_read.assert_called_once_with() diff --git a/src/newsreader/news/collection/tests/feed/collector/tests.py b/src/newsreader/news/collection/tests/feed/collector/tests.py index db95ccb..0d14730 100644 --- a/src/newsreader/news/collection/tests/feed/collector/tests.py +++ b/src/newsreader/news/collection/tests/feed/collector/tests.py @@ -6,17 +6,26 @@ import pytz from freezegun import freeze_time -from django.test import TestCase -from django.utils import timezone - -from newsreader.news.collection.feed import FeedCollector -from newsreader.news.collection.tests.factories import CollectionRuleFactory -from newsreader.news.collection.tests.feed.collector.mocks import ( +from .mocks import ( duplicate_mock, empty_mock, multiple_mock, multiple_update_mock, ) + +from django.test import TestCase +from django.utils import timezone + +from newsreader.news.collection.exceptions import ( + StreamDeniedException, + StreamException, + StreamForbiddenException, + StreamNotFoundException, + StreamParseException, + StreamTimeOutException, +) +from newsreader.news.collection.feed import FeedCollector +from newsreader.news.collection.tests.factories import CollectionRuleFactory from newsreader.news.collection.utils import build_publication_date from newsreader.news.posts.models import Post from newsreader.news.posts.tests.factories import PostFactory @@ -24,14 +33,12 @@ from newsreader.news.posts.tests.factories import PostFactory class FeedCollectorTestCase(TestCase): def setUp(self): - self.patched_get = patch( - 'newsreader.news.collection.feed.requests.get' - ) - self.mocked_get = self.patched_get.start() + self.maxDiff = None - self.patched_parse = patch( - 'newsreader.news.collection.feed.FeedStream.parse' - ) + self.patched_get = patch("newsreader.news.collection.feed.fetch") + self.mocked_fetch = self.patched_get.start() + + self.patched_parse = patch("newsreader.news.collection.feed.FeedStream.parse") self.mocked_parse = self.patched_parse.start() def tearDown(self): @@ -54,7 +61,7 @@ class FeedCollectorTestCase(TestCase): @freeze_time("2019-10-30 12:30:00") def test_emtpy_batch(self): - self.mocked_get.return_value = MagicMock(status_code=200) + self.mocked_fetch.return_value = MagicMock() self.mocked_parse.return_value = empty_mock rule = CollectionRuleFactory() @@ -69,7 +76,7 @@ class FeedCollectorTestCase(TestCase): self.assertEquals(rule.last_suceeded, timezone.now()) def test_not_found(self): - self.mocked_get.return_value = MagicMock(status_code=404) + self.mocked_fetch.side_effect = StreamNotFoundException rule = CollectionRuleFactory() collector = FeedCollector() @@ -82,7 +89,7 @@ class FeedCollectorTestCase(TestCase): self.assertEquals(rule.error, "Stream not found") def test_denied(self): - self.mocked_get.return_value = MagicMock(status_code=404) + self.mocked_fetch.side_effect = StreamDeniedException last_suceeded = timezone.make_aware( datetime.combine(date=date(2019, 10, 30), time=time(12, 30)) ) @@ -95,11 +102,11 @@ class FeedCollectorTestCase(TestCase): self.assertEquals(Post.objects.count(), 0) self.assertEquals(rule.succeeded, False) - self.assertEquals(rule.error, "Stream not found") + self.assertEquals(rule.error, "Stream does not have sufficient permissions") self.assertEquals(rule.last_suceeded, last_suceeded) def test_forbidden(self): - self.mocked_get.return_value = MagicMock(status_code=403) + self.mocked_fetch.side_effect = StreamForbiddenException last_suceeded = timezone.make_aware( datetime.combine(date=date(2019, 10, 30), time=time(12, 30)) ) @@ -116,7 +123,7 @@ class FeedCollectorTestCase(TestCase): self.assertEquals(rule.last_suceeded, last_suceeded) def test_timed_out(self): - self.mocked_get.return_value = MagicMock(status_code=408) + self.mocked_fetch.side_effect = StreamTimeOutException last_suceeded = timezone.make_aware( datetime.combine(date=date(2019, 10, 30), time=time(12, 30)) ) @@ -138,8 +145,7 @@ class FeedCollectorTestCase(TestCase): rule = CollectionRuleFactory() _, aware_datetime = build_publication_date( - struct_time((2019, 5, 20, 16, 7, 37, 0, 140, 0)), - pytz.utc + struct_time((2019, 5, 20, 16, 7, 37, 0, 140, 0)), pytz.utc ) first_post = PostFactory( @@ -148,12 +154,11 @@ class FeedCollectorTestCase(TestCase): body="Foreign Minister Mohammad Javad Zarif says the US " "president should try showing Iranians some respect.", publication_date=aware_datetime, - rule=rule + rule=rule, ) _, aware_datetime = build_publication_date( - struct_time((2019, 5, 20, 12, 19, 19, 0, 140, 0,)), - pytz.utc + struct_time((2019, 5, 20, 12, 19, 19, 0, 140, 0)), pytz.utc ) second_post = PostFactory( @@ -162,22 +167,21 @@ class FeedCollectorTestCase(TestCase): body="Google's move to end business ties with Huawei will " "affect current devices and future purchases.", publication_date=aware_datetime, - rule=rule + rule=rule, ) _, aware_datetime = build_publication_date( - struct_time((2019, 5, 20, 16, 32, 38, 0, 140, 0)), - pytz.utc + struct_time((2019, 5, 20, 16, 32, 38, 0, 140, 0)), pytz.utc ) third_post = PostFactory( url="https://www.bbc.co.uk/news/uk-england-birmingham-48339080", title="Birmingham head teacher threatened over LGBT lessons", body="Police are investigating the messages while an MP " - "calls for a protest exclusion zone \"to protect " - "children\".", + 'calls for a protest exclusion zone "to protect ' + 'children".', publication_date=aware_datetime, - rule=rule + rule=rule, ) collector = FeedCollector() @@ -201,7 +205,7 @@ class FeedCollectorTestCase(TestCase): title="Trump", body="Foreign Minister Mohammad Javad Zarif", publication_date=timezone.now(), - rule=rule + rule=rule, ) second_post = PostFactory( @@ -210,7 +214,7 @@ class FeedCollectorTestCase(TestCase): title="Huawei's Android loss: How it affects you", body="Google's move to end business ties with Huawei will", publication_date=timezone.now(), - rule=rule + rule=rule, ) third_post = PostFactory( @@ -219,7 +223,7 @@ class FeedCollectorTestCase(TestCase): title="Birmingham head teacher threatened over LGBT lessons", body="Police are investigating the messages while an MP", publication_date=timezone.now(), - rule=rule + rule=rule, ) collector = FeedCollector() @@ -235,17 +239,8 @@ class FeedCollectorTestCase(TestCase): self.assertEquals(rule.last_suceeded, timezone.now()) self.assertEquals(rule.error, None) - self.assertEquals( - first_post.title, - "Trump's 'genocidal taunts' will not end Iran - Zarif" - ) + self.assertEquals(first_post.title, "Trump's 'genocidal taunts' will not end Iran - Zarif") - self.assertEquals( - second_post.title, - "Huawei's Android loss: How it affects you" - ) + self.assertEquals(second_post.title, "Huawei's Android loss: How it affects you") - self.assertEquals( - third_post.title, - 'Birmingham head teacher threatened over LGBT lessons' - ) + self.assertEquals(third_post.title, "Birmingham head teacher threatened over LGBT lessons") diff --git a/src/newsreader/news/collection/tests/feed/duplicate_handler/tests.py b/src/newsreader/news/collection/tests/feed/duplicate_handler/tests.py index a8600af..eff63cc 100644 --- a/src/newsreader/news/collection/tests/feed/duplicate_handler/tests.py +++ b/src/newsreader/news/collection/tests/feed/duplicate_handler/tests.py @@ -9,7 +9,7 @@ from newsreader.news.posts.tests.factories import PostFactory class FeedDuplicateHandlerTestCase(TestCase): def setUp(self): - pass + self.maxDiff = None def test_duplicate_entries_with_remote_identifiers(self): rule = CollectionRuleFactory() @@ -19,7 +19,7 @@ class FeedDuplicateHandlerTestCase(TestCase): new_post = PostFactory.build( remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7", title="title got updated", - rule=rule + rule=rule, ) with FeedDuplicateHandler(rule) as duplicate_handler: @@ -45,7 +45,7 @@ class FeedDuplicateHandlerTestCase(TestCase): body="Google's move to end business ties with Huawei will affect current devices", publication_date=publication_date, remote_identifier=None, - rule=rule + rule=rule, ) new_post = PostFactory.build( url="https://www.bbc.co.uk/news/uk-england-birmingham-48339080", @@ -53,7 +53,7 @@ class FeedDuplicateHandlerTestCase(TestCase): body="Google's move to end business ties with Huawei will affect current devices", publication_date=publication_date, remote_identifier=None, - rule=rule + rule=rule, ) with FeedDuplicateHandler(rule) as duplicate_handler: diff --git a/src/newsreader/news/collection/tests/feed/stream/mocks.py b/src/newsreader/news/collection/tests/feed/stream/mocks.py index 5853eb7..a098383 100644 --- a/src/newsreader/news/collection/tests/feed/stream/mocks.py +++ b/src/newsreader/news/collection/tests/feed/stream/mocks.py @@ -1,61 +1,62 @@ from time import struct_time simple_mock = { - 'bozo': 0, - 'encoding': 'utf-8', - 'entries': [{ - 'guidislink': False, - 'href': '', - 'id': 'https://www.bbc.co.uk/news/world-us-canada-48338168', - 'link': 'https://www.bbc.co.uk/news/world-us-canada-48338168', - 'links': [{ - 'href': 'https://www.bbc.co.uk/news/world-us-canada-48338168', - 'rel': 'alternate', - 'type': 'text/html' - }], - 'media_thumbnail': [{ - 'height': '1152', - 'url': 'http://c.files.bbci.co.uk/7605/production/_107031203_mediaitem107031202.jpg', - 'width': '2048' - }], - 'published': 'Mon, 20 May 2019 16:07:37 GMT', - 'published_parsed': struct_time((2019, 5, 20, 16, 7, 37, 0, 140, 0)), - 'summary': 'Foreign Minister Mohammad Javad Zarif says the US ' - 'president should try showing Iranians some respect.', - 'summary_detail': { - 'base': 'http://feeds.bbci.co.uk/news/rss.xml', - 'language': None, - 'type': 'text/html', - 'value': 'Foreign Minister Mohammad Javad ' - 'Zarif says the US president should ' - 'try showing Iranians some ' - 'respect.' - }, - 'title': "Trump's 'genocidal taunts' will not end Iran - Zarif", - 'title_detail': { - 'base': 'http://feeds.bbci.co.uk/news/rss.xml', - 'language': None, - 'type': 'text/plain', - 'value': "Trump's 'genocidal taunts' will not " - 'end Iran - Zarif' - } - }], - 'feed': { - 'image': { - 'href': 'https://news.bbcimg.co.uk/nol/shared/img/bbc_news_120x60.gif', - 'link': 'https://www.bbc.co.uk/news/', - 'title': 'BBC News - Home', - 'language': 'en-gb', - 'link': 'https://www.bbc.co.uk/news/' + "bozo": 1, + "encoding": "utf-8", + "entries": [ + { + "guidislink": False, + "href": "", + "id": "https://www.bbc.co.uk/news/world-us-canada-48338168", + "link": "https://www.bbc.co.uk/news/world-us-canada-48338168", + "links": [ + { + "href": "https://www.bbc.co.uk/news/world-us-canada-48338168", + "rel": "alternate", + "type": "text/html", + } + ], + "media_thumbnail": [ + { + "height": "1152", + "url": "http://c.files.bbci.co.uk/7605/production/_107031203_mediaitem107031202.jpg", + "width": "2048", + } + ], + "published": "Mon, 20 May 2019 16:07:37 GMT", + "published_parsed": struct_time((2019, 5, 20, 16, 7, 37, 0, 140, 0)), + "summary": "Foreign Minister Mohammad Javad Zarif says the US " + "president should try showing Iranians some respect.", + "summary_detail": { + "base": "http://feeds.bbci.co.uk/news/rss.xml", + "language": None, + "type": "text/html", + "value": "Foreign Minister Mohammad Javad " + "Zarif says the US president should " + "try showing Iranians some " + "respect.", }, - 'links': [{ - 'href': 'https://www.bbc.co.uk/news/', - 'rel': 'alternate', - 'type': 'text/html' - }], - 'title': 'BBC News - Home', + "title": "Trump's 'genocidal taunts' will not end Iran - Zarif", + "title_detail": { + "base": "http://feeds.bbci.co.uk/news/rss.xml", + "language": None, + "type": "text/plain", + "value": "Trump's 'genocidal taunts' will not " "end Iran - Zarif", + }, + } + ], + "feed": { + "image": { + "href": "https://news.bbcimg.co.uk/nol/shared/img/bbc_news_120x60.gif", + "link": "https://www.bbc.co.uk/news/", + "title": "BBC News - Home", + "language": "en-gb", + "link": "https://www.bbc.co.uk/news/", + }, + "links": [{"href": "https://www.bbc.co.uk/news/", "rel": "alternate", "type": "text/html"}], + "title": "BBC News - Home", }, - 'href': 'http://feeds.bbci.co.uk/news/rss.xml', - 'status': 200, - 'version': 'rss20' + "href": "http://feeds.bbci.co.uk/news/rss.xml", + "status": 200, + "version": "rss20", } diff --git a/src/newsreader/news/collection/tests/feed/stream/tests.py b/src/newsreader/news/collection/tests/feed/stream/tests.py index 6e15194..3a7811b 100644 --- a/src/newsreader/news/collection/tests/feed/stream/tests.py +++ b/src/newsreader/news/collection/tests/feed/stream/tests.py @@ -1,5 +1,7 @@ from unittest.mock import MagicMock, patch +from .mocks import simple_mock + from django.test import TestCase from django.utils import timezone @@ -13,36 +15,32 @@ from newsreader.news.collection.exceptions import ( ) from newsreader.news.collection.feed import FeedStream from newsreader.news.collection.tests.factories import CollectionRuleFactory -from newsreader.news.collection.tests.feed.stream.mocks import simple_mock class FeedStreamTestCase(TestCase): def setUp(self): - self.patched_get = patch( - 'newsreader.news.collection.feed.requests.get' - ) - self.mocked_get = self.patched_get.start() + self.maxDiff = None - self.patched_parse = patch( - 'newsreader.news.collection.feed.FeedStream.parse' - ) - self.mocked_parse = self.patched_parse.start() + self.patched_fetch = patch("newsreader.news.collection.feed.fetch") + self.mocked_fetch = self.patched_fetch.start() def tearDown(self): patch.stopall() def test_simple_stream(self): - self.mocked_parse.return_value = simple_mock + self.mocked_fetch.return_value = MagicMock(content=simple_mock) rule = CollectionRuleFactory() stream = FeedStream(rule) - return_value = stream.read() - self.mocked_get.assert_called_once_with(rule.url) - self.assertEquals(return_value, (simple_mock, stream)) + data, stream = stream.read() + + self.mocked_fetch.assert_called_once_with(rule.url) + self.assertEquals(data["entries"], data["entries"]) + self.assertEquals(stream, stream) def test_stream_raises_exception(self): - self.mocked_parse.side_effect = StreamException + self.mocked_fetch.side_effect = StreamException rule = CollectionRuleFactory() stream = FeedStream(rule) @@ -50,10 +48,10 @@ class FeedStreamTestCase(TestCase): with self.assertRaises(StreamException): stream.read() - self.mocked_get.assert_called_once_with(rule.url) + self.mocked_fetch.assert_called_once_with(rule.url) def test_stream_raises_denied_exception(self): - self.mocked_get.return_value = MagicMock(status_code=401) + self.mocked_fetch.side_effect = StreamDeniedException rule = CollectionRuleFactory() stream = FeedStream(rule) @@ -61,10 +59,10 @@ class FeedStreamTestCase(TestCase): with self.assertRaises(StreamDeniedException): stream.read() - self.mocked_get.assert_called_once_with(rule.url) + self.mocked_fetch.assert_called_once_with(rule.url) def test_stream_raises_not_found_exception(self): - self.mocked_get.return_value = MagicMock(status_code=404) + self.mocked_fetch.side_effect = StreamNotFoundException rule = CollectionRuleFactory() stream = FeedStream(rule) @@ -72,10 +70,10 @@ class FeedStreamTestCase(TestCase): with self.assertRaises(StreamNotFoundException): stream.read() - self.mocked_get.assert_called_once_with(rule.url) + self.mocked_fetch.assert_called_once_with(rule.url) def test_stream_raises_time_out_exception(self): - self.mocked_get.return_value = MagicMock(status_code=408) + self.mocked_fetch.side_effect = StreamTimeOutException rule = CollectionRuleFactory() stream = FeedStream(rule) @@ -83,10 +81,10 @@ class FeedStreamTestCase(TestCase): with self.assertRaises(StreamTimeOutException): stream.read() - self.mocked_get.assert_called_once_with(rule.url) + self.mocked_fetch.assert_called_once_with(rule.url) def test_stream_raises_forbidden_exception(self): - self.mocked_get.return_value = MagicMock(status_code=403) + self.mocked_fetch.side_effect = StreamForbiddenException rule = CollectionRuleFactory() stream = FeedStream(rule) @@ -94,13 +92,12 @@ class FeedStreamTestCase(TestCase): with self.assertRaises(StreamForbiddenException): stream.read() - self.mocked_get.assert_called_once_with(rule.url) + self.mocked_fetch.assert_called_once_with(rule.url) @patch("newsreader.news.collection.feed.parse") def test_stream_raises_parse_exception(self, mocked_parse): - self.mocked_get.return_value = MagicMock(status_code=200) + self.mocked_fetch.return_value = MagicMock() mocked_parse.side_effect = TypeError - self.patched_parse.stop() rule = CollectionRuleFactory() stream = FeedStream(rule) diff --git a/src/newsreader/news/collection/tests/mocks.py b/src/newsreader/news/collection/tests/mocks.py new file mode 100644 index 0000000..32ad699 --- /dev/null +++ b/src/newsreader/news/collection/tests/mocks.py @@ -0,0 +1,47 @@ +simple_mock = """ + + +
+

Clickbait

+
+ + +""" + +simple_feed_mock = { + "bozo": 0, + "encoding": "utf-8", + "feed": { + "image": { + "href": "https://news.bbcimg.co.uk/nol/shared/img/bbc_news_120x60.gif", + "link": "https://www.bbc.co.uk/news/", + "title": "BBC News - Home", + "language": "en-gb", + "link": "https://www.bbc.co.uk/news/", + }, + "link": "https://www.bbc.co.uk/news/", + "links": [{"href": "https://www.bbc.co.uk/news/", "rel": "alternate", "type": "text/html"}], + "title": "BBC News - Home", + }, + "href": "http://feeds.bbci.co.uk/news/rss.xml", + "status": 200, + "version": "rss20", +} + +feed_mock_without_link = { + "bozo": 0, + "encoding": "utf-8", + "feed": { + "image": { + "href": "https://news.bbcimg.co.uk/nol/shared/img/bbc_news_120x60.gif", + "link": "https://www.bbc.co.uk/news/", + "title": "BBC News - Home", + "language": "en-gb", + "link": "https://www.bbc.co.uk/news/", + }, + "title": "BBC News - Home", + }, + "href": "http://feeds.bbci.co.uk/news/rss.xml", + "status": 200, + "version": "rss20", +} diff --git a/src/newsreader/news/collection/tests/tests.py b/src/newsreader/news/collection/tests/tests.py new file mode 100644 index 0000000..08bc4e0 --- /dev/null +++ b/src/newsreader/news/collection/tests/tests.py @@ -0,0 +1,134 @@ +from unittest.mock import MagicMock, patch + +from bs4 import BeautifulSoup + +from .mocks import feed_mock_without_link, simple_feed_mock, simple_mock + +from django.test import TestCase + +from newsreader.news.collection.base import URLBuilder, WebsiteStream +from newsreader.news.collection.exceptions import ( + StreamDeniedException, + StreamException, + StreamForbiddenException, + StreamNotFoundException, + StreamParseException, + StreamTimeOutException, +) +from newsreader.news.collection.tests.factories import CollectionRuleFactory + + +class WebsiteStreamTestCase(TestCase): + def setUp(self): + self.patched_fetch = patch("newsreader.news.collection.base.fetch") + self.mocked_fetch = self.patched_fetch.start() + + def tearDown(self): + patch.stopall() + + def test_simple(self): + self.mocked_fetch.return_value = MagicMock(content=simple_mock) + + rule = CollectionRuleFactory() + stream = WebsiteStream(rule.url) + return_value = stream.read() + + self.mocked_fetch.assert_called_once_with(rule.url) + self.assertEquals(return_value, (BeautifulSoup(simple_mock, "lxml"), stream)) + + def test_raises_exception(self): + self.mocked_fetch.side_effect = StreamException + + rule = CollectionRuleFactory() + stream = WebsiteStream(rule.url) + + with self.assertRaises(StreamException): + stream.read() + + self.mocked_fetch.assert_called_once_with(rule.url) + + def test_raises_denied_exception(self): + self.mocked_fetch.side_effect = StreamDeniedException + + rule = CollectionRuleFactory() + stream = WebsiteStream(rule.url) + + with self.assertRaises(StreamDeniedException): + stream.read() + + self.mocked_fetch.assert_called_once_with(rule.url) + + def test_raises_stream_not_found_exception(self): + self.mocked_fetch.side_effect = StreamNotFoundException + + rule = CollectionRuleFactory() + stream = WebsiteStream(rule.url) + + with self.assertRaises(StreamNotFoundException): + stream.read() + + self.mocked_fetch.assert_called_once_with(rule.url) + + def test_stream_raises_time_out_exception(self): + self.mocked_fetch.side_effect = StreamTimeOutException + + rule = CollectionRuleFactory() + stream = WebsiteStream(rule.url) + + with self.assertRaises(StreamTimeOutException): + stream.read() + + self.mocked_fetch.assert_called_once_with(rule.url) + + def test_stream_raises_forbidden_exception(self): + self.mocked_fetch.side_effect = StreamForbiddenException + + rule = CollectionRuleFactory() + stream = WebsiteStream(rule.url) + + with self.assertRaises(StreamForbiddenException): + stream.read() + + self.mocked_fetch.assert_called_once_with(rule.url) + + @patch("newsreader.news.collection.base.WebsiteStream.parse") + def test_stream_raises_parse_exception(self, mocked_parse): + self.mocked_fetch.return_value = MagicMock() + mocked_parse.side_effect = StreamParseException + + rule = CollectionRuleFactory() + stream = WebsiteStream(rule.url) + + with self.assertRaises(StreamParseException): + stream.read() + + self.mocked_fetch.assert_called_once_with(rule.url) + + +class URLBuilderTestCase(TestCase): + def test_simple(self): + initial_rule = CollectionRuleFactory() + + with URLBuilder((simple_feed_mock, MagicMock(rule=initial_rule))) as builder: + rule, url = builder.build() + + self.assertEquals(rule.pk, initial_rule.pk) + self.assertEquals(url, "https://www.bbc.co.uk/news/") + + def test_no_link(self): + initial_rule = CollectionRuleFactory() + + with URLBuilder((feed_mock_without_link, MagicMock(rule=initial_rule))) as builder: + rule, url = builder.build() + + self.assertEquals(rule.pk, initial_rule.pk) + self.assertEquals(url, None) + + def test_no_data(self): + initial_rule = CollectionRuleFactory() + + with URLBuilder((None, MagicMock(rule=initial_rule))) as builder: + rule, url = builder.build() + + self.assertEquals(rule.pk, initial_rule.pk) + self.assertEquals(url, None) diff --git a/src/newsreader/news/collection/tests/utils/__init__.py b/src/newsreader/news/collection/tests/utils/__init__.py new file mode 100644 index 0000000..8baa6e5 --- /dev/null +++ b/src/newsreader/news/collection/tests/utils/__init__.py @@ -0,0 +1 @@ +from .tests import * diff --git a/src/newsreader/news/collection/tests/utils/tests.py b/src/newsreader/news/collection/tests/utils/tests.py new file mode 100644 index 0000000..3c95df0 --- /dev/null +++ b/src/newsreader/news/collection/tests/utils/tests.py @@ -0,0 +1,57 @@ +from unittest.mock import MagicMock, patch + +from django.test import TestCase + +from newsreader.news.collection.exceptions import ( + StreamDeniedException, + StreamForbiddenException, + StreamNotFoundException, + StreamTimeOutException, +) +from newsreader.news.collection.utils import fetch + + +class FetchTestCase(TestCase): + def setUp(self): + self.patched_get = patch("newsreader.news.collection.utils.requests.get") + self.mocked_get = self.patched_get.start() + + def test_simple(self): + self.mocked_get.return_value = MagicMock(status_code=200, content="content") + + url = "https://www.bbc.co.uk/news" + response = fetch(url) + + self.assertEquals(response.content, "content") + + def test_raises_not_found(self): + self.mocked_get.return_value = MagicMock(status_code=404) + + url = "https://www.bbc.co.uk/news" + + with self.assertRaises(StreamNotFoundException): + fetch(url) + + def test_raises_denied(self): + self.mocked_get.return_value = MagicMock(status_code=401) + + url = "https://www.bbc.co.uk/news" + + with self.assertRaises(StreamDeniedException): + fetch(url) + + def test_raises_forbidden(self): + self.mocked_get.return_value = MagicMock(status_code=403) + + url = "https://www.bbc.co.uk/news" + + with self.assertRaises(StreamForbiddenException): + fetch(url) + + def test_raises_timed_out(self): + self.mocked_get.return_value = MagicMock(status_code=408) + + url = "https://www.bbc.co.uk/news" + + with self.assertRaises(StreamTimeOutException): + fetch(url) diff --git a/src/newsreader/news/collection/utils.py b/src/newsreader/news/collection/utils.py index 172fb54..17f379a 100644 --- a/src/newsreader/news/collection/utils.py +++ b/src/newsreader/news/collection/utils.py @@ -1,9 +1,15 @@ from datetime import datetime, tzinfo from time import mktime, struct_time -from typing import Tuple +from typing import Optional, Tuple + +import requests + +from requests.models import Response from django.utils import timezone +from newsreader.news.collection.response_handler import ResponseHandler + def build_publication_date(dt: struct_time, tz: tzinfo) -> Tuple: try: @@ -12,3 +18,12 @@ def build_publication_date(dt: struct_time, tz: tzinfo) -> Tuple: except TypeError: return False, None return True, published_parsed + + +def fetch(url: str) -> Optional[Response]: + response = requests.get(url) + + with ResponseHandler(response) as response_handler: + response_handler.handle_response() + + return response