0.2.3 #99

Merged
sonny merged 112 commits from development into master 2020-05-23 16:58:42 +02:00
33 changed files with 1238 additions and 228 deletions
Showing only changes of commit 48a9b25545 - Show all commits

View file

@ -4,20 +4,9 @@ from newsreader.news.collection.models import CollectionRule
class CollectionRuleAdmin(admin.ModelAdmin): class CollectionRuleAdmin(admin.ModelAdmin):
fields = ( fields = ("url", "name", "timezone", "category", "favicon")
"url",
"name",
"timezone",
"category",
)
list_display = ( list_display = ("name", "category", "url", "last_suceeded", "succeeded")
"name",
"category",
"url",
"last_suceeded",
"succeeded",
)
admin.site.register(CollectionRule, CollectionRuleAdmin) admin.site.register(CollectionRule, CollectionRuleAdmin)

View file

@ -2,9 +2,13 @@ from typing import ContextManager, Dict, List, Optional, Tuple
import requests import requests
from bs4 import BeautifulSoup
from django.utils import timezone from django.utils import timezone
from newsreader.news.collection.exceptions import StreamParseException
from newsreader.news.collection.models import CollectionRule from newsreader.news.collection.models import CollectionRule
from newsreader.news.collection.utils import fetch
class Stream: class Stream:
@ -12,9 +16,7 @@ class Stream:
self.rule = rule self.rule = rule
def read(self) -> Tuple: def read(self) -> Tuple:
url = self.rule.url raise NotImplementedError
response = requests.get(url)
return (self.parse(response.content), self)
def parse(self, payload: bytes) -> Dict: def parse(self, payload: bytes) -> Dict:
raise NotImplementedError raise NotImplementedError
@ -45,7 +47,7 @@ class Client:
class Builder: class Builder:
instances = [] instances = []
def __init__(self, stream: Stream) -> None: def __init__(self, stream: Tuple) -> None:
self.stream = stream self.stream = stream
def __enter__(self) -> ContextManager: def __enter__(self) -> ContextManager:
@ -81,3 +83,39 @@ class Collector:
class Meta: class Meta:
abstract = True abstract = True
class WebsiteStream(Stream):
def __init__(self, url: str) -> None:
self.url = url
def read(self) -> Tuple:
response = fetch(self.url)
return (self.parse(response.content), self)
def parse(self, payload: bytes) -> BeautifulSoup:
try:
return BeautifulSoup(payload, "lxml")
except TypeError:
raise StreamParseException("Could not parse given HTML")
class URLBuilder(Builder):
def __enter__(self) -> ContextManager:
return self
def build(self) -> Tuple:
data, stream = self.stream
rule = stream.rule
try:
url = data["feed"]["link"]
except (KeyError, TypeError):
url = None
if url:
rule.website_url = url
rule.save()
return rule, url

View file

@ -0,0 +1,113 @@
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import ContextManager, List, Optional
from urllib.parse import urljoin, urlparse
from newsreader.news.collection.base import (
Builder,
Client,
Collector,
Stream,
URLBuilder,
WebsiteStream,
)
from newsreader.news.collection.exceptions import StreamException
from newsreader.news.collection.feed import FeedClient
LINK_RELS = ["icon", "shortcut icon", "apple-touch-icon", "apple-touch-icon-precomposed"]
class FaviconBuilder(Builder):
def build(self) -> None:
rule, soup = self.stream
url = self.parse(soup, rule.website_url)
if url:
rule.favicon = url
rule.save()
def parse(self, soup, website_url) -> Optional[str]:
if not soup.head:
return
links = soup.head.find_all("link")
url = self.parse_links(links)
if not url:
return
parsed_url = urlparse(url)
if not parsed_url.scheme and not parsed_url.netloc:
if not website_url:
return
return urljoin(website_url, url)
elif not parsed_url.scheme:
return urljoin(f"https://{parsed_url.netloc}", parsed_url.path)
return url
def parse_links(self, links: List) -> Optional[str]:
favicons = set()
icons = set()
for link in links:
if not "href" in link.attrs:
continue
if "favicon" in link["href"]:
favicons.add(link["href"].lower())
if "rel" in link.attrs:
for rel in link["rel"]:
if rel in LINK_RELS:
icons.add(link["href"].lower())
if favicons:
return favicons.pop()
elif icons:
return icons.pop()
class FaviconClient(Client):
stream = WebsiteStream
def __init__(self, streams: List) -> None:
self.streams = streams
def __enter__(self) -> ContextManager:
with ThreadPoolExecutor(max_workers=10) as executor:
futures = {executor.submit(stream.read): rule for rule, stream in self.streams}
for future in as_completed(futures):
rule = futures[future]
try:
response_data, stream = future.result()
except StreamException:
continue
yield (rule, response_data)
class FaviconCollector(Collector):
feed_client, favicon_client = (FeedClient, FaviconClient)
url_builder, favicon_builder = (URLBuilder, FaviconBuilder)
def collect(self, rules: Optional[List] = None) -> None:
streams = []
with self.feed_client(rules=rules) as client:
for data, stream in client:
with self.url_builder((data, stream)) as builder:
rule, url = builder.build()
if not url:
continue
streams.append((rule, WebsiteStream(url)))
with self.favicon_client(streams) as client:
for rule, data in client:
with self.favicon_builder((rule, data)) as builder:
builder.build()

View file

@ -3,7 +3,6 @@ from typing import ContextManager, Dict, Generator, List, Optional, Tuple
import bleach import bleach
import pytz import pytz
import requests
from feedparser import parse from feedparser import parse
@ -19,7 +18,7 @@ from newsreader.news.collection.exceptions import (
) )
from newsreader.news.collection.models import CollectionRule from newsreader.news.collection.models import CollectionRule
from newsreader.news.collection.response_handler import ResponseHandler from newsreader.news.collection.response_handler import ResponseHandler
from newsreader.news.collection.utils import build_publication_date from newsreader.news.collection.utils import build_publication_date, fetch
from newsreader.news.posts.models import Post from newsreader.news.posts.models import Post
@ -92,10 +91,7 @@ class FeedBuilder(Builder):
class FeedStream(Stream): class FeedStream(Stream):
def read(self) -> Tuple: def read(self) -> Tuple:
url = self.rule.url url = self.rule.url
response = requests.get(url) response = fetch(url)
with ResponseHandler(response) as response_handler:
response_handler.handle_response()
return (self.parse(response.content), self) return (self.parse(response.content), self)

View file

@ -0,0 +1,11 @@
from django.core.management.base import BaseCommand
from newsreader.news.collection.favicon import FaviconCollector
class Command(BaseCommand):
help = "Fetch favicons for collection rules"
def handle(self, *args, **options):
collector = FaviconCollector()
collector.collect()

View file

@ -0,0 +1,16 @@
# Generated by Django 2.2 on 2019-06-23 18:37
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [("collection", "0006_collectionrule_error")]
operations = [
migrations.AlterField(
model_name="collectionrule",
name="favicon",
field=models.ImageField(default="favicons/default-favicon.ico", upload_to="favicons/"),
)
]

View file

@ -0,0 +1,16 @@
# Generated by Django 2.2 on 2019-06-23 18:47
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [("collection", "0007_auto_20190623_1837")]
operations = [
migrations.AlterField(
model_name="collectionrule",
name="favicon",
field=models.URLField(blank=True, null=True),
)
]

View file

@ -0,0 +1,16 @@
# Generated by Django 2.2 on 2019-06-27 21:27
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [("collection", "0008_auto_20190623_1847")]
operations = [
migrations.AddField(
model_name="collectionrule",
name="website_url",
field=models.URLField(blank=True, editable=False, null=True),
)
]

View file

@ -0,0 +1,19 @@
# Generated by Django 2.2 on 2019-06-28 21:42
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [("collection", "0009_collectionrule_website_url")]
operations = [
migrations.AlterField(
model_name="collectionrule", name="url", field=models.URLField(max_length=1024)
),
migrations.AlterField(
model_name="collectionrule",
name="website_url",
field=models.URLField(blank=True, editable=False, max_length=1024, null=True),
),
]

View file

@ -1,5 +1,6 @@
import pytz import pytz
from django.conf import settings
from django.db import models from django.db import models
from django.utils.translation import gettext as _ from django.utils.translation import gettext as _
@ -8,8 +9,9 @@ class CollectionRule(models.Model):
name = models.CharField(max_length=100) name = models.CharField(max_length=100)
source = models.CharField(max_length=100) source = models.CharField(max_length=100)
url = models.URLField() url = models.URLField(max_length=1024)
favicon = models.ImageField(blank=True, null=True) website_url = models.URLField(max_length=1024, editable=False, blank=True, null=True)
favicon = models.URLField(blank=True, null=True)
timezone = models.CharField( timezone = models.CharField(
choices=((timezone, timezone) for timezone in pytz.all_timezones), choices=((timezone, timezone) for timezone in pytz.all_timezones),
@ -23,7 +25,7 @@ class CollectionRule(models.Model):
null=True, null=True,
verbose_name=_("Category"), verbose_name=_("Category"),
help_text=_("Posts from this rule will be tagged with this category"), help_text=_("Posts from this rule will be tagged with this category"),
on_delete=models.SET_NULL on_delete=models.SET_NULL,
) )
last_suceeded = models.DateTimeField(blank=True, null=True) last_suceeded = models.DateTimeField(blank=True, null=True)

View file

@ -1 +1,4 @@
from .favicon import *
from .feed import * from .feed import *
from .tests import *
from .utils import *

View file

@ -10,3 +10,4 @@ class CollectionRuleFactory(factory.django.DjangoModelFactory):
name = factory.Sequence(lambda n: "CollectionRule-{}".format(n)) name = factory.Sequence(lambda n: "CollectionRule-{}".format(n))
source = factory.Faker("name") source = factory.Faker("name")
url = factory.Faker("url") url = factory.Faker("url")
website_url = factory.Faker("url")

View file

@ -0,0 +1,3 @@
from .builder import *
from .client import *
from .collector import *

View file

@ -0,0 +1 @@
from .tests import *

View file

@ -0,0 +1,88 @@
from bs4 import BeautifulSoup
simple_mock = BeautifulSoup(
"""
<html>
<head>
<link rel='shortcut icon' href='https://www.bbc.com/favicon.ico' />
</head>
<body>
<article />
</body>
</html>
""",
"lxml",
)
mock_without_url = BeautifulSoup(
"""
<html>
<head>
<link rel='shortcut icon' href='favicon.ico' />
</head>
<body>
<article />
</body>
</html>
""",
"lxml",
)
mock_without_header = BeautifulSoup(
"""
<html>
<body>
<article />
</body>
</html>
""",
"lxml",
)
mock_with_weird_path = BeautifulSoup(
"""
<html>
<head>
<link rel='shortcut icon' href='//www.theguardian.com/jabadaba/doe/favicon.ico' />
</head>
<body>
<article />
</body>
</html>
""",
"lxml",
)
mock_with_other_url = BeautifulSoup(
"""
<html>
<head>
<link rel='stylesheet' href='https://www.theguardian.com/main.css' />
<link rel='icon' href='https://www.theguardian.com/icon.png' />
</head>
<body>
<article />
</body>
</html>
""",
"lxml",
)
mock_with_multiple_icons = BeautifulSoup(
"""
<html>
<head>
<link rel='stylesheet' href='https://www.theguardian.com/main.css' />
<link rel='icon' href='https://www.theguardian.com/icon.png' />
<link rel='shortcut icon' href='https://www.theguardian.com/icon.png' />
<link rel='apple-touch-icon' href='https://www.theguardian.com/icon.png' />
<link rel='apple-touch-icon-precomposed' href='https://www.theguardian.com/icon.png' />
<link rel='shortcut icon' href='https://www.bbc.com/favicon.ico' />
</head>
<body>
<article />
</body>
</html>
""",
"lxml",
)

View file

@ -0,0 +1,60 @@
from freezegun import freeze_time
from django.test import TestCase
from newsreader.news.collection.favicon import FaviconBuilder
from newsreader.news.collection.tests.factories import CollectionRuleFactory
from newsreader.news.collection.tests.favicon.builder.mocks import *
class FaviconBuilderTestCase(TestCase):
def setUp(self):
self.maxDiff = None
def test_simple(self):
rule = CollectionRuleFactory(favicon=None)
with FaviconBuilder((rule, simple_mock)) as builder:
builder.build()
self.assertEquals(rule.favicon, "https://www.bbc.com/favicon.ico")
def test_without_url(self):
rule = CollectionRuleFactory(website_url="https://www.theguardian.com/", favicon=None)
with FaviconBuilder((rule, mock_without_url)) as builder:
builder.build()
self.assertEquals(rule.favicon, "https://www.theguardian.com/favicon.ico")
def test_without_header(self):
rule = CollectionRuleFactory(favicon=None)
with FaviconBuilder((rule, mock_without_header)) as builder:
builder.build()
self.assertEquals(rule.favicon, None)
def test_weird_path(self):
rule = CollectionRuleFactory(favicon=None)
with FaviconBuilder((rule, mock_with_weird_path)) as builder:
builder.build()
self.assertEquals(rule.favicon, "https://www.theguardian.com/jabadaba/doe/favicon.ico")
def test_other_url(self):
rule = CollectionRuleFactory(favicon=None)
with FaviconBuilder((rule, mock_with_other_url)) as builder:
builder.build()
self.assertEquals(rule.favicon, "https://www.theguardian.com/icon.png")
def test_url_with_favicon_takes_precedence(self):
rule = CollectionRuleFactory(favicon=None)
with FaviconBuilder((rule, mock_with_multiple_icons)) as builder:
builder.build()
self.assertEquals(rule.favicon, "https://www.bbc.com/favicon.ico")

View file

@ -0,0 +1 @@
from .tests import *

View file

@ -0,0 +1,12 @@
from bs4 import BeautifulSoup
simple_mock = BeautifulSoup(
"""
<html>
<body>
<article />
</body>
</htm>
""",
"lxml",
)

View file

@ -0,0 +1,91 @@
from unittest.mock import MagicMock
from django.test import TestCase
from newsreader.news.collection.base import WebsiteStream
from newsreader.news.collection.exceptions import (
StreamDeniedException,
StreamException,
StreamNotFoundException,
StreamTimeOutException,
)
from newsreader.news.collection.favicon import FaviconClient
from newsreader.news.collection.tests.factories import CollectionRuleFactory
from newsreader.news.collection.tests.favicon.client.mocks import simple_mock
class FaviconClientTestCase(TestCase):
def setUp(self):
self.maxDiff = None
def test_simple(self):
rule = CollectionRuleFactory()
stream = MagicMock(url="https://www.bbc.com")
stream.read.return_value = (simple_mock, stream)
with FaviconClient([(rule, stream)]) as client:
for rule, data in client:
self.assertEquals(rule.pk, rule.pk)
self.assertEquals(data, simple_mock)
stream.read.assert_called_once_with()
def test_client_catches_stream_exception(self):
rule = CollectionRuleFactory(error=None, succeeded=True)
stream = MagicMock(url="https://www.bbc.com")
stream.read.side_effect = StreamException
with FaviconClient([(rule, stream)]) as client:
for rule, data in client:
pass
stream.read.assert_called_once_with()
# The favicon client does not set CollectionRule errors
self.assertEquals(rule.succeeded, True)
self.assertEquals(rule.error, None)
def test_client_catches_stream_not_found_exception(self):
rule = CollectionRuleFactory(error=None, succeeded=True)
stream = MagicMock(url="https://www.bbc.com")
stream.read.side_effect = StreamNotFoundException
with FaviconClient([(rule, stream)]) as client:
for rule, data in client:
pass
stream.read.assert_called_once_with()
# The favicon client does not set CollectionRule errors
self.assertEquals(rule.succeeded, True)
self.assertEquals(rule.error, None)
def test_client_catches_stream_denied_exception(self):
rule = CollectionRuleFactory(error=None, succeeded=True)
stream = MagicMock(url="https://www.bbc.com")
stream.read.side_effect = StreamDeniedException
with FaviconClient([(rule, stream)]) as client:
for rule, data in client:
pass
stream.read.assert_called_once_with()
# The favicon client does not set CollectionRule errors
self.assertEquals(rule.succeeded, True)
self.assertEquals(rule.error, None)
def test_client_catches_stream_timed_out(self):
rule = CollectionRuleFactory(error=None, succeeded=True)
stream = MagicMock(url="https://www.bbc.com")
stream.read.side_effect = StreamTimeOutException
with FaviconClient([(rule, stream)]) as client:
for rule, data in client:
pass
stream.read.assert_called_once_with()
# The favicon client does not set CollectionRule errors
self.assertEquals(rule.succeeded, True)
self.assertEquals(rule.error, None)

View file

@ -0,0 +1 @@
from .tests import *

View file

@ -0,0 +1,159 @@
from time import struct_time
from bs4 import BeautifulSoup
feed_mock = {
"bozo": 0,
"encoding": "utf-8",
"entries": [
{
"guidislink": False,
"href": "",
"id": "https://www.bbc.co.uk/news/world-us-canada-48338168",
"link": "https://www.bbc.co.uk/news/world-us-canada-48338168",
"links": [
{
"href": "https://www.bbc.co.uk/news/world-us-canada-48338168",
"rel": "alternate",
"type": "text/html",
}
],
"media_thumbnail": [
{
"height": "1152",
"url": "http://c.files.bbci.co.uk/7605/production/_107031203_mediaitem107031202.jpg",
"width": "2048",
}
],
"published": "Mon, 20 May 2019 16:07:37 GMT",
"published_parsed": struct_time((2019, 5, 20, 16, 7, 37, 0, 140, 0)),
"summary": "Foreign Minister Mohammad Javad Zarif says the US "
"president should try showing Iranians some respect.",
"summary_detail": {
"base": "http://feeds.bbci.co.uk/news/rss.xml",
"language": None,
"type": "text/html",
"value": "Foreign Minister Mohammad Javad "
"Zarif says the US president should "
"try showing Iranians some "
"respect.",
},
"title": "Trump's genocidal taunts will not end Iran - Zarif",
"title_detail": {
"base": "http://feeds.bbci.co.uk/news/rss.xml",
"language": None,
"type": "text/plain",
"value": "Trump's genocidal taunts will not " "end Iran - Zarif",
},
},
{
"guidislink": False,
"href": "",
"id": "https://www.bbc.co.uk/news/technology-48334739",
"link": "https://www.bbc.co.uk/news/technology-48334739",
"links": [
{
"href": "https://www.bbc.co.uk/news/technology-48334739",
"rel": "alternate",
"type": "text/html",
}
],
"media_thumbnail": [
{
"height": "432",
"url": "http://c.files.bbci.co.uk/4789/production/_107031381_mediaitem107028670.jpg",
"width": "768",
}
],
"published": "Mon, 20 May 2019 12:19:19 GMT",
"published_parsed": struct_time((2019, 5, 20, 12, 19, 19, 0, 140, 0)),
"summary": "Google's move to end business ties with Huawei will "
"affect current devices and future purchases.",
"summary_detail": {
"base": "http://feeds.bbci.co.uk/news/rss.xml",
"language": None,
"type": "text/html",
"value": "Google's move to end business ties "
"with Huawei will affect current "
"devices and future purchases.",
},
"title": "Huawei's Android loss: How it affects you",
"title_detail": {
"base": "http://feeds.bbci.co.uk/news/rss.xml",
"language": None,
"type": "text/plain",
"value": "Huawei's Android loss: How it " "affects you",
},
},
{
"guidislink": False,
"href": "",
"id": "https://www.bbc.co.uk/news/uk-england-birmingham-48339080",
"link": "https://www.bbc.co.uk/news/uk-england-birmingham-48339080",
"links": [
{
"href": "https://www.bbc.co.uk/news/uk-england-birmingham-48339080",
"rel": "alternate",
"type": "text/html",
}
],
"media_thumbnail": [
{
"height": "549",
"url": "http://c.files.bbci.co.uk/11D67/production/_107036037_lgbtheadjpg.jpg",
"width": "976",
}
],
"published": "Mon, 20 May 2019 16:32:38 GMT",
"published_parsed": struct_time((2019, 5, 20, 16, 32, 38, 0, 140, 0)),
"summary": "Police are investigating the messages while an MP "
"calls for a protest exclusion zone to protect "
"children.",
"summary_detail": {
"base": "http://feeds.bbci.co.uk/news/rss.xml",
"language": None,
"type": "text/html",
"value": "Police are investigating the "
"messages while an MP calls for a "
"protest exclusion zone to protect "
"children.",
},
"title": "Birmingham head teacher threatened over LGBT lessons",
"title_detail": {
"base": "http://feeds.bbci.co.uk/news/rss.xml",
"language": None,
"type": "text/plain",
"value": "Birmingham head teacher threatened " "over LGBT lessons",
},
},
],
"feed": {
"image": {
"href": "https://news.bbcimg.co.uk/nol/shared/img/bbc_news_120x60.gif",
"link": "https://www.bbc.co.uk/news/",
"title": "BBC News - Home",
"language": "en-gb",
"link": "https://www.bbc.co.uk/news/",
},
"link": "https://www.bbc.co.uk/news/",
"links": [{"href": "https://www.bbc.co.uk/news/", "rel": "alternate", "type": "text/html"}],
"title": "BBC News - Home",
},
"href": "http://feeds.bbci.co.uk/news/rss.xml",
"status": 200,
"version": "rss20",
}
website_mock = BeautifulSoup(
"""
<html>
<head>
<link rel="shortcut icon" href="https://www.bbc.co.uk/news/favicon.ico" />
</head>
<body>
<article />
</body>
</html>
""",
"lxml",
)

View file

@ -0,0 +1,147 @@
from unittest.mock import MagicMock, patch
import pytz
from bs4 import BeautifulSoup
from .mocks import feed_mock, website_mock
from django.test import TestCase
from django.utils import timezone
from newsreader.news.collection.exceptions import (
StreamDeniedException,
StreamException,
StreamForbiddenException,
StreamNotFoundException,
StreamParseException,
StreamTimeOutException,
)
from newsreader.news.collection.favicon import FaviconCollector
from newsreader.news.collection.tests.factories import CollectionRuleFactory
class FaviconCollectorTestCase(TestCase):
def setUp(self):
self.maxDiff = None
self.patched_feed_client = patch("newsreader.news.collection.favicon.FeedClient.__enter__")
self.mocked_feed_client = self.patched_feed_client.start()
self.patched_website_read = patch("newsreader.news.collection.favicon.WebsiteStream.read")
self.mocked_website_read = self.patched_website_read.start()
def tearDown(self):
patch.stopall()
def test_simple(self):
rule = CollectionRuleFactory(succeeded=True, error=None)
self.mocked_feed_client.return_value = [(feed_mock, MagicMock(rule=rule))]
self.mocked_website_read.return_value = (website_mock, MagicMock())
collector = FaviconCollector()
collector.collect()
rule.refresh_from_db()
self.assertEquals(rule.succeeded, True)
self.assertEquals(rule.error, None)
self.assertEquals(rule.website_url, "https://www.bbc.co.uk/news/")
self.assertEquals(rule.favicon, "https://www.bbc.co.uk/news/favicon.ico")
def test_empty_stream(self):
rule = CollectionRuleFactory(succeeded=True, error=None)
self.mocked_feed_client.return_value = [(feed_mock, MagicMock(rule=rule))]
self.mocked_website_read.return_value = (BeautifulSoup("", "lxml"), MagicMock())
collector = FaviconCollector()
collector.collect()
rule.refresh_from_db()
self.assertEquals(rule.succeeded, True)
self.assertEquals(rule.error, None)
self.assertEquals(rule.website_url, "https://www.bbc.co.uk/news/")
self.assertEquals(rule.favicon, None)
def test_not_found(self):
rule = CollectionRuleFactory(succeeded=True, error=None)
self.mocked_feed_client.return_value = [(feed_mock, MagicMock(rule=rule))]
self.mocked_website_read.side_effect = StreamNotFoundException
collector = FaviconCollector()
collector.collect()
rule.refresh_from_db()
self.assertEquals(rule.succeeded, True)
self.assertEquals(rule.error, None)
self.assertEquals(rule.website_url, "https://www.bbc.co.uk/news/")
self.assertEquals(rule.favicon, None)
def test_denied(self):
rule = CollectionRuleFactory(succeeded=True, error=None)
self.mocked_feed_client.return_value = [(feed_mock, MagicMock(rule=rule))]
self.mocked_website_read.side_effect = StreamDeniedException
collector = FaviconCollector()
collector.collect()
rule.refresh_from_db()
self.assertEquals(rule.succeeded, True)
self.assertEquals(rule.error, None)
self.assertEquals(rule.website_url, "https://www.bbc.co.uk/news/")
self.assertEquals(rule.favicon, None)
def test_forbidden(self):
rule = CollectionRuleFactory(succeeded=True, error=None)
self.mocked_feed_client.return_value = [(feed_mock, MagicMock(rule=rule))]
self.mocked_website_read.side_effect = StreamForbiddenException
collector = FaviconCollector()
collector.collect()
rule.refresh_from_db()
self.assertEquals(rule.succeeded, True)
self.assertEquals(rule.error, None)
self.assertEquals(rule.website_url, "https://www.bbc.co.uk/news/")
self.assertEquals(rule.favicon, None)
def test_timed_out(self):
rule = CollectionRuleFactory(succeeded=True, error=None)
self.mocked_feed_client.return_value = [(feed_mock, MagicMock(rule=rule))]
self.mocked_website_read.side_effect = StreamTimeOutException
collector = FaviconCollector()
collector.collect()
rule.refresh_from_db()
self.assertEquals(rule.succeeded, True)
self.assertEquals(rule.error, None)
self.assertEquals(rule.website_url, "https://www.bbc.co.uk/news/")
self.assertEquals(rule.favicon, None)
def test_wrong_stream_content_type(self):
rule = CollectionRuleFactory(succeeded=True, error=None)
self.mocked_feed_client.return_value = [(feed_mock, MagicMock(rule=rule))]
self.mocked_website_read.side_effect = StreamParseException
collector = FaviconCollector()
collector.collect()
rule.refresh_from_db()
self.assertEquals(rule.succeeded, True)
self.assertEquals(rule.error, None)
self.assertEquals(rule.website_url, "https://www.bbc.co.uk/news/")
self.assertEquals(rule.favicon, None)

View file

@ -5,26 +5,27 @@ import pytz
from freezegun import freeze_time from freezegun import freeze_time
from .mocks import *
from django.test import TestCase from django.test import TestCase
from django.utils import timezone from django.utils import timezone
from newsreader.news.collection.feed import FeedBuilder from newsreader.news.collection.feed import FeedBuilder
from newsreader.news.collection.tests.factories import CollectionRuleFactory from newsreader.news.collection.tests.factories import CollectionRuleFactory
from newsreader.news.collection.tests.feed.builder.mocks import *
from newsreader.news.posts.models import Post from newsreader.news.posts.models import Post
from newsreader.news.posts.tests.factories import PostFactory from newsreader.news.posts.tests.factories import PostFactory
class FeedBuilderTestCase(TestCase): class FeedBuilderTestCase(TestCase):
def setUp(self): def setUp(self):
pass self.maxDiff = None
def test_basic_entry(self): def test_basic_entry(self):
builder = FeedBuilder builder = FeedBuilder
rule = CollectionRuleFactory() rule = CollectionRuleFactory()
mock_stream = MagicMock(rule=rule) mock_stream = MagicMock(rule=rule)
with builder((simple_mock, mock_stream,)) as builder: with builder((simple_mock, mock_stream)) as builder:
builder.save() builder.save()
post = Post.objects.get() post = Post.objects.get()
@ -36,26 +37,19 @@ class FeedBuilderTestCase(TestCase):
self.assertEquals(Post.objects.count(), 1) self.assertEquals(Post.objects.count(), 1)
self.assertEquals( self.assertEquals(
post.remote_identifier, post.remote_identifier, "https://www.bbc.co.uk/news/world-us-canada-48338168"
"https://www.bbc.co.uk/news/world-us-canada-48338168"
) )
self.assertEquals( self.assertEquals(post.url, "https://www.bbc.co.uk/news/world-us-canada-48338168")
post.url,
"https://www.bbc.co.uk/news/world-us-canada-48338168"
)
self.assertEquals( self.assertEquals(post.title, "Trump's 'genocidal taunts' will not end Iran - Zarif")
post.title,
"Trump's 'genocidal taunts' will not end Iran - Zarif"
)
def test_multiple_entries(self): def test_multiple_entries(self):
builder = FeedBuilder builder = FeedBuilder
rule = CollectionRuleFactory() rule = CollectionRuleFactory()
mock_stream = MagicMock(rule=rule) mock_stream = MagicMock(rule=rule)
with builder((multiple_mock, mock_stream,)) as builder: with builder((multiple_mock, mock_stream)) as builder:
builder.save() builder.save()
posts = Post.objects.order_by("id") posts = Post.objects.order_by("id")
@ -70,19 +64,12 @@ class FeedBuilderTestCase(TestCase):
self.assertEquals(first_post.publication_date, aware_date) self.assertEquals(first_post.publication_date, aware_date)
self.assertEquals( self.assertEquals(
first_post.remote_identifier, first_post.remote_identifier, "https://www.bbc.co.uk/news/world-us-canada-48338168"
"https://www.bbc.co.uk/news/world-us-canada-48338168"
) )
self.assertEquals( self.assertEquals(first_post.url, "https://www.bbc.co.uk/news/world-us-canada-48338168")
first_post.url,
"https://www.bbc.co.uk/news/world-us-canada-48338168"
)
self.assertEquals( self.assertEquals(first_post.title, "Trump's 'genocidal taunts' will not end Iran - Zarif")
first_post.title,
"Trump's 'genocidal taunts' will not end Iran - Zarif"
)
d = datetime.combine(date(2019, 5, 20), time(hour=12, minute=19, second=19)) d = datetime.combine(date(2019, 5, 20), time(hour=12, minute=19, second=19))
aware_date = pytz.utc.localize(d) aware_date = pytz.utc.localize(d)
@ -90,26 +77,19 @@ class FeedBuilderTestCase(TestCase):
self.assertEquals(second_post.publication_date, aware_date) self.assertEquals(second_post.publication_date, aware_date)
self.assertEquals( self.assertEquals(
second_post.remote_identifier, second_post.remote_identifier, "https://www.bbc.co.uk/news/technology-48334739"
"https://www.bbc.co.uk/news/technology-48334739"
) )
self.assertEquals( self.assertEquals(second_post.url, "https://www.bbc.co.uk/news/technology-48334739")
second_post.url,
"https://www.bbc.co.uk/news/technology-48334739"
)
self.assertEquals( self.assertEquals(second_post.title, "Huawei's Android loss: How it affects you")
second_post.title,
"Huawei's Android loss: How it affects you"
)
def test_entry_without_remote_identifier(self): def test_entry_without_remote_identifier(self):
builder = FeedBuilder builder = FeedBuilder
rule = CollectionRuleFactory() rule = CollectionRuleFactory()
mock_stream = MagicMock(rule=rule) mock_stream = MagicMock(rule=rule)
with builder((mock_without_identifier, mock_stream,)) as builder: with builder((mock_without_identifier, mock_stream)) as builder:
builder.save() builder.save()
posts = Post.objects.order_by("id") posts = Post.objects.order_by("id")
@ -124,15 +104,9 @@ class FeedBuilderTestCase(TestCase):
self.assertEquals(first_post.remote_identifier, None) self.assertEquals(first_post.remote_identifier, None)
self.assertEquals( self.assertEquals(first_post.url, "https://www.bbc.co.uk/news/world-us-canada-48338168")
first_post.url,
"https://www.bbc.co.uk/news/world-us-canada-48338168"
)
self.assertEquals( self.assertEquals(first_post.title, "Trump's 'genocidal taunts' will not end Iran - Zarif")
first_post.title,
"Trump's 'genocidal taunts' will not end Iran - Zarif"
)
@freeze_time("2019-10-30 12:30:00") @freeze_time("2019-10-30 12:30:00")
def test_entry_without_publication_date(self): def test_entry_without_publication_date(self):
@ -140,7 +114,7 @@ class FeedBuilderTestCase(TestCase):
rule = CollectionRuleFactory() rule = CollectionRuleFactory()
mock_stream = MagicMock(rule=rule) mock_stream = MagicMock(rule=rule)
with builder((mock_without_publish_date, mock_stream,)) as builder: with builder((mock_without_publish_date, mock_stream)) as builder:
builder.save() builder.save()
posts = Post.objects.order_by("id") posts = Post.objects.order_by("id")
@ -151,14 +125,12 @@ class FeedBuilderTestCase(TestCase):
self.assertEquals(first_post.created, timezone.now()) self.assertEquals(first_post.created, timezone.now())
self.assertEquals( self.assertEquals(
first_post.remote_identifier, first_post.remote_identifier, "https://www.bbc.co.uk/news/world-us-canada-48338168"
'https://www.bbc.co.uk/news/world-us-canada-48338168'
) )
self.assertEquals(second_post.created, timezone.now()) self.assertEquals(second_post.created, timezone.now())
self.assertEquals( self.assertEquals(
second_post.remote_identifier, second_post.remote_identifier, "https://www.bbc.co.uk/news/technology-48334739"
'https://www.bbc.co.uk/news/technology-48334739'
) )
@freeze_time("2019-10-30 12:30:00") @freeze_time("2019-10-30 12:30:00")
@ -167,7 +139,7 @@ class FeedBuilderTestCase(TestCase):
rule = CollectionRuleFactory() rule = CollectionRuleFactory()
mock_stream = MagicMock(rule=rule) mock_stream = MagicMock(rule=rule)
with builder((mock_without_url, mock_stream,)) as builder: with builder((mock_without_url, mock_stream)) as builder:
builder.save() builder.save()
posts = Post.objects.order_by("id") posts = Post.objects.order_by("id")
@ -178,14 +150,12 @@ class FeedBuilderTestCase(TestCase):
self.assertEquals(first_post.created, timezone.now()) self.assertEquals(first_post.created, timezone.now())
self.assertEquals( self.assertEquals(
first_post.remote_identifier, first_post.remote_identifier, "https://www.bbc.co.uk/news/world-us-canada-48338168"
'https://www.bbc.co.uk/news/world-us-canada-48338168'
) )
self.assertEquals(second_post.created, timezone.now()) self.assertEquals(second_post.created, timezone.now())
self.assertEquals( self.assertEquals(
second_post.remote_identifier, second_post.remote_identifier, "https://www.bbc.co.uk/news/technology-48334739"
'https://www.bbc.co.uk/news/technology-48334739'
) )
@freeze_time("2019-10-30 12:30:00") @freeze_time("2019-10-30 12:30:00")
@ -194,7 +164,7 @@ class FeedBuilderTestCase(TestCase):
rule = CollectionRuleFactory() rule = CollectionRuleFactory()
mock_stream = MagicMock(rule=rule) mock_stream = MagicMock(rule=rule)
with builder((mock_without_body, mock_stream,)) as builder: with builder((mock_without_body, mock_stream)) as builder:
builder.save() builder.save()
posts = Post.objects.order_by("id") posts = Post.objects.order_by("id")
@ -205,14 +175,13 @@ class FeedBuilderTestCase(TestCase):
self.assertEquals(first_post.created, timezone.now()) self.assertEquals(first_post.created, timezone.now())
self.assertEquals( self.assertEquals(
first_post.remote_identifier, first_post.remote_identifier, "https://www.bbc.co.uk/news/world-us-canada-48338168"
'https://www.bbc.co.uk/news/world-us-canada-48338168'
) )
self.assertEquals(second_post.created, timezone.now()) self.assertEquals(second_post.created, timezone.now())
self.assertEquals( self.assertEquals(
second_post.remote_identifier, second_post.remote_identifier,
'https://www.bbc.co.uk/news/uk-england-birmingham-48339080' "https://www.bbc.co.uk/news/uk-england-birmingham-48339080",
) )
@freeze_time("2019-10-30 12:30:00") @freeze_time("2019-10-30 12:30:00")
@ -221,7 +190,7 @@ class FeedBuilderTestCase(TestCase):
rule = CollectionRuleFactory() rule = CollectionRuleFactory()
mock_stream = MagicMock(rule=rule) mock_stream = MagicMock(rule=rule)
with builder((mock_without_author, mock_stream,)) as builder: with builder((mock_without_author, mock_stream)) as builder:
builder.save() builder.save()
posts = Post.objects.order_by("id") posts = Post.objects.order_by("id")
@ -232,14 +201,12 @@ class FeedBuilderTestCase(TestCase):
self.assertEquals(first_post.created, timezone.now()) self.assertEquals(first_post.created, timezone.now())
self.assertEquals( self.assertEquals(
first_post.remote_identifier, first_post.remote_identifier, "https://www.bbc.co.uk/news/world-us-canada-48338168"
'https://www.bbc.co.uk/news/world-us-canada-48338168'
) )
self.assertEquals(second_post.created, timezone.now()) self.assertEquals(second_post.created, timezone.now())
self.assertEquals( self.assertEquals(
second_post.remote_identifier, second_post.remote_identifier, "https://www.bbc.co.uk/news/technology-48334739"
'https://www.bbc.co.uk/news/technology-48334739'
) )
def test_empty_entries(self): def test_empty_entries(self):
@ -247,7 +214,7 @@ class FeedBuilderTestCase(TestCase):
rule = CollectionRuleFactory() rule = CollectionRuleFactory()
mock_stream = MagicMock(rule=rule) mock_stream = MagicMock(rule=rule)
with builder((mock_without_entries, mock_stream,)) as builder: with builder((mock_without_entries, mock_stream)) as builder:
builder.save() builder.save()
self.assertEquals(Post.objects.count(), 0) self.assertEquals(Post.objects.count(), 0)
@ -265,7 +232,7 @@ class FeedBuilderTestCase(TestCase):
remote_identifier="a5479c66-8fae-11e9-8422-00163ef6bee7", rule=rule remote_identifier="a5479c66-8fae-11e9-8422-00163ef6bee7", rule=rule
) )
with builder((mock_with_update_entries, mock_stream,)) as builder: with builder((mock_with_update_entries, mock_stream)) as builder:
builder.save() builder.save()
self.assertEquals(Post.objects.count(), 3) self.assertEquals(Post.objects.count(), 3)
@ -274,21 +241,17 @@ class FeedBuilderTestCase(TestCase):
existing_second_post.refresh_from_db() existing_second_post.refresh_from_db()
self.assertEquals( self.assertEquals(
existing_first_post.title, existing_first_post.title, "Trump's 'genocidal taunts' will not end Iran - Zarif"
"Trump's 'genocidal taunts' will not end Iran - Zarif"
) )
self.assertEquals( self.assertEquals(existing_second_post.title, "Huawei's Android loss: How it affects you")
existing_second_post.title,
"Huawei's Android loss: How it affects you"
)
def test_html_sanitizing(self): def test_html_sanitizing(self):
builder = FeedBuilder builder = FeedBuilder
rule = CollectionRuleFactory() rule = CollectionRuleFactory()
mock_stream = MagicMock(rule=rule) mock_stream = MagicMock(rule=rule)
with builder((mock_with_html, mock_stream,)) as builder: with builder((mock_with_html, mock_stream)) as builder:
builder.save() builder.save()
post = Post.objects.get() post = Post.objects.get()

View file

@ -1,5 +1,7 @@
from unittest.mock import MagicMock, patch from unittest.mock import MagicMock, patch
from .mocks import simple_mock
from django.test import TestCase from django.test import TestCase
from django.utils import timezone from django.utils import timezone
@ -7,15 +9,17 @@ from newsreader.news.collection.exceptions import (
StreamDeniedException, StreamDeniedException,
StreamException, StreamException,
StreamNotFoundException, StreamNotFoundException,
StreamParseException,
StreamTimeOutException, StreamTimeOutException,
) )
from newsreader.news.collection.feed import FeedClient from newsreader.news.collection.feed import FeedClient
from newsreader.news.collection.tests.factories import CollectionRuleFactory from newsreader.news.collection.tests.factories import CollectionRuleFactory
from newsreader.news.collection.tests.feed.client.mocks import simple_mock
class FeedClientTestCase(TestCase): class FeedClientTestCase(TestCase):
def setUp(self): def setUp(self):
self.maxDiff = None
self.patched_read = patch("newsreader.news.collection.feed.FeedStream.read") self.patched_read = patch("newsreader.news.collection.feed.FeedStream.read")
self.mocked_read = self.patched_read.start() self.mocked_read = self.patched_read.start()
@ -85,3 +89,16 @@ class FeedClientTestCase(TestCase):
self.assertEquals(stream.rule.succeeded, False) self.assertEquals(stream.rule.succeeded, False)
self.mocked_read.assert_called_once_with() self.mocked_read.assert_called_once_with()
def test_client_catches_stream_parse_exception(self):
rule = CollectionRuleFactory.create()
mock_stream = MagicMock(rule=rule)
self.mocked_read.side_effect = StreamParseException("Stream has wrong contents")
with FeedClient([rule]) as client:
for data, stream in client:
self.assertEquals(data, {"entries": []})
self.assertEquals(stream.rule.error, "Stream has wrong contents")
self.assertEquals(stream.rule.succeeded, False)
self.mocked_read.assert_called_once_with()

View file

@ -6,17 +6,26 @@ import pytz
from freezegun import freeze_time from freezegun import freeze_time
from django.test import TestCase from .mocks import (
from django.utils import timezone
from newsreader.news.collection.feed import FeedCollector
from newsreader.news.collection.tests.factories import CollectionRuleFactory
from newsreader.news.collection.tests.feed.collector.mocks import (
duplicate_mock, duplicate_mock,
empty_mock, empty_mock,
multiple_mock, multiple_mock,
multiple_update_mock, multiple_update_mock,
) )
from django.test import TestCase
from django.utils import timezone
from newsreader.news.collection.exceptions import (
StreamDeniedException,
StreamException,
StreamForbiddenException,
StreamNotFoundException,
StreamParseException,
StreamTimeOutException,
)
from newsreader.news.collection.feed import FeedCollector
from newsreader.news.collection.tests.factories import CollectionRuleFactory
from newsreader.news.collection.utils import build_publication_date from newsreader.news.collection.utils import build_publication_date
from newsreader.news.posts.models import Post from newsreader.news.posts.models import Post
from newsreader.news.posts.tests.factories import PostFactory from newsreader.news.posts.tests.factories import PostFactory
@ -24,14 +33,12 @@ from newsreader.news.posts.tests.factories import PostFactory
class FeedCollectorTestCase(TestCase): class FeedCollectorTestCase(TestCase):
def setUp(self): def setUp(self):
self.patched_get = patch( self.maxDiff = None
'newsreader.news.collection.feed.requests.get'
)
self.mocked_get = self.patched_get.start()
self.patched_parse = patch( self.patched_get = patch("newsreader.news.collection.feed.fetch")
'newsreader.news.collection.feed.FeedStream.parse' self.mocked_fetch = self.patched_get.start()
)
self.patched_parse = patch("newsreader.news.collection.feed.FeedStream.parse")
self.mocked_parse = self.patched_parse.start() self.mocked_parse = self.patched_parse.start()
def tearDown(self): def tearDown(self):
@ -54,7 +61,7 @@ class FeedCollectorTestCase(TestCase):
@freeze_time("2019-10-30 12:30:00") @freeze_time("2019-10-30 12:30:00")
def test_emtpy_batch(self): def test_emtpy_batch(self):
self.mocked_get.return_value = MagicMock(status_code=200) self.mocked_fetch.return_value = MagicMock()
self.mocked_parse.return_value = empty_mock self.mocked_parse.return_value = empty_mock
rule = CollectionRuleFactory() rule = CollectionRuleFactory()
@ -69,7 +76,7 @@ class FeedCollectorTestCase(TestCase):
self.assertEquals(rule.last_suceeded, timezone.now()) self.assertEquals(rule.last_suceeded, timezone.now())
def test_not_found(self): def test_not_found(self):
self.mocked_get.return_value = MagicMock(status_code=404) self.mocked_fetch.side_effect = StreamNotFoundException
rule = CollectionRuleFactory() rule = CollectionRuleFactory()
collector = FeedCollector() collector = FeedCollector()
@ -82,7 +89,7 @@ class FeedCollectorTestCase(TestCase):
self.assertEquals(rule.error, "Stream not found") self.assertEquals(rule.error, "Stream not found")
def test_denied(self): def test_denied(self):
self.mocked_get.return_value = MagicMock(status_code=404) self.mocked_fetch.side_effect = StreamDeniedException
last_suceeded = timezone.make_aware( last_suceeded = timezone.make_aware(
datetime.combine(date=date(2019, 10, 30), time=time(12, 30)) datetime.combine(date=date(2019, 10, 30), time=time(12, 30))
) )
@ -95,11 +102,11 @@ class FeedCollectorTestCase(TestCase):
self.assertEquals(Post.objects.count(), 0) self.assertEquals(Post.objects.count(), 0)
self.assertEquals(rule.succeeded, False) self.assertEquals(rule.succeeded, False)
self.assertEquals(rule.error, "Stream not found") self.assertEquals(rule.error, "Stream does not have sufficient permissions")
self.assertEquals(rule.last_suceeded, last_suceeded) self.assertEquals(rule.last_suceeded, last_suceeded)
def test_forbidden(self): def test_forbidden(self):
self.mocked_get.return_value = MagicMock(status_code=403) self.mocked_fetch.side_effect = StreamForbiddenException
last_suceeded = timezone.make_aware( last_suceeded = timezone.make_aware(
datetime.combine(date=date(2019, 10, 30), time=time(12, 30)) datetime.combine(date=date(2019, 10, 30), time=time(12, 30))
) )
@ -116,7 +123,7 @@ class FeedCollectorTestCase(TestCase):
self.assertEquals(rule.last_suceeded, last_suceeded) self.assertEquals(rule.last_suceeded, last_suceeded)
def test_timed_out(self): def test_timed_out(self):
self.mocked_get.return_value = MagicMock(status_code=408) self.mocked_fetch.side_effect = StreamTimeOutException
last_suceeded = timezone.make_aware( last_suceeded = timezone.make_aware(
datetime.combine(date=date(2019, 10, 30), time=time(12, 30)) datetime.combine(date=date(2019, 10, 30), time=time(12, 30))
) )
@ -138,8 +145,7 @@ class FeedCollectorTestCase(TestCase):
rule = CollectionRuleFactory() rule = CollectionRuleFactory()
_, aware_datetime = build_publication_date( _, aware_datetime = build_publication_date(
struct_time((2019, 5, 20, 16, 7, 37, 0, 140, 0)), struct_time((2019, 5, 20, 16, 7, 37, 0, 140, 0)), pytz.utc
pytz.utc
) )
first_post = PostFactory( first_post = PostFactory(
@ -148,12 +154,11 @@ class FeedCollectorTestCase(TestCase):
body="Foreign Minister Mohammad Javad Zarif says the US " body="Foreign Minister Mohammad Javad Zarif says the US "
"president should try showing Iranians some respect.", "president should try showing Iranians some respect.",
publication_date=aware_datetime, publication_date=aware_datetime,
rule=rule rule=rule,
) )
_, aware_datetime = build_publication_date( _, aware_datetime = build_publication_date(
struct_time((2019, 5, 20, 12, 19, 19, 0, 140, 0,)), struct_time((2019, 5, 20, 12, 19, 19, 0, 140, 0)), pytz.utc
pytz.utc
) )
second_post = PostFactory( second_post = PostFactory(
@ -162,22 +167,21 @@ class FeedCollectorTestCase(TestCase):
body="Google's move to end business ties with Huawei will " body="Google's move to end business ties with Huawei will "
"affect current devices and future purchases.", "affect current devices and future purchases.",
publication_date=aware_datetime, publication_date=aware_datetime,
rule=rule rule=rule,
) )
_, aware_datetime = build_publication_date( _, aware_datetime = build_publication_date(
struct_time((2019, 5, 20, 16, 32, 38, 0, 140, 0)), struct_time((2019, 5, 20, 16, 32, 38, 0, 140, 0)), pytz.utc
pytz.utc
) )
third_post = PostFactory( third_post = PostFactory(
url="https://www.bbc.co.uk/news/uk-england-birmingham-48339080", url="https://www.bbc.co.uk/news/uk-england-birmingham-48339080",
title="Birmingham head teacher threatened over LGBT lessons", title="Birmingham head teacher threatened over LGBT lessons",
body="Police are investigating the messages while an MP " body="Police are investigating the messages while an MP "
"calls for a protest exclusion zone \"to protect " 'calls for a protest exclusion zone "to protect '
"children\".", 'children".',
publication_date=aware_datetime, publication_date=aware_datetime,
rule=rule rule=rule,
) )
collector = FeedCollector() collector = FeedCollector()
@ -201,7 +205,7 @@ class FeedCollectorTestCase(TestCase):
title="Trump", title="Trump",
body="Foreign Minister Mohammad Javad Zarif", body="Foreign Minister Mohammad Javad Zarif",
publication_date=timezone.now(), publication_date=timezone.now(),
rule=rule rule=rule,
) )
second_post = PostFactory( second_post = PostFactory(
@ -210,7 +214,7 @@ class FeedCollectorTestCase(TestCase):
title="Huawei's Android loss: How it affects you", title="Huawei's Android loss: How it affects you",
body="Google's move to end business ties with Huawei will", body="Google's move to end business ties with Huawei will",
publication_date=timezone.now(), publication_date=timezone.now(),
rule=rule rule=rule,
) )
third_post = PostFactory( third_post = PostFactory(
@ -219,7 +223,7 @@ class FeedCollectorTestCase(TestCase):
title="Birmingham head teacher threatened over LGBT lessons", title="Birmingham head teacher threatened over LGBT lessons",
body="Police are investigating the messages while an MP", body="Police are investigating the messages while an MP",
publication_date=timezone.now(), publication_date=timezone.now(),
rule=rule rule=rule,
) )
collector = FeedCollector() collector = FeedCollector()
@ -235,17 +239,8 @@ class FeedCollectorTestCase(TestCase):
self.assertEquals(rule.last_suceeded, timezone.now()) self.assertEquals(rule.last_suceeded, timezone.now())
self.assertEquals(rule.error, None) self.assertEquals(rule.error, None)
self.assertEquals( self.assertEquals(first_post.title, "Trump's 'genocidal taunts' will not end Iran - Zarif")
first_post.title,
"Trump's 'genocidal taunts' will not end Iran - Zarif"
)
self.assertEquals( self.assertEquals(second_post.title, "Huawei's Android loss: How it affects you")
second_post.title,
"Huawei's Android loss: How it affects you"
)
self.assertEquals( self.assertEquals(third_post.title, "Birmingham head teacher threatened over LGBT lessons")
third_post.title,
'Birmingham head teacher threatened over LGBT lessons'
)

View file

@ -9,7 +9,7 @@ from newsreader.news.posts.tests.factories import PostFactory
class FeedDuplicateHandlerTestCase(TestCase): class FeedDuplicateHandlerTestCase(TestCase):
def setUp(self): def setUp(self):
pass self.maxDiff = None
def test_duplicate_entries_with_remote_identifiers(self): def test_duplicate_entries_with_remote_identifiers(self):
rule = CollectionRuleFactory() rule = CollectionRuleFactory()
@ -19,7 +19,7 @@ class FeedDuplicateHandlerTestCase(TestCase):
new_post = PostFactory.build( new_post = PostFactory.build(
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7", remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7",
title="title got updated", title="title got updated",
rule=rule rule=rule,
) )
with FeedDuplicateHandler(rule) as duplicate_handler: with FeedDuplicateHandler(rule) as duplicate_handler:
@ -45,7 +45,7 @@ class FeedDuplicateHandlerTestCase(TestCase):
body="Google's move to end business ties with Huawei will affect current devices", body="Google's move to end business ties with Huawei will affect current devices",
publication_date=publication_date, publication_date=publication_date,
remote_identifier=None, remote_identifier=None,
rule=rule rule=rule,
) )
new_post = PostFactory.build( new_post = PostFactory.build(
url="https://www.bbc.co.uk/news/uk-england-birmingham-48339080", url="https://www.bbc.co.uk/news/uk-england-birmingham-48339080",
@ -53,7 +53,7 @@ class FeedDuplicateHandlerTestCase(TestCase):
body="Google's move to end business ties with Huawei will affect current devices", body="Google's move to end business ties with Huawei will affect current devices",
publication_date=publication_date, publication_date=publication_date,
remote_identifier=None, remote_identifier=None,
rule=rule rule=rule,
) )
with FeedDuplicateHandler(rule) as duplicate_handler: with FeedDuplicateHandler(rule) as duplicate_handler:

View file

@ -1,61 +1,62 @@
from time import struct_time from time import struct_time
simple_mock = { simple_mock = {
'bozo': 0, "bozo": 1,
'encoding': 'utf-8', "encoding": "utf-8",
'entries': [{ "entries": [
'guidislink': False, {
'href': '', "guidislink": False,
'id': 'https://www.bbc.co.uk/news/world-us-canada-48338168', "href": "",
'link': 'https://www.bbc.co.uk/news/world-us-canada-48338168', "id": "https://www.bbc.co.uk/news/world-us-canada-48338168",
'links': [{ "link": "https://www.bbc.co.uk/news/world-us-canada-48338168",
'href': 'https://www.bbc.co.uk/news/world-us-canada-48338168', "links": [
'rel': 'alternate', {
'type': 'text/html' "href": "https://www.bbc.co.uk/news/world-us-canada-48338168",
}], "rel": "alternate",
'media_thumbnail': [{ "type": "text/html",
'height': '1152', }
'url': 'http://c.files.bbci.co.uk/7605/production/_107031203_mediaitem107031202.jpg', ],
'width': '2048' "media_thumbnail": [
}], {
'published': 'Mon, 20 May 2019 16:07:37 GMT', "height": "1152",
'published_parsed': struct_time((2019, 5, 20, 16, 7, 37, 0, 140, 0)), "url": "http://c.files.bbci.co.uk/7605/production/_107031203_mediaitem107031202.jpg",
'summary': 'Foreign Minister Mohammad Javad Zarif says the US ' "width": "2048",
'president should try showing Iranians some respect.', }
'summary_detail': { ],
'base': 'http://feeds.bbci.co.uk/news/rss.xml', "published": "Mon, 20 May 2019 16:07:37 GMT",
'language': None, "published_parsed": struct_time((2019, 5, 20, 16, 7, 37, 0, 140, 0)),
'type': 'text/html', "summary": "Foreign Minister Mohammad Javad Zarif says the US "
'value': 'Foreign Minister Mohammad Javad ' "president should try showing Iranians some respect.",
'Zarif says the US president should ' "summary_detail": {
'try showing Iranians some ' "base": "http://feeds.bbci.co.uk/news/rss.xml",
'respect.' "language": None,
}, "type": "text/html",
'title': "Trump's 'genocidal taunts' will not end Iran - Zarif", "value": "Foreign Minister Mohammad Javad "
'title_detail': { "Zarif says the US president should "
'base': 'http://feeds.bbci.co.uk/news/rss.xml', "try showing Iranians some "
'language': None, "respect.",
'type': 'text/plain',
'value': "Trump's 'genocidal taunts' will not "
'end Iran - Zarif'
}
}],
'feed': {
'image': {
'href': 'https://news.bbcimg.co.uk/nol/shared/img/bbc_news_120x60.gif',
'link': 'https://www.bbc.co.uk/news/',
'title': 'BBC News - Home',
'language': 'en-gb',
'link': 'https://www.bbc.co.uk/news/'
}, },
'links': [{ "title": "Trump's 'genocidal taunts' will not end Iran - Zarif",
'href': 'https://www.bbc.co.uk/news/', "title_detail": {
'rel': 'alternate', "base": "http://feeds.bbci.co.uk/news/rss.xml",
'type': 'text/html' "language": None,
}], "type": "text/plain",
'title': 'BBC News - Home', "value": "Trump's 'genocidal taunts' will not " "end Iran - Zarif",
},
}
],
"feed": {
"image": {
"href": "https://news.bbcimg.co.uk/nol/shared/img/bbc_news_120x60.gif",
"link": "https://www.bbc.co.uk/news/",
"title": "BBC News - Home",
"language": "en-gb",
"link": "https://www.bbc.co.uk/news/",
},
"links": [{"href": "https://www.bbc.co.uk/news/", "rel": "alternate", "type": "text/html"}],
"title": "BBC News - Home",
}, },
'href': 'http://feeds.bbci.co.uk/news/rss.xml', "href": "http://feeds.bbci.co.uk/news/rss.xml",
'status': 200, "status": 200,
'version': 'rss20' "version": "rss20",
} }

View file

@ -1,5 +1,7 @@
from unittest.mock import MagicMock, patch from unittest.mock import MagicMock, patch
from .mocks import simple_mock
from django.test import TestCase from django.test import TestCase
from django.utils import timezone from django.utils import timezone
@ -13,36 +15,32 @@ from newsreader.news.collection.exceptions import (
) )
from newsreader.news.collection.feed import FeedStream from newsreader.news.collection.feed import FeedStream
from newsreader.news.collection.tests.factories import CollectionRuleFactory from newsreader.news.collection.tests.factories import CollectionRuleFactory
from newsreader.news.collection.tests.feed.stream.mocks import simple_mock
class FeedStreamTestCase(TestCase): class FeedStreamTestCase(TestCase):
def setUp(self): def setUp(self):
self.patched_get = patch( self.maxDiff = None
'newsreader.news.collection.feed.requests.get'
)
self.mocked_get = self.patched_get.start()
self.patched_parse = patch( self.patched_fetch = patch("newsreader.news.collection.feed.fetch")
'newsreader.news.collection.feed.FeedStream.parse' self.mocked_fetch = self.patched_fetch.start()
)
self.mocked_parse = self.patched_parse.start()
def tearDown(self): def tearDown(self):
patch.stopall() patch.stopall()
def test_simple_stream(self): def test_simple_stream(self):
self.mocked_parse.return_value = simple_mock self.mocked_fetch.return_value = MagicMock(content=simple_mock)
rule = CollectionRuleFactory() rule = CollectionRuleFactory()
stream = FeedStream(rule) stream = FeedStream(rule)
return_value = stream.read()
self.mocked_get.assert_called_once_with(rule.url) data, stream = stream.read()
self.assertEquals(return_value, (simple_mock, stream))
self.mocked_fetch.assert_called_once_with(rule.url)
self.assertEquals(data["entries"], data["entries"])
self.assertEquals(stream, stream)
def test_stream_raises_exception(self): def test_stream_raises_exception(self):
self.mocked_parse.side_effect = StreamException self.mocked_fetch.side_effect = StreamException
rule = CollectionRuleFactory() rule = CollectionRuleFactory()
stream = FeedStream(rule) stream = FeedStream(rule)
@ -50,10 +48,10 @@ class FeedStreamTestCase(TestCase):
with self.assertRaises(StreamException): with self.assertRaises(StreamException):
stream.read() stream.read()
self.mocked_get.assert_called_once_with(rule.url) self.mocked_fetch.assert_called_once_with(rule.url)
def test_stream_raises_denied_exception(self): def test_stream_raises_denied_exception(self):
self.mocked_get.return_value = MagicMock(status_code=401) self.mocked_fetch.side_effect = StreamDeniedException
rule = CollectionRuleFactory() rule = CollectionRuleFactory()
stream = FeedStream(rule) stream = FeedStream(rule)
@ -61,10 +59,10 @@ class FeedStreamTestCase(TestCase):
with self.assertRaises(StreamDeniedException): with self.assertRaises(StreamDeniedException):
stream.read() stream.read()
self.mocked_get.assert_called_once_with(rule.url) self.mocked_fetch.assert_called_once_with(rule.url)
def test_stream_raises_not_found_exception(self): def test_stream_raises_not_found_exception(self):
self.mocked_get.return_value = MagicMock(status_code=404) self.mocked_fetch.side_effect = StreamNotFoundException
rule = CollectionRuleFactory() rule = CollectionRuleFactory()
stream = FeedStream(rule) stream = FeedStream(rule)
@ -72,10 +70,10 @@ class FeedStreamTestCase(TestCase):
with self.assertRaises(StreamNotFoundException): with self.assertRaises(StreamNotFoundException):
stream.read() stream.read()
self.mocked_get.assert_called_once_with(rule.url) self.mocked_fetch.assert_called_once_with(rule.url)
def test_stream_raises_time_out_exception(self): def test_stream_raises_time_out_exception(self):
self.mocked_get.return_value = MagicMock(status_code=408) self.mocked_fetch.side_effect = StreamTimeOutException
rule = CollectionRuleFactory() rule = CollectionRuleFactory()
stream = FeedStream(rule) stream = FeedStream(rule)
@ -83,10 +81,10 @@ class FeedStreamTestCase(TestCase):
with self.assertRaises(StreamTimeOutException): with self.assertRaises(StreamTimeOutException):
stream.read() stream.read()
self.mocked_get.assert_called_once_with(rule.url) self.mocked_fetch.assert_called_once_with(rule.url)
def test_stream_raises_forbidden_exception(self): def test_stream_raises_forbidden_exception(self):
self.mocked_get.return_value = MagicMock(status_code=403) self.mocked_fetch.side_effect = StreamForbiddenException
rule = CollectionRuleFactory() rule = CollectionRuleFactory()
stream = FeedStream(rule) stream = FeedStream(rule)
@ -94,13 +92,12 @@ class FeedStreamTestCase(TestCase):
with self.assertRaises(StreamForbiddenException): with self.assertRaises(StreamForbiddenException):
stream.read() stream.read()
self.mocked_get.assert_called_once_with(rule.url) self.mocked_fetch.assert_called_once_with(rule.url)
@patch("newsreader.news.collection.feed.parse") @patch("newsreader.news.collection.feed.parse")
def test_stream_raises_parse_exception(self, mocked_parse): def test_stream_raises_parse_exception(self, mocked_parse):
self.mocked_get.return_value = MagicMock(status_code=200) self.mocked_fetch.return_value = MagicMock()
mocked_parse.side_effect = TypeError mocked_parse.side_effect = TypeError
self.patched_parse.stop()
rule = CollectionRuleFactory() rule = CollectionRuleFactory()
stream = FeedStream(rule) stream = FeedStream(rule)

View file

@ -0,0 +1,47 @@
simple_mock = """
<html>
<body>
<article>
<h1>Clickbait</h1>
</article>
</body>
</html>
"""
simple_feed_mock = {
"bozo": 0,
"encoding": "utf-8",
"feed": {
"image": {
"href": "https://news.bbcimg.co.uk/nol/shared/img/bbc_news_120x60.gif",
"link": "https://www.bbc.co.uk/news/",
"title": "BBC News - Home",
"language": "en-gb",
"link": "https://www.bbc.co.uk/news/",
},
"link": "https://www.bbc.co.uk/news/",
"links": [{"href": "https://www.bbc.co.uk/news/", "rel": "alternate", "type": "text/html"}],
"title": "BBC News - Home",
},
"href": "http://feeds.bbci.co.uk/news/rss.xml",
"status": 200,
"version": "rss20",
}
feed_mock_without_link = {
"bozo": 0,
"encoding": "utf-8",
"feed": {
"image": {
"href": "https://news.bbcimg.co.uk/nol/shared/img/bbc_news_120x60.gif",
"link": "https://www.bbc.co.uk/news/",
"title": "BBC News - Home",
"language": "en-gb",
"link": "https://www.bbc.co.uk/news/",
},
"title": "BBC News - Home",
},
"href": "http://feeds.bbci.co.uk/news/rss.xml",
"status": 200,
"version": "rss20",
}

View file

@ -0,0 +1,134 @@
from unittest.mock import MagicMock, patch
from bs4 import BeautifulSoup
from .mocks import feed_mock_without_link, simple_feed_mock, simple_mock
from django.test import TestCase
from newsreader.news.collection.base import URLBuilder, WebsiteStream
from newsreader.news.collection.exceptions import (
StreamDeniedException,
StreamException,
StreamForbiddenException,
StreamNotFoundException,
StreamParseException,
StreamTimeOutException,
)
from newsreader.news.collection.tests.factories import CollectionRuleFactory
class WebsiteStreamTestCase(TestCase):
def setUp(self):
self.patched_fetch = patch("newsreader.news.collection.base.fetch")
self.mocked_fetch = self.patched_fetch.start()
def tearDown(self):
patch.stopall()
def test_simple(self):
self.mocked_fetch.return_value = MagicMock(content=simple_mock)
rule = CollectionRuleFactory()
stream = WebsiteStream(rule.url)
return_value = stream.read()
self.mocked_fetch.assert_called_once_with(rule.url)
self.assertEquals(return_value, (BeautifulSoup(simple_mock, "lxml"), stream))
def test_raises_exception(self):
self.mocked_fetch.side_effect = StreamException
rule = CollectionRuleFactory()
stream = WebsiteStream(rule.url)
with self.assertRaises(StreamException):
stream.read()
self.mocked_fetch.assert_called_once_with(rule.url)
def test_raises_denied_exception(self):
self.mocked_fetch.side_effect = StreamDeniedException
rule = CollectionRuleFactory()
stream = WebsiteStream(rule.url)
with self.assertRaises(StreamDeniedException):
stream.read()
self.mocked_fetch.assert_called_once_with(rule.url)
def test_raises_stream_not_found_exception(self):
self.mocked_fetch.side_effect = StreamNotFoundException
rule = CollectionRuleFactory()
stream = WebsiteStream(rule.url)
with self.assertRaises(StreamNotFoundException):
stream.read()
self.mocked_fetch.assert_called_once_with(rule.url)
def test_stream_raises_time_out_exception(self):
self.mocked_fetch.side_effect = StreamTimeOutException
rule = CollectionRuleFactory()
stream = WebsiteStream(rule.url)
with self.assertRaises(StreamTimeOutException):
stream.read()
self.mocked_fetch.assert_called_once_with(rule.url)
def test_stream_raises_forbidden_exception(self):
self.mocked_fetch.side_effect = StreamForbiddenException
rule = CollectionRuleFactory()
stream = WebsiteStream(rule.url)
with self.assertRaises(StreamForbiddenException):
stream.read()
self.mocked_fetch.assert_called_once_with(rule.url)
@patch("newsreader.news.collection.base.WebsiteStream.parse")
def test_stream_raises_parse_exception(self, mocked_parse):
self.mocked_fetch.return_value = MagicMock()
mocked_parse.side_effect = StreamParseException
rule = CollectionRuleFactory()
stream = WebsiteStream(rule.url)
with self.assertRaises(StreamParseException):
stream.read()
self.mocked_fetch.assert_called_once_with(rule.url)
class URLBuilderTestCase(TestCase):
def test_simple(self):
initial_rule = CollectionRuleFactory()
with URLBuilder((simple_feed_mock, MagicMock(rule=initial_rule))) as builder:
rule, url = builder.build()
self.assertEquals(rule.pk, initial_rule.pk)
self.assertEquals(url, "https://www.bbc.co.uk/news/")
def test_no_link(self):
initial_rule = CollectionRuleFactory()
with URLBuilder((feed_mock_without_link, MagicMock(rule=initial_rule))) as builder:
rule, url = builder.build()
self.assertEquals(rule.pk, initial_rule.pk)
self.assertEquals(url, None)
def test_no_data(self):
initial_rule = CollectionRuleFactory()
with URLBuilder((None, MagicMock(rule=initial_rule))) as builder:
rule, url = builder.build()
self.assertEquals(rule.pk, initial_rule.pk)
self.assertEquals(url, None)

View file

@ -0,0 +1 @@
from .tests import *

View file

@ -0,0 +1,57 @@
from unittest.mock import MagicMock, patch
from django.test import TestCase
from newsreader.news.collection.exceptions import (
StreamDeniedException,
StreamForbiddenException,
StreamNotFoundException,
StreamTimeOutException,
)
from newsreader.news.collection.utils import fetch
class FetchTestCase(TestCase):
def setUp(self):
self.patched_get = patch("newsreader.news.collection.utils.requests.get")
self.mocked_get = self.patched_get.start()
def test_simple(self):
self.mocked_get.return_value = MagicMock(status_code=200, content="content")
url = "https://www.bbc.co.uk/news"
response = fetch(url)
self.assertEquals(response.content, "content")
def test_raises_not_found(self):
self.mocked_get.return_value = MagicMock(status_code=404)
url = "https://www.bbc.co.uk/news"
with self.assertRaises(StreamNotFoundException):
fetch(url)
def test_raises_denied(self):
self.mocked_get.return_value = MagicMock(status_code=401)
url = "https://www.bbc.co.uk/news"
with self.assertRaises(StreamDeniedException):
fetch(url)
def test_raises_forbidden(self):
self.mocked_get.return_value = MagicMock(status_code=403)
url = "https://www.bbc.co.uk/news"
with self.assertRaises(StreamForbiddenException):
fetch(url)
def test_raises_timed_out(self):
self.mocked_get.return_value = MagicMock(status_code=408)
url = "https://www.bbc.co.uk/news"
with self.assertRaises(StreamTimeOutException):
fetch(url)

View file

@ -1,9 +1,15 @@
from datetime import datetime, tzinfo from datetime import datetime, tzinfo
from time import mktime, struct_time from time import mktime, struct_time
from typing import Tuple from typing import Optional, Tuple
import requests
from requests.models import Response
from django.utils import timezone from django.utils import timezone
from newsreader.news.collection.response_handler import ResponseHandler
def build_publication_date(dt: struct_time, tz: tzinfo) -> Tuple: def build_publication_date(dt: struct_time, tz: tzinfo) -> Tuple:
try: try:
@ -12,3 +18,12 @@ def build_publication_date(dt: struct_time, tz: tzinfo) -> Tuple:
except TypeError: except TypeError:
return False, None return False, None
return True, published_parsed return True, published_parsed
def fetch(url: str) -> Optional[Response]:
response = requests.get(url)
with ResponseHandler(response) as response_handler:
response_handler.handle_response()
return response