0.2.3 #99

Merged
sonny merged 112 commits from development into master 2020-05-23 16:58:42 +02:00
33 changed files with 1238 additions and 228 deletions
Showing only changes of commit 48a9b25545 - Show all commits

View file

@ -4,20 +4,9 @@ from newsreader.news.collection.models import CollectionRule
class CollectionRuleAdmin(admin.ModelAdmin):
fields = (
"url",
"name",
"timezone",
"category",
)
fields = ("url", "name", "timezone", "category", "favicon")
list_display = (
"name",
"category",
"url",
"last_suceeded",
"succeeded",
)
list_display = ("name", "category", "url", "last_suceeded", "succeeded")
admin.site.register(CollectionRule, CollectionRuleAdmin)

View file

@ -2,9 +2,13 @@ from typing import ContextManager, Dict, List, Optional, Tuple
import requests
from bs4 import BeautifulSoup
from django.utils import timezone
from newsreader.news.collection.exceptions import StreamParseException
from newsreader.news.collection.models import CollectionRule
from newsreader.news.collection.utils import fetch
class Stream:
@ -12,9 +16,7 @@ class Stream:
self.rule = rule
def read(self) -> Tuple:
url = self.rule.url
response = requests.get(url)
return (self.parse(response.content), self)
raise NotImplementedError
def parse(self, payload: bytes) -> Dict:
raise NotImplementedError
@ -45,7 +47,7 @@ class Client:
class Builder:
instances = []
def __init__(self, stream: Stream) -> None:
def __init__(self, stream: Tuple) -> None:
self.stream = stream
def __enter__(self) -> ContextManager:
@ -81,3 +83,39 @@ class Collector:
class Meta:
abstract = True
class WebsiteStream(Stream):
def __init__(self, url: str) -> None:
self.url = url
def read(self) -> Tuple:
response = fetch(self.url)
return (self.parse(response.content), self)
def parse(self, payload: bytes) -> BeautifulSoup:
try:
return BeautifulSoup(payload, "lxml")
except TypeError:
raise StreamParseException("Could not parse given HTML")
class URLBuilder(Builder):
def __enter__(self) -> ContextManager:
return self
def build(self) -> Tuple:
data, stream = self.stream
rule = stream.rule
try:
url = data["feed"]["link"]
except (KeyError, TypeError):
url = None
if url:
rule.website_url = url
rule.save()
return rule, url

View file

@ -0,0 +1,113 @@
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import ContextManager, List, Optional
from urllib.parse import urljoin, urlparse
from newsreader.news.collection.base import (
Builder,
Client,
Collector,
Stream,
URLBuilder,
WebsiteStream,
)
from newsreader.news.collection.exceptions import StreamException
from newsreader.news.collection.feed import FeedClient
LINK_RELS = ["icon", "shortcut icon", "apple-touch-icon", "apple-touch-icon-precomposed"]
class FaviconBuilder(Builder):
def build(self) -> None:
rule, soup = self.stream
url = self.parse(soup, rule.website_url)
if url:
rule.favicon = url
rule.save()
def parse(self, soup, website_url) -> Optional[str]:
if not soup.head:
return
links = soup.head.find_all("link")
url = self.parse_links(links)
if not url:
return
parsed_url = urlparse(url)
if not parsed_url.scheme and not parsed_url.netloc:
if not website_url:
return
return urljoin(website_url, url)
elif not parsed_url.scheme:
return urljoin(f"https://{parsed_url.netloc}", parsed_url.path)
return url
def parse_links(self, links: List) -> Optional[str]:
favicons = set()
icons = set()
for link in links:
if not "href" in link.attrs:
continue
if "favicon" in link["href"]:
favicons.add(link["href"].lower())
if "rel" in link.attrs:
for rel in link["rel"]:
if rel in LINK_RELS:
icons.add(link["href"].lower())
if favicons:
return favicons.pop()
elif icons:
return icons.pop()
class FaviconClient(Client):
stream = WebsiteStream
def __init__(self, streams: List) -> None:
self.streams = streams
def __enter__(self) -> ContextManager:
with ThreadPoolExecutor(max_workers=10) as executor:
futures = {executor.submit(stream.read): rule for rule, stream in self.streams}
for future in as_completed(futures):
rule = futures[future]
try:
response_data, stream = future.result()
except StreamException:
continue
yield (rule, response_data)
class FaviconCollector(Collector):
feed_client, favicon_client = (FeedClient, FaviconClient)
url_builder, favicon_builder = (URLBuilder, FaviconBuilder)
def collect(self, rules: Optional[List] = None) -> None:
streams = []
with self.feed_client(rules=rules) as client:
for data, stream in client:
with self.url_builder((data, stream)) as builder:
rule, url = builder.build()
if not url:
continue
streams.append((rule, WebsiteStream(url)))
with self.favicon_client(streams) as client:
for rule, data in client:
with self.favicon_builder((rule, data)) as builder:
builder.build()

View file

@ -3,7 +3,6 @@ from typing import ContextManager, Dict, Generator, List, Optional, Tuple
import bleach
import pytz
import requests
from feedparser import parse
@ -19,7 +18,7 @@ from newsreader.news.collection.exceptions import (
)
from newsreader.news.collection.models import CollectionRule
from newsreader.news.collection.response_handler import ResponseHandler
from newsreader.news.collection.utils import build_publication_date
from newsreader.news.collection.utils import build_publication_date, fetch
from newsreader.news.posts.models import Post
@ -92,10 +91,7 @@ class FeedBuilder(Builder):
class FeedStream(Stream):
def read(self) -> Tuple:
url = self.rule.url
response = requests.get(url)
with ResponseHandler(response) as response_handler:
response_handler.handle_response()
response = fetch(url)
return (self.parse(response.content), self)

View file

@ -0,0 +1,11 @@
from django.core.management.base import BaseCommand
from newsreader.news.collection.favicon import FaviconCollector
class Command(BaseCommand):
help = "Fetch favicons for collection rules"
def handle(self, *args, **options):
collector = FaviconCollector()
collector.collect()

View file

@ -0,0 +1,16 @@
# Generated by Django 2.2 on 2019-06-23 18:37
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [("collection", "0006_collectionrule_error")]
operations = [
migrations.AlterField(
model_name="collectionrule",
name="favicon",
field=models.ImageField(default="favicons/default-favicon.ico", upload_to="favicons/"),
)
]

View file

@ -0,0 +1,16 @@
# Generated by Django 2.2 on 2019-06-23 18:47
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [("collection", "0007_auto_20190623_1837")]
operations = [
migrations.AlterField(
model_name="collectionrule",
name="favicon",
field=models.URLField(blank=True, null=True),
)
]

View file

@ -0,0 +1,16 @@
# Generated by Django 2.2 on 2019-06-27 21:27
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [("collection", "0008_auto_20190623_1847")]
operations = [
migrations.AddField(
model_name="collectionrule",
name="website_url",
field=models.URLField(blank=True, editable=False, null=True),
)
]

View file

@ -0,0 +1,19 @@
# Generated by Django 2.2 on 2019-06-28 21:42
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [("collection", "0009_collectionrule_website_url")]
operations = [
migrations.AlterField(
model_name="collectionrule", name="url", field=models.URLField(max_length=1024)
),
migrations.AlterField(
model_name="collectionrule",
name="website_url",
field=models.URLField(blank=True, editable=False, max_length=1024, null=True),
),
]

View file

@ -1,5 +1,6 @@
import pytz
from django.conf import settings
from django.db import models
from django.utils.translation import gettext as _
@ -8,8 +9,9 @@ class CollectionRule(models.Model):
name = models.CharField(max_length=100)
source = models.CharField(max_length=100)
url = models.URLField()
favicon = models.ImageField(blank=True, null=True)
url = models.URLField(max_length=1024)
website_url = models.URLField(max_length=1024, editable=False, blank=True, null=True)
favicon = models.URLField(blank=True, null=True)
timezone = models.CharField(
choices=((timezone, timezone) for timezone in pytz.all_timezones),
@ -23,7 +25,7 @@ class CollectionRule(models.Model):
null=True,
verbose_name=_("Category"),
help_text=_("Posts from this rule will be tagged with this category"),
on_delete=models.SET_NULL
on_delete=models.SET_NULL,
)
last_suceeded = models.DateTimeField(blank=True, null=True)

View file

@ -1 +1,4 @@
from .favicon import *
from .feed import *
from .tests import *
from .utils import *

View file

@ -10,3 +10,4 @@ class CollectionRuleFactory(factory.django.DjangoModelFactory):
name = factory.Sequence(lambda n: "CollectionRule-{}".format(n))
source = factory.Faker("name")
url = factory.Faker("url")
website_url = factory.Faker("url")

View file

@ -0,0 +1,3 @@
from .builder import *
from .client import *
from .collector import *

View file

@ -0,0 +1 @@
from .tests import *

View file

@ -0,0 +1,88 @@
from bs4 import BeautifulSoup
simple_mock = BeautifulSoup(
"""
<html>
<head>
<link rel='shortcut icon' href='https://www.bbc.com/favicon.ico' />
</head>
<body>
<article />
</body>
</html>
""",
"lxml",
)
mock_without_url = BeautifulSoup(
"""
<html>
<head>
<link rel='shortcut icon' href='favicon.ico' />
</head>
<body>
<article />
</body>
</html>
""",
"lxml",
)
mock_without_header = BeautifulSoup(
"""
<html>
<body>
<article />
</body>
</html>
""",
"lxml",
)
mock_with_weird_path = BeautifulSoup(
"""
<html>
<head>
<link rel='shortcut icon' href='//www.theguardian.com/jabadaba/doe/favicon.ico' />
</head>
<body>
<article />
</body>
</html>
""",
"lxml",
)
mock_with_other_url = BeautifulSoup(
"""
<html>
<head>
<link rel='stylesheet' href='https://www.theguardian.com/main.css' />
<link rel='icon' href='https://www.theguardian.com/icon.png' />
</head>
<body>
<article />
</body>
</html>
""",
"lxml",
)
mock_with_multiple_icons = BeautifulSoup(
"""
<html>
<head>
<link rel='stylesheet' href='https://www.theguardian.com/main.css' />
<link rel='icon' href='https://www.theguardian.com/icon.png' />
<link rel='shortcut icon' href='https://www.theguardian.com/icon.png' />
<link rel='apple-touch-icon' href='https://www.theguardian.com/icon.png' />
<link rel='apple-touch-icon-precomposed' href='https://www.theguardian.com/icon.png' />
<link rel='shortcut icon' href='https://www.bbc.com/favicon.ico' />
</head>
<body>
<article />
</body>
</html>
""",
"lxml",
)

View file

@ -0,0 +1,60 @@
from freezegun import freeze_time
from django.test import TestCase
from newsreader.news.collection.favicon import FaviconBuilder
from newsreader.news.collection.tests.factories import CollectionRuleFactory
from newsreader.news.collection.tests.favicon.builder.mocks import *
class FaviconBuilderTestCase(TestCase):
def setUp(self):
self.maxDiff = None
def test_simple(self):
rule = CollectionRuleFactory(favicon=None)
with FaviconBuilder((rule, simple_mock)) as builder:
builder.build()
self.assertEquals(rule.favicon, "https://www.bbc.com/favicon.ico")
def test_without_url(self):
rule = CollectionRuleFactory(website_url="https://www.theguardian.com/", favicon=None)
with FaviconBuilder((rule, mock_without_url)) as builder:
builder.build()
self.assertEquals(rule.favicon, "https://www.theguardian.com/favicon.ico")
def test_without_header(self):
rule = CollectionRuleFactory(favicon=None)
with FaviconBuilder((rule, mock_without_header)) as builder:
builder.build()
self.assertEquals(rule.favicon, None)
def test_weird_path(self):
rule = CollectionRuleFactory(favicon=None)
with FaviconBuilder((rule, mock_with_weird_path)) as builder:
builder.build()
self.assertEquals(rule.favicon, "https://www.theguardian.com/jabadaba/doe/favicon.ico")
def test_other_url(self):
rule = CollectionRuleFactory(favicon=None)
with FaviconBuilder((rule, mock_with_other_url)) as builder:
builder.build()
self.assertEquals(rule.favicon, "https://www.theguardian.com/icon.png")
def test_url_with_favicon_takes_precedence(self):
rule = CollectionRuleFactory(favicon=None)
with FaviconBuilder((rule, mock_with_multiple_icons)) as builder:
builder.build()
self.assertEquals(rule.favicon, "https://www.bbc.com/favicon.ico")

View file

@ -0,0 +1 @@
from .tests import *

View file

@ -0,0 +1,12 @@
from bs4 import BeautifulSoup
simple_mock = BeautifulSoup(
"""
<html>
<body>
<article />
</body>
</htm>
""",
"lxml",
)

View file

@ -0,0 +1,91 @@
from unittest.mock import MagicMock
from django.test import TestCase
from newsreader.news.collection.base import WebsiteStream
from newsreader.news.collection.exceptions import (
StreamDeniedException,
StreamException,
StreamNotFoundException,
StreamTimeOutException,
)
from newsreader.news.collection.favicon import FaviconClient
from newsreader.news.collection.tests.factories import CollectionRuleFactory
from newsreader.news.collection.tests.favicon.client.mocks import simple_mock
class FaviconClientTestCase(TestCase):
def setUp(self):
self.maxDiff = None
def test_simple(self):
rule = CollectionRuleFactory()
stream = MagicMock(url="https://www.bbc.com")
stream.read.return_value = (simple_mock, stream)
with FaviconClient([(rule, stream)]) as client:
for rule, data in client:
self.assertEquals(rule.pk, rule.pk)
self.assertEquals(data, simple_mock)
stream.read.assert_called_once_with()
def test_client_catches_stream_exception(self):
rule = CollectionRuleFactory(error=None, succeeded=True)
stream = MagicMock(url="https://www.bbc.com")
stream.read.side_effect = StreamException
with FaviconClient([(rule, stream)]) as client:
for rule, data in client:
pass
stream.read.assert_called_once_with()
# The favicon client does not set CollectionRule errors
self.assertEquals(rule.succeeded, True)
self.assertEquals(rule.error, None)
def test_client_catches_stream_not_found_exception(self):
rule = CollectionRuleFactory(error=None, succeeded=True)
stream = MagicMock(url="https://www.bbc.com")
stream.read.side_effect = StreamNotFoundException
with FaviconClient([(rule, stream)]) as client:
for rule, data in client:
pass
stream.read.assert_called_once_with()
# The favicon client does not set CollectionRule errors
self.assertEquals(rule.succeeded, True)
self.assertEquals(rule.error, None)
def test_client_catches_stream_denied_exception(self):
rule = CollectionRuleFactory(error=None, succeeded=True)
stream = MagicMock(url="https://www.bbc.com")
stream.read.side_effect = StreamDeniedException
with FaviconClient([(rule, stream)]) as client:
for rule, data in client:
pass
stream.read.assert_called_once_with()
# The favicon client does not set CollectionRule errors
self.assertEquals(rule.succeeded, True)
self.assertEquals(rule.error, None)
def test_client_catches_stream_timed_out(self):
rule = CollectionRuleFactory(error=None, succeeded=True)
stream = MagicMock(url="https://www.bbc.com")
stream.read.side_effect = StreamTimeOutException
with FaviconClient([(rule, stream)]) as client:
for rule, data in client:
pass
stream.read.assert_called_once_with()
# The favicon client does not set CollectionRule errors
self.assertEquals(rule.succeeded, True)
self.assertEquals(rule.error, None)

View file

@ -0,0 +1 @@
from .tests import *

View file

@ -0,0 +1,159 @@
from time import struct_time
from bs4 import BeautifulSoup
feed_mock = {
"bozo": 0,
"encoding": "utf-8",
"entries": [
{
"guidislink": False,
"href": "",
"id": "https://www.bbc.co.uk/news/world-us-canada-48338168",
"link": "https://www.bbc.co.uk/news/world-us-canada-48338168",
"links": [
{
"href": "https://www.bbc.co.uk/news/world-us-canada-48338168",
"rel": "alternate",
"type": "text/html",
}
],
"media_thumbnail": [
{
"height": "1152",
"url": "http://c.files.bbci.co.uk/7605/production/_107031203_mediaitem107031202.jpg",
"width": "2048",
}
],
"published": "Mon, 20 May 2019 16:07:37 GMT",
"published_parsed": struct_time((2019, 5, 20, 16, 7, 37, 0, 140, 0)),
"summary": "Foreign Minister Mohammad Javad Zarif says the US "
"president should try showing Iranians some respect.",
"summary_detail": {
"base": "http://feeds.bbci.co.uk/news/rss.xml",
"language": None,
"type": "text/html",
"value": "Foreign Minister Mohammad Javad "
"Zarif says the US president should "
"try showing Iranians some "
"respect.",
},
"title": "Trump's genocidal taunts will not end Iran - Zarif",
"title_detail": {
"base": "http://feeds.bbci.co.uk/news/rss.xml",
"language": None,
"type": "text/plain",
"value": "Trump's genocidal taunts will not " "end Iran - Zarif",
},
},
{
"guidislink": False,
"href": "",
"id": "https://www.bbc.co.uk/news/technology-48334739",
"link": "https://www.bbc.co.uk/news/technology-48334739",
"links": [
{
"href": "https://www.bbc.co.uk/news/technology-48334739",
"rel": "alternate",
"type": "text/html",
}
],
"media_thumbnail": [
{
"height": "432",
"url": "http://c.files.bbci.co.uk/4789/production/_107031381_mediaitem107028670.jpg",
"width": "768",
}
],
"published": "Mon, 20 May 2019 12:19:19 GMT",
"published_parsed": struct_time((2019, 5, 20, 12, 19, 19, 0, 140, 0)),
"summary": "Google's move to end business ties with Huawei will "
"affect current devices and future purchases.",
"summary_detail": {
"base": "http://feeds.bbci.co.uk/news/rss.xml",
"language": None,
"type": "text/html",
"value": "Google's move to end business ties "
"with Huawei will affect current "
"devices and future purchases.",
},
"title": "Huawei's Android loss: How it affects you",
"title_detail": {
"base": "http://feeds.bbci.co.uk/news/rss.xml",
"language": None,
"type": "text/plain",
"value": "Huawei's Android loss: How it " "affects you",
},
},
{
"guidislink": False,
"href": "",
"id": "https://www.bbc.co.uk/news/uk-england-birmingham-48339080",
"link": "https://www.bbc.co.uk/news/uk-england-birmingham-48339080",
"links": [
{
"href": "https://www.bbc.co.uk/news/uk-england-birmingham-48339080",
"rel": "alternate",
"type": "text/html",
}
],
"media_thumbnail": [
{
"height": "549",
"url": "http://c.files.bbci.co.uk/11D67/production/_107036037_lgbtheadjpg.jpg",
"width": "976",
}
],
"published": "Mon, 20 May 2019 16:32:38 GMT",
"published_parsed": struct_time((2019, 5, 20, 16, 32, 38, 0, 140, 0)),
"summary": "Police are investigating the messages while an MP "
"calls for a protest exclusion zone to protect "
"children.",
"summary_detail": {
"base": "http://feeds.bbci.co.uk/news/rss.xml",
"language": None,
"type": "text/html",
"value": "Police are investigating the "
"messages while an MP calls for a "
"protest exclusion zone to protect "
"children.",
},
"title": "Birmingham head teacher threatened over LGBT lessons",
"title_detail": {
"base": "http://feeds.bbci.co.uk/news/rss.xml",
"language": None,
"type": "text/plain",
"value": "Birmingham head teacher threatened " "over LGBT lessons",
},
},
],
"feed": {
"image": {
"href": "https://news.bbcimg.co.uk/nol/shared/img/bbc_news_120x60.gif",
"link": "https://www.bbc.co.uk/news/",
"title": "BBC News - Home",
"language": "en-gb",
"link": "https://www.bbc.co.uk/news/",
},
"link": "https://www.bbc.co.uk/news/",
"links": [{"href": "https://www.bbc.co.uk/news/", "rel": "alternate", "type": "text/html"}],
"title": "BBC News - Home",
},
"href": "http://feeds.bbci.co.uk/news/rss.xml",
"status": 200,
"version": "rss20",
}
website_mock = BeautifulSoup(
"""
<html>
<head>
<link rel="shortcut icon" href="https://www.bbc.co.uk/news/favicon.ico" />
</head>
<body>
<article />
</body>
</html>
""",
"lxml",
)

View file

@ -0,0 +1,147 @@
from unittest.mock import MagicMock, patch
import pytz
from bs4 import BeautifulSoup
from .mocks import feed_mock, website_mock
from django.test import TestCase
from django.utils import timezone
from newsreader.news.collection.exceptions import (
StreamDeniedException,
StreamException,
StreamForbiddenException,
StreamNotFoundException,
StreamParseException,
StreamTimeOutException,
)
from newsreader.news.collection.favicon import FaviconCollector
from newsreader.news.collection.tests.factories import CollectionRuleFactory
class FaviconCollectorTestCase(TestCase):
def setUp(self):
self.maxDiff = None
self.patched_feed_client = patch("newsreader.news.collection.favicon.FeedClient.__enter__")
self.mocked_feed_client = self.patched_feed_client.start()
self.patched_website_read = patch("newsreader.news.collection.favicon.WebsiteStream.read")
self.mocked_website_read = self.patched_website_read.start()
def tearDown(self):
patch.stopall()
def test_simple(self):
rule = CollectionRuleFactory(succeeded=True, error=None)
self.mocked_feed_client.return_value = [(feed_mock, MagicMock(rule=rule))]
self.mocked_website_read.return_value = (website_mock, MagicMock())
collector = FaviconCollector()
collector.collect()
rule.refresh_from_db()
self.assertEquals(rule.succeeded, True)
self.assertEquals(rule.error, None)
self.assertEquals(rule.website_url, "https://www.bbc.co.uk/news/")
self.assertEquals(rule.favicon, "https://www.bbc.co.uk/news/favicon.ico")
def test_empty_stream(self):
rule = CollectionRuleFactory(succeeded=True, error=None)
self.mocked_feed_client.return_value = [(feed_mock, MagicMock(rule=rule))]
self.mocked_website_read.return_value = (BeautifulSoup("", "lxml"), MagicMock())
collector = FaviconCollector()
collector.collect()
rule.refresh_from_db()
self.assertEquals(rule.succeeded, True)
self.assertEquals(rule.error, None)
self.assertEquals(rule.website_url, "https://www.bbc.co.uk/news/")
self.assertEquals(rule.favicon, None)
def test_not_found(self):
rule = CollectionRuleFactory(succeeded=True, error=None)
self.mocked_feed_client.return_value = [(feed_mock, MagicMock(rule=rule))]
self.mocked_website_read.side_effect = StreamNotFoundException
collector = FaviconCollector()
collector.collect()
rule.refresh_from_db()
self.assertEquals(rule.succeeded, True)
self.assertEquals(rule.error, None)
self.assertEquals(rule.website_url, "https://www.bbc.co.uk/news/")
self.assertEquals(rule.favicon, None)
def test_denied(self):
rule = CollectionRuleFactory(succeeded=True, error=None)
self.mocked_feed_client.return_value = [(feed_mock, MagicMock(rule=rule))]
self.mocked_website_read.side_effect = StreamDeniedException
collector = FaviconCollector()
collector.collect()
rule.refresh_from_db()
self.assertEquals(rule.succeeded, True)
self.assertEquals(rule.error, None)
self.assertEquals(rule.website_url, "https://www.bbc.co.uk/news/")
self.assertEquals(rule.favicon, None)
def test_forbidden(self):
rule = CollectionRuleFactory(succeeded=True, error=None)
self.mocked_feed_client.return_value = [(feed_mock, MagicMock(rule=rule))]
self.mocked_website_read.side_effect = StreamForbiddenException
collector = FaviconCollector()
collector.collect()
rule.refresh_from_db()
self.assertEquals(rule.succeeded, True)
self.assertEquals(rule.error, None)
self.assertEquals(rule.website_url, "https://www.bbc.co.uk/news/")
self.assertEquals(rule.favicon, None)
def test_timed_out(self):
rule = CollectionRuleFactory(succeeded=True, error=None)
self.mocked_feed_client.return_value = [(feed_mock, MagicMock(rule=rule))]
self.mocked_website_read.side_effect = StreamTimeOutException
collector = FaviconCollector()
collector.collect()
rule.refresh_from_db()
self.assertEquals(rule.succeeded, True)
self.assertEquals(rule.error, None)
self.assertEquals(rule.website_url, "https://www.bbc.co.uk/news/")
self.assertEquals(rule.favicon, None)
def test_wrong_stream_content_type(self):
rule = CollectionRuleFactory(succeeded=True, error=None)
self.mocked_feed_client.return_value = [(feed_mock, MagicMock(rule=rule))]
self.mocked_website_read.side_effect = StreamParseException
collector = FaviconCollector()
collector.collect()
rule.refresh_from_db()
self.assertEquals(rule.succeeded, True)
self.assertEquals(rule.error, None)
self.assertEquals(rule.website_url, "https://www.bbc.co.uk/news/")
self.assertEquals(rule.favicon, None)

View file

@ -5,26 +5,27 @@ import pytz
from freezegun import freeze_time
from .mocks import *
from django.test import TestCase
from django.utils import timezone
from newsreader.news.collection.feed import FeedBuilder
from newsreader.news.collection.tests.factories import CollectionRuleFactory
from newsreader.news.collection.tests.feed.builder.mocks import *
from newsreader.news.posts.models import Post
from newsreader.news.posts.tests.factories import PostFactory
class FeedBuilderTestCase(TestCase):
def setUp(self):
pass
self.maxDiff = None
def test_basic_entry(self):
builder = FeedBuilder
rule = CollectionRuleFactory()
mock_stream = MagicMock(rule=rule)
with builder((simple_mock, mock_stream,)) as builder:
with builder((simple_mock, mock_stream)) as builder:
builder.save()
post = Post.objects.get()
@ -36,26 +37,19 @@ class FeedBuilderTestCase(TestCase):
self.assertEquals(Post.objects.count(), 1)
self.assertEquals(
post.remote_identifier,
"https://www.bbc.co.uk/news/world-us-canada-48338168"
post.remote_identifier, "https://www.bbc.co.uk/news/world-us-canada-48338168"
)
self.assertEquals(
post.url,
"https://www.bbc.co.uk/news/world-us-canada-48338168"
)
self.assertEquals(post.url, "https://www.bbc.co.uk/news/world-us-canada-48338168")
self.assertEquals(
post.title,
"Trump's 'genocidal taunts' will not end Iran - Zarif"
)
self.assertEquals(post.title, "Trump's 'genocidal taunts' will not end Iran - Zarif")
def test_multiple_entries(self):
builder = FeedBuilder
rule = CollectionRuleFactory()
mock_stream = MagicMock(rule=rule)
with builder((multiple_mock, mock_stream,)) as builder:
with builder((multiple_mock, mock_stream)) as builder:
builder.save()
posts = Post.objects.order_by("id")
@ -70,19 +64,12 @@ class FeedBuilderTestCase(TestCase):
self.assertEquals(first_post.publication_date, aware_date)
self.assertEquals(
first_post.remote_identifier,
"https://www.bbc.co.uk/news/world-us-canada-48338168"
first_post.remote_identifier, "https://www.bbc.co.uk/news/world-us-canada-48338168"
)
self.assertEquals(
first_post.url,
"https://www.bbc.co.uk/news/world-us-canada-48338168"
)
self.assertEquals(first_post.url, "https://www.bbc.co.uk/news/world-us-canada-48338168")
self.assertEquals(
first_post.title,
"Trump's 'genocidal taunts' will not end Iran - Zarif"
)
self.assertEquals(first_post.title, "Trump's 'genocidal taunts' will not end Iran - Zarif")
d = datetime.combine(date(2019, 5, 20), time(hour=12, minute=19, second=19))
aware_date = pytz.utc.localize(d)
@ -90,26 +77,19 @@ class FeedBuilderTestCase(TestCase):
self.assertEquals(second_post.publication_date, aware_date)
self.assertEquals(
second_post.remote_identifier,
"https://www.bbc.co.uk/news/technology-48334739"
second_post.remote_identifier, "https://www.bbc.co.uk/news/technology-48334739"
)
self.assertEquals(
second_post.url,
"https://www.bbc.co.uk/news/technology-48334739"
)
self.assertEquals(second_post.url, "https://www.bbc.co.uk/news/technology-48334739")
self.assertEquals(
second_post.title,
"Huawei's Android loss: How it affects you"
)
self.assertEquals(second_post.title, "Huawei's Android loss: How it affects you")
def test_entry_without_remote_identifier(self):
builder = FeedBuilder
rule = CollectionRuleFactory()
mock_stream = MagicMock(rule=rule)
with builder((mock_without_identifier, mock_stream,)) as builder:
with builder((mock_without_identifier, mock_stream)) as builder:
builder.save()
posts = Post.objects.order_by("id")
@ -124,15 +104,9 @@ class FeedBuilderTestCase(TestCase):
self.assertEquals(first_post.remote_identifier, None)
self.assertEquals(
first_post.url,
"https://www.bbc.co.uk/news/world-us-canada-48338168"
)
self.assertEquals(first_post.url, "https://www.bbc.co.uk/news/world-us-canada-48338168")
self.assertEquals(
first_post.title,
"Trump's 'genocidal taunts' will not end Iran - Zarif"
)
self.assertEquals(first_post.title, "Trump's 'genocidal taunts' will not end Iran - Zarif")
@freeze_time("2019-10-30 12:30:00")
def test_entry_without_publication_date(self):
@ -140,7 +114,7 @@ class FeedBuilderTestCase(TestCase):
rule = CollectionRuleFactory()
mock_stream = MagicMock(rule=rule)
with builder((mock_without_publish_date, mock_stream,)) as builder:
with builder((mock_without_publish_date, mock_stream)) as builder:
builder.save()
posts = Post.objects.order_by("id")
@ -151,14 +125,12 @@ class FeedBuilderTestCase(TestCase):
self.assertEquals(first_post.created, timezone.now())
self.assertEquals(
first_post.remote_identifier,
'https://www.bbc.co.uk/news/world-us-canada-48338168'
first_post.remote_identifier, "https://www.bbc.co.uk/news/world-us-canada-48338168"
)
self.assertEquals(second_post.created, timezone.now())
self.assertEquals(
second_post.remote_identifier,
'https://www.bbc.co.uk/news/technology-48334739'
second_post.remote_identifier, "https://www.bbc.co.uk/news/technology-48334739"
)
@freeze_time("2019-10-30 12:30:00")
@ -167,7 +139,7 @@ class FeedBuilderTestCase(TestCase):
rule = CollectionRuleFactory()
mock_stream = MagicMock(rule=rule)
with builder((mock_without_url, mock_stream,)) as builder:
with builder((mock_without_url, mock_stream)) as builder:
builder.save()
posts = Post.objects.order_by("id")
@ -178,14 +150,12 @@ class FeedBuilderTestCase(TestCase):
self.assertEquals(first_post.created, timezone.now())
self.assertEquals(
first_post.remote_identifier,
'https://www.bbc.co.uk/news/world-us-canada-48338168'
first_post.remote_identifier, "https://www.bbc.co.uk/news/world-us-canada-48338168"
)
self.assertEquals(second_post.created, timezone.now())
self.assertEquals(
second_post.remote_identifier,
'https://www.bbc.co.uk/news/technology-48334739'
second_post.remote_identifier, "https://www.bbc.co.uk/news/technology-48334739"
)
@freeze_time("2019-10-30 12:30:00")
@ -194,7 +164,7 @@ class FeedBuilderTestCase(TestCase):
rule = CollectionRuleFactory()
mock_stream = MagicMock(rule=rule)
with builder((mock_without_body, mock_stream,)) as builder:
with builder((mock_without_body, mock_stream)) as builder:
builder.save()
posts = Post.objects.order_by("id")
@ -205,14 +175,13 @@ class FeedBuilderTestCase(TestCase):
self.assertEquals(first_post.created, timezone.now())
self.assertEquals(
first_post.remote_identifier,
'https://www.bbc.co.uk/news/world-us-canada-48338168'
first_post.remote_identifier, "https://www.bbc.co.uk/news/world-us-canada-48338168"
)
self.assertEquals(second_post.created, timezone.now())
self.assertEquals(
second_post.remote_identifier,
'https://www.bbc.co.uk/news/uk-england-birmingham-48339080'
"https://www.bbc.co.uk/news/uk-england-birmingham-48339080",
)
@freeze_time("2019-10-30 12:30:00")
@ -221,7 +190,7 @@ class FeedBuilderTestCase(TestCase):
rule = CollectionRuleFactory()
mock_stream = MagicMock(rule=rule)
with builder((mock_without_author, mock_stream,)) as builder:
with builder((mock_without_author, mock_stream)) as builder:
builder.save()
posts = Post.objects.order_by("id")
@ -232,14 +201,12 @@ class FeedBuilderTestCase(TestCase):
self.assertEquals(first_post.created, timezone.now())
self.assertEquals(
first_post.remote_identifier,
'https://www.bbc.co.uk/news/world-us-canada-48338168'
first_post.remote_identifier, "https://www.bbc.co.uk/news/world-us-canada-48338168"
)
self.assertEquals(second_post.created, timezone.now())
self.assertEquals(
second_post.remote_identifier,
'https://www.bbc.co.uk/news/technology-48334739'
second_post.remote_identifier, "https://www.bbc.co.uk/news/technology-48334739"
)
def test_empty_entries(self):
@ -247,7 +214,7 @@ class FeedBuilderTestCase(TestCase):
rule = CollectionRuleFactory()
mock_stream = MagicMock(rule=rule)
with builder((mock_without_entries, mock_stream,)) as builder:
with builder((mock_without_entries, mock_stream)) as builder:
builder.save()
self.assertEquals(Post.objects.count(), 0)
@ -265,7 +232,7 @@ class FeedBuilderTestCase(TestCase):
remote_identifier="a5479c66-8fae-11e9-8422-00163ef6bee7", rule=rule
)
with builder((mock_with_update_entries, mock_stream,)) as builder:
with builder((mock_with_update_entries, mock_stream)) as builder:
builder.save()
self.assertEquals(Post.objects.count(), 3)
@ -274,21 +241,17 @@ class FeedBuilderTestCase(TestCase):
existing_second_post.refresh_from_db()
self.assertEquals(
existing_first_post.title,
"Trump's 'genocidal taunts' will not end Iran - Zarif"
existing_first_post.title, "Trump's 'genocidal taunts' will not end Iran - Zarif"
)
self.assertEquals(
existing_second_post.title,
"Huawei's Android loss: How it affects you"
)
self.assertEquals(existing_second_post.title, "Huawei's Android loss: How it affects you")
def test_html_sanitizing(self):
builder = FeedBuilder
rule = CollectionRuleFactory()
mock_stream = MagicMock(rule=rule)
with builder((mock_with_html, mock_stream,)) as builder:
with builder((mock_with_html, mock_stream)) as builder:
builder.save()
post = Post.objects.get()

View file

@ -1,5 +1,7 @@
from unittest.mock import MagicMock, patch
from .mocks import simple_mock
from django.test import TestCase
from django.utils import timezone
@ -7,15 +9,17 @@ from newsreader.news.collection.exceptions import (
StreamDeniedException,
StreamException,
StreamNotFoundException,
StreamParseException,
StreamTimeOutException,
)
from newsreader.news.collection.feed import FeedClient
from newsreader.news.collection.tests.factories import CollectionRuleFactory
from newsreader.news.collection.tests.feed.client.mocks import simple_mock
class FeedClientTestCase(TestCase):
def setUp(self):
self.maxDiff = None
self.patched_read = patch("newsreader.news.collection.feed.FeedStream.read")
self.mocked_read = self.patched_read.start()
@ -85,3 +89,16 @@ class FeedClientTestCase(TestCase):
self.assertEquals(stream.rule.succeeded, False)
self.mocked_read.assert_called_once_with()
def test_client_catches_stream_parse_exception(self):
rule = CollectionRuleFactory.create()
mock_stream = MagicMock(rule=rule)
self.mocked_read.side_effect = StreamParseException("Stream has wrong contents")
with FeedClient([rule]) as client:
for data, stream in client:
self.assertEquals(data, {"entries": []})
self.assertEquals(stream.rule.error, "Stream has wrong contents")
self.assertEquals(stream.rule.succeeded, False)
self.mocked_read.assert_called_once_with()

View file

@ -6,17 +6,26 @@ import pytz
from freezegun import freeze_time
from django.test import TestCase
from django.utils import timezone
from newsreader.news.collection.feed import FeedCollector
from newsreader.news.collection.tests.factories import CollectionRuleFactory
from newsreader.news.collection.tests.feed.collector.mocks import (
from .mocks import (
duplicate_mock,
empty_mock,
multiple_mock,
multiple_update_mock,
)
from django.test import TestCase
from django.utils import timezone
from newsreader.news.collection.exceptions import (
StreamDeniedException,
StreamException,
StreamForbiddenException,
StreamNotFoundException,
StreamParseException,
StreamTimeOutException,
)
from newsreader.news.collection.feed import FeedCollector
from newsreader.news.collection.tests.factories import CollectionRuleFactory
from newsreader.news.collection.utils import build_publication_date
from newsreader.news.posts.models import Post
from newsreader.news.posts.tests.factories import PostFactory
@ -24,14 +33,12 @@ from newsreader.news.posts.tests.factories import PostFactory
class FeedCollectorTestCase(TestCase):
def setUp(self):
self.patched_get = patch(
'newsreader.news.collection.feed.requests.get'
)
self.mocked_get = self.patched_get.start()
self.maxDiff = None
self.patched_parse = patch(
'newsreader.news.collection.feed.FeedStream.parse'
)
self.patched_get = patch("newsreader.news.collection.feed.fetch")
self.mocked_fetch = self.patched_get.start()
self.patched_parse = patch("newsreader.news.collection.feed.FeedStream.parse")
self.mocked_parse = self.patched_parse.start()
def tearDown(self):
@ -54,7 +61,7 @@ class FeedCollectorTestCase(TestCase):
@freeze_time("2019-10-30 12:30:00")
def test_emtpy_batch(self):
self.mocked_get.return_value = MagicMock(status_code=200)
self.mocked_fetch.return_value = MagicMock()
self.mocked_parse.return_value = empty_mock
rule = CollectionRuleFactory()
@ -69,7 +76,7 @@ class FeedCollectorTestCase(TestCase):
self.assertEquals(rule.last_suceeded, timezone.now())
def test_not_found(self):
self.mocked_get.return_value = MagicMock(status_code=404)
self.mocked_fetch.side_effect = StreamNotFoundException
rule = CollectionRuleFactory()
collector = FeedCollector()
@ -82,7 +89,7 @@ class FeedCollectorTestCase(TestCase):
self.assertEquals(rule.error, "Stream not found")
def test_denied(self):
self.mocked_get.return_value = MagicMock(status_code=404)
self.mocked_fetch.side_effect = StreamDeniedException
last_suceeded = timezone.make_aware(
datetime.combine(date=date(2019, 10, 30), time=time(12, 30))
)
@ -95,11 +102,11 @@ class FeedCollectorTestCase(TestCase):
self.assertEquals(Post.objects.count(), 0)
self.assertEquals(rule.succeeded, False)
self.assertEquals(rule.error, "Stream not found")
self.assertEquals(rule.error, "Stream does not have sufficient permissions")
self.assertEquals(rule.last_suceeded, last_suceeded)
def test_forbidden(self):
self.mocked_get.return_value = MagicMock(status_code=403)
self.mocked_fetch.side_effect = StreamForbiddenException
last_suceeded = timezone.make_aware(
datetime.combine(date=date(2019, 10, 30), time=time(12, 30))
)
@ -116,7 +123,7 @@ class FeedCollectorTestCase(TestCase):
self.assertEquals(rule.last_suceeded, last_suceeded)
def test_timed_out(self):
self.mocked_get.return_value = MagicMock(status_code=408)
self.mocked_fetch.side_effect = StreamTimeOutException
last_suceeded = timezone.make_aware(
datetime.combine(date=date(2019, 10, 30), time=time(12, 30))
)
@ -138,8 +145,7 @@ class FeedCollectorTestCase(TestCase):
rule = CollectionRuleFactory()
_, aware_datetime = build_publication_date(
struct_time((2019, 5, 20, 16, 7, 37, 0, 140, 0)),
pytz.utc
struct_time((2019, 5, 20, 16, 7, 37, 0, 140, 0)), pytz.utc
)
first_post = PostFactory(
@ -148,12 +154,11 @@ class FeedCollectorTestCase(TestCase):
body="Foreign Minister Mohammad Javad Zarif says the US "
"president should try showing Iranians some respect.",
publication_date=aware_datetime,
rule=rule
rule=rule,
)
_, aware_datetime = build_publication_date(
struct_time((2019, 5, 20, 12, 19, 19, 0, 140, 0,)),
pytz.utc
struct_time((2019, 5, 20, 12, 19, 19, 0, 140, 0)), pytz.utc
)
second_post = PostFactory(
@ -162,22 +167,21 @@ class FeedCollectorTestCase(TestCase):
body="Google's move to end business ties with Huawei will "
"affect current devices and future purchases.",
publication_date=aware_datetime,
rule=rule
rule=rule,
)
_, aware_datetime = build_publication_date(
struct_time((2019, 5, 20, 16, 32, 38, 0, 140, 0)),
pytz.utc
struct_time((2019, 5, 20, 16, 32, 38, 0, 140, 0)), pytz.utc
)
third_post = PostFactory(
url="https://www.bbc.co.uk/news/uk-england-birmingham-48339080",
title="Birmingham head teacher threatened over LGBT lessons",
body="Police are investigating the messages while an MP "
"calls for a protest exclusion zone \"to protect "
"children\".",
'calls for a protest exclusion zone "to protect '
'children".',
publication_date=aware_datetime,
rule=rule
rule=rule,
)
collector = FeedCollector()
@ -201,7 +205,7 @@ class FeedCollectorTestCase(TestCase):
title="Trump",
body="Foreign Minister Mohammad Javad Zarif",
publication_date=timezone.now(),
rule=rule
rule=rule,
)
second_post = PostFactory(
@ -210,7 +214,7 @@ class FeedCollectorTestCase(TestCase):
title="Huawei's Android loss: How it affects you",
body="Google's move to end business ties with Huawei will",
publication_date=timezone.now(),
rule=rule
rule=rule,
)
third_post = PostFactory(
@ -219,7 +223,7 @@ class FeedCollectorTestCase(TestCase):
title="Birmingham head teacher threatened over LGBT lessons",
body="Police are investigating the messages while an MP",
publication_date=timezone.now(),
rule=rule
rule=rule,
)
collector = FeedCollector()
@ -235,17 +239,8 @@ class FeedCollectorTestCase(TestCase):
self.assertEquals(rule.last_suceeded, timezone.now())
self.assertEquals(rule.error, None)
self.assertEquals(
first_post.title,
"Trump's 'genocidal taunts' will not end Iran - Zarif"
)
self.assertEquals(first_post.title, "Trump's 'genocidal taunts' will not end Iran - Zarif")
self.assertEquals(
second_post.title,
"Huawei's Android loss: How it affects you"
)
self.assertEquals(second_post.title, "Huawei's Android loss: How it affects you")
self.assertEquals(
third_post.title,
'Birmingham head teacher threatened over LGBT lessons'
)
self.assertEquals(third_post.title, "Birmingham head teacher threatened over LGBT lessons")

View file

@ -9,7 +9,7 @@ from newsreader.news.posts.tests.factories import PostFactory
class FeedDuplicateHandlerTestCase(TestCase):
def setUp(self):
pass
self.maxDiff = None
def test_duplicate_entries_with_remote_identifiers(self):
rule = CollectionRuleFactory()
@ -19,7 +19,7 @@ class FeedDuplicateHandlerTestCase(TestCase):
new_post = PostFactory.build(
remote_identifier="28f79ae4-8f9a-11e9-b143-00163ef6bee7",
title="title got updated",
rule=rule
rule=rule,
)
with FeedDuplicateHandler(rule) as duplicate_handler:
@ -45,7 +45,7 @@ class FeedDuplicateHandlerTestCase(TestCase):
body="Google's move to end business ties with Huawei will affect current devices",
publication_date=publication_date,
remote_identifier=None,
rule=rule
rule=rule,
)
new_post = PostFactory.build(
url="https://www.bbc.co.uk/news/uk-england-birmingham-48339080",
@ -53,7 +53,7 @@ class FeedDuplicateHandlerTestCase(TestCase):
body="Google's move to end business ties with Huawei will affect current devices",
publication_date=publication_date,
remote_identifier=None,
rule=rule
rule=rule,
)
with FeedDuplicateHandler(rule) as duplicate_handler:

View file

@ -1,61 +1,62 @@
from time import struct_time
simple_mock = {
'bozo': 0,
'encoding': 'utf-8',
'entries': [{
'guidislink': False,
'href': '',
'id': 'https://www.bbc.co.uk/news/world-us-canada-48338168',
'link': 'https://www.bbc.co.uk/news/world-us-canada-48338168',
'links': [{
'href': 'https://www.bbc.co.uk/news/world-us-canada-48338168',
'rel': 'alternate',
'type': 'text/html'
}],
'media_thumbnail': [{
'height': '1152',
'url': 'http://c.files.bbci.co.uk/7605/production/_107031203_mediaitem107031202.jpg',
'width': '2048'
}],
'published': 'Mon, 20 May 2019 16:07:37 GMT',
'published_parsed': struct_time((2019, 5, 20, 16, 7, 37, 0, 140, 0)),
'summary': 'Foreign Minister Mohammad Javad Zarif says the US '
'president should try showing Iranians some respect.',
'summary_detail': {
'base': 'http://feeds.bbci.co.uk/news/rss.xml',
'language': None,
'type': 'text/html',
'value': 'Foreign Minister Mohammad Javad '
'Zarif says the US president should '
'try showing Iranians some '
'respect.'
},
'title': "Trump's 'genocidal taunts' will not end Iran - Zarif",
'title_detail': {
'base': 'http://feeds.bbci.co.uk/news/rss.xml',
'language': None,
'type': 'text/plain',
'value': "Trump's 'genocidal taunts' will not "
'end Iran - Zarif'
}
}],
'feed': {
'image': {
'href': 'https://news.bbcimg.co.uk/nol/shared/img/bbc_news_120x60.gif',
'link': 'https://www.bbc.co.uk/news/',
'title': 'BBC News - Home',
'language': 'en-gb',
'link': 'https://www.bbc.co.uk/news/'
"bozo": 1,
"encoding": "utf-8",
"entries": [
{
"guidislink": False,
"href": "",
"id": "https://www.bbc.co.uk/news/world-us-canada-48338168",
"link": "https://www.bbc.co.uk/news/world-us-canada-48338168",
"links": [
{
"href": "https://www.bbc.co.uk/news/world-us-canada-48338168",
"rel": "alternate",
"type": "text/html",
}
],
"media_thumbnail": [
{
"height": "1152",
"url": "http://c.files.bbci.co.uk/7605/production/_107031203_mediaitem107031202.jpg",
"width": "2048",
}
],
"published": "Mon, 20 May 2019 16:07:37 GMT",
"published_parsed": struct_time((2019, 5, 20, 16, 7, 37, 0, 140, 0)),
"summary": "Foreign Minister Mohammad Javad Zarif says the US "
"president should try showing Iranians some respect.",
"summary_detail": {
"base": "http://feeds.bbci.co.uk/news/rss.xml",
"language": None,
"type": "text/html",
"value": "Foreign Minister Mohammad Javad "
"Zarif says the US president should "
"try showing Iranians some "
"respect.",
},
'links': [{
'href': 'https://www.bbc.co.uk/news/',
'rel': 'alternate',
'type': 'text/html'
}],
'title': 'BBC News - Home',
"title": "Trump's 'genocidal taunts' will not end Iran - Zarif",
"title_detail": {
"base": "http://feeds.bbci.co.uk/news/rss.xml",
"language": None,
"type": "text/plain",
"value": "Trump's 'genocidal taunts' will not " "end Iran - Zarif",
},
}
],
"feed": {
"image": {
"href": "https://news.bbcimg.co.uk/nol/shared/img/bbc_news_120x60.gif",
"link": "https://www.bbc.co.uk/news/",
"title": "BBC News - Home",
"language": "en-gb",
"link": "https://www.bbc.co.uk/news/",
},
"links": [{"href": "https://www.bbc.co.uk/news/", "rel": "alternate", "type": "text/html"}],
"title": "BBC News - Home",
},
'href': 'http://feeds.bbci.co.uk/news/rss.xml',
'status': 200,
'version': 'rss20'
"href": "http://feeds.bbci.co.uk/news/rss.xml",
"status": 200,
"version": "rss20",
}

View file

@ -1,5 +1,7 @@
from unittest.mock import MagicMock, patch
from .mocks import simple_mock
from django.test import TestCase
from django.utils import timezone
@ -13,36 +15,32 @@ from newsreader.news.collection.exceptions import (
)
from newsreader.news.collection.feed import FeedStream
from newsreader.news.collection.tests.factories import CollectionRuleFactory
from newsreader.news.collection.tests.feed.stream.mocks import simple_mock
class FeedStreamTestCase(TestCase):
def setUp(self):
self.patched_get = patch(
'newsreader.news.collection.feed.requests.get'
)
self.mocked_get = self.patched_get.start()
self.maxDiff = None
self.patched_parse = patch(
'newsreader.news.collection.feed.FeedStream.parse'
)
self.mocked_parse = self.patched_parse.start()
self.patched_fetch = patch("newsreader.news.collection.feed.fetch")
self.mocked_fetch = self.patched_fetch.start()
def tearDown(self):
patch.stopall()
def test_simple_stream(self):
self.mocked_parse.return_value = simple_mock
self.mocked_fetch.return_value = MagicMock(content=simple_mock)
rule = CollectionRuleFactory()
stream = FeedStream(rule)
return_value = stream.read()
self.mocked_get.assert_called_once_with(rule.url)
self.assertEquals(return_value, (simple_mock, stream))
data, stream = stream.read()
self.mocked_fetch.assert_called_once_with(rule.url)
self.assertEquals(data["entries"], data["entries"])
self.assertEquals(stream, stream)
def test_stream_raises_exception(self):
self.mocked_parse.side_effect = StreamException
self.mocked_fetch.side_effect = StreamException
rule = CollectionRuleFactory()
stream = FeedStream(rule)
@ -50,10 +48,10 @@ class FeedStreamTestCase(TestCase):
with self.assertRaises(StreamException):
stream.read()
self.mocked_get.assert_called_once_with(rule.url)
self.mocked_fetch.assert_called_once_with(rule.url)
def test_stream_raises_denied_exception(self):
self.mocked_get.return_value = MagicMock(status_code=401)
self.mocked_fetch.side_effect = StreamDeniedException
rule = CollectionRuleFactory()
stream = FeedStream(rule)
@ -61,10 +59,10 @@ class FeedStreamTestCase(TestCase):
with self.assertRaises(StreamDeniedException):
stream.read()
self.mocked_get.assert_called_once_with(rule.url)
self.mocked_fetch.assert_called_once_with(rule.url)
def test_stream_raises_not_found_exception(self):
self.mocked_get.return_value = MagicMock(status_code=404)
self.mocked_fetch.side_effect = StreamNotFoundException
rule = CollectionRuleFactory()
stream = FeedStream(rule)
@ -72,10 +70,10 @@ class FeedStreamTestCase(TestCase):
with self.assertRaises(StreamNotFoundException):
stream.read()
self.mocked_get.assert_called_once_with(rule.url)
self.mocked_fetch.assert_called_once_with(rule.url)
def test_stream_raises_time_out_exception(self):
self.mocked_get.return_value = MagicMock(status_code=408)
self.mocked_fetch.side_effect = StreamTimeOutException
rule = CollectionRuleFactory()
stream = FeedStream(rule)
@ -83,10 +81,10 @@ class FeedStreamTestCase(TestCase):
with self.assertRaises(StreamTimeOutException):
stream.read()
self.mocked_get.assert_called_once_with(rule.url)
self.mocked_fetch.assert_called_once_with(rule.url)
def test_stream_raises_forbidden_exception(self):
self.mocked_get.return_value = MagicMock(status_code=403)
self.mocked_fetch.side_effect = StreamForbiddenException
rule = CollectionRuleFactory()
stream = FeedStream(rule)
@ -94,13 +92,12 @@ class FeedStreamTestCase(TestCase):
with self.assertRaises(StreamForbiddenException):
stream.read()
self.mocked_get.assert_called_once_with(rule.url)
self.mocked_fetch.assert_called_once_with(rule.url)
@patch("newsreader.news.collection.feed.parse")
def test_stream_raises_parse_exception(self, mocked_parse):
self.mocked_get.return_value = MagicMock(status_code=200)
self.mocked_fetch.return_value = MagicMock()
mocked_parse.side_effect = TypeError
self.patched_parse.stop()
rule = CollectionRuleFactory()
stream = FeedStream(rule)

View file

@ -0,0 +1,47 @@
simple_mock = """
<html>
<body>
<article>
<h1>Clickbait</h1>
</article>
</body>
</html>
"""
simple_feed_mock = {
"bozo": 0,
"encoding": "utf-8",
"feed": {
"image": {
"href": "https://news.bbcimg.co.uk/nol/shared/img/bbc_news_120x60.gif",
"link": "https://www.bbc.co.uk/news/",
"title": "BBC News - Home",
"language": "en-gb",
"link": "https://www.bbc.co.uk/news/",
},
"link": "https://www.bbc.co.uk/news/",
"links": [{"href": "https://www.bbc.co.uk/news/", "rel": "alternate", "type": "text/html"}],
"title": "BBC News - Home",
},
"href": "http://feeds.bbci.co.uk/news/rss.xml",
"status": 200,
"version": "rss20",
}
feed_mock_without_link = {
"bozo": 0,
"encoding": "utf-8",
"feed": {
"image": {
"href": "https://news.bbcimg.co.uk/nol/shared/img/bbc_news_120x60.gif",
"link": "https://www.bbc.co.uk/news/",
"title": "BBC News - Home",
"language": "en-gb",
"link": "https://www.bbc.co.uk/news/",
},
"title": "BBC News - Home",
},
"href": "http://feeds.bbci.co.uk/news/rss.xml",
"status": 200,
"version": "rss20",
}

View file

@ -0,0 +1,134 @@
from unittest.mock import MagicMock, patch
from bs4 import BeautifulSoup
from .mocks import feed_mock_without_link, simple_feed_mock, simple_mock
from django.test import TestCase
from newsreader.news.collection.base import URLBuilder, WebsiteStream
from newsreader.news.collection.exceptions import (
StreamDeniedException,
StreamException,
StreamForbiddenException,
StreamNotFoundException,
StreamParseException,
StreamTimeOutException,
)
from newsreader.news.collection.tests.factories import CollectionRuleFactory
class WebsiteStreamTestCase(TestCase):
def setUp(self):
self.patched_fetch = patch("newsreader.news.collection.base.fetch")
self.mocked_fetch = self.patched_fetch.start()
def tearDown(self):
patch.stopall()
def test_simple(self):
self.mocked_fetch.return_value = MagicMock(content=simple_mock)
rule = CollectionRuleFactory()
stream = WebsiteStream(rule.url)
return_value = stream.read()
self.mocked_fetch.assert_called_once_with(rule.url)
self.assertEquals(return_value, (BeautifulSoup(simple_mock, "lxml"), stream))
def test_raises_exception(self):
self.mocked_fetch.side_effect = StreamException
rule = CollectionRuleFactory()
stream = WebsiteStream(rule.url)
with self.assertRaises(StreamException):
stream.read()
self.mocked_fetch.assert_called_once_with(rule.url)
def test_raises_denied_exception(self):
self.mocked_fetch.side_effect = StreamDeniedException
rule = CollectionRuleFactory()
stream = WebsiteStream(rule.url)
with self.assertRaises(StreamDeniedException):
stream.read()
self.mocked_fetch.assert_called_once_with(rule.url)
def test_raises_stream_not_found_exception(self):
self.mocked_fetch.side_effect = StreamNotFoundException
rule = CollectionRuleFactory()
stream = WebsiteStream(rule.url)
with self.assertRaises(StreamNotFoundException):
stream.read()
self.mocked_fetch.assert_called_once_with(rule.url)
def test_stream_raises_time_out_exception(self):
self.mocked_fetch.side_effect = StreamTimeOutException
rule = CollectionRuleFactory()
stream = WebsiteStream(rule.url)
with self.assertRaises(StreamTimeOutException):
stream.read()
self.mocked_fetch.assert_called_once_with(rule.url)
def test_stream_raises_forbidden_exception(self):
self.mocked_fetch.side_effect = StreamForbiddenException
rule = CollectionRuleFactory()
stream = WebsiteStream(rule.url)
with self.assertRaises(StreamForbiddenException):
stream.read()
self.mocked_fetch.assert_called_once_with(rule.url)
@patch("newsreader.news.collection.base.WebsiteStream.parse")
def test_stream_raises_parse_exception(self, mocked_parse):
self.mocked_fetch.return_value = MagicMock()
mocked_parse.side_effect = StreamParseException
rule = CollectionRuleFactory()
stream = WebsiteStream(rule.url)
with self.assertRaises(StreamParseException):
stream.read()
self.mocked_fetch.assert_called_once_with(rule.url)
class URLBuilderTestCase(TestCase):
def test_simple(self):
initial_rule = CollectionRuleFactory()
with URLBuilder((simple_feed_mock, MagicMock(rule=initial_rule))) as builder:
rule, url = builder.build()
self.assertEquals(rule.pk, initial_rule.pk)
self.assertEquals(url, "https://www.bbc.co.uk/news/")
def test_no_link(self):
initial_rule = CollectionRuleFactory()
with URLBuilder((feed_mock_without_link, MagicMock(rule=initial_rule))) as builder:
rule, url = builder.build()
self.assertEquals(rule.pk, initial_rule.pk)
self.assertEquals(url, None)
def test_no_data(self):
initial_rule = CollectionRuleFactory()
with URLBuilder((None, MagicMock(rule=initial_rule))) as builder:
rule, url = builder.build()
self.assertEquals(rule.pk, initial_rule.pk)
self.assertEquals(url, None)

View file

@ -0,0 +1 @@
from .tests import *

View file

@ -0,0 +1,57 @@
from unittest.mock import MagicMock, patch
from django.test import TestCase
from newsreader.news.collection.exceptions import (
StreamDeniedException,
StreamForbiddenException,
StreamNotFoundException,
StreamTimeOutException,
)
from newsreader.news.collection.utils import fetch
class FetchTestCase(TestCase):
def setUp(self):
self.patched_get = patch("newsreader.news.collection.utils.requests.get")
self.mocked_get = self.patched_get.start()
def test_simple(self):
self.mocked_get.return_value = MagicMock(status_code=200, content="content")
url = "https://www.bbc.co.uk/news"
response = fetch(url)
self.assertEquals(response.content, "content")
def test_raises_not_found(self):
self.mocked_get.return_value = MagicMock(status_code=404)
url = "https://www.bbc.co.uk/news"
with self.assertRaises(StreamNotFoundException):
fetch(url)
def test_raises_denied(self):
self.mocked_get.return_value = MagicMock(status_code=401)
url = "https://www.bbc.co.uk/news"
with self.assertRaises(StreamDeniedException):
fetch(url)
def test_raises_forbidden(self):
self.mocked_get.return_value = MagicMock(status_code=403)
url = "https://www.bbc.co.uk/news"
with self.assertRaises(StreamForbiddenException):
fetch(url)
def test_raises_timed_out(self):
self.mocked_get.return_value = MagicMock(status_code=408)
url = "https://www.bbc.co.uk/news"
with self.assertRaises(StreamTimeOutException):
fetch(url)

View file

@ -1,9 +1,15 @@
from datetime import datetime, tzinfo
from time import mktime, struct_time
from typing import Tuple
from typing import Optional, Tuple
import requests
from requests.models import Response
from django.utils import timezone
from newsreader.news.collection.response_handler import ResponseHandler
def build_publication_date(dt: struct_time, tz: tzinfo) -> Tuple:
try:
@ -12,3 +18,12 @@ def build_publication_date(dt: struct_time, tz: tzinfo) -> Tuple:
except TypeError:
return False, None
return True, published_parsed
def fetch(url: str) -> Optional[Response]:
response = requests.get(url)
with ResponseHandler(response) as response_handler:
response_handler.handle_response()
return response