0.2.3 #99

Merged
sonny merged 112 commits from development into master 2020-05-23 16:58:42 +02:00
5 changed files with 92 additions and 97 deletions
Showing only changes of commit c75de1c469 - Show all commits

View file

@ -1,3 +1,5 @@
from typing import ContextManager, Dict, List, Optional, Tuple
import requests import requests
from django.utils import timezone from django.utils import timezone
@ -5,23 +7,58 @@ from django.utils import timezone
from newsreader.news.collection.models import CollectionRule from newsreader.news.collection.models import CollectionRule
class Stream:
def __init__(self, rule: CollectionRule) -> None:
self.rule = rule
def read(self) -> Tuple:
url = self.rule.url
response = requests.get(url)
return (self.parse(response.content), self)
def parse(self, payload: bytes) -> Dict:
raise NotImplementedError
class Meta:
abstract = True
class Client:
stream = Stream
def __init__(self, rules: Optional[CollectionRule] = None) -> None:
self.rules = rules if rules else CollectionRule.objects.all()
def __enter__(self) -> ContextManager:
for rule in self.rules:
stream = self.stream(rule)
yield stream.read()
def __exit__(self, *args, **kwargs) -> None:
pass
class Meta:
abstract = True
class Builder: class Builder:
instances = [] instances = []
def __init__(self, stream): def __init__(self, stream: Stream) -> None:
self.stream = stream self.stream = stream
def __enter__(self): def __enter__(self) -> ContextManager:
self.create_posts(self.stream) self.create_posts(self.stream)
return self return self
def __exit__(self, *args, **kwargs): def __exit__(self, *args, **kwargs) -> None:
pass pass
def create_posts(self, stream): def create_posts(self, stream: Tuple) -> None:
pass pass
def save(self): def save(self) -> None:
pass pass
class Meta: class Meta:
@ -32,11 +69,11 @@ class Collector:
client = None client = None
builder = None builder = None
def __init__(self, client=None, builder=None): def __init__(self, client: Optional[Client] = None, builder: Optional[Builder] = None) -> None:
self.client = client if client else self.client self.client = client if client else self.client
self.builder = builder if builder else self.builder self.builder = builder if builder else self.builder
def collect(self, rules=None): def collect(self, rules: Optional[List] = None) -> None:
with self.client(rules=rules) as client: with self.client(rules=rules) as client:
for data, stream in client: for data, stream in client:
with self.builder((data, stream)) as builder: with self.builder((data, stream)) as builder:
@ -44,38 +81,3 @@ class Collector:
class Meta: class Meta:
abstract = True abstract = True
class Stream:
def __init__(self, rule):
self.rule = rule
def read(self):
url = self.rule.url
response = requests.get(url)
return (self.parse(response.content), self)
def parse(self, payload):
raise NotImplementedError
class Meta:
abstract = True
class Client:
stream = Stream
def __init__(self, rules=None):
self.rules = rules if rules else CollectionRule.objects.all()
def __enter__(self):
for rule in self.rules:
stream = self.stream(rule)
yield stream.read()
def __exit__(self, *args, **kwargs):
pass
class Meta:
abstract = True

View file

@ -1,4 +1,5 @@
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import ContextManager, Dict, Generator, List, Optional, Tuple
import bleach import bleach
import pytz import pytz
@ -16,6 +17,7 @@ from newsreader.news.collection.exceptions import (
StreamParseException, StreamParseException,
StreamTimeOutException, StreamTimeOutException,
) )
from newsreader.news.collection.models import CollectionRule
from newsreader.news.collection.response_handler import ResponseHandler from newsreader.news.collection.response_handler import ResponseHandler
from newsreader.news.collection.utils import build_publication_date from newsreader.news.collection.utils import build_publication_date
from newsreader.news.posts.models import Post from newsreader.news.posts.models import Post
@ -24,17 +26,16 @@ from newsreader.news.posts.models import Post
class FeedBuilder(Builder): class FeedBuilder(Builder):
instances = [] instances = []
def __enter__(self): def __enter__(self) -> ContextManager:
_, stream = self.stream _, stream = self.stream
self.instances = [] self.instances = []
self.existing_posts = { self.existing_posts = {
post.remote_identifier: post post.remote_identifier: post for post in Post.objects.filter(rule=stream.rule)
for post in Post.objects.filter(rule=stream.rule)
} }
return super().__enter__() return super().__enter__()
def create_posts(self, stream): def create_posts(self, stream: Tuple) -> None:
data, stream = stream data, stream = stream
entries = [] entries = []
@ -49,30 +50,25 @@ class FeedBuilder(Builder):
self.instances = [post for post in posts] self.instances = [post for post in posts]
def build(self, entries, rule): def build(self, entries: List, rule: CollectionRule) -> Generator[Post, None, None]:
field_mapping = { field_mapping = {
"id": "remote_identifier", "id": "remote_identifier",
"title": "title", "title": "title",
"summary": "body", "summary": "body",
"link": "url", "link": "url",
"published_parsed": "publication_date", "published_parsed": "publication_date",
"author": "author" "author": "author",
} }
tz = pytz.timezone(rule.timezone) tz = pytz.timezone(rule.timezone)
for entry in entries: for entry in entries:
data = { data = {"rule_id": rule.pk, "category": rule.category}
"rule_id": rule.pk,
"category": rule.category
}
for field, value in field_mapping.items(): for field, value in field_mapping.items():
if field in entry: if field in entry:
if field == "published_parsed": if field == "published_parsed":
created, aware_datetime = build_publication_date( created, aware_datetime = build_publication_date(entry[field], tz)
entry[field], tz
)
data[value] = aware_datetime if created else None data[value] = aware_datetime if created else None
elif field == "summary": elif field == "summary":
summary = self.sanitize_summary(entry[field]) summary = self.sanitize_summary(entry[field])
@ -82,19 +78,19 @@ class FeedBuilder(Builder):
yield Post(**data) yield Post(**data)
def sanitize_summary(self, summary): def sanitize_summary(self, summary: str) -> Optional[str]:
attrs = {"a": ["href", "rel"], "img": ["alt", "src"],} attrs = {"a": ["href", "rel"], "img": ["alt", "src"]}
tags = ["a", "img", "p"] tags = ["a", "img", "p"]
return bleach.clean(summary, tags=tags, attributes=attrs) if summary else None return bleach.clean(summary, tags=tags, attributes=attrs) if summary else None
def save(self): def save(self) -> None:
for post in self.instances: for post in self.instances:
post.save() post.save()
class FeedStream(Stream): class FeedStream(Stream):
def read(self): def read(self) -> Tuple:
url = self.rule.url url = self.rule.url
response = requests.get(url) response = requests.get(url)
@ -103,7 +99,7 @@ class FeedStream(Stream):
return (self.parse(response.content), self) return (self.parse(response.content), self)
def parse(self, payload): def parse(self, payload: bytes) -> Dict:
try: try:
return parse(payload) return parse(payload)
except TypeError as e: except TypeError as e:
@ -113,14 +109,11 @@ class FeedStream(Stream):
class FeedClient(Client): class FeedClient(Client):
stream = FeedStream stream = FeedStream
def __enter__(self): def __enter__(self) -> ContextManager:
streams = [self.stream(rule) for rule in self.rules] streams = [self.stream(rule) for rule in self.rules]
with ThreadPoolExecutor(max_workers=10) as executor: with ThreadPoolExecutor(max_workers=10) as executor:
futures = { futures = {executor.submit(stream.read): stream for stream in streams}
executor.submit(stream.read): stream
for stream in streams
}
for future in as_completed(futures): for future in as_completed(futures):
stream = futures[future] stream = futures[future]
@ -148,20 +141,20 @@ class FeedCollector(Collector):
class FeedDuplicateHandler: class FeedDuplicateHandler:
def __init__(self, rule): def __init__(self, rule: CollectionRule) -> None:
self.queryset = rule.post_set.all() self.queryset = rule.post_set.all()
def __enter__(self): def __enter__(self) -> ContextManager:
self.existing_identifiers = self.queryset.filter(remote_identifier__isnull=False).values_list( self.existing_identifiers = self.queryset.filter(
"remote_identifier", flat=True remote_identifier__isnull=False
) ).values_list("remote_identifier", flat=True)
return self return self
def __exit__(self, *args, **kwargs): def __exit__(self, *args, **kwargs) -> None:
pass pass
def check(self, instances): def check(self, instances: List) -> Generator[Post, None, None]:
for instance in instances: for instance in instances:
if instance.remote_identifier in self.existing_identifiers: if instance.remote_identifier in self.existing_identifiers:
existing_post = self.handle_duplicate(instance) existing_post = self.handle_duplicate(instance)
@ -173,31 +166,29 @@ class FeedDuplicateHandler:
yield instance yield instance
def in_database(self, entry): def in_database(self, post: Post) -> Optional[bool]:
values = { values = {
"url": entry.url, "url": post.url,
"title": entry.title, "title": post.title,
"body": entry.body, "body": post.body,
"publication_date": entry.publication_date "publication_date": post.publication_date,
} }
for existing_entry in self.queryset.order_by("-publication_date")[:50]: for existing_post in self.queryset.order_by("-publication_date")[:50]:
if self.is_duplicate(existing_entry, values): if self.is_duplicate(existing_post, values):
return True return True
def is_duplicate(self, existing_entry, values): def is_duplicate(self, existing_post: Post, values: Dict) -> bool:
for key, value in values.items(): for key, value in values.items():
existing_value = getattr(existing_entry, key, object()) existing_value = getattr(existing_post, key, object())
if existing_value != value: if existing_value != value:
return False return False
return True return True
def handle_duplicate(self, instance): def handle_duplicate(self, instance: Post) -> Optional[Post]:
try: try:
existing_instance = self.queryset.get( existing_instance = self.queryset.get(remote_identifier=instance.remote_identifier)
remote_identifier=instance.remote_identifier,
)
except ObjectDoesNotExist: except ObjectDoesNotExist:
return return

View file

@ -1,3 +1,7 @@
from typing import ContextManager
from requests import Response
from newsreader.news.collection.exceptions import ( from newsreader.news.collection.exceptions import (
StreamDeniedException, StreamDeniedException,
StreamForbiddenException, StreamForbiddenException,
@ -14,17 +18,17 @@ class ResponseHandler:
408: StreamTimeOutException, 408: StreamTimeOutException,
} }
def __init__(self, response): def __init__(self, response: Response) -> None:
self.response = response self.response = response
def __enter__(self): def __enter__(self) -> ContextManager:
return self return self
def handle_response(self): def handle_response(self) -> None:
status_code = self.response.status_code status_code = self.response.status_code
if status_code in self.message_mapping: if status_code in self.message_mapping:
raise self.message_mapping[status_code] raise self.message_mapping[status_code]
def __exit__(self, *args, **kwargs): def __exit__(self, *args, **kwargs) -> None:
self.response = None self.response = None

View file

@ -6,7 +6,6 @@ from django.utils import timezone
from newsreader.news.collection.exceptions import ( from newsreader.news.collection.exceptions import (
StreamDeniedException, StreamDeniedException,
StreamException, StreamException,
StreamFieldException,
StreamNotFoundException, StreamNotFoundException,
StreamTimeOutException, StreamTimeOutException,
) )
@ -17,9 +16,7 @@ from newsreader.news.collection.tests.feed.client.mocks import simple_mock
class FeedClientTestCase(TestCase): class FeedClientTestCase(TestCase):
def setUp(self): def setUp(self):
self.patched_read = patch( self.patched_read = patch("newsreader.news.collection.feed.FeedStream.read")
'newsreader.news.collection.feed.FeedStream.read'
)
self.mocked_read = self.patched_read.start() self.mocked_read = self.patched_read.start()
def tearDown(self): def tearDown(self):

View file

@ -1,10 +1,11 @@
from datetime import datetime from datetime import datetime, tzinfo
from time import mktime from time import mktime, struct_time
from typing import Tuple
from django.utils import timezone from django.utils import timezone
def build_publication_date(dt, tz): def build_publication_date(dt: struct_time, tz: tzinfo) -> Tuple:
try: try:
naive_datetime = datetime.fromtimestamp(mktime(dt)) naive_datetime = datetime.fromtimestamp(mktime(dt))
published_parsed = timezone.make_aware(naive_datetime, timezone=tz) published_parsed = timezone.make_aware(naive_datetime, timezone=tz)