Add type hinting
This commit is contained in:
parent
69e4e7b269
commit
c75de1c469
5 changed files with 92 additions and 97 deletions
|
|
@ -1,3 +1,5 @@
|
||||||
|
from typing import ContextManager, Dict, List, Optional, Tuple
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
from django.utils import timezone
|
from django.utils import timezone
|
||||||
|
|
@ -5,23 +7,58 @@ from django.utils import timezone
|
||||||
from newsreader.news.collection.models import CollectionRule
|
from newsreader.news.collection.models import CollectionRule
|
||||||
|
|
||||||
|
|
||||||
|
class Stream:
|
||||||
|
def __init__(self, rule: CollectionRule) -> None:
|
||||||
|
self.rule = rule
|
||||||
|
|
||||||
|
def read(self) -> Tuple:
|
||||||
|
url = self.rule.url
|
||||||
|
response = requests.get(url)
|
||||||
|
return (self.parse(response.content), self)
|
||||||
|
|
||||||
|
def parse(self, payload: bytes) -> Dict:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
class Meta:
|
||||||
|
abstract = True
|
||||||
|
|
||||||
|
|
||||||
|
class Client:
|
||||||
|
stream = Stream
|
||||||
|
|
||||||
|
def __init__(self, rules: Optional[CollectionRule] = None) -> None:
|
||||||
|
self.rules = rules if rules else CollectionRule.objects.all()
|
||||||
|
|
||||||
|
def __enter__(self) -> ContextManager:
|
||||||
|
for rule in self.rules:
|
||||||
|
stream = self.stream(rule)
|
||||||
|
|
||||||
|
yield stream.read()
|
||||||
|
|
||||||
|
def __exit__(self, *args, **kwargs) -> None:
|
||||||
|
pass
|
||||||
|
|
||||||
|
class Meta:
|
||||||
|
abstract = True
|
||||||
|
|
||||||
|
|
||||||
class Builder:
|
class Builder:
|
||||||
instances = []
|
instances = []
|
||||||
|
|
||||||
def __init__(self, stream):
|
def __init__(self, stream: Stream) -> None:
|
||||||
self.stream = stream
|
self.stream = stream
|
||||||
|
|
||||||
def __enter__(self):
|
def __enter__(self) -> ContextManager:
|
||||||
self.create_posts(self.stream)
|
self.create_posts(self.stream)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def __exit__(self, *args, **kwargs):
|
def __exit__(self, *args, **kwargs) -> None:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def create_posts(self, stream):
|
def create_posts(self, stream: Tuple) -> None:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def save(self):
|
def save(self) -> None:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
class Meta:
|
class Meta:
|
||||||
|
|
@ -32,11 +69,11 @@ class Collector:
|
||||||
client = None
|
client = None
|
||||||
builder = None
|
builder = None
|
||||||
|
|
||||||
def __init__(self, client=None, builder=None):
|
def __init__(self, client: Optional[Client] = None, builder: Optional[Builder] = None) -> None:
|
||||||
self.client = client if client else self.client
|
self.client = client if client else self.client
|
||||||
self.builder = builder if builder else self.builder
|
self.builder = builder if builder else self.builder
|
||||||
|
|
||||||
def collect(self, rules=None):
|
def collect(self, rules: Optional[List] = None) -> None:
|
||||||
with self.client(rules=rules) as client:
|
with self.client(rules=rules) as client:
|
||||||
for data, stream in client:
|
for data, stream in client:
|
||||||
with self.builder((data, stream)) as builder:
|
with self.builder((data, stream)) as builder:
|
||||||
|
|
@ -44,38 +81,3 @@ class Collector:
|
||||||
|
|
||||||
class Meta:
|
class Meta:
|
||||||
abstract = True
|
abstract = True
|
||||||
|
|
||||||
|
|
||||||
class Stream:
|
|
||||||
def __init__(self, rule):
|
|
||||||
self.rule = rule
|
|
||||||
|
|
||||||
def read(self):
|
|
||||||
url = self.rule.url
|
|
||||||
response = requests.get(url)
|
|
||||||
return (self.parse(response.content), self)
|
|
||||||
|
|
||||||
def parse(self, payload):
|
|
||||||
raise NotImplementedError
|
|
||||||
|
|
||||||
class Meta:
|
|
||||||
abstract = True
|
|
||||||
|
|
||||||
|
|
||||||
class Client:
|
|
||||||
stream = Stream
|
|
||||||
|
|
||||||
def __init__(self, rules=None):
|
|
||||||
self.rules = rules if rules else CollectionRule.objects.all()
|
|
||||||
|
|
||||||
def __enter__(self):
|
|
||||||
for rule in self.rules:
|
|
||||||
stream = self.stream(rule)
|
|
||||||
|
|
||||||
yield stream.read()
|
|
||||||
|
|
||||||
def __exit__(self, *args, **kwargs):
|
|
||||||
pass
|
|
||||||
|
|
||||||
class Meta:
|
|
||||||
abstract = True
|
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,5 @@
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
from typing import ContextManager, Dict, Generator, List, Optional, Tuple
|
||||||
|
|
||||||
import bleach
|
import bleach
|
||||||
import pytz
|
import pytz
|
||||||
|
|
@ -16,6 +17,7 @@ from newsreader.news.collection.exceptions import (
|
||||||
StreamParseException,
|
StreamParseException,
|
||||||
StreamTimeOutException,
|
StreamTimeOutException,
|
||||||
)
|
)
|
||||||
|
from newsreader.news.collection.models import CollectionRule
|
||||||
from newsreader.news.collection.response_handler import ResponseHandler
|
from newsreader.news.collection.response_handler import ResponseHandler
|
||||||
from newsreader.news.collection.utils import build_publication_date
|
from newsreader.news.collection.utils import build_publication_date
|
||||||
from newsreader.news.posts.models import Post
|
from newsreader.news.posts.models import Post
|
||||||
|
|
@ -24,17 +26,16 @@ from newsreader.news.posts.models import Post
|
||||||
class FeedBuilder(Builder):
|
class FeedBuilder(Builder):
|
||||||
instances = []
|
instances = []
|
||||||
|
|
||||||
def __enter__(self):
|
def __enter__(self) -> ContextManager:
|
||||||
_, stream = self.stream
|
_, stream = self.stream
|
||||||
self.instances = []
|
self.instances = []
|
||||||
self.existing_posts = {
|
self.existing_posts = {
|
||||||
post.remote_identifier: post
|
post.remote_identifier: post for post in Post.objects.filter(rule=stream.rule)
|
||||||
for post in Post.objects.filter(rule=stream.rule)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return super().__enter__()
|
return super().__enter__()
|
||||||
|
|
||||||
def create_posts(self, stream):
|
def create_posts(self, stream: Tuple) -> None:
|
||||||
data, stream = stream
|
data, stream = stream
|
||||||
entries = []
|
entries = []
|
||||||
|
|
||||||
|
|
@ -49,30 +50,25 @@ class FeedBuilder(Builder):
|
||||||
|
|
||||||
self.instances = [post for post in posts]
|
self.instances = [post for post in posts]
|
||||||
|
|
||||||
def build(self, entries, rule):
|
def build(self, entries: List, rule: CollectionRule) -> Generator[Post, None, None]:
|
||||||
field_mapping = {
|
field_mapping = {
|
||||||
"id": "remote_identifier",
|
"id": "remote_identifier",
|
||||||
"title": "title",
|
"title": "title",
|
||||||
"summary": "body",
|
"summary": "body",
|
||||||
"link": "url",
|
"link": "url",
|
||||||
"published_parsed": "publication_date",
|
"published_parsed": "publication_date",
|
||||||
"author": "author"
|
"author": "author",
|
||||||
}
|
}
|
||||||
|
|
||||||
tz = pytz.timezone(rule.timezone)
|
tz = pytz.timezone(rule.timezone)
|
||||||
|
|
||||||
for entry in entries:
|
for entry in entries:
|
||||||
data = {
|
data = {"rule_id": rule.pk, "category": rule.category}
|
||||||
"rule_id": rule.pk,
|
|
||||||
"category": rule.category
|
|
||||||
}
|
|
||||||
|
|
||||||
for field, value in field_mapping.items():
|
for field, value in field_mapping.items():
|
||||||
if field in entry:
|
if field in entry:
|
||||||
if field == "published_parsed":
|
if field == "published_parsed":
|
||||||
created, aware_datetime = build_publication_date(
|
created, aware_datetime = build_publication_date(entry[field], tz)
|
||||||
entry[field], tz
|
|
||||||
)
|
|
||||||
data[value] = aware_datetime if created else None
|
data[value] = aware_datetime if created else None
|
||||||
elif field == "summary":
|
elif field == "summary":
|
||||||
summary = self.sanitize_summary(entry[field])
|
summary = self.sanitize_summary(entry[field])
|
||||||
|
|
@ -82,19 +78,19 @@ class FeedBuilder(Builder):
|
||||||
|
|
||||||
yield Post(**data)
|
yield Post(**data)
|
||||||
|
|
||||||
def sanitize_summary(self, summary):
|
def sanitize_summary(self, summary: str) -> Optional[str]:
|
||||||
attrs = {"a": ["href", "rel"], "img": ["alt", "src"],}
|
attrs = {"a": ["href", "rel"], "img": ["alt", "src"]}
|
||||||
tags = ["a", "img", "p"]
|
tags = ["a", "img", "p"]
|
||||||
|
|
||||||
return bleach.clean(summary, tags=tags, attributes=attrs) if summary else None
|
return bleach.clean(summary, tags=tags, attributes=attrs) if summary else None
|
||||||
|
|
||||||
def save(self):
|
def save(self) -> None:
|
||||||
for post in self.instances:
|
for post in self.instances:
|
||||||
post.save()
|
post.save()
|
||||||
|
|
||||||
|
|
||||||
class FeedStream(Stream):
|
class FeedStream(Stream):
|
||||||
def read(self):
|
def read(self) -> Tuple:
|
||||||
url = self.rule.url
|
url = self.rule.url
|
||||||
response = requests.get(url)
|
response = requests.get(url)
|
||||||
|
|
||||||
|
|
@ -103,7 +99,7 @@ class FeedStream(Stream):
|
||||||
|
|
||||||
return (self.parse(response.content), self)
|
return (self.parse(response.content), self)
|
||||||
|
|
||||||
def parse(self, payload):
|
def parse(self, payload: bytes) -> Dict:
|
||||||
try:
|
try:
|
||||||
return parse(payload)
|
return parse(payload)
|
||||||
except TypeError as e:
|
except TypeError as e:
|
||||||
|
|
@ -113,14 +109,11 @@ class FeedStream(Stream):
|
||||||
class FeedClient(Client):
|
class FeedClient(Client):
|
||||||
stream = FeedStream
|
stream = FeedStream
|
||||||
|
|
||||||
def __enter__(self):
|
def __enter__(self) -> ContextManager:
|
||||||
streams = [self.stream(rule) for rule in self.rules]
|
streams = [self.stream(rule) for rule in self.rules]
|
||||||
|
|
||||||
with ThreadPoolExecutor(max_workers=10) as executor:
|
with ThreadPoolExecutor(max_workers=10) as executor:
|
||||||
futures = {
|
futures = {executor.submit(stream.read): stream for stream in streams}
|
||||||
executor.submit(stream.read): stream
|
|
||||||
for stream in streams
|
|
||||||
}
|
|
||||||
|
|
||||||
for future in as_completed(futures):
|
for future in as_completed(futures):
|
||||||
stream = futures[future]
|
stream = futures[future]
|
||||||
|
|
@ -148,20 +141,20 @@ class FeedCollector(Collector):
|
||||||
|
|
||||||
|
|
||||||
class FeedDuplicateHandler:
|
class FeedDuplicateHandler:
|
||||||
def __init__(self, rule):
|
def __init__(self, rule: CollectionRule) -> None:
|
||||||
self.queryset = rule.post_set.all()
|
self.queryset = rule.post_set.all()
|
||||||
|
|
||||||
def __enter__(self):
|
def __enter__(self) -> ContextManager:
|
||||||
self.existing_identifiers = self.queryset.filter(remote_identifier__isnull=False).values_list(
|
self.existing_identifiers = self.queryset.filter(
|
||||||
"remote_identifier", flat=True
|
remote_identifier__isnull=False
|
||||||
)
|
).values_list("remote_identifier", flat=True)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def __exit__(self, *args, **kwargs):
|
def __exit__(self, *args, **kwargs) -> None:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def check(self, instances):
|
def check(self, instances: List) -> Generator[Post, None, None]:
|
||||||
for instance in instances:
|
for instance in instances:
|
||||||
if instance.remote_identifier in self.existing_identifiers:
|
if instance.remote_identifier in self.existing_identifiers:
|
||||||
existing_post = self.handle_duplicate(instance)
|
existing_post = self.handle_duplicate(instance)
|
||||||
|
|
||||||
|
|
@ -173,31 +166,29 @@ class FeedDuplicateHandler:
|
||||||
|
|
||||||
yield instance
|
yield instance
|
||||||
|
|
||||||
def in_database(self, entry):
|
def in_database(self, post: Post) -> Optional[bool]:
|
||||||
values = {
|
values = {
|
||||||
"url": entry.url,
|
"url": post.url,
|
||||||
"title": entry.title,
|
"title": post.title,
|
||||||
"body": entry.body,
|
"body": post.body,
|
||||||
"publication_date": entry.publication_date
|
"publication_date": post.publication_date,
|
||||||
}
|
}
|
||||||
|
|
||||||
for existing_entry in self.queryset.order_by("-publication_date")[:50]:
|
for existing_post in self.queryset.order_by("-publication_date")[:50]:
|
||||||
if self.is_duplicate(existing_entry, values):
|
if self.is_duplicate(existing_post, values):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def is_duplicate(self, existing_entry, values):
|
def is_duplicate(self, existing_post: Post, values: Dict) -> bool:
|
||||||
for key, value in values.items():
|
for key, value in values.items():
|
||||||
existing_value = getattr(existing_entry, key, object())
|
existing_value = getattr(existing_post, key, object())
|
||||||
if existing_value != value:
|
if existing_value != value:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def handle_duplicate(self, instance):
|
def handle_duplicate(self, instance: Post) -> Optional[Post]:
|
||||||
try:
|
try:
|
||||||
existing_instance = self.queryset.get(
|
existing_instance = self.queryset.get(remote_identifier=instance.remote_identifier)
|
||||||
remote_identifier=instance.remote_identifier,
|
|
||||||
)
|
|
||||||
except ObjectDoesNotExist:
|
except ObjectDoesNotExist:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,7 @@
|
||||||
|
from typing import ContextManager
|
||||||
|
|
||||||
|
from requests import Response
|
||||||
|
|
||||||
from newsreader.news.collection.exceptions import (
|
from newsreader.news.collection.exceptions import (
|
||||||
StreamDeniedException,
|
StreamDeniedException,
|
||||||
StreamForbiddenException,
|
StreamForbiddenException,
|
||||||
|
|
@ -14,17 +18,17 @@ class ResponseHandler:
|
||||||
408: StreamTimeOutException,
|
408: StreamTimeOutException,
|
||||||
}
|
}
|
||||||
|
|
||||||
def __init__(self, response):
|
def __init__(self, response: Response) -> None:
|
||||||
self.response = response
|
self.response = response
|
||||||
|
|
||||||
def __enter__(self):
|
def __enter__(self) -> ContextManager:
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def handle_response(self):
|
def handle_response(self) -> None:
|
||||||
status_code = self.response.status_code
|
status_code = self.response.status_code
|
||||||
|
|
||||||
if status_code in self.message_mapping:
|
if status_code in self.message_mapping:
|
||||||
raise self.message_mapping[status_code]
|
raise self.message_mapping[status_code]
|
||||||
|
|
||||||
def __exit__(self, *args, **kwargs):
|
def __exit__(self, *args, **kwargs) -> None:
|
||||||
self.response = None
|
self.response = None
|
||||||
|
|
|
||||||
|
|
@ -6,7 +6,6 @@ from django.utils import timezone
|
||||||
from newsreader.news.collection.exceptions import (
|
from newsreader.news.collection.exceptions import (
|
||||||
StreamDeniedException,
|
StreamDeniedException,
|
||||||
StreamException,
|
StreamException,
|
||||||
StreamFieldException,
|
|
||||||
StreamNotFoundException,
|
StreamNotFoundException,
|
||||||
StreamTimeOutException,
|
StreamTimeOutException,
|
||||||
)
|
)
|
||||||
|
|
@ -17,9 +16,7 @@ from newsreader.news.collection.tests.feed.client.mocks import simple_mock
|
||||||
|
|
||||||
class FeedClientTestCase(TestCase):
|
class FeedClientTestCase(TestCase):
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.patched_read = patch(
|
self.patched_read = patch("newsreader.news.collection.feed.FeedStream.read")
|
||||||
'newsreader.news.collection.feed.FeedStream.read'
|
|
||||||
)
|
|
||||||
self.mocked_read = self.patched_read.start()
|
self.mocked_read = self.patched_read.start()
|
||||||
|
|
||||||
def tearDown(self):
|
def tearDown(self):
|
||||||
|
|
|
||||||
|
|
@ -1,10 +1,11 @@
|
||||||
from datetime import datetime
|
from datetime import datetime, tzinfo
|
||||||
from time import mktime
|
from time import mktime, struct_time
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
from django.utils import timezone
|
from django.utils import timezone
|
||||||
|
|
||||||
|
|
||||||
def build_publication_date(dt, tz):
|
def build_publication_date(dt: struct_time, tz: tzinfo) -> Tuple:
|
||||||
try:
|
try:
|
||||||
naive_datetime = datetime.fromtimestamp(mktime(dt))
|
naive_datetime = datetime.fromtimestamp(mktime(dt))
|
||||||
published_parsed = timezone.make_aware(naive_datetime, timezone=tz)
|
published_parsed = timezone.make_aware(naive_datetime, timezone=tz)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue