Update duplicate handling in RedditBuilder

This commit is contained in:
sonny 2020-07-13 23:07:13 +02:00
parent 3db9336909
commit d44bec944c
2 changed files with 14 additions and 12 deletions

View file

@ -111,6 +111,8 @@ class RedditBuilder(Builder):
self.instances = self.build(posts, stream.rule) self.instances = self.build(posts, stream.rule)
def build(self, posts, rule): def build(self, posts, rule):
results = {}
for post in posts: for post in posts:
if not "data" in post: if not "data" in post:
continue continue
@ -120,6 +122,9 @@ class RedditBuilder(Builder):
author = truncate_text(Post, "author", post["data"]["author"]) author = truncate_text(Post, "author", post["data"]["author"])
url_fragment = f"{post['data']['permalink']}" url_fragment = f"{post['data']['permalink']}"
if remote_identifier in results:
continue
uncleaned_body = post["data"]["selftext_html"] uncleaned_body = post["data"]["selftext_html"]
unescaped_body = unescape(uncleaned_body) if uncleaned_body else "" unescaped_body = unescape(uncleaned_body) if uncleaned_body else ""
body = ( body = (
@ -154,14 +159,15 @@ class RedditBuilder(Builder):
if remote_identifier in self.existing_posts: if remote_identifier in self.existing_posts:
existing_post = self.existing_posts[remote_identifier] existing_post = self.existing_posts[remote_identifier]
if created_date > existing_post.publication_date: for key, value in data.items():
for key, value in data.items(): setattr(existing_post, key, value)
setattr(existing_post, key, value)
yield existing_post results[existing_post.remote_identifier] = existing_post
continue continue
yield Post(**data) results[remote_identifier] = Post(**data)
return results.values()
def save(self): def save(self):
for post in self.instances: for post in self.instances:

View file

@ -85,10 +85,8 @@ class RedditBuilderTestCase(TestCase):
def test_update_posts(self): def test_update_posts(self):
subreddit = SubredditFactory() subreddit = SubredditFactory()
existing_publication_date = pytz.utc.localize(datetime(2020, 7, 8, 14, 0, 0))
existing_post = RedditPostFactory( existing_post = RedditPostFactory(
remote_identifier="hngsj8", remote_identifier="hngsj8",
publication_date=existing_publication_date,
author="Old author", author="Old author",
title="Old title", title="Old title",
body="Old body", body="Old body",
@ -198,9 +196,7 @@ class RedditBuilderTestCase(TestCase):
mock_stream = MagicMock(rule=subreddit) mock_stream = MagicMock(rule=subreddit)
duplicate_post = RedditPostFactory( duplicate_post = RedditPostFactory(
publication_date=pytz.utc.localize(datetime(2020, 7, 1, 9, 20, 22)), remote_identifier="hm0qct", rule=subreddit, title="foo"
remote_identifier="hm0qct",
title="foo",
) )
with builder((simple_mock, mock_stream)) as builder: with builder((simple_mock, mock_stream)) as builder:
@ -217,7 +213,7 @@ class RedditBuilderTestCase(TestCase):
self.assertEquals( self.assertEquals(
duplicate_post.publication_date, duplicate_post.publication_date,
pytz.utc.localize(datetime(2020, 7, 6, 14, 11, 22)), pytz.utc.localize(datetime(2020, 7, 6, 6, 11, 22)),
) )
self.assertEquals( self.assertEquals(
duplicate_post.title, duplicate_post.title,