Update duplicate handling in RedditBuilder
This commit is contained in:
parent
3db9336909
commit
d44bec944c
2 changed files with 14 additions and 12 deletions
|
|
@ -111,6 +111,8 @@ class RedditBuilder(Builder):
|
||||||
self.instances = self.build(posts, stream.rule)
|
self.instances = self.build(posts, stream.rule)
|
||||||
|
|
||||||
def build(self, posts, rule):
|
def build(self, posts, rule):
|
||||||
|
results = {}
|
||||||
|
|
||||||
for post in posts:
|
for post in posts:
|
||||||
if not "data" in post:
|
if not "data" in post:
|
||||||
continue
|
continue
|
||||||
|
|
@ -120,6 +122,9 @@ class RedditBuilder(Builder):
|
||||||
author = truncate_text(Post, "author", post["data"]["author"])
|
author = truncate_text(Post, "author", post["data"]["author"])
|
||||||
url_fragment = f"{post['data']['permalink']}"
|
url_fragment = f"{post['data']['permalink']}"
|
||||||
|
|
||||||
|
if remote_identifier in results:
|
||||||
|
continue
|
||||||
|
|
||||||
uncleaned_body = post["data"]["selftext_html"]
|
uncleaned_body = post["data"]["selftext_html"]
|
||||||
unescaped_body = unescape(uncleaned_body) if uncleaned_body else ""
|
unescaped_body = unescape(uncleaned_body) if uncleaned_body else ""
|
||||||
body = (
|
body = (
|
||||||
|
|
@ -154,14 +159,15 @@ class RedditBuilder(Builder):
|
||||||
if remote_identifier in self.existing_posts:
|
if remote_identifier in self.existing_posts:
|
||||||
existing_post = self.existing_posts[remote_identifier]
|
existing_post = self.existing_posts[remote_identifier]
|
||||||
|
|
||||||
if created_date > existing_post.publication_date:
|
|
||||||
for key, value in data.items():
|
for key, value in data.items():
|
||||||
setattr(existing_post, key, value)
|
setattr(existing_post, key, value)
|
||||||
|
|
||||||
yield existing_post
|
results[existing_post.remote_identifier] = existing_post
|
||||||
continue
|
continue
|
||||||
|
|
||||||
yield Post(**data)
|
results[remote_identifier] = Post(**data)
|
||||||
|
|
||||||
|
return results.values()
|
||||||
|
|
||||||
def save(self):
|
def save(self):
|
||||||
for post in self.instances:
|
for post in self.instances:
|
||||||
|
|
|
||||||
|
|
@ -85,10 +85,8 @@ class RedditBuilderTestCase(TestCase):
|
||||||
|
|
||||||
def test_update_posts(self):
|
def test_update_posts(self):
|
||||||
subreddit = SubredditFactory()
|
subreddit = SubredditFactory()
|
||||||
existing_publication_date = pytz.utc.localize(datetime(2020, 7, 8, 14, 0, 0))
|
|
||||||
existing_post = RedditPostFactory(
|
existing_post = RedditPostFactory(
|
||||||
remote_identifier="hngsj8",
|
remote_identifier="hngsj8",
|
||||||
publication_date=existing_publication_date,
|
|
||||||
author="Old author",
|
author="Old author",
|
||||||
title="Old title",
|
title="Old title",
|
||||||
body="Old body",
|
body="Old body",
|
||||||
|
|
@ -198,9 +196,7 @@ class RedditBuilderTestCase(TestCase):
|
||||||
mock_stream = MagicMock(rule=subreddit)
|
mock_stream = MagicMock(rule=subreddit)
|
||||||
|
|
||||||
duplicate_post = RedditPostFactory(
|
duplicate_post = RedditPostFactory(
|
||||||
publication_date=pytz.utc.localize(datetime(2020, 7, 1, 9, 20, 22)),
|
remote_identifier="hm0qct", rule=subreddit, title="foo"
|
||||||
remote_identifier="hm0qct",
|
|
||||||
title="foo",
|
|
||||||
)
|
)
|
||||||
|
|
||||||
with builder((simple_mock, mock_stream)) as builder:
|
with builder((simple_mock, mock_stream)) as builder:
|
||||||
|
|
@ -217,7 +213,7 @@ class RedditBuilderTestCase(TestCase):
|
||||||
|
|
||||||
self.assertEquals(
|
self.assertEquals(
|
||||||
duplicate_post.publication_date,
|
duplicate_post.publication_date,
|
||||||
pytz.utc.localize(datetime(2020, 7, 6, 14, 11, 22)),
|
pytz.utc.localize(datetime(2020, 7, 6, 6, 11, 22)),
|
||||||
)
|
)
|
||||||
self.assertEquals(
|
self.assertEquals(
|
||||||
duplicate_post.title,
|
duplicate_post.title,
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue