Compare commits

...
Sign in to create a new pull request.

3 commits

Author SHA1 Message Date
d44bec944c Update duplicate handling in RedditBuilder 2020-07-13 23:07:13 +02:00
3db9336909 Update factory 2020-07-13 22:23:58 +02:00
2ac1299a49 Add duplicate tests 2020-07-13 22:21:05 +02:00
4 changed files with 404 additions and 19 deletions

View file

@ -111,6 +111,8 @@ class RedditBuilder(Builder):
self.instances = self.build(posts, stream.rule) self.instances = self.build(posts, stream.rule)
def build(self, posts, rule): def build(self, posts, rule):
results = {}
for post in posts: for post in posts:
if not "data" in post: if not "data" in post:
continue continue
@ -120,6 +122,9 @@ class RedditBuilder(Builder):
author = truncate_text(Post, "author", post["data"]["author"]) author = truncate_text(Post, "author", post["data"]["author"])
url_fragment = f"{post['data']['permalink']}" url_fragment = f"{post['data']['permalink']}"
if remote_identifier in results:
continue
uncleaned_body = post["data"]["selftext_html"] uncleaned_body = post["data"]["selftext_html"]
unescaped_body = unescape(uncleaned_body) if uncleaned_body else "" unescaped_body = unescape(uncleaned_body) if uncleaned_body else ""
body = ( body = (
@ -154,14 +159,15 @@ class RedditBuilder(Builder):
if remote_identifier in self.existing_posts: if remote_identifier in self.existing_posts:
existing_post = self.existing_posts[remote_identifier] existing_post = self.existing_posts[remote_identifier]
if created_date > existing_post.publication_date:
for key, value in data.items(): for key, value in data.items():
setattr(existing_post, key, value) setattr(existing_post, key, value)
yield existing_post results[existing_post.remote_identifier] = existing_post
continue continue
yield Post(**data) results[remote_identifier] = Post(**data)
return results.values()
def save(self): def save(self):
for post in self.instances: for post in self.instances:

View file

@ -1376,3 +1376,336 @@ title_mock = {
"before": None, "before": None,
}, },
} }
duplicate_mock = {
"kind": "Listing",
"data": {
"modhash": "rjewztai5w0ab64547311ae1fb1f9cf81cd18949bfb629cb7f",
"dist": 27,
"children": [
{
"kind": "t3",
"data": {
"approved_at_utc": None,
"subreddit": "linux",
"selftext": "Welcome to r/linux rants and experiences! This megathread is also to hear opinions from anyone just starting out with Linux or those that have used Linux (GNU or otherwise) for a long time.\n\nLet us know what's annoying you, whats making you happy, or something that you want to get out to r/linux but didn't make the cut into a full post of it's own.\n\nFor those looking for certifications please use this megathread to ask about how to get certified whether it's for the business world or for your own satisfaction. Be sure to check out r/linuxadmin for more discussion in the SysAdmin world!\n\n_Please keep questions in r/linuxquestions, r/linux4noobs, or the Wednesday automod thread._",
"author_fullname": "t2_6l4z3",
"saved": False,
"mod_reason_title": None,
"gilded": 0,
"clicked": False,
"title": "Linux Experiences/Rants or Education/Certifications thread - July 06, 2020",
"link_flair_richtext": [],
"subreddit_name_prefixed": "r/linux",
"hidden": False,
"pwls": 6,
"link_flair_css_class": None,
"downs": 0,
"top_awarded_type": None,
"hide_score": False,
"name": "t3_hm0qct",
"quarantine": False,
"link_flair_text_color": "dark",
"upvote_ratio": 0.7,
"author_flair_background_color": None,
"subreddit_type": "public",
"ups": 8,
"total_awards_received": 0,
"media_embed": {},
"author_flair_template_id": None,
"is_original_content": False,
"user_reports": [],
"secure_media": None,
"is_reddit_media_domain": False,
"is_meta": False,
"category": None,
"secure_media_embed": {},
"link_flair_text": None,
"can_mod_post": False,
"score": 8,
"approved_by": None,
"author_premium": True,
"thumbnail": "",
"edited": False,
"author_flair_css_class": None,
"author_flair_richtext": [],
"gildings": {},
"content_categories": None,
"is_self": True,
"mod_note": None,
"created": 1594037482.0,
"link_flair_type": "text",
"wls": 6,
"removed_by_category": None,
"banned_by": None,
"author_flair_type": "text",
"domain": "self.linux",
"allow_live_comments": False,
"selftext_html": "<!-- SC_OFF --><div class='md'><p>Welcome to <a href='/r/linux'>r/linux</a> rants and experiences! This megathread is also to hear opinions from anyone just starting out with Linux or those that have used Linux (GNU or otherwise) for a long time.</p>\n\n<p>Let us know what's annoying you, whats making you happy, or something that you want to get out to <a href='/r/linux'>r/linux</a> but didn't make the cut into a full post of it's own.</p>\n\n<p>For those looking for certifications please use this megathread to ask about how to get certified whether it's for the business world or for your own satisfaction. Be sure to check out <a href='/r/linuxadmin'>r/linuxadmin</a> for more discussion in the SysAdmin world!</p>\n\n<p><em>Please keep questions in <a href='/r/linuxquestions'>r/linuxquestions</a>, <a href='/r/linux4noobs'>r/linux4noobs</a>, or the Wednesday automod thread.</em></p>\n</div><!-- SC_ON -->",
"likes": None,
"suggested_sort": None,
"banned_at_utc": None,
"view_count": None,
"archived": False,
"no_follow": True,
"is_crosspostable": True,
"pinned": False,
"over_18": False,
"all_awardings": [],
"awarders": [],
"media_only": False,
"can_gild": True,
"spoiler": False,
"locked": False,
"author_flair_text": None,
"treatment_tags": [],
"visited": False,
"removed_by": None,
"num_reports": None,
"distinguished": "moderator",
"subreddit_id": "t5_2qh1a",
"mod_reason_by": None,
"removal_reason": None,
"link_flair_background_color": "",
"id": "hm0qct",
"is_robot_indexable": True,
"report_reasons": None,
"author": "AutoModerator",
"discussion_type": None,
"num_comments": 9,
"send_replies": False,
"whitelist_status": "all_ads",
"contest_mode": False,
"mod_reports": [],
"author_patreon_flair": False,
"author_flair_text_color": None,
"permalink": "/r/linux/comments/hm0qct/linux_experiencesrants_or_educationcertifications/",
"parent_whitelist_status": "all_ads",
"stickied": True,
"url": "https://www.reddit.com/r/linux/comments/hm0qct/linux_experiencesrants_or_educationcertifications/",
"subreddit_subscribers": 544037,
"created_utc": 1594008682.0,
"num_crossposts": 0,
"media": None,
"is_video": False,
},
},
{
"kind": "t3",
"data": {
"approved_at_utc": None,
"subreddit": "linux",
"selftext": "Welcome to r/linux! If you're new to Linux or trying to get started this thread is for you. Get help here or as always, check out r/linuxquestions or r/linux4noobs\n\nThis megathread is for all your question needs. As we don't allow questions on r/linux outside of this megathread, please consider using r/linuxquestions or r/linux4noobs for the best solution to your problem.\n\nAsk your hardware requests here too or try r/linuxhardware!",
"author_fullname": "t2_6l4z3",
"saved": False,
"mod_reason_title": None,
"gilded": 0,
"clicked": False,
"title": "Weekly Questions and Hardware Thread - July 08, 2020",
"link_flair_richtext": [],
"subreddit_name_prefixed": "r/linux",
"hidden": False,
"pwls": 6,
"link_flair_css_class": None,
"downs": 0,
"top_awarded_type": None,
"hide_score": False,
"name": "t3_hna75r",
"quarantine": False,
"link_flair_text_color": "dark",
"upvote_ratio": 0.6,
"author_flair_background_color": None,
"subreddit_type": "public",
"ups": 2,
"total_awards_received": 0,
"media_embed": {},
"author_flair_template_id": None,
"is_original_content": False,
"user_reports": [],
"secure_media": None,
"is_reddit_media_domain": False,
"is_meta": False,
"category": None,
"secure_media_embed": {},
"link_flair_text": None,
"can_mod_post": False,
"score": 2,
"approved_by": None,
"author_premium": True,
"thumbnail": "",
"edited": False,
"author_flair_css_class": None,
"author_flair_richtext": [],
"gildings": {},
"content_categories": None,
"is_self": True,
"mod_note": None,
"created": 1594210138.0,
"link_flair_type": "text",
"wls": 6,
"removed_by_category": None,
"banned_by": None,
"author_flair_type": "text",
"domain": "self.linux",
"allow_live_comments": False,
"selftext_html": '<!-- SC_OFF --><div class="md"><p>Welcome to <a href="/r/linux">r/linux</a>! If you're new to Linux or trying to get started this thread is for you. Get help here or as always, check out <a href="/r/linuxquestions">r/linuxquestions</a> or <a href="/r/linux4noobs">r/linux4noobs</a></p>\n\n<p>This megathread is for all your question needs. As we don't allow questions on <a href="/r/linux">r/linux</a> outside of this megathread, please consider using <a href="/r/linuxquestions">r/linuxquestions</a> or <a href="/r/linux4noobs">r/linux4noobs</a> for the best solution to your problem.</p>\n\n<p>Ask your hardware requests here too or try <a href="/r/linuxhardware">r/linuxhardware</a>!</p>\n</div><!-- SC_ON -->',
"likes": None,
"suggested_sort": "new",
"banned_at_utc": None,
"view_count": None,
"archived": False,
"no_follow": True,
"is_crosspostable": True,
"pinned": False,
"over_18": False,
"all_awardings": [],
"awarders": [],
"media_only": False,
"can_gild": True,
"spoiler": False,
"locked": False,
"author_flair_text": None,
"treatment_tags": [],
"visited": False,
"removed_by": None,
"num_reports": None,
"distinguished": "moderator",
"subreddit_id": "t5_2qh1a",
"mod_reason_by": None,
"removal_reason": None,
"link_flair_background_color": "",
"id": "hna75r",
"is_robot_indexable": True,
"report_reasons": None,
"author": "AutoModerator",
"discussion_type": None,
"num_comments": 2,
"send_replies": False,
"whitelist_status": "all_ads",
"contest_mode": False,
"mod_reports": [],
"author_patreon_flair": False,
"author_flair_text_color": None,
"permalink": "/r/linux/comments/hna75r/weekly_questions_and_hardware_thread_july_08_2020/",
"parent_whitelist_status": "all_ads",
"stickied": True,
"url": "https://www.reddit.com/r/linux/comments/hna75r/weekly_questions_and_hardware_thread_july_08_2020/",
"subreddit_subscribers": 544037,
"created_utc": 1594181338.0,
"num_crossposts": 0,
"media": None,
"is_video": False,
},
},
{
"kind": "t3",
"data": {
"approved_at_utc": None,
"subreddit": "linux",
"selftext": "Welcome to r/linux rants and experiences! This megathread is also to hear opinions from anyone just starting out with Linux or those that have used Linux (GNU or otherwise) for a long time.\n\nLet us know what's annoying you, whats making you happy, or something that you want to get out to r/linux but didn't make the cut into a full post of it's own.\n\nFor those looking for certifications please use this megathread to ask about how to get certified whether it's for the business world or for your own satisfaction. Be sure to check out r/linuxadmin for more discussion in the SysAdmin world!\n\n_Please keep questions in r/linuxquestions, r/linux4noobs, or the Wednesday automod thread._",
"author_fullname": "t2_6l4z3",
"saved": False,
"mod_reason_title": None,
"gilded": 0,
"clicked": False,
"title": "Linux Experiences/Rants or Education/Certifications thread - July 06, 2020",
"link_flair_richtext": [],
"subreddit_name_prefixed": "r/linux",
"hidden": False,
"pwls": 6,
"link_flair_css_class": None,
"downs": 0,
"top_awarded_type": None,
"hide_score": False,
"name": "t3_hm0qct",
"quarantine": False,
"link_flair_text_color": "dark",
"upvote_ratio": 0.7,
"author_flair_background_color": None,
"subreddit_type": "public",
"ups": 8,
"total_awards_received": 0,
"media_embed": {},
"author_flair_template_id": None,
"is_original_content": False,
"user_reports": [],
"secure_media": None,
"is_reddit_media_domain": False,
"is_meta": False,
"category": None,
"secure_media_embed": {},
"link_flair_text": None,
"can_mod_post": False,
"score": 8,
"approved_by": None,
"author_premium": True,
"thumbnail": "",
"edited": False,
"author_flair_css_class": None,
"author_flair_richtext": [],
"gildings": {},
"content_categories": None,
"is_self": True,
"mod_note": None,
"created": 1594037482.0,
"link_flair_type": "text",
"wls": 6,
"removed_by_category": None,
"banned_by": None,
"author_flair_type": "text",
"domain": "self.linux",
"allow_live_comments": False,
"selftext_html": "<!-- SC_OFF --><div class='md'><p>Welcome to <a href='/r/linux'>r/linux</a> rants and experiences! This megathread is also to hear opinions from anyone just starting out with Linux or those that have used Linux (GNU or otherwise) for a long time.</p>\n\n<p>Let us know what's annoying you, whats making you happy, or something that you want to get out to <a href='/r/linux'>r/linux</a> but didn't make the cut into a full post of it's own.</p>\n\n<p>For those looking for certifications please use this megathread to ask about how to get certified whether it's for the business world or for your own satisfaction. Be sure to check out <a href='/r/linuxadmin'>r/linuxadmin</a> for more discussion in the SysAdmin world!</p>\n\n<p><em>Please keep questions in <a href='/r/linuxquestions'>r/linuxquestions</a>, <a href='/r/linux4noobs'>r/linux4noobs</a>, or the Wednesday automod thread.</em></p>\n</div><!-- SC_ON -->",
"likes": None,
"suggested_sort": None,
"banned_at_utc": None,
"view_count": None,
"archived": False,
"no_follow": True,
"is_crosspostable": True,
"pinned": False,
"over_18": False,
"all_awardings": [],
"awarders": [],
"media_only": False,
"can_gild": True,
"spoiler": False,
"locked": False,
"author_flair_text": None,
"treatment_tags": [],
"visited": False,
"removed_by": None,
"num_reports": None,
"distinguished": "moderator",
"subreddit_id": "t5_2qh1a",
"mod_reason_by": None,
"removal_reason": None,
"link_flair_background_color": "",
"id": "hm0qct",
"is_robot_indexable": True,
"report_reasons": None,
"author": "AutoModerator",
"discussion_type": None,
"num_comments": 9,
"send_replies": False,
"whitelist_status": "all_ads",
"contest_mode": False,
"mod_reports": [],
"author_patreon_flair": False,
"author_flair_text_color": None,
"permalink": "/r/linux/comments/hm0qct/linux_experiencesrants_or_educationcertifications/",
"parent_whitelist_status": "all_ads",
"stickied": True,
"url": "https://www.reddit.com/r/linux/comments/hm0qct/linux_experiencesrants_or_educationcertifications/",
"subreddit_subscribers": 544037,
"created_utc": 1594008682.0,
"num_crossposts": 0,
"media": None,
"is_video": False,
},
},
],
"after": "t3_hmytic",
"before": None,
},
}

View file

@ -7,16 +7,9 @@ import pytz
from newsreader.news.collection.reddit import RedditBuilder from newsreader.news.collection.reddit import RedditBuilder
from newsreader.news.collection.tests.factories import SubredditFactory from newsreader.news.collection.tests.factories import SubredditFactory
from newsreader.news.collection.tests.reddit.builder.mocks import ( from newsreader.news.collection.tests.reddit.builder.mocks import *
author_mock,
empty_mock,
simple_mock,
title_mock,
unknown_mock,
unsanitized_mock,
)
from newsreader.news.core.models import Post from newsreader.news.core.models import Post
from newsreader.news.core.tests.factories import PostFactory from newsreader.news.core.tests.factories import RedditPostFactory
class RedditBuilderTestCase(TestCase): class RedditBuilderTestCase(TestCase):
@ -92,10 +85,8 @@ class RedditBuilderTestCase(TestCase):
def test_update_posts(self): def test_update_posts(self):
subreddit = SubredditFactory() subreddit = SubredditFactory()
existing_publication_date = pytz.utc.localize(datetime(2020, 7, 8, 14, 0, 0)) existing_post = RedditPostFactory(
existing_post = PostFactory(
remote_identifier="hngsj8", remote_identifier="hngsj8",
publication_date=existing_publication_date,
author="Old author", author="Old author",
title="Old title", title="Old title",
body="Old body", body="Old body",
@ -183,3 +174,48 @@ class RedditBuilderTestCase(TestCase):
post.title, post.title,
'Board statement on the LibreOffice 7.0 RC "Personal EditionBoard statement on the LibreOffice 7.0 RC "Personal Edition" label" labelBoard statement on the LibreOffice 7.0 RC "PersBoard statement on t…', 'Board statement on the LibreOffice 7.0 RC "Personal EditionBoard statement on the LibreOffice 7.0 RC "Personal Edition" label" labelBoard statement on the LibreOffice 7.0 RC "PersBoard statement on t…',
) )
def test_duplicate_in_response(self):
builder = RedditBuilder
subreddit = SubredditFactory()
mock_stream = MagicMock(rule=subreddit)
with builder((duplicate_mock, mock_stream)) as builder:
builder.save()
posts = {post.remote_identifier: post for post in Post.objects.all()}
self.assertEquals(Post.objects.count(), 2)
self.assertCountEqual(("hm0qct", "hna75r"), posts.keys())
def test_duplicate_in_database(self):
builder = RedditBuilder
subreddit = SubredditFactory()
mock_stream = MagicMock(rule=subreddit)
duplicate_post = RedditPostFactory(
remote_identifier="hm0qct", rule=subreddit, title="foo"
)
with builder((simple_mock, mock_stream)) as builder:
builder.save()
posts = {post.remote_identifier: post for post in Post.objects.all()}
self.assertEquals(Post.objects.count(), 5)
self.assertCountEqual(
("hm0qct", "hna75r", "hngs71", "hngsj8", "hnd7cy"), posts.keys()
)
duplicate_post.refresh_from_db()
self.assertEquals(
duplicate_post.publication_date,
pytz.utc.localize(datetime(2020, 7, 6, 6, 11, 22)),
)
self.assertEquals(
duplicate_post.title,
"Linux Experiences/Rants or Education/Certifications thread - July 06, 2020",
)

View file

@ -1,7 +1,9 @@
import factory import factory
import factory.fuzzy
import pytz import pytz
from newsreader.accounts.tests.factories import UserFactory from newsreader.accounts.tests.factories import UserFactory
from newsreader.news.collection.reddit import REDDIT_URL
from newsreader.news.core.models import Category, Post from newsreader.news.core.models import Category, Post
@ -19,7 +21,7 @@ class PostFactory(factory.django.DjangoModelFactory):
author = factory.Faker("name") author = factory.Faker("name")
publication_date = factory.Faker("date_time_this_year", tzinfo=pytz.utc) publication_date = factory.Faker("date_time_this_year", tzinfo=pytz.utc)
url = factory.Faker("url") url = factory.Faker("url")
remote_identifier = factory.Faker("url") remote_identifier = factory.Faker("uuid4")
rule = factory.SubFactory( rule = factory.SubFactory(
"newsreader.news.collection.tests.factories.CollectionRuleFactory" "newsreader.news.collection.tests.factories.CollectionRuleFactory"
@ -29,3 +31,11 @@ class PostFactory(factory.django.DjangoModelFactory):
class Meta: class Meta:
model = Post model = Post
class RedditPostFactory(PostFactory):
remote_identifier = factory.Faker("uuid4")
url = factory.fuzzy.FuzzyText(length=10, prefix=f"{REDDIT_URL}/")
rule = factory.SubFactory(
"newsreader.news.collection.tests.factories.SubredditFactory"
)