diff --git a/src/newsreader/news/collection/tests/twitter/builder/mocks.py b/src/newsreader/news/collection/tests/twitter/builder/mocks.py index b330f2f..2be360d 100644 --- a/src/newsreader/news/collection/tests/twitter/builder/mocks.py +++ b/src/newsreader/news/collection/tests/twitter/builder/mocks.py @@ -2185,3 +2185,202 @@ unsanitized_mock = [ }, } ] + +broken_mock = [ + { + "contributors": None, + "coordinates": None, + "created_at": "Fri Aug 07 00:17:05 +0000 2020", + "display_text_range": [11, 59], + "entities": { + "hashtags": [], + "symbols": [], + "urls": [ + { + "display_url": "youtu.be/rDy7tPf6CT8", + "expanded_url": "https://youtu.be/rDy7tPf6CT8", + "indices": [36, 59], + "url": "https://t.co/trAcIxBMlX", + } + ], + "user_mentions": [ + { + "id": 975844884606275587, + "id_str": "975844884606275587", + "indices": [0, 10], + "name": "ArieNeo", + "screen_name": "ArieNeoSC", + } + ], + }, + "favorite_count": 19, + "favorited": False, + # Note the missing full_text key here + "geo": None, + "id": 1291528756373286914, + "id_str": "1291528756373286914", + "in_reply_to_screen_name": "ArieNeoSC", + "in_reply_to_status_id": 1291507356313038850, + "in_reply_to_status_id_str": "1291507356313038850", + "in_reply_to_user_id": 975844884606275587, + "in_reply_to_user_id_str": "975844884606275587", + "is_quote_status": False, + "lang": "en", + "place": None, + "possibly_sensitive": False, + "retweet_count": 5, + "retweeted": False, + "source": 'Twitter Web App', + "truncated": False, + "user": { + "contributors_enabled": False, + "created_at": "Wed Sep 05 00:58:11 +0000 2012", + "default_profile": False, + "default_profile_image": False, + "description": "The official Twitter profile for #StarCitizen and Roberts Space Industries.", + "entities": { + "description": {"urls": []}, + "url": { + "urls": [ + { + "display_url": "robertsspaceindustries.com", + "expanded_url": "http://www.robertsspaceindustries.com", + "indices": [0, 23], + "url": "https://t.co/iqO6apof3y", + } + ] + }, + }, + "favourites_count": 4588, + "follow_request_sent": None, + "followers_count": 106169, + "following": None, + "friends_count": 201, + "geo_enabled": False, + "has_extended_profile": False, + "id": 803542770, + "id_str": "803542770", + "is_translation_enabled": False, + "is_translator": False, + "lang": None, + "listed_count": 890, + "location": "Roberts Space Industries", + "name": "Star Citizen", + "notifications": None, + "profile_background_color": "131516", + "profile_background_image_url": "http://abs.twimg.com/images/themes/theme14/bg.gif", + "profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme14/bg.gif", + "profile_background_tile": False, + "profile_banner_url": "https://pbs.twimg.com/profile_banners/803542770/1596651186", + "profile_image_url": "http://pbs.twimg.com/profile_images/963109950103814144/ysnj_Asy_normal.jpg", + "profile_image_url_https": "https://pbs.twimg.com/profile_images/963109950103814144/ysnj_Asy_normal.jpg", + "profile_link_color": "0A5485", + "profile_sidebar_border_color": "FFFFFF", + "profile_sidebar_fill_color": "EFEFEF", + "profile_text_color": "333333", + "profile_use_background_image": True, + "protected": False, + "screen_name": "RobertsSpaceInd", + "statuses_count": 6210, + "time_zone": None, + "translator_type": "none", + "url": "https://t.co/iqO6apof3y", + "utc_offset": None, + "verified": True, + }, + }, + { + "contributors": None, + "coordinates": None, + "created_at": "Wed Jul 29 19:01:47 +0000 2020", + "display_text_range": [10, 98], + "entities": { + "hashtags": [], + "symbols": [], + "urls": [], + "user_mentions": [ + { + "id": 435221600, + "id_str": "435221600", + "indices": [0, 9], + "name": "Christopher Blough", + "screen_name": "RelicCcb", + } + ], + }, + "favorite_count": 1, + "favorited": False, + "full_text": "@RelicCcb Hi Christoper, we have checked the status of your investigation and it is still ongoing.", + "geo": None, + "id": 1288550304095416320, + "id_str": "1288550304095416320", + "in_reply_to_screen_name": "RelicCcb", + "in_reply_to_status_id": 1288475147951898625, + "in_reply_to_status_id_str": "1288475147951898625", + "in_reply_to_user_id": 435221600, + "in_reply_to_user_id_str": "435221600", + "is_quote_status": False, + "lang": "en", + "place": None, + "retweet_count": 0, + "retweeted": False, + "source": 'Twitter Web App', + "truncated": False, + "user": { + "contributors_enabled": False, + "created_at": "Wed Sep 05 00:58:11 +0000 2012", + "default_profile": False, + "default_profile_image": False, + "description": "The official Twitter profile for #StarCitizen and Roberts Space Industries.", + "entities": { + "description": {"urls": []}, + "url": { + "urls": [ + { + "display_url": "robertsspaceindustries.com", + "expanded_url": "http://www.robertsspaceindustries.com", + "indices": [0, 23], + "url": "https://t.co/iqO6apof3y", + } + ] + }, + }, + "favourites_count": 4588, + "follow_request_sent": None, + "followers_count": 106169, + "following": None, + "friends_count": 201, + "geo_enabled": False, + "has_extended_profile": False, + "id": 803542770, + "id_str": "803542770", + "is_translation_enabled": False, + "is_translator": False, + "lang": None, + "listed_count": 890, + "location": "Roberts Space Industries", + "name": "Star Citizen", + "notifications": None, + "profile_background_color": "131516", + "profile_background_image_url": "http://abs.twimg.com/images/themes/theme14/bg.gif", + "profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme14/bg.gif", + "profile_background_tile": False, + "profile_banner_url": "https://pbs.twimg.com/profile_banners/803542770/1596651186", + "profile_image_url": "http://pbs.twimg.com/profile_images/963109950103814144/ysnj_Asy_normal.jpg", + "profile_image_url_https": "https://pbs.twimg.com/profile_images/963109950103814144/ysnj_Asy_normal.jpg", + "profile_link_color": "0A5485", + "profile_sidebar_border_color": "FFFFFF", + "profile_sidebar_fill_color": "EFEFEF", + "profile_text_color": "333333", + "profile_use_background_image": True, + "protected": False, + "screen_name": "RobertsSpaceInd", + "statuses_count": 6210, + "time_zone": None, + "translator_type": "none", + "url": "https://t.co/iqO6apof3y", + "utc_offset": None, + "verified": True, + }, + }, +] diff --git a/src/newsreader/news/collection/tests/twitter/builder/tests.py b/src/newsreader/news/collection/tests/twitter/builder/tests.py index 37d7ad7..2e9ecc0 100644 --- a/src/newsreader/news/collection/tests/twitter/builder/tests.py +++ b/src/newsreader/news/collection/tests/twitter/builder/tests.py @@ -10,6 +10,7 @@ from ftfy import fix_text from newsreader.news.collection.tests.factories import TwitterTimelineFactory from newsreader.news.collection.tests.twitter.builder.mocks import ( + broken_mock, gif_mock, image_mock, quoted_mock, @@ -410,3 +411,21 @@ class TwitterBuilderTestCase(TestCase): builder.save() self.assertEquals(Post.objects.count(), 2) + + def test_bad_post(self): + """ + Tests that the builder will ignore posts which miss data + """ + builder = TwitterBuilder + + profile = TwitterTimelineFactory(screen_name="RobertsSpaceInd") + mock_stream = Mock(rule=profile) + + with builder(broken_mock, mock_stream) as builder: + builder.build() + builder.save() + + self.assertCountEqual( + Post.objects.values_list("remote_identifier", flat=True), + ["1288550304095416320"], + ) diff --git a/src/newsreader/news/collection/twitter.py b/src/newsreader/news/collection/twitter.py index dc32ecc..99d8c1c 100644 --- a/src/newsreader/news/collection/twitter.py +++ b/src/newsreader/news/collection/twitter.py @@ -48,58 +48,63 @@ class TwitterBuilder(PostBuilder): def build(self): results = {} - rule = self.stream.rule for post in self.payload: - remote_identifier = post["id_str"] + remote_identifier = post.get("id_str") - if remote_identifier in self.existing_posts: + if not remote_identifier or remote_identifier in self.existing_posts: continue - url = f"{TWITTER_URL}/{rule.screen_name}/status/{remote_identifier}" - body = urlize(post["full_text"], nofollow=True) - title = truncate_text( - Post, "title", self.sanitize_fragment(post["full_text"]) - ) - - publication_date = pytz.utc.localize( - datetime.strptime(post["created_at"], "%a %b %d %H:%M:%S +0000 %Y") - ) - - if "extended_entities" in post: - try: - media_entities = self.get_media_entities(post) - body += media_entities - except KeyError: - logger.exception(f"Failed parsing media_entities for {url}") - - if "retweeted_status" in post: - original_post = post["retweeted_status"] - original_tweet = urlize(original_post["full_text"], nofollow=True) - body = f"{body}
Original tweet: {original_tweet}
" - if "quoted_status" in post: - original_post = post["quoted_status"] - original_tweet = urlize(original_post["full_text"], nofollow=True) - body = f"{body}
Quoted tweet: {original_tweet}
" - - body = self.sanitize_fragment(body) - - data = { - "remote_identifier": remote_identifier, - "title": fix_text(title), - "body": fix_text(body), - "author": rule.screen_name, - "publication_date": publication_date, - "url": url, - "rule": rule, - } - - results[remote_identifier] = Post(**data) + try: + results[remote_identifier] = self.build_post(post) + except KeyError: + logger.exception(f"Failed building post {remote_identifier}") + continue self.instances = results.values() - def get_media_entities(self, post): - media_entities = post["extended_entities"]["media"] + def build_post(self, data): + remote_identifier = data["id_str"] + body = urlize(data["full_text"], nofollow=True) + title = truncate_text(Post, "title", self.sanitize_fragment(data["full_text"])) + url = f"{TWITTER_URL}/{self.stream.rule.screen_name}/status/{remote_identifier}" + + publication_date = pytz.utc.localize( + datetime.strptime(data["created_at"], "%a %b %d %H:%M:%S +0000 %Y") + ) + + if "extended_entities" in data: + try: + media_entities = self.get_media_entities(data) + body += media_entities + except KeyError: + logger.exception(f"Failed parsing media_entities for {url}") + + if "retweeted_status" in data: + original_post = data["retweeted_status"] + original_tweet = urlize(original_post["full_text"], nofollow=True) + body = f"{body}
Original tweet: {original_tweet}
" + if "quoted_status" in data: + original_post = data["quoted_status"] + original_tweet = urlize(original_post["full_text"], nofollow=True) + body = f"{body}
Quoted tweet: {original_tweet}
" + + body = self.sanitize_fragment(body) + + return Post( + **{ + "remote_identifier": data["id_str"], + "title": fix_text(title), + "body": fix_text(body), + "author": self.stream.rule.screen_name, + "publication_date": publication_date, + "url": url, + "rule": self.stream.rule, + } + ) + + def get_media_entities(self, data): + media_entities = data["extended_entities"]["media"] formatted_entities = "" for media_entity in media_entities: