diff --git a/src/newsreader/news/collection/tests/twitter/builder/mocks.py b/src/newsreader/news/collection/tests/twitter/builder/mocks.py
index b330f2f..2be360d 100644
--- a/src/newsreader/news/collection/tests/twitter/builder/mocks.py
+++ b/src/newsreader/news/collection/tests/twitter/builder/mocks.py
@@ -2185,3 +2185,202 @@ unsanitized_mock = [
},
}
]
+
+broken_mock = [
+ {
+ "contributors": None,
+ "coordinates": None,
+ "created_at": "Fri Aug 07 00:17:05 +0000 2020",
+ "display_text_range": [11, 59],
+ "entities": {
+ "hashtags": [],
+ "symbols": [],
+ "urls": [
+ {
+ "display_url": "youtu.be/rDy7tPf6CT8",
+ "expanded_url": "https://youtu.be/rDy7tPf6CT8",
+ "indices": [36, 59],
+ "url": "https://t.co/trAcIxBMlX",
+ }
+ ],
+ "user_mentions": [
+ {
+ "id": 975844884606275587,
+ "id_str": "975844884606275587",
+ "indices": [0, 10],
+ "name": "ArieNeo",
+ "screen_name": "ArieNeoSC",
+ }
+ ],
+ },
+ "favorite_count": 19,
+ "favorited": False,
+ # Note the missing full_text key here
+ "geo": None,
+ "id": 1291528756373286914,
+ "id_str": "1291528756373286914",
+ "in_reply_to_screen_name": "ArieNeoSC",
+ "in_reply_to_status_id": 1291507356313038850,
+ "in_reply_to_status_id_str": "1291507356313038850",
+ "in_reply_to_user_id": 975844884606275587,
+ "in_reply_to_user_id_str": "975844884606275587",
+ "is_quote_status": False,
+ "lang": "en",
+ "place": None,
+ "possibly_sensitive": False,
+ "retweet_count": 5,
+ "retweeted": False,
+ "source": 'Twitter Web App',
+ "truncated": False,
+ "user": {
+ "contributors_enabled": False,
+ "created_at": "Wed Sep 05 00:58:11 +0000 2012",
+ "default_profile": False,
+ "default_profile_image": False,
+ "description": "The official Twitter profile for #StarCitizen and Roberts Space Industries.",
+ "entities": {
+ "description": {"urls": []},
+ "url": {
+ "urls": [
+ {
+ "display_url": "robertsspaceindustries.com",
+ "expanded_url": "http://www.robertsspaceindustries.com",
+ "indices": [0, 23],
+ "url": "https://t.co/iqO6apof3y",
+ }
+ ]
+ },
+ },
+ "favourites_count": 4588,
+ "follow_request_sent": None,
+ "followers_count": 106169,
+ "following": None,
+ "friends_count": 201,
+ "geo_enabled": False,
+ "has_extended_profile": False,
+ "id": 803542770,
+ "id_str": "803542770",
+ "is_translation_enabled": False,
+ "is_translator": False,
+ "lang": None,
+ "listed_count": 890,
+ "location": "Roberts Space Industries",
+ "name": "Star Citizen",
+ "notifications": None,
+ "profile_background_color": "131516",
+ "profile_background_image_url": "http://abs.twimg.com/images/themes/theme14/bg.gif",
+ "profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme14/bg.gif",
+ "profile_background_tile": False,
+ "profile_banner_url": "https://pbs.twimg.com/profile_banners/803542770/1596651186",
+ "profile_image_url": "http://pbs.twimg.com/profile_images/963109950103814144/ysnj_Asy_normal.jpg",
+ "profile_image_url_https": "https://pbs.twimg.com/profile_images/963109950103814144/ysnj_Asy_normal.jpg",
+ "profile_link_color": "0A5485",
+ "profile_sidebar_border_color": "FFFFFF",
+ "profile_sidebar_fill_color": "EFEFEF",
+ "profile_text_color": "333333",
+ "profile_use_background_image": True,
+ "protected": False,
+ "screen_name": "RobertsSpaceInd",
+ "statuses_count": 6210,
+ "time_zone": None,
+ "translator_type": "none",
+ "url": "https://t.co/iqO6apof3y",
+ "utc_offset": None,
+ "verified": True,
+ },
+ },
+ {
+ "contributors": None,
+ "coordinates": None,
+ "created_at": "Wed Jul 29 19:01:47 +0000 2020",
+ "display_text_range": [10, 98],
+ "entities": {
+ "hashtags": [],
+ "symbols": [],
+ "urls": [],
+ "user_mentions": [
+ {
+ "id": 435221600,
+ "id_str": "435221600",
+ "indices": [0, 9],
+ "name": "Christopher Blough",
+ "screen_name": "RelicCcb",
+ }
+ ],
+ },
+ "favorite_count": 1,
+ "favorited": False,
+ "full_text": "@RelicCcb Hi Christoper, we have checked the status of your investigation and it is still ongoing.",
+ "geo": None,
+ "id": 1288550304095416320,
+ "id_str": "1288550304095416320",
+ "in_reply_to_screen_name": "RelicCcb",
+ "in_reply_to_status_id": 1288475147951898625,
+ "in_reply_to_status_id_str": "1288475147951898625",
+ "in_reply_to_user_id": 435221600,
+ "in_reply_to_user_id_str": "435221600",
+ "is_quote_status": False,
+ "lang": "en",
+ "place": None,
+ "retweet_count": 0,
+ "retweeted": False,
+ "source": 'Twitter Web App',
+ "truncated": False,
+ "user": {
+ "contributors_enabled": False,
+ "created_at": "Wed Sep 05 00:58:11 +0000 2012",
+ "default_profile": False,
+ "default_profile_image": False,
+ "description": "The official Twitter profile for #StarCitizen and Roberts Space Industries.",
+ "entities": {
+ "description": {"urls": []},
+ "url": {
+ "urls": [
+ {
+ "display_url": "robertsspaceindustries.com",
+ "expanded_url": "http://www.robertsspaceindustries.com",
+ "indices": [0, 23],
+ "url": "https://t.co/iqO6apof3y",
+ }
+ ]
+ },
+ },
+ "favourites_count": 4588,
+ "follow_request_sent": None,
+ "followers_count": 106169,
+ "following": None,
+ "friends_count": 201,
+ "geo_enabled": False,
+ "has_extended_profile": False,
+ "id": 803542770,
+ "id_str": "803542770",
+ "is_translation_enabled": False,
+ "is_translator": False,
+ "lang": None,
+ "listed_count": 890,
+ "location": "Roberts Space Industries",
+ "name": "Star Citizen",
+ "notifications": None,
+ "profile_background_color": "131516",
+ "profile_background_image_url": "http://abs.twimg.com/images/themes/theme14/bg.gif",
+ "profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme14/bg.gif",
+ "profile_background_tile": False,
+ "profile_banner_url": "https://pbs.twimg.com/profile_banners/803542770/1596651186",
+ "profile_image_url": "http://pbs.twimg.com/profile_images/963109950103814144/ysnj_Asy_normal.jpg",
+ "profile_image_url_https": "https://pbs.twimg.com/profile_images/963109950103814144/ysnj_Asy_normal.jpg",
+ "profile_link_color": "0A5485",
+ "profile_sidebar_border_color": "FFFFFF",
+ "profile_sidebar_fill_color": "EFEFEF",
+ "profile_text_color": "333333",
+ "profile_use_background_image": True,
+ "protected": False,
+ "screen_name": "RobertsSpaceInd",
+ "statuses_count": 6210,
+ "time_zone": None,
+ "translator_type": "none",
+ "url": "https://t.co/iqO6apof3y",
+ "utc_offset": None,
+ "verified": True,
+ },
+ },
+]
diff --git a/src/newsreader/news/collection/tests/twitter/builder/tests.py b/src/newsreader/news/collection/tests/twitter/builder/tests.py
index 37d7ad7..2e9ecc0 100644
--- a/src/newsreader/news/collection/tests/twitter/builder/tests.py
+++ b/src/newsreader/news/collection/tests/twitter/builder/tests.py
@@ -10,6 +10,7 @@ from ftfy import fix_text
from newsreader.news.collection.tests.factories import TwitterTimelineFactory
from newsreader.news.collection.tests.twitter.builder.mocks import (
+ broken_mock,
gif_mock,
image_mock,
quoted_mock,
@@ -410,3 +411,21 @@ class TwitterBuilderTestCase(TestCase):
builder.save()
self.assertEquals(Post.objects.count(), 2)
+
+ def test_bad_post(self):
+ """
+ Tests that the builder will ignore posts which miss data
+ """
+ builder = TwitterBuilder
+
+ profile = TwitterTimelineFactory(screen_name="RobertsSpaceInd")
+ mock_stream = Mock(rule=profile)
+
+ with builder(broken_mock, mock_stream) as builder:
+ builder.build()
+ builder.save()
+
+ self.assertCountEqual(
+ Post.objects.values_list("remote_identifier", flat=True),
+ ["1288550304095416320"],
+ )
diff --git a/src/newsreader/news/collection/twitter.py b/src/newsreader/news/collection/twitter.py
index dc32ecc..99d8c1c 100644
--- a/src/newsreader/news/collection/twitter.py
+++ b/src/newsreader/news/collection/twitter.py
@@ -48,58 +48,63 @@ class TwitterBuilder(PostBuilder):
def build(self):
results = {}
- rule = self.stream.rule
for post in self.payload:
- remote_identifier = post["id_str"]
+ remote_identifier = post.get("id_str")
- if remote_identifier in self.existing_posts:
+ if not remote_identifier or remote_identifier in self.existing_posts:
continue
- url = f"{TWITTER_URL}/{rule.screen_name}/status/{remote_identifier}"
- body = urlize(post["full_text"], nofollow=True)
- title = truncate_text(
- Post, "title", self.sanitize_fragment(post["full_text"])
- )
-
- publication_date = pytz.utc.localize(
- datetime.strptime(post["created_at"], "%a %b %d %H:%M:%S +0000 %Y")
- )
-
- if "extended_entities" in post:
- try:
- media_entities = self.get_media_entities(post)
- body += media_entities
- except KeyError:
- logger.exception(f"Failed parsing media_entities for {url}")
-
- if "retweeted_status" in post:
- original_post = post["retweeted_status"]
- original_tweet = urlize(original_post["full_text"], nofollow=True)
- body = f"{body}