Catch KeyError's in TwitterBuilder

This commit is contained in:
Sonny Bakker 2020-09-28 22:38:47 +02:00
parent b0c6714002
commit 1a7279c533
3 changed files with 267 additions and 44 deletions

View file

@ -2185,3 +2185,202 @@ unsanitized_mock = [
},
}
]
broken_mock = [
{
"contributors": None,
"coordinates": None,
"created_at": "Fri Aug 07 00:17:05 +0000 2020",
"display_text_range": [11, 59],
"entities": {
"hashtags": [],
"symbols": [],
"urls": [
{
"display_url": "youtu.be/rDy7tPf6CT8",
"expanded_url": "https://youtu.be/rDy7tPf6CT8",
"indices": [36, 59],
"url": "https://t.co/trAcIxBMlX",
}
],
"user_mentions": [
{
"id": 975844884606275587,
"id_str": "975844884606275587",
"indices": [0, 10],
"name": "ArieNeo",
"screen_name": "ArieNeoSC",
}
],
},
"favorite_count": 19,
"favorited": False,
# Note the missing full_text key here
"geo": None,
"id": 1291528756373286914,
"id_str": "1291528756373286914",
"in_reply_to_screen_name": "ArieNeoSC",
"in_reply_to_status_id": 1291507356313038850,
"in_reply_to_status_id_str": "1291507356313038850",
"in_reply_to_user_id": 975844884606275587,
"in_reply_to_user_id_str": "975844884606275587",
"is_quote_status": False,
"lang": "en",
"place": None,
"possibly_sensitive": False,
"retweet_count": 5,
"retweeted": False,
"source": '<a href="https://mobile.twitter.com" rel="nofollow">Twitter Web App</a>',
"truncated": False,
"user": {
"contributors_enabled": False,
"created_at": "Wed Sep 05 00:58:11 +0000 2012",
"default_profile": False,
"default_profile_image": False,
"description": "The official Twitter profile for #StarCitizen and Roberts Space Industries.",
"entities": {
"description": {"urls": []},
"url": {
"urls": [
{
"display_url": "robertsspaceindustries.com",
"expanded_url": "http://www.robertsspaceindustries.com",
"indices": [0, 23],
"url": "https://t.co/iqO6apof3y",
}
]
},
},
"favourites_count": 4588,
"follow_request_sent": None,
"followers_count": 106169,
"following": None,
"friends_count": 201,
"geo_enabled": False,
"has_extended_profile": False,
"id": 803542770,
"id_str": "803542770",
"is_translation_enabled": False,
"is_translator": False,
"lang": None,
"listed_count": 890,
"location": "Roberts Space Industries",
"name": "Star Citizen",
"notifications": None,
"profile_background_color": "131516",
"profile_background_image_url": "http://abs.twimg.com/images/themes/theme14/bg.gif",
"profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme14/bg.gif",
"profile_background_tile": False,
"profile_banner_url": "https://pbs.twimg.com/profile_banners/803542770/1596651186",
"profile_image_url": "http://pbs.twimg.com/profile_images/963109950103814144/ysnj_Asy_normal.jpg",
"profile_image_url_https": "https://pbs.twimg.com/profile_images/963109950103814144/ysnj_Asy_normal.jpg",
"profile_link_color": "0A5485",
"profile_sidebar_border_color": "FFFFFF",
"profile_sidebar_fill_color": "EFEFEF",
"profile_text_color": "333333",
"profile_use_background_image": True,
"protected": False,
"screen_name": "RobertsSpaceInd",
"statuses_count": 6210,
"time_zone": None,
"translator_type": "none",
"url": "https://t.co/iqO6apof3y",
"utc_offset": None,
"verified": True,
},
},
{
"contributors": None,
"coordinates": None,
"created_at": "Wed Jul 29 19:01:47 +0000 2020",
"display_text_range": [10, 98],
"entities": {
"hashtags": [],
"symbols": [],
"urls": [],
"user_mentions": [
{
"id": 435221600,
"id_str": "435221600",
"indices": [0, 9],
"name": "Christopher Blough",
"screen_name": "RelicCcb",
}
],
},
"favorite_count": 1,
"favorited": False,
"full_text": "@RelicCcb Hi Christoper, we have checked the status of your investigation and it is still ongoing.",
"geo": None,
"id": 1288550304095416320,
"id_str": "1288550304095416320",
"in_reply_to_screen_name": "RelicCcb",
"in_reply_to_status_id": 1288475147951898625,
"in_reply_to_status_id_str": "1288475147951898625",
"in_reply_to_user_id": 435221600,
"in_reply_to_user_id_str": "435221600",
"is_quote_status": False,
"lang": "en",
"place": None,
"retweet_count": 0,
"retweeted": False,
"source": '<a href="https://mobile.twitter.com" rel="nofollow">Twitter Web App</a>',
"truncated": False,
"user": {
"contributors_enabled": False,
"created_at": "Wed Sep 05 00:58:11 +0000 2012",
"default_profile": False,
"default_profile_image": False,
"description": "The official Twitter profile for #StarCitizen and Roberts Space Industries.",
"entities": {
"description": {"urls": []},
"url": {
"urls": [
{
"display_url": "robertsspaceindustries.com",
"expanded_url": "http://www.robertsspaceindustries.com",
"indices": [0, 23],
"url": "https://t.co/iqO6apof3y",
}
]
},
},
"favourites_count": 4588,
"follow_request_sent": None,
"followers_count": 106169,
"following": None,
"friends_count": 201,
"geo_enabled": False,
"has_extended_profile": False,
"id": 803542770,
"id_str": "803542770",
"is_translation_enabled": False,
"is_translator": False,
"lang": None,
"listed_count": 890,
"location": "Roberts Space Industries",
"name": "Star Citizen",
"notifications": None,
"profile_background_color": "131516",
"profile_background_image_url": "http://abs.twimg.com/images/themes/theme14/bg.gif",
"profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme14/bg.gif",
"profile_background_tile": False,
"profile_banner_url": "https://pbs.twimg.com/profile_banners/803542770/1596651186",
"profile_image_url": "http://pbs.twimg.com/profile_images/963109950103814144/ysnj_Asy_normal.jpg",
"profile_image_url_https": "https://pbs.twimg.com/profile_images/963109950103814144/ysnj_Asy_normal.jpg",
"profile_link_color": "0A5485",
"profile_sidebar_border_color": "FFFFFF",
"profile_sidebar_fill_color": "EFEFEF",
"profile_text_color": "333333",
"profile_use_background_image": True,
"protected": False,
"screen_name": "RobertsSpaceInd",
"statuses_count": 6210,
"time_zone": None,
"translator_type": "none",
"url": "https://t.co/iqO6apof3y",
"utc_offset": None,
"verified": True,
},
},
]

View file

@ -10,6 +10,7 @@ from ftfy import fix_text
from newsreader.news.collection.tests.factories import TwitterTimelineFactory
from newsreader.news.collection.tests.twitter.builder.mocks import (
broken_mock,
gif_mock,
image_mock,
quoted_mock,
@ -410,3 +411,21 @@ class TwitterBuilderTestCase(TestCase):
builder.save()
self.assertEquals(Post.objects.count(), 2)
def test_bad_post(self):
"""
Tests that the builder will ignore posts which miss data
"""
builder = TwitterBuilder
profile = TwitterTimelineFactory(screen_name="RobertsSpaceInd")
mock_stream = Mock(rule=profile)
with builder(broken_mock, mock_stream) as builder:
builder.build()
builder.save()
self.assertCountEqual(
Post.objects.values_list("remote_identifier", flat=True),
["1288550304095416320"],
)

View file

@ -48,58 +48,63 @@ class TwitterBuilder(PostBuilder):
def build(self):
results = {}
rule = self.stream.rule
for post in self.payload:
remote_identifier = post["id_str"]
remote_identifier = post.get("id_str")
if remote_identifier in self.existing_posts:
if not remote_identifier or remote_identifier in self.existing_posts:
continue
url = f"{TWITTER_URL}/{rule.screen_name}/status/{remote_identifier}"
body = urlize(post["full_text"], nofollow=True)
title = truncate_text(
Post, "title", self.sanitize_fragment(post["full_text"])
)
try:
results[remote_identifier] = self.build_post(post)
except KeyError:
logger.exception(f"Failed building post {remote_identifier}")
continue
self.instances = results.values()
def build_post(self, data):
remote_identifier = data["id_str"]
body = urlize(data["full_text"], nofollow=True)
title = truncate_text(Post, "title", self.sanitize_fragment(data["full_text"]))
url = f"{TWITTER_URL}/{self.stream.rule.screen_name}/status/{remote_identifier}"
publication_date = pytz.utc.localize(
datetime.strptime(post["created_at"], "%a %b %d %H:%M:%S +0000 %Y")
datetime.strptime(data["created_at"], "%a %b %d %H:%M:%S +0000 %Y")
)
if "extended_entities" in post:
if "extended_entities" in data:
try:
media_entities = self.get_media_entities(post)
media_entities = self.get_media_entities(data)
body += media_entities
except KeyError:
logger.exception(f"Failed parsing media_entities for {url}")
if "retweeted_status" in post:
original_post = post["retweeted_status"]
if "retweeted_status" in data:
original_post = data["retweeted_status"]
original_tweet = urlize(original_post["full_text"], nofollow=True)
body = f"{body} <br><div>Original tweet: {original_tweet}</div>"
if "quoted_status" in post:
original_post = post["quoted_status"]
if "quoted_status" in data:
original_post = data["quoted_status"]
original_tweet = urlize(original_post["full_text"], nofollow=True)
body = f"{body} <br><div>Quoted tweet: {original_tweet}</div>"
body = self.sanitize_fragment(body)
data = {
"remote_identifier": remote_identifier,
return Post(
**{
"remote_identifier": data["id_str"],
"title": fix_text(title),
"body": fix_text(body),
"author": rule.screen_name,
"author": self.stream.rule.screen_name,
"publication_date": publication_date,
"url": url,
"rule": rule,
"rule": self.stream.rule,
}
)
results[remote_identifier] = Post(**data)
self.instances = results.values()
def get_media_entities(self, post):
media_entities = post["extended_entities"]["media"]
def get_media_entities(self, data):
media_entities = data["extended_entities"]["media"]
formatted_entities = ""
for media_entity in media_entities: