Catch KeyError's in TwitterBuilder
This commit is contained in:
parent
b0c6714002
commit
1a7279c533
3 changed files with 267 additions and 44 deletions
|
|
@ -2185,3 +2185,202 @@ unsanitized_mock = [
|
|||
},
|
||||
}
|
||||
]
|
||||
|
||||
broken_mock = [
|
||||
{
|
||||
"contributors": None,
|
||||
"coordinates": None,
|
||||
"created_at": "Fri Aug 07 00:17:05 +0000 2020",
|
||||
"display_text_range": [11, 59],
|
||||
"entities": {
|
||||
"hashtags": [],
|
||||
"symbols": [],
|
||||
"urls": [
|
||||
{
|
||||
"display_url": "youtu.be/rDy7tPf6CT8",
|
||||
"expanded_url": "https://youtu.be/rDy7tPf6CT8",
|
||||
"indices": [36, 59],
|
||||
"url": "https://t.co/trAcIxBMlX",
|
||||
}
|
||||
],
|
||||
"user_mentions": [
|
||||
{
|
||||
"id": 975844884606275587,
|
||||
"id_str": "975844884606275587",
|
||||
"indices": [0, 10],
|
||||
"name": "ArieNeo",
|
||||
"screen_name": "ArieNeoSC",
|
||||
}
|
||||
],
|
||||
},
|
||||
"favorite_count": 19,
|
||||
"favorited": False,
|
||||
# Note the missing full_text key here
|
||||
"geo": None,
|
||||
"id": 1291528756373286914,
|
||||
"id_str": "1291528756373286914",
|
||||
"in_reply_to_screen_name": "ArieNeoSC",
|
||||
"in_reply_to_status_id": 1291507356313038850,
|
||||
"in_reply_to_status_id_str": "1291507356313038850",
|
||||
"in_reply_to_user_id": 975844884606275587,
|
||||
"in_reply_to_user_id_str": "975844884606275587",
|
||||
"is_quote_status": False,
|
||||
"lang": "en",
|
||||
"place": None,
|
||||
"possibly_sensitive": False,
|
||||
"retweet_count": 5,
|
||||
"retweeted": False,
|
||||
"source": '<a href="https://mobile.twitter.com" rel="nofollow">Twitter Web App</a>',
|
||||
"truncated": False,
|
||||
"user": {
|
||||
"contributors_enabled": False,
|
||||
"created_at": "Wed Sep 05 00:58:11 +0000 2012",
|
||||
"default_profile": False,
|
||||
"default_profile_image": False,
|
||||
"description": "The official Twitter profile for #StarCitizen and Roberts Space Industries.",
|
||||
"entities": {
|
||||
"description": {"urls": []},
|
||||
"url": {
|
||||
"urls": [
|
||||
{
|
||||
"display_url": "robertsspaceindustries.com",
|
||||
"expanded_url": "http://www.robertsspaceindustries.com",
|
||||
"indices": [0, 23],
|
||||
"url": "https://t.co/iqO6apof3y",
|
||||
}
|
||||
]
|
||||
},
|
||||
},
|
||||
"favourites_count": 4588,
|
||||
"follow_request_sent": None,
|
||||
"followers_count": 106169,
|
||||
"following": None,
|
||||
"friends_count": 201,
|
||||
"geo_enabled": False,
|
||||
"has_extended_profile": False,
|
||||
"id": 803542770,
|
||||
"id_str": "803542770",
|
||||
"is_translation_enabled": False,
|
||||
"is_translator": False,
|
||||
"lang": None,
|
||||
"listed_count": 890,
|
||||
"location": "Roberts Space Industries",
|
||||
"name": "Star Citizen",
|
||||
"notifications": None,
|
||||
"profile_background_color": "131516",
|
||||
"profile_background_image_url": "http://abs.twimg.com/images/themes/theme14/bg.gif",
|
||||
"profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme14/bg.gif",
|
||||
"profile_background_tile": False,
|
||||
"profile_banner_url": "https://pbs.twimg.com/profile_banners/803542770/1596651186",
|
||||
"profile_image_url": "http://pbs.twimg.com/profile_images/963109950103814144/ysnj_Asy_normal.jpg",
|
||||
"profile_image_url_https": "https://pbs.twimg.com/profile_images/963109950103814144/ysnj_Asy_normal.jpg",
|
||||
"profile_link_color": "0A5485",
|
||||
"profile_sidebar_border_color": "FFFFFF",
|
||||
"profile_sidebar_fill_color": "EFEFEF",
|
||||
"profile_text_color": "333333",
|
||||
"profile_use_background_image": True,
|
||||
"protected": False,
|
||||
"screen_name": "RobertsSpaceInd",
|
||||
"statuses_count": 6210,
|
||||
"time_zone": None,
|
||||
"translator_type": "none",
|
||||
"url": "https://t.co/iqO6apof3y",
|
||||
"utc_offset": None,
|
||||
"verified": True,
|
||||
},
|
||||
},
|
||||
{
|
||||
"contributors": None,
|
||||
"coordinates": None,
|
||||
"created_at": "Wed Jul 29 19:01:47 +0000 2020",
|
||||
"display_text_range": [10, 98],
|
||||
"entities": {
|
||||
"hashtags": [],
|
||||
"symbols": [],
|
||||
"urls": [],
|
||||
"user_mentions": [
|
||||
{
|
||||
"id": 435221600,
|
||||
"id_str": "435221600",
|
||||
"indices": [0, 9],
|
||||
"name": "Christopher Blough",
|
||||
"screen_name": "RelicCcb",
|
||||
}
|
||||
],
|
||||
},
|
||||
"favorite_count": 1,
|
||||
"favorited": False,
|
||||
"full_text": "@RelicCcb Hi Christoper, we have checked the status of your investigation and it is still ongoing.",
|
||||
"geo": None,
|
||||
"id": 1288550304095416320,
|
||||
"id_str": "1288550304095416320",
|
||||
"in_reply_to_screen_name": "RelicCcb",
|
||||
"in_reply_to_status_id": 1288475147951898625,
|
||||
"in_reply_to_status_id_str": "1288475147951898625",
|
||||
"in_reply_to_user_id": 435221600,
|
||||
"in_reply_to_user_id_str": "435221600",
|
||||
"is_quote_status": False,
|
||||
"lang": "en",
|
||||
"place": None,
|
||||
"retweet_count": 0,
|
||||
"retweeted": False,
|
||||
"source": '<a href="https://mobile.twitter.com" rel="nofollow">Twitter Web App</a>',
|
||||
"truncated": False,
|
||||
"user": {
|
||||
"contributors_enabled": False,
|
||||
"created_at": "Wed Sep 05 00:58:11 +0000 2012",
|
||||
"default_profile": False,
|
||||
"default_profile_image": False,
|
||||
"description": "The official Twitter profile for #StarCitizen and Roberts Space Industries.",
|
||||
"entities": {
|
||||
"description": {"urls": []},
|
||||
"url": {
|
||||
"urls": [
|
||||
{
|
||||
"display_url": "robertsspaceindustries.com",
|
||||
"expanded_url": "http://www.robertsspaceindustries.com",
|
||||
"indices": [0, 23],
|
||||
"url": "https://t.co/iqO6apof3y",
|
||||
}
|
||||
]
|
||||
},
|
||||
},
|
||||
"favourites_count": 4588,
|
||||
"follow_request_sent": None,
|
||||
"followers_count": 106169,
|
||||
"following": None,
|
||||
"friends_count": 201,
|
||||
"geo_enabled": False,
|
||||
"has_extended_profile": False,
|
||||
"id": 803542770,
|
||||
"id_str": "803542770",
|
||||
"is_translation_enabled": False,
|
||||
"is_translator": False,
|
||||
"lang": None,
|
||||
"listed_count": 890,
|
||||
"location": "Roberts Space Industries",
|
||||
"name": "Star Citizen",
|
||||
"notifications": None,
|
||||
"profile_background_color": "131516",
|
||||
"profile_background_image_url": "http://abs.twimg.com/images/themes/theme14/bg.gif",
|
||||
"profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme14/bg.gif",
|
||||
"profile_background_tile": False,
|
||||
"profile_banner_url": "https://pbs.twimg.com/profile_banners/803542770/1596651186",
|
||||
"profile_image_url": "http://pbs.twimg.com/profile_images/963109950103814144/ysnj_Asy_normal.jpg",
|
||||
"profile_image_url_https": "https://pbs.twimg.com/profile_images/963109950103814144/ysnj_Asy_normal.jpg",
|
||||
"profile_link_color": "0A5485",
|
||||
"profile_sidebar_border_color": "FFFFFF",
|
||||
"profile_sidebar_fill_color": "EFEFEF",
|
||||
"profile_text_color": "333333",
|
||||
"profile_use_background_image": True,
|
||||
"protected": False,
|
||||
"screen_name": "RobertsSpaceInd",
|
||||
"statuses_count": 6210,
|
||||
"time_zone": None,
|
||||
"translator_type": "none",
|
||||
"url": "https://t.co/iqO6apof3y",
|
||||
"utc_offset": None,
|
||||
"verified": True,
|
||||
},
|
||||
},
|
||||
]
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@ from ftfy import fix_text
|
|||
|
||||
from newsreader.news.collection.tests.factories import TwitterTimelineFactory
|
||||
from newsreader.news.collection.tests.twitter.builder.mocks import (
|
||||
broken_mock,
|
||||
gif_mock,
|
||||
image_mock,
|
||||
quoted_mock,
|
||||
|
|
@ -410,3 +411,21 @@ class TwitterBuilderTestCase(TestCase):
|
|||
builder.save()
|
||||
|
||||
self.assertEquals(Post.objects.count(), 2)
|
||||
|
||||
def test_bad_post(self):
|
||||
"""
|
||||
Tests that the builder will ignore posts which miss data
|
||||
"""
|
||||
builder = TwitterBuilder
|
||||
|
||||
profile = TwitterTimelineFactory(screen_name="RobertsSpaceInd")
|
||||
mock_stream = Mock(rule=profile)
|
||||
|
||||
with builder(broken_mock, mock_stream) as builder:
|
||||
builder.build()
|
||||
builder.save()
|
||||
|
||||
self.assertCountEqual(
|
||||
Post.objects.values_list("remote_identifier", flat=True),
|
||||
["1288550304095416320"],
|
||||
)
|
||||
|
|
|
|||
|
|
@ -48,58 +48,63 @@ class TwitterBuilder(PostBuilder):
|
|||
|
||||
def build(self):
|
||||
results = {}
|
||||
rule = self.stream.rule
|
||||
|
||||
for post in self.payload:
|
||||
remote_identifier = post["id_str"]
|
||||
remote_identifier = post.get("id_str")
|
||||
|
||||
if remote_identifier in self.existing_posts:
|
||||
if not remote_identifier or remote_identifier in self.existing_posts:
|
||||
continue
|
||||
|
||||
url = f"{TWITTER_URL}/{rule.screen_name}/status/{remote_identifier}"
|
||||
body = urlize(post["full_text"], nofollow=True)
|
||||
title = truncate_text(
|
||||
Post, "title", self.sanitize_fragment(post["full_text"])
|
||||
)
|
||||
|
||||
publication_date = pytz.utc.localize(
|
||||
datetime.strptime(post["created_at"], "%a %b %d %H:%M:%S +0000 %Y")
|
||||
)
|
||||
|
||||
if "extended_entities" in post:
|
||||
try:
|
||||
media_entities = self.get_media_entities(post)
|
||||
body += media_entities
|
||||
except KeyError:
|
||||
logger.exception(f"Failed parsing media_entities for {url}")
|
||||
|
||||
if "retweeted_status" in post:
|
||||
original_post = post["retweeted_status"]
|
||||
original_tweet = urlize(original_post["full_text"], nofollow=True)
|
||||
body = f"{body} <br><div>Original tweet: {original_tweet}</div>"
|
||||
if "quoted_status" in post:
|
||||
original_post = post["quoted_status"]
|
||||
original_tweet = urlize(original_post["full_text"], nofollow=True)
|
||||
body = f"{body} <br><div>Quoted tweet: {original_tweet}</div>"
|
||||
|
||||
body = self.sanitize_fragment(body)
|
||||
|
||||
data = {
|
||||
"remote_identifier": remote_identifier,
|
||||
"title": fix_text(title),
|
||||
"body": fix_text(body),
|
||||
"author": rule.screen_name,
|
||||
"publication_date": publication_date,
|
||||
"url": url,
|
||||
"rule": rule,
|
||||
}
|
||||
|
||||
results[remote_identifier] = Post(**data)
|
||||
try:
|
||||
results[remote_identifier] = self.build_post(post)
|
||||
except KeyError:
|
||||
logger.exception(f"Failed building post {remote_identifier}")
|
||||
continue
|
||||
|
||||
self.instances = results.values()
|
||||
|
||||
def get_media_entities(self, post):
|
||||
media_entities = post["extended_entities"]["media"]
|
||||
def build_post(self, data):
|
||||
remote_identifier = data["id_str"]
|
||||
body = urlize(data["full_text"], nofollow=True)
|
||||
title = truncate_text(Post, "title", self.sanitize_fragment(data["full_text"]))
|
||||
url = f"{TWITTER_URL}/{self.stream.rule.screen_name}/status/{remote_identifier}"
|
||||
|
||||
publication_date = pytz.utc.localize(
|
||||
datetime.strptime(data["created_at"], "%a %b %d %H:%M:%S +0000 %Y")
|
||||
)
|
||||
|
||||
if "extended_entities" in data:
|
||||
try:
|
||||
media_entities = self.get_media_entities(data)
|
||||
body += media_entities
|
||||
except KeyError:
|
||||
logger.exception(f"Failed parsing media_entities for {url}")
|
||||
|
||||
if "retweeted_status" in data:
|
||||
original_post = data["retweeted_status"]
|
||||
original_tweet = urlize(original_post["full_text"], nofollow=True)
|
||||
body = f"{body} <br><div>Original tweet: {original_tweet}</div>"
|
||||
if "quoted_status" in data:
|
||||
original_post = data["quoted_status"]
|
||||
original_tweet = urlize(original_post["full_text"], nofollow=True)
|
||||
body = f"{body} <br><div>Quoted tweet: {original_tweet}</div>"
|
||||
|
||||
body = self.sanitize_fragment(body)
|
||||
|
||||
return Post(
|
||||
**{
|
||||
"remote_identifier": data["id_str"],
|
||||
"title": fix_text(title),
|
||||
"body": fix_text(body),
|
||||
"author": self.stream.rule.screen_name,
|
||||
"publication_date": publication_date,
|
||||
"url": url,
|
||||
"rule": self.stream.rule,
|
||||
}
|
||||
)
|
||||
|
||||
def get_media_entities(self, data):
|
||||
media_entities = data["extended_entities"]["media"]
|
||||
formatted_entities = ""
|
||||
|
||||
for media_entity in media_entities:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue