Catch KeyError's in TwitterBuilder
This commit is contained in:
parent
b0c6714002
commit
1a7279c533
3 changed files with 267 additions and 44 deletions
|
|
@ -2185,3 +2185,202 @@ unsanitized_mock = [
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|
||||||
|
broken_mock = [
|
||||||
|
{
|
||||||
|
"contributors": None,
|
||||||
|
"coordinates": None,
|
||||||
|
"created_at": "Fri Aug 07 00:17:05 +0000 2020",
|
||||||
|
"display_text_range": [11, 59],
|
||||||
|
"entities": {
|
||||||
|
"hashtags": [],
|
||||||
|
"symbols": [],
|
||||||
|
"urls": [
|
||||||
|
{
|
||||||
|
"display_url": "youtu.be/rDy7tPf6CT8",
|
||||||
|
"expanded_url": "https://youtu.be/rDy7tPf6CT8",
|
||||||
|
"indices": [36, 59],
|
||||||
|
"url": "https://t.co/trAcIxBMlX",
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"user_mentions": [
|
||||||
|
{
|
||||||
|
"id": 975844884606275587,
|
||||||
|
"id_str": "975844884606275587",
|
||||||
|
"indices": [0, 10],
|
||||||
|
"name": "ArieNeo",
|
||||||
|
"screen_name": "ArieNeoSC",
|
||||||
|
}
|
||||||
|
],
|
||||||
|
},
|
||||||
|
"favorite_count": 19,
|
||||||
|
"favorited": False,
|
||||||
|
# Note the missing full_text key here
|
||||||
|
"geo": None,
|
||||||
|
"id": 1291528756373286914,
|
||||||
|
"id_str": "1291528756373286914",
|
||||||
|
"in_reply_to_screen_name": "ArieNeoSC",
|
||||||
|
"in_reply_to_status_id": 1291507356313038850,
|
||||||
|
"in_reply_to_status_id_str": "1291507356313038850",
|
||||||
|
"in_reply_to_user_id": 975844884606275587,
|
||||||
|
"in_reply_to_user_id_str": "975844884606275587",
|
||||||
|
"is_quote_status": False,
|
||||||
|
"lang": "en",
|
||||||
|
"place": None,
|
||||||
|
"possibly_sensitive": False,
|
||||||
|
"retweet_count": 5,
|
||||||
|
"retweeted": False,
|
||||||
|
"source": '<a href="https://mobile.twitter.com" rel="nofollow">Twitter Web App</a>',
|
||||||
|
"truncated": False,
|
||||||
|
"user": {
|
||||||
|
"contributors_enabled": False,
|
||||||
|
"created_at": "Wed Sep 05 00:58:11 +0000 2012",
|
||||||
|
"default_profile": False,
|
||||||
|
"default_profile_image": False,
|
||||||
|
"description": "The official Twitter profile for #StarCitizen and Roberts Space Industries.",
|
||||||
|
"entities": {
|
||||||
|
"description": {"urls": []},
|
||||||
|
"url": {
|
||||||
|
"urls": [
|
||||||
|
{
|
||||||
|
"display_url": "robertsspaceindustries.com",
|
||||||
|
"expanded_url": "http://www.robertsspaceindustries.com",
|
||||||
|
"indices": [0, 23],
|
||||||
|
"url": "https://t.co/iqO6apof3y",
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"favourites_count": 4588,
|
||||||
|
"follow_request_sent": None,
|
||||||
|
"followers_count": 106169,
|
||||||
|
"following": None,
|
||||||
|
"friends_count": 201,
|
||||||
|
"geo_enabled": False,
|
||||||
|
"has_extended_profile": False,
|
||||||
|
"id": 803542770,
|
||||||
|
"id_str": "803542770",
|
||||||
|
"is_translation_enabled": False,
|
||||||
|
"is_translator": False,
|
||||||
|
"lang": None,
|
||||||
|
"listed_count": 890,
|
||||||
|
"location": "Roberts Space Industries",
|
||||||
|
"name": "Star Citizen",
|
||||||
|
"notifications": None,
|
||||||
|
"profile_background_color": "131516",
|
||||||
|
"profile_background_image_url": "http://abs.twimg.com/images/themes/theme14/bg.gif",
|
||||||
|
"profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme14/bg.gif",
|
||||||
|
"profile_background_tile": False,
|
||||||
|
"profile_banner_url": "https://pbs.twimg.com/profile_banners/803542770/1596651186",
|
||||||
|
"profile_image_url": "http://pbs.twimg.com/profile_images/963109950103814144/ysnj_Asy_normal.jpg",
|
||||||
|
"profile_image_url_https": "https://pbs.twimg.com/profile_images/963109950103814144/ysnj_Asy_normal.jpg",
|
||||||
|
"profile_link_color": "0A5485",
|
||||||
|
"profile_sidebar_border_color": "FFFFFF",
|
||||||
|
"profile_sidebar_fill_color": "EFEFEF",
|
||||||
|
"profile_text_color": "333333",
|
||||||
|
"profile_use_background_image": True,
|
||||||
|
"protected": False,
|
||||||
|
"screen_name": "RobertsSpaceInd",
|
||||||
|
"statuses_count": 6210,
|
||||||
|
"time_zone": None,
|
||||||
|
"translator_type": "none",
|
||||||
|
"url": "https://t.co/iqO6apof3y",
|
||||||
|
"utc_offset": None,
|
||||||
|
"verified": True,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"contributors": None,
|
||||||
|
"coordinates": None,
|
||||||
|
"created_at": "Wed Jul 29 19:01:47 +0000 2020",
|
||||||
|
"display_text_range": [10, 98],
|
||||||
|
"entities": {
|
||||||
|
"hashtags": [],
|
||||||
|
"symbols": [],
|
||||||
|
"urls": [],
|
||||||
|
"user_mentions": [
|
||||||
|
{
|
||||||
|
"id": 435221600,
|
||||||
|
"id_str": "435221600",
|
||||||
|
"indices": [0, 9],
|
||||||
|
"name": "Christopher Blough",
|
||||||
|
"screen_name": "RelicCcb",
|
||||||
|
}
|
||||||
|
],
|
||||||
|
},
|
||||||
|
"favorite_count": 1,
|
||||||
|
"favorited": False,
|
||||||
|
"full_text": "@RelicCcb Hi Christoper, we have checked the status of your investigation and it is still ongoing.",
|
||||||
|
"geo": None,
|
||||||
|
"id": 1288550304095416320,
|
||||||
|
"id_str": "1288550304095416320",
|
||||||
|
"in_reply_to_screen_name": "RelicCcb",
|
||||||
|
"in_reply_to_status_id": 1288475147951898625,
|
||||||
|
"in_reply_to_status_id_str": "1288475147951898625",
|
||||||
|
"in_reply_to_user_id": 435221600,
|
||||||
|
"in_reply_to_user_id_str": "435221600",
|
||||||
|
"is_quote_status": False,
|
||||||
|
"lang": "en",
|
||||||
|
"place": None,
|
||||||
|
"retweet_count": 0,
|
||||||
|
"retweeted": False,
|
||||||
|
"source": '<a href="https://mobile.twitter.com" rel="nofollow">Twitter Web App</a>',
|
||||||
|
"truncated": False,
|
||||||
|
"user": {
|
||||||
|
"contributors_enabled": False,
|
||||||
|
"created_at": "Wed Sep 05 00:58:11 +0000 2012",
|
||||||
|
"default_profile": False,
|
||||||
|
"default_profile_image": False,
|
||||||
|
"description": "The official Twitter profile for #StarCitizen and Roberts Space Industries.",
|
||||||
|
"entities": {
|
||||||
|
"description": {"urls": []},
|
||||||
|
"url": {
|
||||||
|
"urls": [
|
||||||
|
{
|
||||||
|
"display_url": "robertsspaceindustries.com",
|
||||||
|
"expanded_url": "http://www.robertsspaceindustries.com",
|
||||||
|
"indices": [0, 23],
|
||||||
|
"url": "https://t.co/iqO6apof3y",
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"favourites_count": 4588,
|
||||||
|
"follow_request_sent": None,
|
||||||
|
"followers_count": 106169,
|
||||||
|
"following": None,
|
||||||
|
"friends_count": 201,
|
||||||
|
"geo_enabled": False,
|
||||||
|
"has_extended_profile": False,
|
||||||
|
"id": 803542770,
|
||||||
|
"id_str": "803542770",
|
||||||
|
"is_translation_enabled": False,
|
||||||
|
"is_translator": False,
|
||||||
|
"lang": None,
|
||||||
|
"listed_count": 890,
|
||||||
|
"location": "Roberts Space Industries",
|
||||||
|
"name": "Star Citizen",
|
||||||
|
"notifications": None,
|
||||||
|
"profile_background_color": "131516",
|
||||||
|
"profile_background_image_url": "http://abs.twimg.com/images/themes/theme14/bg.gif",
|
||||||
|
"profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme14/bg.gif",
|
||||||
|
"profile_background_tile": False,
|
||||||
|
"profile_banner_url": "https://pbs.twimg.com/profile_banners/803542770/1596651186",
|
||||||
|
"profile_image_url": "http://pbs.twimg.com/profile_images/963109950103814144/ysnj_Asy_normal.jpg",
|
||||||
|
"profile_image_url_https": "https://pbs.twimg.com/profile_images/963109950103814144/ysnj_Asy_normal.jpg",
|
||||||
|
"profile_link_color": "0A5485",
|
||||||
|
"profile_sidebar_border_color": "FFFFFF",
|
||||||
|
"profile_sidebar_fill_color": "EFEFEF",
|
||||||
|
"profile_text_color": "333333",
|
||||||
|
"profile_use_background_image": True,
|
||||||
|
"protected": False,
|
||||||
|
"screen_name": "RobertsSpaceInd",
|
||||||
|
"statuses_count": 6210,
|
||||||
|
"time_zone": None,
|
||||||
|
"translator_type": "none",
|
||||||
|
"url": "https://t.co/iqO6apof3y",
|
||||||
|
"utc_offset": None,
|
||||||
|
"verified": True,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
|
||||||
|
|
@ -10,6 +10,7 @@ from ftfy import fix_text
|
||||||
|
|
||||||
from newsreader.news.collection.tests.factories import TwitterTimelineFactory
|
from newsreader.news.collection.tests.factories import TwitterTimelineFactory
|
||||||
from newsreader.news.collection.tests.twitter.builder.mocks import (
|
from newsreader.news.collection.tests.twitter.builder.mocks import (
|
||||||
|
broken_mock,
|
||||||
gif_mock,
|
gif_mock,
|
||||||
image_mock,
|
image_mock,
|
||||||
quoted_mock,
|
quoted_mock,
|
||||||
|
|
@ -410,3 +411,21 @@ class TwitterBuilderTestCase(TestCase):
|
||||||
builder.save()
|
builder.save()
|
||||||
|
|
||||||
self.assertEquals(Post.objects.count(), 2)
|
self.assertEquals(Post.objects.count(), 2)
|
||||||
|
|
||||||
|
def test_bad_post(self):
|
||||||
|
"""
|
||||||
|
Tests that the builder will ignore posts which miss data
|
||||||
|
"""
|
||||||
|
builder = TwitterBuilder
|
||||||
|
|
||||||
|
profile = TwitterTimelineFactory(screen_name="RobertsSpaceInd")
|
||||||
|
mock_stream = Mock(rule=profile)
|
||||||
|
|
||||||
|
with builder(broken_mock, mock_stream) as builder:
|
||||||
|
builder.build()
|
||||||
|
builder.save()
|
||||||
|
|
||||||
|
self.assertCountEqual(
|
||||||
|
Post.objects.values_list("remote_identifier", flat=True),
|
||||||
|
["1288550304095416320"],
|
||||||
|
)
|
||||||
|
|
|
||||||
|
|
@ -48,58 +48,63 @@ class TwitterBuilder(PostBuilder):
|
||||||
|
|
||||||
def build(self):
|
def build(self):
|
||||||
results = {}
|
results = {}
|
||||||
rule = self.stream.rule
|
|
||||||
|
|
||||||
for post in self.payload:
|
for post in self.payload:
|
||||||
remote_identifier = post["id_str"]
|
remote_identifier = post.get("id_str")
|
||||||
|
|
||||||
if remote_identifier in self.existing_posts:
|
if not remote_identifier or remote_identifier in self.existing_posts:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
url = f"{TWITTER_URL}/{rule.screen_name}/status/{remote_identifier}"
|
try:
|
||||||
body = urlize(post["full_text"], nofollow=True)
|
results[remote_identifier] = self.build_post(post)
|
||||||
title = truncate_text(
|
except KeyError:
|
||||||
Post, "title", self.sanitize_fragment(post["full_text"])
|
logger.exception(f"Failed building post {remote_identifier}")
|
||||||
)
|
continue
|
||||||
|
|
||||||
|
self.instances = results.values()
|
||||||
|
|
||||||
|
def build_post(self, data):
|
||||||
|
remote_identifier = data["id_str"]
|
||||||
|
body = urlize(data["full_text"], nofollow=True)
|
||||||
|
title = truncate_text(Post, "title", self.sanitize_fragment(data["full_text"]))
|
||||||
|
url = f"{TWITTER_URL}/{self.stream.rule.screen_name}/status/{remote_identifier}"
|
||||||
|
|
||||||
publication_date = pytz.utc.localize(
|
publication_date = pytz.utc.localize(
|
||||||
datetime.strptime(post["created_at"], "%a %b %d %H:%M:%S +0000 %Y")
|
datetime.strptime(data["created_at"], "%a %b %d %H:%M:%S +0000 %Y")
|
||||||
)
|
)
|
||||||
|
|
||||||
if "extended_entities" in post:
|
if "extended_entities" in data:
|
||||||
try:
|
try:
|
||||||
media_entities = self.get_media_entities(post)
|
media_entities = self.get_media_entities(data)
|
||||||
body += media_entities
|
body += media_entities
|
||||||
except KeyError:
|
except KeyError:
|
||||||
logger.exception(f"Failed parsing media_entities for {url}")
|
logger.exception(f"Failed parsing media_entities for {url}")
|
||||||
|
|
||||||
if "retweeted_status" in post:
|
if "retweeted_status" in data:
|
||||||
original_post = post["retweeted_status"]
|
original_post = data["retweeted_status"]
|
||||||
original_tweet = urlize(original_post["full_text"], nofollow=True)
|
original_tweet = urlize(original_post["full_text"], nofollow=True)
|
||||||
body = f"{body} <br><div>Original tweet: {original_tweet}</div>"
|
body = f"{body} <br><div>Original tweet: {original_tweet}</div>"
|
||||||
if "quoted_status" in post:
|
if "quoted_status" in data:
|
||||||
original_post = post["quoted_status"]
|
original_post = data["quoted_status"]
|
||||||
original_tweet = urlize(original_post["full_text"], nofollow=True)
|
original_tweet = urlize(original_post["full_text"], nofollow=True)
|
||||||
body = f"{body} <br><div>Quoted tweet: {original_tweet}</div>"
|
body = f"{body} <br><div>Quoted tweet: {original_tweet}</div>"
|
||||||
|
|
||||||
body = self.sanitize_fragment(body)
|
body = self.sanitize_fragment(body)
|
||||||
|
|
||||||
data = {
|
return Post(
|
||||||
"remote_identifier": remote_identifier,
|
**{
|
||||||
|
"remote_identifier": data["id_str"],
|
||||||
"title": fix_text(title),
|
"title": fix_text(title),
|
||||||
"body": fix_text(body),
|
"body": fix_text(body),
|
||||||
"author": rule.screen_name,
|
"author": self.stream.rule.screen_name,
|
||||||
"publication_date": publication_date,
|
"publication_date": publication_date,
|
||||||
"url": url,
|
"url": url,
|
||||||
"rule": rule,
|
"rule": self.stream.rule,
|
||||||
}
|
}
|
||||||
|
)
|
||||||
|
|
||||||
results[remote_identifier] = Post(**data)
|
def get_media_entities(self, data):
|
||||||
|
media_entities = data["extended_entities"]["media"]
|
||||||
self.instances = results.values()
|
|
||||||
|
|
||||||
def get_media_entities(self, post):
|
|
||||||
media_entities = post["extended_entities"]["media"]
|
|
||||||
formatted_entities = ""
|
formatted_entities = ""
|
||||||
|
|
||||||
for media_entity in media_entities:
|
for media_entity in media_entities:
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue