I am trying to convert the twitter json data to csv. I am able to get the top level attributes perfectly but the attributes which are nested, I’m not able to parse. The nested json is like this
"entities":{
"hashtags":[
],
"urls":[
{
"url":"https:\/\/t.co\/ieON9yclmy",
"expanded_url":"http:\/\/www.dailymail.co.uk\/news\/article-4044728\/Theresa-wants-use-army-computerised-Trump-mind-readers-help-win-Election.html#ixzz5AE6Hx3VW",
"display_url":"dailymail.co.uk\/news\/article-4\u2026",
"indices":[
39,
62
]
}
],
"user_mentions":[
{
"screen_name":"neonbubble",
"name":"Mark H",
"id":10934622,
"id_str":"10934622",
"indices":[
0,
11
]
}
Right now my python code looks like this
from operator import itemgetter
from StringIO import StringIO
import csv
import json
import sys
reload(sys)
sys.setdefaultencoding('utf8')
def get_leaves(item, key=None):
if isinstance(item, dict):
leaves = []
for i in item.keys():
leaves.extend(get_leaves(item[i], i))
return leaves
elif isinstance(item, list):
leaves = []
for i in item:
leaves.extend(get_leaves(i, key))
return get_leaves
else:
return [(key,item)]
header = ['created_at', 'id', 'id_str', 'in_reply_to_status_id', 'in_reply_to_user_id', 'text', 'source', 'truncated', 'in_reply_to_status_id_str', 'in_reply_to_user_id_str', 'in_reply_to_screen_name', 'user', 'entities']
required_cols = itemgetter(*header)
with open('twitter.json') as f_input, open('output.csv', 'wb') as f_output:
csv_output = csv.writer(f_output)
csv_output.writerow(header)
write_header = True
for entry in f_input:
if entry.strip():
leaf_entries = sorted(get_leaves(entry))
csv_output.writerow(required_cols(json.loads(leaf_entries)))
How can I do it in python?