I have a JSON file I want to convert to a CSV file. How can I do this with Python?
I tried:
import json
import c
Try this
import csv, json, sys
input = open(sys.argv[1])
data = json.load(input)
input.close()
output = csv.writer(sys.stdout)
output.writerow(data[0].keys()) # header row
for item in data:
output.writerow(item.values())
Modified Alec McGail's answer to support JSON with lists inside
def flattenjson(self, mp, delim="|"):
ret = []
if isinstance(mp, dict):
for k in mp.keys():
csvs = self.flattenjson(mp[k], delim)
for csv in csvs:
ret.append(k + delim + csv)
elif isinstance(mp, list):
for k in mp:
csvs = self.flattenjson(k, delim)
for csv in csvs:
ret.append(csv)
else:
ret.append(mp)
return ret
Thanks!
I might be late to the party, but I think, I have dealt with the similar problem. I had a json file which looked like this
I only wanted to extract few keys/values from these json file. So, I wrote the following code to extract the same.
"""json_to_csv.py
This script reads n numbers of json files present in a folder and then extract certain data from each file and write in a csv file.
The folder contains the python script i.e. json_to_csv.py, output.csv and another folder descriptions containing all the json files.
"""
import os
import json
import csv
def get_list_of_json_files():
"""Returns the list of filenames of all the Json files present in the folder
Parameter
---------
directory : str
'descriptions' in this case
Returns
-------
list_of_files: list
List of the filenames of all the json files
"""
list_of_files = os.listdir('descriptions') # creates list of all the files in the folder
return list_of_files
def create_list_from_json(jsonfile):
"""Returns a list of the extracted items from json file in the same order we need it.
Parameter
_________
jsonfile : json
The json file containing the data
Returns
-------
one_sample_list : list
The list of the extracted items needed for the final csv
"""
with open(jsonfile) as f:
data = json.load(f)
data_list = [] # create an empty list
# append the items to the list in the same order.
data_list.append(data['_id'])
data_list.append(data['_modelType'])
data_list.append(data['creator']['_id'])
data_list.append(data['creator']['name'])
data_list.append(data['dataset']['_accessLevel'])
data_list.append(data['dataset']['_id'])
data_list.append(data['dataset']['description'])
data_list.append(data['dataset']['name'])
data_list.append(data['meta']['acquisition']['image_type'])
data_list.append(data['meta']['acquisition']['pixelsX'])
data_list.append(data['meta']['acquisition']['pixelsY'])
data_list.append(data['meta']['clinical']['age_approx'])
data_list.append(data['meta']['clinical']['benign_malignant'])
data_list.append(data['meta']['clinical']['diagnosis'])
data_list.append(data['meta']['clinical']['diagnosis_confirm_type'])
data_list.append(data['meta']['clinical']['melanocytic'])
data_list.append(data['meta']['clinical']['sex'])
data_list.append(data['meta']['unstructured']['diagnosis'])
# In few json files, the race was not there so using KeyError exception to add '' at the place
try:
data_list.append(data['meta']['unstructured']['race'])
except KeyError:
data_list.append("") # will add an empty string in case race is not there.
data_list.append(data['name'])
return data_list
def write_csv():
"""Creates the desired csv file
Parameters
__________
list_of_files : file
The list created by get_list_of_json_files() method
result.csv : csv
The csv file containing the header only
Returns
_______
result.csv : csv
The desired csv file
"""
list_of_files = get_list_of_json_files()
for file in list_of_files:
row = create_list_from_json(f'descriptions/{file}') # create the row to be added to csv for each file (json-file)
with open('output.csv', 'a') as c:
writer = csv.writer(c)
writer.writerow(row)
c.close()
if __name__ == '__main__':
write_csv()
I hope this will help. For details on how this code work you can check here
Surprisingly, I found that none of the answers posted here so far correctly deal with all possible scenarios (e.g., nested dicts, nested lists, None values, etc).
This solution should work across all scenarios:
def flatten_json(json):
def process_value(keys, value, flattened):
if isinstance(value, dict):
for key in value.keys():
process_value(keys + [key], value[key], flattened)
elif isinstance(value, list):
for idx, v in enumerate(value):
process_value(keys + [str(idx)], v, flattened)
else:
flattened['__'.join(keys)] = value
flattened = {}
for key in json.keys():
process_value([key], json[key], flattened)
return flattened
JSON can represent a wide variety of data structures -- a JS "object" is roughly like a Python dict (with string keys), a JS "array" roughly like a Python list, and you can nest them as long as the final "leaf" elements are numbers or strings.
CSV can essentially represent only a 2-D table -- optionally with a first row of "headers", i.e., "column names", which can make the table interpretable as a list of dicts, instead of the normal interpretation, a list of lists (again, "leaf" elements can be numbers or strings).
So, in the general case, you can't translate an arbitrary JSON structure to a CSV. In a few special cases you can (array of arrays with no further nesting; arrays of objects which all have exactly the same keys). Which special case, if any, applies to your problem? The details of the solution depend on which special case you do have. Given the astonishing fact that you don't even mention which one applies, I suspect you may not have considered the constraint, neither usable case in fact applies, and your problem is impossible to solve. But please do clarify!
As mentioned in the previous answers the difficulty in converting json to csv is because a json file can contain nested dictionaries and therefore be a multidimensional data structure verses a csv which is a 2D data structure. However, a good way to turn a multidimensional structure to a csv is to have multiple csvs that tie together with primary keys.
In your example, the first csv output has the columns "pk","model","fields" as your columns. Values for "pk", and "model" are easy to get but because the "fields" column contains a dictionary, it should be its own csv and because "codename" appears to the be the primary key, you can use as the input for "fields" to complete the first csv. The second csv contains the dictionary from the "fields" column with codename as the the primary key that can be used to tie the 2 csvs together.
Here is a solution for your json file which converts a nested dictionaries to 2 csvs.
import csv
import json
def readAndWrite(inputFileName, primaryKey=""):
input = open(inputFileName+".json")
data = json.load(input)
input.close()
header = set()
if primaryKey != "":
outputFileName = inputFileName+"-"+primaryKey
if inputFileName == "data":
for i in data:
for j in i["fields"].keys():
if j not in header:
header.add(j)
else:
outputFileName = inputFileName
for i in data:
for j in i.keys():
if j not in header:
header.add(j)
with open(outputFileName+".csv", 'wb') as output_file:
fieldnames = list(header)
writer = csv.DictWriter(output_file, fieldnames, delimiter=',', quotechar='"')
writer.writeheader()
for x in data:
row_value = {}
if primaryKey == "":
for y in x.keys():
yValue = x.get(y)
if type(yValue) == int or type(yValue) == bool or type(yValue) == float or type(yValue) == list:
row_value[y] = str(yValue).encode('utf8')
elif type(yValue) != dict:
row_value[y] = yValue.encode('utf8')
else:
if inputFileName == "data":
row_value[y] = yValue["codename"].encode('utf8')
readAndWrite(inputFileName, primaryKey="codename")
writer.writerow(row_value)
elif primaryKey == "codename":
for y in x["fields"].keys():
yValue = x["fields"].get(y)
if type(yValue) == int or type(yValue) == bool or type(yValue) == float or type(yValue) == list:
row_value[y] = str(yValue).encode('utf8')
elif type(yValue) != dict:
row_value[y] = yValue.encode('utf8')
writer.writerow(row_value)
readAndWrite("data")