I am trying to convert JSON to CSV file, that I can use for further analysis. Issue with my structure is that I have quite some nested dict/lists when I convert my JSON file
IMO accepted answer doesn't properly handle JSON array.
If JSON object has array as value then it should be flattened to array of objects like
{'a': [1, 2]} -> [{'a': 1}, {'a': 2}]
instead of adding index to key.
And nested objects should be flattened by concatenating keys (e.g. with dot as separator) like
{'a': {'b': 1}} -> {'a.b': 1}
(and this is done correctly in accepted one).
With all these requirements I've ended up with following (developed and used in CPython3.5.3):
from functools import (partial,
singledispatch)
from itertools import chain
from typing import (Dict,
List,
TypeVar)
Serializable = TypeVar('Serializable', None, int, bool, float, str,
dict, list, tuple)
Array = List[Serializable]
Object = Dict[str, Serializable]
def flatten(object_: Object,
*,
path_separator: str = '.') -> Array[Object]:
"""
Flattens given JSON object into list of objects with non-nested values.
>>> flatten({'a': 1})
[{'a': 1}]
>>> flatten({'a': [1, 2]})
[{'a': 1}, {'a': 2}]
>>> flatten({'a': {'b': None}})
[{'a.b': None}]
"""
keys = set(object_)
result = [dict(object_)]
while keys:
key = keys.pop()
new_result = []
for index, record in enumerate(result):
try:
value = record[key]
except KeyError:
new_result.append(record)
else:
if isinstance(value, dict):
del record[key]
new_value = flatten_nested_objects(
value,
prefix=key + path_separator,
path_separator=path_separator)
keys.update(new_value.keys())
new_result.append({**new_value, **record})
elif isinstance(value, list):
del record[key]
new_records = [
flatten_nested_objects(sub_value,
prefix=key + path_separator,
path_separator=path_separator)
for sub_value in value]
keys.update(chain.from_iterable(map(dict.keys,
new_records)))
new_result.extend({**new_record, **record}
for new_record in new_records)
else:
new_result.append(record)
result = new_result
return result
@singledispatch
def flatten_nested_objects(object_: Serializable,
*,
prefix: str = '',
path_separator: str) -> Object:
return {prefix[:-len(path_separator)]: object_}
@flatten_nested_objects.register(dict)
def _(object_: Object,
*,
prefix: str = '',
path_separator: str) -> Object:
result = dict(object_)
for key in list(result):
result.update(flatten_nested_objects(result.pop(key),
prefix=(prefix + key
+ path_separator),
path_separator=path_separator))
return result
@flatten_nested_objects.register(list)
def _(object_: Array,
*,
prefix: str = '',
path_separator: str) -> Object:
return {prefix[:-len(path_separator)]: list(map(partial(
flatten_nested_objects,
path_separator=path_separator),
object_))}
I use this simple function to normalize and flatten data to json. It accepts list, dict, tuple and flattens it to a json.
def normalize_data_to_json(raw_data: [list, dict, tuple], parent=""):
from datetime import datetime
from decimal import Decimal
result = {}
# key name normalise to snake case (single underscore)
parent = parent.lower().replace(" ", "_") if isinstance(parent, str) else parent
if isinstance(parent, str) and parent.startswith("__"):
# if parent has no parent remove double underscore and treat as int if digit else as str
# treating as int is better if passed data is a list so you output is index based dict
parent = int(parent.lstrip("_")) if parent.lstrip("_").isdigit() else parent.lstrip("_")
# handle str, int, float, and decimal.
# you can easily add more data types as er your data
if type(raw_data) in [str, int, float, Decimal]:
result[parent] = float(raw_data) if isinstance(raw_data, Decimal) else raw_data
# normalise datetime object
elif isinstance(raw_data, datetime):
result[parent] = raw_data.strftime("%Y-%m-%d %H:%M:%S")
# normalise dict and all nested dicts.
# all nests are joined with double underscore to identify parent key name with it's children
elif isinstance(raw_data, dict):
for k, v in raw_data.items():
k = f'{parent}__{k}' if parent else k
result.update(normalize_data_to_json(v, parent=k))
# normalise list and tuple
elif type(raw_data) in [list, tuple]:
for i, sub_item in enumerate(raw_data, start=1):
result.update(normalize_data_to_json(sub_item, f"{parent}__{i}"))
# any data which did not matched above data types, normalise them using it's __str__
else:
result[parent] = str(raw_data)
return result
Outputting in jsonpath format:
def convert(f):
out = {}
def flatten(x, name=None):
if type(x) is dict:
for a in x:
val = '.'.join((name, a)) if name else a
flatten(x[a], val)
elif type(x) is list:
for (i, a) in enumerate(x):
flatten(a, name + f'[{str(i)}]')
else:
out[name] = x if x else ""
flatten(f)
return out
Just pass your dictionary here:
def getKeyValuePair(dic,master_dic = {},master_key = None):
keys = list(dic.keys())
for key in keys:
if type(dic[key]) == dict:
getKeyValuePair(dic[key],master_dic = master_dic,master_key = key)
else:
if master_key == None:
master_dic[key] = dic[key]
else:
master_dic[str(master_key)+'_'+str(key)] = dic[key]
return master_dic
Cross-posting (but then adapting further) from https://stackoverflow.com/a/62186053/4355695 : In this repo: https://github.com/ScriptSmith/socialreaper/blob/master/socialreaper/tools.py#L8 , I found an implementation of the list-inclusion comment by @roneo to the answer posted by @Imran.
I've added checks to it for catching empty lists and empty dicts. And also added print lines that will help one understand precisely how this function works. You can turn off those print statemenents by setting crumbs=False
import collections
crumbs = True
def flatten(dictionary, parent_key=False, separator='.'):
"""
Turn a nested dictionary into a flattened dictionary
:param dictionary: The dictionary to flatten
:param parent_key: The string to prepend to dictionary's keys
:param separator: The string used to separate flattened keys
:return: A flattened dictionary
"""
items = []
for key, value in dictionary.items():
if crumbs: print('checking:',key)
new_key = str(parent_key) + separator + key if parent_key else key
if isinstance(value, collections.MutableMapping):
if crumbs: print(new_key,': dict found')
if not value.items():
if crumbs: print('Adding key-value pair:',new_key,None)
items.append((new_key,None))
else:
items.extend(flatten(value, new_key, separator).items())
elif isinstance(value, list):
if crumbs: print(new_key,': list found')
if len(value):
for k, v in enumerate(value):
items.extend(flatten({str(k): v}, new_key).items())
else:
if crumbs: print('Adding key-value pair:',new_key,None)
items.append((new_key,None))
else:
if crumbs: print('Adding key-value pair:',new_key,value)
items.append((new_key, value))
return dict(items)
Test it:
ans = flatten({'a': 1, 'c': {'a': 2, 'b': {'x': 5, 'y' : 10}}, 'd': [1, 2, 3], 'e':{'f':[], 'g':{}} })
print('\nflattened:',ans)
Output:
checking: a
Adding key-value pair: a 1
checking: c
c : dict found
checking: a
Adding key-value pair: c.a 2
checking: b
c.b : dict found
checking: x
Adding key-value pair: c.b.x 5
checking: y
Adding key-value pair: c.b.y 10
checking: d
d : list found
checking: 0
Adding key-value pair: d.0 1
checking: 1
Adding key-value pair: d.1 2
checking: 2
Adding key-value pair: d.2 3
checking: e
e : dict found
checking: f
e.f : list found
Adding key-value pair: e.f None
checking: g
e.g : dict found
Adding key-value pair: e.g None
flattened: {'a': 1, 'c.a': 2, 'c.b.x': 5, 'c.b.y': 10, 'd.0': 1, 'd.1': 2, 'd.2': 3, 'e.f': None, 'e.g': None}
Annd that does the job I need done: I throw any complicated json at this and it flattens it out for me. I added a check to the original code to handle empty lists too
Credits to https://github.com/ScriptSmith whose repo I found the intial flatten function in.
Testing OP's sample json, here's the output:
{'count': 13,
'virtualmachine.0.id': '1082e2ed-ff66-40b1-a41b-26061afd4a0b',
'virtualmachine.0.name': 'test-2',
'virtualmachine.0.displayname': 'test-2',
'virtualmachine.0.securitygroup.0.id': '9e649fbc-3e64-4395-9629-5e1215b34e58',
'virtualmachine.0.securitygroup.0.name': 'test',
'virtualmachine.0.securitygroup.0.tags': None,
'virtualmachine.0.nic.0.id': '79568b14-b377-4d4f-b024-87dc22492b8e',
'virtualmachine.0.nic.0.networkid': '05c0e278-7ab4-4a6d-aa9c-3158620b6471',
'virtualmachine.0.nic.1.id': '3d7f2818-1f19-46e7-aa98-956526c5b1ad',
'virtualmachine.0.nic.1.networkid': 'b4648cfd-0795-43fc-9e50-6ee9ddefc5bd',
'virtualmachine.0.nic.1.traffictype': 'Guest',
'virtualmachine.0.hypervisor': 'KVM',
'virtualmachine.0.affinitygroup': None,
'virtualmachine.0.isdynamicallyscalable': False}
So you'll see that 'tags' and 'affinitygroup' keys are also handled and added to output. Original code was omitting them.
Thanks to gyx-hh, this has been resolved:
I used following function (details can be found here):
def flatten_json(y):
out = {}
def flatten(x, name=''):
if type(x) is dict:
for a in x:
flatten(x[a], name + a + '_')
elif type(x) is list:
i = 0
for a in x:
flatten(a, name + str(i) + '_')
i += 1
else:
out[name[:-1]] = x
flatten(y)
return out
This unfortunately completely flattens whole JSON, meaning that if you have multi-level JSON (many nested dictionaries), it might flatten everything into single line with tons of columns.
What I used in the end was json_normalize()
and specified structure that I required. Nice example of how to do it that way can be found here.
Hopefully this hepls someone and again thank to gyx-hh for solution.
Best regards