I have a Pandas dataframe containing parent ids and child ids. I need help building an updated dataframe listing every descendant of each parent.
For clarificati
This should return the parent and child ids in the two columns that you wanted:
import pandas as pd
import numpy as np
import itertools
df = pd.DataFrame(
{
'parent_id': [3111, 2010, 3000, 1000, 4023, 3011, 3033, 5010, 3011, 3102, 2010, 4023, 2110, 2100, 1000, 5010, 2110, 1000, 5010, 3033],
'child_id': [4321, 3102, 4023, 2010, 5321, 4200, 4113, 6525, 4010, 4001, 3011, 5010, 3000, 3033, 2110, 6100, 3111, 2100, 6016, 4311]
}
)
def get_child_list(df, parent_id):
list_of_children = []
list_of_children.append(df[df['parent_id'] == parent_id]['child_id'].values)
for i_, r_ in df[df['parent_id'] == parent_id].iterrows():
if r_['child_id'] != parent_id:
list_of_children.append(get_child_list(df, r_['child_id']))
# to flatten the list
list_of_children = [item for sublist in list_of_children for item in sublist]
return list_of_children
new_df = pd.DataFrame(columns=['parent_id', 'list_of_children'])
for index, row in df.iterrows():
temp_df = pd.DataFrame(columns=['parent_id', 'list_of_children'])
temp_df['list_of_children'] = pd.Series(get_child_list(df, row['parent_id']))
temp_df['parent_id'] = row['parent_id']
new_df = new_df.append(temp_df)
print new_df