I have a Pandas dataframe containing parent ids and child ids. I need help building an updated dataframe listing every descendant of each parent.
For clarificati
As long as your IDs never have cycles, I think this should work
def get_children(id):
list_of_children = []
def dfs(id):
child_ids = df[df["parent_id"]==id]["child_id"]
if child_ids.empty:
return
for child_id in child_ids:
list_of_children.append(child_id)
dfs(child_id)
dfs(id)
return list_of_children
df["list_of_children"] = df["parent_id"].apply(get_children)
df
Returns:
parent_id child_id list_of_children
0 3111 4321 [4321]
1 2010 3102 [3102, 4001, 3011, 4200, 4010]
2 3000 4023 [4023, 5321, 5010, 6525, 6100, 6016]
3 1000 2010 [2010, 3102, 4001, 3011, 4200, 4010, 2110, 3000, 4023, 5321, 5010, 6525, 610...
4 4023 5321 [5321, 5010, 6525, 6100, 6016]
5 3011 4200 [4200, 4010]
6 3033 4113 [4113, 4311]
7 5010 6525 [6525, 6100, 6016]
8 3011 4010 [4200, 4010]
9 3102 4001 [4001]
10 2010 3011 [3102, 4001, 3011, 4200, 4010]
11 4023 5010 [5321, 5010, 6525, 6100, 6016]
12 2110 3000 [3000, 4023, 5321, 5010, 6525, 6100, 6016, 3111, 4321]
13 2100 3033 [3033, 4113, 4311]
14 1000 2110 [2010, 3102, 4001, 3011, 4200, 4010, 2110, 3000, 4023, 5321, 5010, 6525, 610...
15 5010 6100 [6525, 6100, 6016]
16 2110 3111 [3000, 4023, 5321, 5010, 6525, 6100, 6016, 3111, 4321]
17 1000 2100 [2010, 3102, 4001, 3011, 4200, 4010, 2110, 3000, 4023, 5321, 5010, 6525, 610...
18 5010 6016 [6525, 6100, 6016]
19 3033 4311 [4113, 4311]
One problem is that you don't pass the dataframe to the function here, so you need to be careful about what you name it. You could probably improve it by finding a way to implement this function without the inner dfs function relying on a dataframe named df existing.
This should return the parent and child ids in the two columns that you wanted:
import pandas as pd
import numpy as np
import itertools
df = pd.DataFrame(
{
'parent_id': [3111, 2010, 3000, 1000, 4023, 3011, 3033, 5010, 3011, 3102, 2010, 4023, 2110, 2100, 1000, 5010, 2110, 1000, 5010, 3033],
'child_id': [4321, 3102, 4023, 2010, 5321, 4200, 4113, 6525, 4010, 4001, 3011, 5010, 3000, 3033, 2110, 6100, 3111, 2100, 6016, 4311]
}
)
def get_child_list(df, parent_id):
list_of_children = []
list_of_children.append(df[df['parent_id'] == parent_id]['child_id'].values)
for i_, r_ in df[df['parent_id'] == parent_id].iterrows():
if r_['child_id'] != parent_id:
list_of_children.append(get_child_list(df, r_['child_id']))
# to flatten the list
list_of_children = [item for sublist in list_of_children for item in sublist]
return list_of_children
new_df = pd.DataFrame(columns=['parent_id', 'list_of_children'])
for index, row in df.iterrows():
temp_df = pd.DataFrame(columns=['parent_id', 'list_of_children'])
temp_df['list_of_children'] = pd.Series(get_child_list(df, row['parent_id']))
temp_df['parent_id'] = row['parent_id']
new_df = new_df.append(temp_df)
print new_df