I have a dictionary my_dict_of_df
which consists of variable number of dataframes each time my program runs. I want to create a new dataframe t
Consulted the solution given here, thanks to @pault.
from functools import reduce
from pyspark.sql import DataFrame
def union_all(*dfs):
return reduce(DataFrame.union, dfs)
df1 = sqlContext.createDataFrame([(1, "foo1"), (2, "bar1")], ("k", "v"))
df2 = sqlContext.createDataFrame([(3, "foo2"), (4, "bar2")], ("k", "v"))
df3 = sqlContext.createDataFrame([(5, "foo3"), (6, "bar3")], ("k", "v"))
my_dic = {}
my_dic["df1"] = df1
my_dic["df2"] = df2
my_dic["df3"] = df3
new_df = union_all(*my_dic.values())
print(type(new_df)) # <class 'pyspark.sql.dataframe.DataFrame'>
print(new_df.show())
"""
+---+----+
| k| v|
+---+----+
| 1|foo1|
| 2|bar1|
| 3|foo2|
| 4|bar2|
| 5|foo3|
| 6|bar3|
+---+----+
"""
Edit: using DataFrame.union
instead of DataFrame.unionAll
since the latter is deprecated.