Concatenate two PySpark dataframes

后端 未结 10 1312
独厮守ぢ
独厮守ぢ 2020-12-02 16:28

I\'m trying to concatenate two PySpark dataframes with some columns that are only on each of them:

from pyspark.sql.functions import randn, rand

df_1 = sqlC         


        
相关标签:
10条回答
  • 2020-12-02 17:00

    Here is one way to do it, in case it is still useful: I ran this in pyspark shell, Python version 2.7.12 and my Spark install was version 2.0.1.

    PS: I guess you meant to use different seeds for the df_1 df_2 and the code below reflects that.

    from pyspark.sql.types import FloatType
    from pyspark.sql.functions import randn, rand
    import pyspark.sql.functions as F
    
    df_1 = sqlContext.range(0, 10)
    df_2 = sqlContext.range(11, 20)
    df_1 = df_1.select("id", rand(seed=10).alias("uniform"), randn(seed=27).alias("normal"))
    df_2 = df_2.select("id", rand(seed=11).alias("uniform"), randn(seed=28).alias("normal_2"))
    
    def get_uniform(df1_uniform, df2_uniform):
        if df1_uniform:
            return df1_uniform
        if df2_uniform:
            return df2_uniform
    
    u_get_uniform = F.udf(get_uniform, FloatType())
    
    df_3 = df_1.join(df_2, on = "id", how = 'outer').select("id", u_get_uniform(df_1["uniform"], df_2["uniform"]).alias("uniform"), "normal", "normal_2").orderBy(F.col("id"))
    

    Here are the outputs I get:

    df_1.show()
    +---+-------------------+--------------------+
    | id|            uniform|              normal|
    +---+-------------------+--------------------+
    |  0|0.41371264720975787|  0.5888539012978773|
    |  1| 0.7311719281896606|  0.8645537008427937|
    |  2| 0.1982919638208397| 0.06157382353970104|
    |  3|0.12714181165849525|  0.3623040918178586|
    |  4| 0.7604318153406678|-0.49575204523675975|
    |  5|0.12030715258495939|  1.0854146699817222|
    |  6|0.12131363910425985| -0.5284523629183004|
    |  7|0.44292918521277047| -0.4798519469521663|
    |  8| 0.8898784253886249| -0.8820294772950535|
    |  9|0.03650707717266999| -2.1591956435415334|
    +---+-------------------+--------------------+
    
    df_2.show()
    +---+-------------------+--------------------+
    | id|            uniform|            normal_2|
    +---+-------------------+--------------------+
    | 11| 0.1982919638208397| 0.06157382353970104|
    | 12|0.12714181165849525|  0.3623040918178586|
    | 13|0.12030715258495939|  1.0854146699817222|
    | 14|0.12131363910425985| -0.5284523629183004|
    | 15|0.44292918521277047| -0.4798519469521663|
    | 16| 0.8898784253886249| -0.8820294772950535|
    | 17| 0.2731073068483362|-0.15116027592854422|
    | 18| 0.7784518091224375| -0.3785563841011868|
    | 19|0.43776394586845413| 0.47700719174464357|
    +---+-------------------+--------------------+
    
    df_3.show()
    +---+-----------+--------------------+--------------------+                     
    | id|    uniform|              normal|            normal_2|
    +---+-----------+--------------------+--------------------+
    |  0| 0.41371265|  0.5888539012978773|                null|
    |  1|  0.7311719|  0.8645537008427937|                null|
    |  2| 0.19829196| 0.06157382353970104|                null|
    |  3| 0.12714182|  0.3623040918178586|                null|
    |  4|  0.7604318|-0.49575204523675975|                null|
    |  5|0.120307155|  1.0854146699817222|                null|
    |  6| 0.12131364| -0.5284523629183004|                null|
    |  7| 0.44292918| -0.4798519469521663|                null|
    |  8| 0.88987845| -0.8820294772950535|                null|
    |  9|0.036507078| -2.1591956435415334|                null|
    | 11| 0.19829196|                null| 0.06157382353970104|
    | 12| 0.12714182|                null|  0.3623040918178586|
    | 13|0.120307155|                null|  1.0854146699817222|
    | 14| 0.12131364|                null| -0.5284523629183004|
    | 15| 0.44292918|                null| -0.4798519469521663|
    | 16| 0.88987845|                null| -0.8820294772950535|
    | 17| 0.27310732|                null|-0.15116027592854422|
    | 18|  0.7784518|                null| -0.3785563841011868|
    | 19| 0.43776396|                null| 0.47700719174464357|
    +---+-----------+--------------------+--------------------+
    
    0 讨论(0)
  • 2020-12-02 17:04

    Above answers are very elegant. I have written this function long back where i was also struggling to concatenate two dataframe with distinct columns.

    Suppose you have dataframe sdf1 and sdf2

    from pyspark.sql import functions as F
    from pyspark.sql.types import *
    
    def unequal_union_sdf(sdf1, sdf2):
        s_df1_schema = set((x.name, x.dataType) for x in sdf1.schema)
        s_df2_schema = set((x.name, x.dataType) for x in sdf2.schema)
    
        for i,j in s_df2_schema.difference(s_df1_schema):
            sdf1 = sdf1.withColumn(i,F.lit(None).cast(j))
    
        for i,j in s_df1_schema.difference(s_df2_schema):
            sdf2 = sdf2.withColumn(i,F.lit(None).cast(j))
    
        common_schema_colnames = sdf1.columns
        sdk = \
            sdf1.select(common_schema_colnames).union(sdf2.select(common_schema_colnames))
        return sdk 
    
    sdf_concat = unequal_union_sdf(sdf1, sdf2) 
    
    0 讨论(0)
  • 2020-12-02 17:11

    Maybe you can try creating the unexisting columns and calling union (unionAll for Spark 1.6 or lower):

    cols = ['id', 'uniform', 'normal', 'normal_2']    
    
    df_1_new = df_1.withColumn("normal_2", lit(None)).select(cols)
    df_2_new = df_2.withColumn("normal", lit(None)).select(cols)
    
    result = df_1_new.union(df_2_new)
    
    0 讨论(0)
  • 2020-12-02 17:12

    This should do it for you ...

    from pyspark.sql.types import FloatType
    from pyspark.sql.functions import randn, rand, lit, coalesce, col
    import pyspark.sql.functions as F
    
    df_1 = sqlContext.range(0, 6)
    df_2 = sqlContext.range(3, 10)
    df_1 = df_1.select("id", lit("old").alias("source"))
    df_2 = df_2.select("id")
    
    df_1.show()
    df_2.show()
    df_3 = df_1.alias("df_1").join(df_2.alias("df_2"), df_1.id == df_2.id, "outer")\
      .select(\
        [coalesce(df_1.id, df_2.id).alias("id")] +\
        [col("df_1." + c) for c in df_1.columns if c != "id"])\
      .sort("id")
    df_3.show()
    
    0 讨论(0)
  • 2020-12-02 17:13

    im a dwh turned pyspark developer. Below is what I would do:

        from pyspark.sql import SparkSession
        df_1.createOrReplaceTempView("tab_1")
        df_2.createOrReplaceTempView("tab_2")
        df_concat=spark.sql("select tab_1.id,tab_1.uniform,tab_1.normal,tab_2.normal_2  from tab_1 tab_1 left join tab_2 tab_2 on tab_1.uniform=tab_2.uniform\
                    union\
                    select tab_2.id,tab_2.uniform,tab_1.normal,tab_2.normal_2  from tab_2 tab_2 left join tab_1 tab_1 on tab_1.uniform=tab_2.uniform")
        df_concat.show()
    

    --pls let me know if this worked for you or was your need.

    0 讨论(0)
  • 2020-12-02 17:14

    Maybe, you want to concatenate more of two Dataframes. I found a issue which use pandas Dataframe conversion.

    Suppose you have 3 spark Dataframe who want to concatenate.

    The code is the following:

    list_dfs = []
    list_dfs_ = []
    
    df = spark.read.json('path_to_your_jsonfile.json',multiLine = True)
    df2 = spark.read.json('path_to_your_jsonfile2.json',multiLine = True)
    df3 = spark.read.json('path_to_your_jsonfile3.json',multiLine = True)
    
    list_dfs.extend([df,df2,df3])
    
    for df in list_dfs : 
    
        df = df.select([column for column in df.columns]).toPandas()
        list_dfs_.append(df)
    
    list_dfs.clear()
    
    df_ = sqlContext.createDataFrame(pd.concat(list_dfs_))
    
    0 讨论(0)
提交回复
热议问题