How to pivot Spark DataFrame?

后端 未结 10 2128
闹比i
闹比i 2020-11-21 06:43

I am starting to use Spark DataFrames and I need to be able to pivot the data to create multiple columns out of 1 column with multiple rows. There is built in functionality

10条回答
  •  心在旅途
    2020-11-21 07:15

    Initially i adopted Al M's solution. Later took the same thought and rewrote this function as a transpose function.

    This method transposes any df rows to columns of any data-format with using key and value column

    for input csv

    id,tag,value
    1,US,50a
    1,UK,100
    1,Can,125
    2,US,75
    2,UK,150
    2,Can,175
    

    ouput

    +--+---+---+---+
    |id| UK| US|Can|
    +--+---+---+---+
    | 2|150| 75|175|
    | 1|100|50a|125|
    +--+---+---+---+
    

    transpose method :

    def transpose(hc : HiveContext , df: DataFrame,compositeId: List[String], key: String, value: String) = {
    
    val distinctCols =   df.select(key).distinct.map { r => r(0) }.collect().toList
    
    val rdd = df.map { row =>
    (compositeId.collect { case id => row.getAs(id).asInstanceOf[Any] },
    scala.collection.mutable.Map(row.getAs(key).asInstanceOf[Any] -> row.getAs(value).asInstanceOf[Any]))
    }
    val pairRdd = rdd.reduceByKey(_ ++ _)
    val rowRdd = pairRdd.map(r => dynamicRow(r, distinctCols))
    hc.createDataFrame(rowRdd, getSchema(df.schema, compositeId, (key, distinctCols)))
    
    }
    
    private def dynamicRow(r: (List[Any], scala.collection.mutable.Map[Any, Any]), colNames: List[Any]) = {
    val cols = colNames.collect { case col => r._2.getOrElse(col.toString(), null) }
    val array = r._1 ++ cols
    Row(array: _*)
    }
    
    private  def getSchema(srcSchema: StructType, idCols: List[String], distinctCols: (String, List[Any])): StructType = {
    val idSchema = idCols.map { idCol => srcSchema.apply(idCol) }
    val colSchema = srcSchema.apply(distinctCols._1)
    val colsSchema = distinctCols._2.map { col => StructField(col.asInstanceOf[String], colSchema.dataType, colSchema.nullable) }
    StructType(idSchema ++ colsSchema)
    }
    

    main snippet

    import java.util.Date
    import org.apache.spark.SparkConf
    import org.apache.spark.SparkContext
    import org.apache.spark.sql.Row
    import org.apache.spark.sql.DataFrame
    import org.apache.spark.sql.types.StructType
    import org.apache.spark.sql.hive.HiveContext
    import org.apache.spark.sql.types.StructField
    
    
    ...
    ...
    def main(args: Array[String]): Unit = {
    
        val sc = new SparkContext(conf)
        val sqlContext = new org.apache.spark.sql.SQLContext(sc)
        val dfdata1 = sqlContext.read.format("com.databricks.spark.csv").option("header", "true").option("inferSchema", "true")
        .load("data.csv")
        dfdata1.show()  
        val dfOutput = transpose(new HiveContext(sc), dfdata1, List("id"), "tag", "value")
        dfOutput.show
    
    }
    

提交回复
热议问题