How can I create a Spark DataFrame from a nested array of struct element?

前端 未结 3 1072
北恋
北恋 2020-12-24 09:21

I have read a JSON file into Spark. This file has the following structure:

scala> tweetBlob.printSchema
root
 |-- related: struct (nullable = true)
 |             


        
相关标签:
3条回答
  • 2020-12-24 10:02
    scala> import org.apache.spark.sql.DataFrame
    import org.apache.spark.sql.DataFrame
    
    scala> import org.apache.spark.sql.types._
    import org.apache.spark.sql.types._
    
    scala> case class Bar(x: Int, y: String)
    defined class Bar
    
    scala> case class Foo(bar: Bar)
    defined class Foo
    
    scala> val df = sc.parallelize(Seq(Foo(Bar(1, "first")), Foo(Bar(2, "second")))).toDF
    df: org.apache.spark.sql.DataFrame = [bar: struct<x: int, y: string>]
    
    
    scala> df.printSchema
    root
     |-- bar: struct (nullable = true)
     |    |-- x: integer (nullable = false)
     |    |-- y: string (nullable = true)
    
    
    scala> df.select("bar.*").printSchema
    root
     |-- x: integer (nullable = true)
     |-- y: string (nullable = true)
    
    
    scala> 
    
    0 讨论(0)
  • 2020-12-24 10:04

    One possible way to handle this is to extract required information from the schema. Lets start with some dummy data:

    import org.apache.spark.sql.DataFrame
    import org.apache.spark.sql.types._
    
    
    case class Bar(x: Int, y: String)
    case class Foo(bar: Bar)
    
    val df = sc.parallelize(Seq(Foo(Bar(1, "first")), Foo(Bar(2, "second")))).toDF
    
    df.printSchema
    
    // root
    //  |-- bar: struct (nullable = true)
    //  |    |-- x: integer (nullable = false)
    //  |    |-- y: string (nullable = true)
    

    and a helper function:

    def children(colname: String, df: DataFrame) = {
      val parent = df.schema.fields.filter(_.name == colname).head
      val fields = parent.dataType match {
        case x: StructType => x.fields
        case _ => Array.empty[StructField]
      }
      fields.map(x => col(s"$colname.${x.name}"))
    }
    

    Finally the results:

    df.select(children("bar", df): _*).printSchema
    
    // root
    // |-- x: integer (nullable = true)
    // |-- y: string (nullable = true)
    
    0 讨论(0)
  • 2020-12-24 10:26

    You can use

    df
      .select(explode(col("path_to_collection")).as("collection"))
      .select(col("collection.*"))`:
    

    Example:

    scala> val json = """{"name":"Michael", "schools":[{"sname":"stanford", "year":2010}, {"sname":"berkeley", "year":2012}]}"""
    
    scala> val inline = sqlContext.read.json(sc.parallelize(json :: Nil)).select(explode(col("schools")).as("collection")).select(col("collection.*"))
    
    scala> inline.printSchema
    root
     |-- sname: string (nullable = true)
     |-- year: long (nullable = true)
    
    scala> inline.show
    +--------+----+
    |   sname|year|
    +--------+----+
    |stanford|2010|
    |berkeley|2012|
    +--------+----+
    

    Or, you can also use SQL function inline:

    scala> val json = """{"name":"Michael", "schools":[{"sname":"stanford", "year":2010}, {"sname":"berkeley", "year":2012}]}"""
    
    scala> sqlContext.read.json(sc.parallelize(json :: Nil)).registerTempTable("tmp")
    
    scala> val inline = sqlContext.sql("SELECT inline(schools) FROM tmp")
    
    scala> inline.printSchema
    root
     |-- sname: string (nullable = true)
     |-- year: long (nullable = true)
    
    scala> inline.show
    +--------+----+
    |   sname|year|
    +--------+----+
    |stanford|2010|
    |berkeley|2012|
    +--------+----+
    
    0 讨论(0)
提交回复
热议问题