Split JSON string column to multiple columns

后端未结

关注

 3  1300

一生所求 2021-01-06 18:32

I\'m looking for a generic solution to extract all the json fields as columns from a JSON string column.

df =  spark.read.load(path)
df.show()

3条回答

隐瞒了意图╮ (楼主)

2021-01-06 19:30

Based on @Gaurang Shah's answer, I have implemented a solution to handle nested JSON structure and fixed the issues with using monotonically_increasing_id(Non-sequential)

In this approach, 'populateColumnName' function recursively checks for StructType column and populate the column name.

'renameColumns' function renames the columns by replacing '.' with '_' to identify the nested json fields.

'addIndex' function adds index to the dataframe to join the dataframe after parsing the JSON column.

def flattenJSON(df : DataFrame, columnName: String) : DataFrame = {

    val indexCol = "internal_temp_id"

    def populateColumnName(col : StructField) : Array[String] = {
        col.dataType match {
          case struct: StructType => struct.fields.flatMap(populateColumnName).map(col.name + "." + _)
          case rest         => Array(col.name)
        }
    }

    def renameColumns(name : String) : String = {
        if(name contains ".") {
            name + " as " + name.replaceAll("\\.", "_")
        }
        else name
    }

    def addIndex(df : DataFrame) : DataFrame = {

        // Append "rowid" column of type Long
        val newSchema = StructType(df.schema.fields ++ Array(StructField(indexCol, LongType, false)))

        // Zip on RDD level
        val rddWithId = df.rdd.zipWithIndex
        // Convert back to DataFrame
        spark.createDataFrame(rddWithId.map{ case (row, index) => Row.fromSeq(row.toSeq ++ Array(index))}, newSchema)
    }

    val dfWithID = addIndex(df)

    val jsonDF = df.select(columnName)

    val ds = jsonDF.rdd.map(_.getString(0)).toDS
    val parseDF = spark.read.option("inferSchema",true).json(ds)

    val columnNames = parseDF.schema.fields.flatMap(populateColumnName).map(renameColumns)

    var resultDF = parseDF.selectExpr(columnNames:_*)

    val jsonDFWithID = addIndex(resultDF)

    val joinDF = dfWithID.join(jsonDFWithID, indexCol).drop(indexCol)

    joinDF
}

val res = flattenJSON(jsonDF, "address")

0 讨论(0)

查看其它3个回答