Split JSON string column to multiple columns

后端 未结 3 1300
一生所求
一生所求 2021-01-06 18:32

I\'m looking for a generic solution to extract all the json fields as columns from a JSON string column.

df =  spark.read.load(path)
df.show()
3条回答
  •  隐瞒了意图╮
    2021-01-06 19:30

    Based on @Gaurang Shah's answer, I have implemented a solution to handle nested JSON structure and fixed the issues with using monotonically_increasing_id(Non-sequential)

    In this approach, 'populateColumnName' function recursively checks for StructType column and populate the column name.

    'renameColumns' function renames the columns by replacing '.' with '_' to identify the nested json fields.

    'addIndex' function adds index to the dataframe to join the dataframe after parsing the JSON column.

    def flattenJSON(df : DataFrame, columnName: String) : DataFrame = {
    
        val indexCol = "internal_temp_id"
    
        def populateColumnName(col : StructField) : Array[String] = {
            col.dataType match {
              case struct: StructType => struct.fields.flatMap(populateColumnName).map(col.name + "." + _)
              case rest         => Array(col.name)
            }
        }
    
        def renameColumns(name : String) : String = {
            if(name contains ".") {
                name + " as " + name.replaceAll("\\.", "_")
            }
            else name
        }
    
        def addIndex(df : DataFrame) : DataFrame = {
    
            // Append "rowid" column of type Long
            val newSchema = StructType(df.schema.fields ++ Array(StructField(indexCol, LongType, false)))
    
            // Zip on RDD level
            val rddWithId = df.rdd.zipWithIndex
            // Convert back to DataFrame
            spark.createDataFrame(rddWithId.map{ case (row, index) => Row.fromSeq(row.toSeq ++ Array(index))}, newSchema)
        }
    
        val dfWithID = addIndex(df)
    
        val jsonDF = df.select(columnName)
    
        val ds = jsonDF.rdd.map(_.getString(0)).toDS
        val parseDF = spark.read.option("inferSchema",true).json(ds)
    
        val columnNames = parseDF.schema.fields.flatMap(populateColumnName).map(renameColumns)
    
        var resultDF = parseDF.selectExpr(columnNames:_*)
    
        val jsonDFWithID = addIndex(resultDF)
    
        val joinDF = dfWithID.join(jsonDFWithID, indexCol).drop(indexCol)
    
        joinDF
    }
    
    val res = flattenJSON(jsonDF, "address")
    

提交回复
热议问题