I\'m looking for a generic solution to extract all the json fields as columns from a JSON string column.
df = spark.read.load(path)
df.show()
Based on @Gaurang Shah's answer, I have implemented a solution to handle nested JSON structure and fixed the issues with using monotonically_increasing_id(Non-sequential)
In this approach, 'populateColumnName' function recursively checks for StructType column and populate the column name.
'renameColumns' function renames the columns by replacing '.' with '_' to identify the nested json fields.
'addIndex' function adds index to the dataframe to join the dataframe after parsing the JSON column.
def flattenJSON(df : DataFrame, columnName: String) : DataFrame = {
val indexCol = "internal_temp_id"
def populateColumnName(col : StructField) : Array[String] = {
col.dataType match {
case struct: StructType => struct.fields.flatMap(populateColumnName).map(col.name + "." + _)
case rest => Array(col.name)
}
}
def renameColumns(name : String) : String = {
if(name contains ".") {
name + " as " + name.replaceAll("\\.", "_")
}
else name
}
def addIndex(df : DataFrame) : DataFrame = {
// Append "rowid" column of type Long
val newSchema = StructType(df.schema.fields ++ Array(StructField(indexCol, LongType, false)))
// Zip on RDD level
val rddWithId = df.rdd.zipWithIndex
// Convert back to DataFrame
spark.createDataFrame(rddWithId.map{ case (row, index) => Row.fromSeq(row.toSeq ++ Array(index))}, newSchema)
}
val dfWithID = addIndex(df)
val jsonDF = df.select(columnName)
val ds = jsonDF.rdd.map(_.getString(0)).toDS
val parseDF = spark.read.option("inferSchema",true).json(ds)
val columnNames = parseDF.schema.fields.flatMap(populateColumnName).map(renameColumns)
var resultDF = parseDF.selectExpr(columnNames:_*)
val jsonDFWithID = addIndex(resultDF)
val joinDF = dfWithID.join(jsonDFWithID, indexCol).drop(indexCol)
joinDF
}
val res = flattenJSON(jsonDF, "address")