Trying to read multiple json files into a dataframe, both files have a \"Value\" node but the type of this node alternates between integer and struct:
File 1:
Try if this helps-
/**
* test/File1.json
* -----
* {
* "Value": 123
* }
*/
/**
* test/File2.json
* ---------
* {
* "Value": {
* "Value": "On",
* "ValueType": "State",
* "IsSystemValue": true
* }
* }
*/
val path = getClass.getResource("/test" ).getPath
val df = spark.read
.option("multiLine", true)
.json(path)
df.show(false)
df.printSchema()
/**
* +-------------------------------------------------------+
* |Value |
* +-------------------------------------------------------+
* |{"Value":"On","ValueType":"State","IsSystemValue":true}|
* |123 |
* +-------------------------------------------------------+
*
* root
* |-- Value: string (nullable = true)
*/
df.withColumn("File", substring_index(input_file_name(),"/", -1))
.withColumn("ValueType", get_json_object(col("Value"), "$.ValueType"))
.withColumn("IsSystemValue", get_json_object(col("Value"), "$.IsSystemValue"))
.withColumn("Value", coalesce(get_json_object(col("Value"), "$.Value"), col("Value")))
.show(false)
/**
* +-----+----------+---------+-------------+
* |Value|File |ValueType|IsSystemValue|
* +-----+----------+---------+-------------+
* |On |File2.json|State |true |
* |123 |File1.json|null |null |
* +-----+----------+---------+-------------+
*/