问题
Check below code. It is generating dataframe with ambiguity if duplicate keys are present . How should we modify the code to add parent column name as prefix to it.
Added another column with json data.
scala> val df = Seq(
(77, "email1", """{"key1":38,"key3":39}""","""{"name":"aaa","age":10}"""),
(78, "email2", """{"key1":38,"key4":39}""","""{"name":"bbb","age":20}"""),
(178, "email21", """{"key1":"when string","key4":36, "key6":"test", "key10":false }""","""{"name":"ccc","age":30}"""),
(179, "email8", """{"sub1":"qwerty","sub2":["42"]}""","""{"name":"ddd","age":40}"""),
(180, "email8", """{"sub1":"qwerty","sub2":["42", "56", "test"]}""","""{"name":"eee","age":50}""")
).toDF("id", "name", "colJson","personInfo")
scala> df.printSchema
root
|-- id: integer (nullable = false)
|-- name: string (nullable = true)
|-- colJson: string (nullable = true)
|-- personInfo: string (nullable = true)
scala> df.show(false)
+---+-------+---------------------------------------------------------------+-----------------------+
|id |name |colJson |personInfo |
+---+-------+---------------------------------------------------------------+-----------------------+
|77 |email1 |{"key1":38,"key3":39} |{"name":"aaa","age":10}|
|78 |email2 |{"key1":38,"key4":39} |{"name":"bbb","age":20}|
|178|email21|{"key1":"when string","key4":36, "key6":"test", "key10":false }|{"name":"ccc","age":30}|
|179|email8 |{"sub1":"qwerty","sub2":["42"]} |{"name":"ddd","age":40}|
|180|email8 |{"sub1":"qwerty","sub2":["42", "56", "test"]} |{"name":"eee","age":50}|
+---+-------+---------------------------------------------------------------+-----------------------+
created fromJson implicit function,You can pass multiple columns to this & It will parse & extract the columns from json.
scala> :paste
// Entering paste mode (ctrl-D to finish)
import org.apache.spark.sql.{Column, DataFrame, Row}
import org.apache.spark.sql.functions.from_json
implicit class DFHelper(inDF: DataFrame) {
import inDF.sparkSession.implicits._
def fromJson(columns:Column*):DataFrame = {
val schemas = columns.map(column => (column, inDF.sparkSession.read.json(inDF.select(column).as[String]).schema))
val mdf = schemas.foldLeft(inDF)((df,schema) => {
df.withColumn(schema._1.toString(),from_json(schema._1,schema._2))
})
mdf.selectExpr(mdf.schema.map(c => if(c.dataType.typeName =="struct") s"${c.name}.*" else c.name):_*)
}
}
// Exiting paste mode, now interpreting.
import org.apache.spark.sql.{Column, DataFrame, Row}
import org.apache.spark.sql.functions.from_json
defined class DFHelper
scala> df.fromJson($"colJson",$"personInfo").show(false)
+---+-------+-----------+-----+----+----+----+------+--------------+---+----+
|id |name |key1 |key10|key3|key4|key6|sub1 |sub2 |age|name|
+---+-------+-----------+-----+----+----+----+------+--------------+---+----+
|77 |email1 |38 |null |39 |null|null|null |null |10 |aaa |
|78 |email2 |38 |null |null|39 |null|null |null |20 |bbb |
|178|email21|when string|false|null|36 |test|null |null |30 |ccc |
|179|email8 |null |null |null|null|null|qwerty|[42] |40 |ddd |
|180|email8 |null |null |null|null|null|qwerty|[42, 56, test]|50 |eee |
+---+-------+-----------+-----+----+----+----+------+--------------+---+----+
scala> df.fromJson($"colJson",$"personInfo").printSchema()
root
|-- id: integer (nullable = false)
|-- name: string (nullable = true)
|-- key1: string (nullable = true)
|-- key10: boolean (nullable = true)
|-- key3: long (nullable = true)
|-- key4: long (nullable = true)
|-- key6: string (nullable = true)
|-- sub1: string (nullable = true)
|-- sub2: array (nullable = true)
| |-- element: string (containsNull = true)
|-- age: long (nullable = true)
|-- name: string (nullable = true)
回答1:
Try this-
df.show(false)
df.printSchema()
/**
* +---+-------+---------------------------------------------------------------+-----------------------+
* |id |name |colJson |personInfo |
* +---+-------+---------------------------------------------------------------+-----------------------+
* |77 |email1 |{"key1":38,"key3":39} |{"name":"aaa","age":10}|
* |78 |email2 |{"key1":38,"key4":39} |{"name":"bbb","age":20}|
* |178|email21|{"key1":"when string","key4":36, "key6":"test", "key10":false }|{"name":"ccc","age":30}|
* |179|email8 |{"sub1":"qwerty","sub2":["42"]} |{"name":"ddd","age":40}|
* |180|email8 |{"sub1":"qwerty","sub2":["42", "56", "test"]} |{"name":"eee","age":50}|
* +---+-------+---------------------------------------------------------------+-----------------------+
*
* root
* |-- id: integer (nullable = false)
* |-- name: string (nullable = true)
* |-- colJson: string (nullable = true)
* |-- personInfo: string (nullable = true)
*
* @param inDF
*/
implicit class DFHelper(inDF: DataFrame) {
import inDF.sparkSession.implicits._
def fromJson(columns:Column*):DataFrame = {
val schemas = columns.map(column => (column, inDF.sparkSession.read.json(inDF.select(column).as[String]).schema))
val mdf = schemas.foldLeft(inDF)((df,schema) => {
df.withColumn(schema._1.toString(),from_json(schema._1,schema._2))
})
mdf//.selectExpr(mdf.schema.map(c => if(c.dataType.typeName =="struct") s"${c.name}.*" else c.name):_*)
}
}
val p = df.fromJson($"colJson", $"personInfo")
p.show(false)
p.printSchema()
/**
* +---+-------+---------------------------------+----------+
* |id |name |colJson |personInfo|
* +---+-------+---------------------------------+----------+
* |77 |email1 |[38,, 39,,,,] |[10, aaa] |
* |78 |email2 |[38,,, 39,,,] |[20, bbb] |
* |178|email21|[when string, false,, 36, test,,]|[30, ccc] |
* |179|email8 |[,,,,, qwerty, [42]] |[40, ddd] |
* |180|email8 |[,,,,, qwerty, [42, 56, test]] |[50, eee] |
* +---+-------+---------------------------------+----------+
*
* root
* |-- id: integer (nullable = false)
* |-- name: string (nullable = true)
* |-- colJson: struct (nullable = true)
* | |-- key1: string (nullable = true)
* | |-- key10: boolean (nullable = true)
* | |-- key3: long (nullable = true)
* | |-- key4: long (nullable = true)
* | |-- key6: string (nullable = true)
* | |-- sub1: string (nullable = true)
* | |-- sub2: array (nullable = true)
* | | |-- element: string (containsNull = true)
* |-- personInfo: struct (nullable = true)
* | |-- age: long (nullable = true)
* | |-- name: string (nullable = true)
*/
// fetch columns of struct using <parent_col>.<child_col>
p.select($"colJson.key1", $"personInfo.age").show(false)
/**
* +-----------+---+
* |key1 |age|
* +-----------+---+
* |38 |10 |
* |38 |20 |
* |when string|30 |
* |null |40 |
* |null |50 |
* +-----------+---+
*/
来源:https://stackoverflow.com/questions/63513495/add-parent-column-name-as-prefix-to-avoid-ambiguity