问题
I am new to Spark and working on a simple application to convert XML streams received from Kafka in to JSON format
Using:
- Spark 2.4.5
- Scala 2.11.12
In my use case kafka stream is in xml format). The Following is the code that I tried.
val spark: SparkSession = SparkSession.builder()
.master("local")
.appName("Spark Demo")
.getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
val inputStream = spark.readStream
.format("kafka")
.option("kafka.bootstrap.servers", "localhost:9092")
.option("subscribe", "demo_topic_xml")
.option("startingOffsets", "earliest") // From starting
.load()
inputStream.printSchema()
val records = inputStream.selectExpr("CAST(value AS STRING)")
//How to remove value column here while converting xml in to json?
val jsons = records.toJSON
jsons.writeStream
.format("console")
.option("truncate", false)
.outputMode("append")
.start()
.awaitTermination()
However the above code gives "value" column header as field name in the json output as shown below:
{"value":"<?xml version=\"1.0\" encoding=\"utf-16\"?><employees><employee id=\"be129\"><firstname>Jane</firstname><lastname>Doe</lastname><title>Engineer</title><division>Materials</division><building>327</building><room>19</room><supervisor>be131</supervisor></employee><employees>"}
What I really need is only the xml payload to be converted to json without "value" column part. Look like I am missing something obvious here. Can someone please help me here. Thanks for your time.
回答1:
Use org.json.XML
library to convert XML
data to JSON
.
Check below code.
Creating UDF
scala> import org.json.XML
import org.json.XML
scala> val parse = udf((value: String) => XML.toJSONObject(value).toString) // Defined UDF to parse xml to json
parse: org.apache.spark.sql.expressions.UserDefinedFunction = UserDefinedFunction(<function1>,StringType,Some(List(StringType)))
Defining schema
based on XML
data.
scala> val schema_json = """{"type":"struct","fields":[{"name":"employees","type":{"type":"struct","fields":[{"name":"employee","type":{"type":"struct","fields":[{"name":"building","type":"long","nullable":true,"metadata":{}},{"name":"division","type":"string","nullable":true,"metadata":{}},{"name":"firstname","type":"string","nullable":true,"metadata":{}},{"name":"id","type":"string","nullable":true,"metadata":{}},{"name":"lastname","type":"string","nullable":true,"metadata":{}},{"name":"room","type":"long","nullable":true,"metadata":{}},{"name":"supervisor","type":"string","nullable":true,"metadata":{}},{"name":"title","type":"string","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}}]}""" // Define Schema of your xml data in json.
schema_json: String = {"type":"struct","fields":[{"name":"employees","type":{"type":"struct","fields":[{"name":"employee","type":{"type":"struct","fields":[{"name":"building","type":"long","nullable":true,"metadata":{}},{"name":"division","type":"string","nullable":true,"metadata":{}},{"name":"firstname","type":"string","nullable":true,"metadata":{}},{"name":"id","type":"string","nullable":true,"metadata":{}},{"name":"lastname","type":"string","nullable":true,"metadata":{}},{"name":"room","type":"long","nullable":true,"metadata":{}},{"name":"supervisor","type":"string","nullable":true,"metadata":{}},{"name":"title","type":"string","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}}]}
scala> val schema = DataType.fromJson(schema_json).asInstanceOf[StructType] // Convert Json schema data to schema.
schema: org.apache.spark.sql.types.StructType = StructType(StructField(employees,StructType(StructField(employee,StructType(StructField(building,LongType,true), StructField(division,StringType,true), StructField(firstname,StringType,true), StructField(id,StringType,true), StructField(lastname,StringType,true), StructField(room,LongType,true), StructField(supervisor,StringType,true), StructField(title,StringType,true)),true)),true))
Final Schema
scala>
inputStream
.selectExpr("CAST(value AS STRING)")
.select(from_json(parse($"data"),schema).as("emp_data"))
.select($"emp_data.employees.employee.*")
.printSchema
root
|-- building: long (nullable = true)
|-- division: string (nullable = true)
|-- firstname: string (nullable = true)
|-- id: string (nullable = true)
|-- lastname: string (nullable = true)
|-- room: long (nullable = true)
|-- supervisor: string (nullable = true)
|-- title: string (nullable = true)
Writing converted to JSON
data to console
.
scala>
inputStream
.selectExpr("CAST(value AS STRING)")
.select(from_json(parse($"data"),schema).as("emp_data"))
.select($"emp_data.employees.employee.*")
.writeStream
.format("console")
.option("truncate", false)
.outputMode("append")
.start()
.awaitTermination()
来源:https://stackoverflow.com/questions/62379533/convert-streaming-xml-into-json-in-spark