问题
I have following avro schema
{
"type":"record",
"name":"test",
"namespace":"test.name",
"fields":[
{"name":"items","type":
{"type":"array",
"items":
{"type":"record","name":"items",
"fields":[
{"name":"name","type":"string"},
{"name":"state","type":"string"}
]
}
}
},
{"name":"firstname","type":"string"}
]
}
when I am using Json decoder and avro encoder to encode Json data:
val writer = new GenericDatumWriter[GenericRecord](schema)
val reader = new GenericDatumReader[GenericRecord](schema)
val baos = new ByteArrayOutputStream
val decoder: JsonDecoder = DecoderFactory.get.jsonDecoder(schema, json)
val encoder = EncoderFactory.get.binaryEncoder(baos, null)
val datum = reader.read(null, decoder)
writer.write(datum, encoder)
encoder.flush()
val avroByteArray = baos.toByteArray
scenario1: when I am passing following json to encode it works fine:
{
"items": [
{
"name": "dallas",
"state": "TX"
}
],
"firstname":"arun"
}
scenario2: when I am passing additional attribute in json at root level (lastname) it is able to encode and works fine:
{
"items": [
{
"name": "dallas",
"state": "TX"
}
],
"firstname":"fname",
"lastname":"lname"
}
scenario3: when I am add additional attribute in array record (country) it is throwing following exception:
Expected record-end. Got FIELD_NAME
org.apache.avro.AvroTypeException: Expected record-end. Got FIELD_NAME
at org.apache.avro.io.JsonDecoder.error(JsonDecoder.java:698)
{
"items": [
{
"name": "dallas",
"state": "TX",
"country":"USA"
}
],
"firstname":"fname",
"lastname":"lname"
}
I need to make scenario#3 working any help will be great.
回答1:
Covert the json data with respective avro schema format using spark data frame approach would help you.
- create struct type from avro schema using SchemaConverters
- create data frame from struct type and json rdd string step
- convert the data frame rows into json using df.toJSON
Sample test cases:
import java.io.ByteArrayOutputStream
import com.databricks.spark.avro.SchemaConverters
import org.apache.avro.Schema
import org.apache.avro.Schema.Parser
import org.apache.avro.generic._
import org.apache.avro.io._
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession.Builder
import org.apache.spark.sql._
import org.apache.spark.sql.types.StructType
import org.scalatest.{Matchers, WordSpecLike}
class Test extends WordSpecLike
with Matchers {
val schemaString: String =
"""{
| "type":"record",
| "name":"test",
| "namespace":"test.name",
| "fields":[
| {"name":"items","type":
| {"type":"array",
| "items":
| {"type":"record","name":"items",
| "fields":[
| {"name":"name","type":"string"},
| {"name":"state","type":"string"}
| ]
| }
| }
| },
| {"name":"firstname","type":"string"}
| ]
|}""".stripMargin
// create spark session and sql context
val builder: Builder = SparkSession.builder.appName("testAvroSpark")
val sparkSession: SparkSession = builder.master("local[1]").getOrCreate()
val sc: SparkContext = sparkSession.sparkContext
val sqlContext: SQLContext = sparkSession.sqlContext
// avro schema from json type schema string
val schema: Schema = new Parser().parse(schemaString)
// get spark struct type from avro schema
val requiredType: StructType =
SchemaConverters.toSqlType(schema).dataType.asInstanceOf[StructType]
"scenario one json data with given schema" in {
val scenarioOneJson: String =
"""{
| "items": [
| {
| "name": "dallas",
| "state": "TX"
| }
| ],
| "firstname":"rumesh"
|}""".stripMargin
val jsonRdd: RDD[String] = sc.parallelize(Seq(scenarioOneJson))
val outputJsonExpected: String =
"""{"items":[{"name":"dallas","state":"TX"}],"firstname":"rumesh"}"""
val resultJson: String = customJsonConverter(requiredType, jsonRdd).head
assert(resultJson === outputJsonExpected)
assert(binaryEncoder(schema, outputJsonExpected) === binaryEncoder(schema, resultJson))
}
"scenario two json data with given schema" in {
val scenarioTwoJson: String =
"""{
| "items": [
| {
| "name": "dallas",
| "state": "TX"
| }
| ],
| "firstname":"rumesh",
| "lastname":"krish"
|}""".stripMargin
val jsonRdd: RDD[String] = sc.parallelize(Seq(scenarioTwoJson))
val outputJsonExpected: String =
"""{"items":[{"name":"dallas","state":"TX"}],"firstname":"rumesh"}"""
val resultJson: String = customJsonConverter(requiredType, jsonRdd).head
assert(resultJson === outputJsonExpected)
assert(binaryEncoder(schema, outputJsonExpected) === binaryEncoder(schema, resultJson))
}
"scenario three json data with given schema" in {
val scenarioThreeJson: String =
"""{
| "items": [
| {
| "name": "dallas",
| "state": "TX",
| "country":"USA"
| }
| ],
| "firstname":"rumesh",
| "lastname":"krish"
|}""".stripMargin
val jsonRdd: RDD[String] = sc.parallelize(Seq(scenarioThreeJson))
val outputJsonExpected: String =
"""{"items":[{"name":"dallas","state":"TX"}],"firstname":"rumesh"}"""
val resultJson: String = customJsonConverter(requiredType, jsonRdd).head
assert(resultJson === outputJsonExpected)
assert(binaryEncoder(schema, outputJsonExpected) === binaryEncoder(schema, resultJson))
}
/**
* convert the json using data frame json parser with given schema struct type
*
* @param customType given data frame struct type
* @param jsonInputRdd json rdd string
* @return
*/
private def customJsonConverter(customType: StructType,
jsonInputRdd: RDD[String]): List[String] = {
// create data frame from rdd string with struct type schema
val df: DataFrame = sqlContext.read.schema(customType).json(jsonInputRdd)
// get the list of json string data frame
df.toJSON.rdd.toLocalIterator.toList
}
/**
* avro binary serialization
*
* @param avroSchema avro schema
* @param jsonData json data
* @return
*/
private def binaryEncoder(avroSchema: Schema, jsonData: String): Array[Byte] = {
val writer = new GenericDatumWriter[GenericRecord](avroSchema)
val reader = new GenericDatumReader[GenericRecord](avroSchema)
val baos = new ByteArrayOutputStream
val decoder: JsonDecoder = DecoderFactory.get.jsonDecoder(avroSchema, jsonData)
val encoder = EncoderFactory.get.binaryEncoder(baos, null)
val datum = reader.read(null, decoder)
writer.write(datum, encoder)
encoder.flush()
baos.toByteArray
}
}
回答2:
Your schema does not represent the structure in scenario 3: the 'country' field is missing:
{"name":"country", "type":"string"}
You are only declaring the fields 'name' and 'state'. Then decoder correctly expects the (sub)record to end after those and, as the error message states, it gets a(nother) field name instead ('country').
Btw: You could use a generator to always get a matching schema out of your JSON, there are a couple available in the net.
来源:https://stackoverflow.com/questions/48432732/avro-json-additional-field