avro json additional field

问题

I have following avro schema

{
    "type":"record",
    "name":"test",
    "namespace":"test.name",
    "fields":[
        {"name":"items","type":
            {"type":"array",
                "items":
                    {"type":"record","name":"items",
                        "fields":[
                                {"name":"name","type":"string"},
                                {"name":"state","type":"string"}
                            ]
                    }
            }
        },
        {"name":"firstname","type":"string"}
    ]
}

when I am using Json decoder and avro encoder to encode Json data:

val writer = new GenericDatumWriter[GenericRecord](schema)
val reader = new GenericDatumReader[GenericRecord](schema)
val baos = new ByteArrayOutputStream
val decoder: JsonDecoder = DecoderFactory.get.jsonDecoder(schema, json)
val encoder = EncoderFactory.get.binaryEncoder(baos, null)
val datum = reader.read(null, decoder)
writer.write(datum, encoder)
encoder.flush()
val avroByteArray = baos.toByteArray

scenario1: when I am passing following json to encode it works fine:

{
  "items": [
    {
      "name": "dallas",
      "state": "TX"
    }
  ],
  "firstname":"arun"
}

scenario2: when I am passing additional attribute in json at root level (lastname) it is able to encode and works fine:

{
  "items": [
    {
      "name": "dallas",
      "state": "TX"
    }
  ],
  "firstname":"fname",
  "lastname":"lname"
}

scenario3: when I am add additional attribute in array record (country) it is throwing following exception:

Expected record-end. Got FIELD_NAME
org.apache.avro.AvroTypeException: Expected record-end. Got FIELD_NAME
    at org.apache.avro.io.JsonDecoder.error(JsonDecoder.java:698)
{
  "items": [
    {
      "name": "dallas",
      "state": "TX",
      "country":"USA"
    }
  ],
  "firstname":"fname",
  "lastname":"lname"
}

I need to make scenario#3 working any help will be great.

回答1:

Covert the json data with respective avro schema format using spark data frame approach would help you.

create struct type from avro schema using SchemaConverters
create data frame from struct type and json rdd string step
convert the data frame rows into json using df.toJSON

Sample test cases:

import java.io.ByteArrayOutputStream

import com.databricks.spark.avro.SchemaConverters
import org.apache.avro.Schema
import org.apache.avro.Schema.Parser
import org.apache.avro.generic._
import org.apache.avro.io._
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession.Builder
import org.apache.spark.sql._
import org.apache.spark.sql.types.StructType
import org.scalatest.{Matchers, WordSpecLike}

class Test extends WordSpecLike
  with Matchers {

  val schemaString: String =
    """{
      |    "type":"record",
      |    "name":"test",
      |    "namespace":"test.name",
      |    "fields":[
      |        {"name":"items","type":
      |            {"type":"array",
      |                "items":
      |                    {"type":"record","name":"items",
      |                        "fields":[
      |                                {"name":"name","type":"string"},
      |                                {"name":"state","type":"string"}
      |                            ]
      |                    }
      |            }
      |        },
      |        {"name":"firstname","type":"string"}
      |    ]
      |}""".stripMargin

  // create spark session and sql context
  val builder: Builder = SparkSession.builder.appName("testAvroSpark")
  val sparkSession: SparkSession = builder.master("local[1]").getOrCreate()
  val sc: SparkContext = sparkSession.sparkContext
  val sqlContext: SQLContext = sparkSession.sqlContext

  // avro schema from json type schema string
  val schema: Schema = new Parser().parse(schemaString)

  // get spark struct type from avro schema
  val requiredType: StructType =
    SchemaConverters.toSqlType(schema).dataType.asInstanceOf[StructType]

  "scenario one json data with given schema" in {
    val scenarioOneJson: String =
      """{
        |  "items": [
        |    {
        |      "name": "dallas",
        |      "state": "TX"
        |    }
        |  ],
        |  "firstname":"rumesh"
        |}""".stripMargin

    val jsonRdd: RDD[String] = sc.parallelize(Seq(scenarioOneJson))

    val outputJsonExpected: String =
      """{"items":[{"name":"dallas","state":"TX"}],"firstname":"rumesh"}"""

    val resultJson: String = customJsonConverter(requiredType, jsonRdd).head

    assert(resultJson === outputJsonExpected)
    assert(binaryEncoder(schema, outputJsonExpected) === binaryEncoder(schema, resultJson))
  }

  "scenario two json data with given schema" in {
    val scenarioTwoJson: String =
      """{
        |  "items": [
        |    {
        |      "name": "dallas",
        |      "state": "TX"
        |    }
        |  ],
        |  "firstname":"rumesh",
        |  "lastname":"krish"
        |}""".stripMargin

    val jsonRdd: RDD[String] = sc.parallelize(Seq(scenarioTwoJson))

    val outputJsonExpected: String =
      """{"items":[{"name":"dallas","state":"TX"}],"firstname":"rumesh"}"""

    val resultJson: String = customJsonConverter(requiredType, jsonRdd).head

    assert(resultJson === outputJsonExpected)
    assert(binaryEncoder(schema, outputJsonExpected) === binaryEncoder(schema, resultJson))
  }

  "scenario three json data with given schema" in {
    val scenarioThreeJson: String =
      """{
        |  "items": [
        |    {
        |      "name": "dallas",
        |      "state": "TX",
        |      "country":"USA"
        |    }
        |  ],
        |  "firstname":"rumesh",
        |  "lastname":"krish"
        |}""".stripMargin

    val jsonRdd: RDD[String] = sc.parallelize(Seq(scenarioThreeJson))

    val outputJsonExpected: String =
      """{"items":[{"name":"dallas","state":"TX"}],"firstname":"rumesh"}"""

    val resultJson: String = customJsonConverter(requiredType, jsonRdd).head

    assert(resultJson === outputJsonExpected)
    assert(binaryEncoder(schema, outputJsonExpected) === binaryEncoder(schema, resultJson))
  }

  /**
    * convert the json using data frame json parser with given schema struct type
    *
    * @param customType   given data frame struct type
    * @param jsonInputRdd json rdd string
    * @return
    */
  private def customJsonConverter(customType: StructType,
                                  jsonInputRdd: RDD[String]): List[String] = {
    // create data frame from rdd string with struct type schema
    val df: DataFrame = sqlContext.read.schema(customType).json(jsonInputRdd)

    // get the list of json string data frame
    df.toJSON.rdd.toLocalIterator.toList
  }


  /**
    * avro binary serialization
    *
    * @param avroSchema avro schema
    * @param jsonData   json data
    * @return
    */
  private def binaryEncoder(avroSchema: Schema, jsonData: String): Array[Byte] = {
    val writer = new GenericDatumWriter[GenericRecord](avroSchema)
    val reader = new GenericDatumReader[GenericRecord](avroSchema)
    val baos = new ByteArrayOutputStream
    val decoder: JsonDecoder = DecoderFactory.get.jsonDecoder(avroSchema, jsonData)
    val encoder = EncoderFactory.get.binaryEncoder(baos, null)
    val datum = reader.read(null, decoder)
    writer.write(datum, encoder)
    encoder.flush()
    baos.toByteArray
  }

}

回答2:

Your schema does not represent the structure in scenario 3: the 'country' field is missing:

{"name":"country", "type":"string"}

You are only declaring the fields 'name' and 'state'. Then decoder correctly expects the (sub)record to end after those and, as the error message states, it gets a(nother) field name instead ('country').

Btw: You could use a generator to always get a matching schema out of your JSON, there are a couple available in the net.

来源：https://stackoverflow.com/questions/48432732/avro-json-additional-field

标签

avro

spark-avro