Error with spark Row.fromSeq for a text file

问题

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark._
import org.apache.spark.sql.types._
import org.apache.spark.sql._

object fixedLength {

  def main(args:Array[String]) {

    def getRow(x : String) : Row={    
    val columnArray = new Array[String](4)
    columnArray(0)=x.substring(0,3)
    columnArray(1)=x.substring(3,13)
    columnArray(2)=x.substring(13,18)
    columnArray(3)=x.substring(18,22)
    Row.fromSeq(columnArray)  
  }

    Logger.getLogger("org").setLevel(Level.ERROR)

    val spark = SparkSession.builder().master("local").appName("ReadingCSV").getOrCreate()


    val conf = new SparkConf().setAppName("FixedLength").setMaster("local[*]").set("spark.driver.allowMultipleContexts", "true");
    val sc = new SparkContext(conf)    
    val fruits = sc.textFile("in/fruits.txt")

    val schemaString = "id,fruitName,isAvailable,unitPrice";
    val fields = schemaString.split(",").map( field => StructField(field,StringType,nullable=true))
    val schema = StructType(fields)

    val df = spark.createDataFrame(fruits.map { x => getRow(x)} , schema)
    df.show() // Error
    println("End of the program")
  }
}

I'm getting error in the df.show() command. My file content is

56 apple     TRUE 0.56
45 pear      FALSE1.34
34 raspberry TRUE 2.43
34 plum      TRUE 1.31
53 cherry    TRUE 1.4 
23 orange    FALSE2.34
56 persimmon FALSE23.2

ERROR Executor: Exception in task 0.0 in stage 0.0 (TID 0) java.lang.ClassCastException: org.apache.spark.util.SerializableConfiguration cannot be cast to [B at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:81)

Can you please help?

回答1:

You are creating rdd in old way SparkContext(conf)

val conf = new SparkConf().setAppName("FixedLength").setMaster("local[*]").set("spark.driver.allowMultipleContexts", "true");
val sc = new SparkContext(conf)    
val fruits = sc.textFile("in/fruits.txt")

whereas you are creating dataframe in new way using SparkSession

val spark = SparkSession.builder().master("local").appName("ReadingCSV").getOrCreate()
val df = spark.createDataFrame(fruits.map { x => getRow(x)} , schema)

Ultimately you are mixing rdd created with old sparkContext functions with dataframe created by using new sparkSession.

I would suggest you to use only one way.

I guess thats the reason for the issue

Update

doing the following should work for you

def getRow(x : String) : Row={    
val columnArray = new Array[String](4)
columnArray(0)=x.substring(0,3)
columnArray(1)=x.substring(3,13)
columnArray(2)=x.substring(13,18)
columnArray(3)=x.substring(18,22)
Row.fromSeq(columnArray)  
}

Logger.getLogger("org").setLevel(Level.ERROR)

val spark = SparkSession.builder().master("local").appName("ReadingCSV").getOrCreate()

val fruits = spark.sparkContext.textFile("in/fruits.txt")

val schemaString = "id,fruitName,isAvailable,unitPrice";
val fields = schemaString.split(",").map( field => StructField(field,StringType,nullable=true))
val schema = StructType(fields)

val df = spark.createDataFrame(fruits.map { x => getRow(x)} , schema)

来源：https://stackoverflow.com/questions/49069720/error-with-spark-row-fromseq-for-a-text-file

标签

scala

apache-spark