问题
Need the message from the socket using spark streaming and read the file from filepath specified in the message and write to the dst.
Message from socket : {"fileName" : "sampleFile.dat","filePath":"/Users/Desktop/test/abc1.dat","fileDst":"/Users/Desktop/git/spark-streaming-poc/src/main/resourcs/samplefile2"}
Error:
java.lang.NullPointerException
at org.apache.spark.sql.execution.SparkPlan.sparkContext(SparkPlan.scala:56)
at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.metrics$lzycompute(ShuffleExchangeExec.scala:51)
at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.metrics(ShuffleExchangeExec.scala:50)
at org.apache.spark.sql.execution.SparkPlan.longMetric(SparkPlan.scala:91)
at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.<init>(ShuffleExchangeExec.scala:67)
at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec$.apply(ShuffleExchangeExec.scala:138)
at org.apache.spark.sql.execution.SparkStrategies$BasicOperators$.apply(SparkStrategies.scala:598)
at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$1.apply(QueryPlanner.scala:63)
at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$1.apply(QueryPlanner.scala:63)
at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:435)
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:441)
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
at org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:93)
at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$2$$anonfun$apply$2.apply(QueryPlanner.scala:78)
at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$2$$anonfun$apply$2.apply(QueryPlanner.scala:75)
at scala.collection.TraversableOnce$$anonfun$foldLeft$1.apply(TraversableOnce.scala:157)
at scala.collection.TraversableOnce$$anonfun$foldLeft$1.apply(TraversableOnce.scala:157)
at scala.collection.Iterator$class.foreach(Iterator.scala:891)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1334)
at scala.collection.TraversableOnce$class.foldLeft(TraversableOnce.scala:157)
at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1334)
at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$2.apply(QueryPlanner.scala:75)
at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$2.apply(QueryPlanner.scala:67)
at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:435)
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:441)
at org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:93)
at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$2$$anonfun$apply$2.apply(QueryPlanner.scala:78)
at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$2$$anonfun$apply$2.apply(QueryPlanner.scala:75)
at scala.collection.TraversableOnce$$anonfun$foldLeft$1.apply(TraversableOnce.scala:157)
at scala.collection.TraversableOnce$$anonfun$foldLeft$1.apply(TraversableOnce.scala:157)
at scala.collection.Iterator$class.foreach(Iterator.scala:891)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1334)
at scala.collection.TraversableOnce$class.foldLeft(TraversableOnce.scala:157)
at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1334)
at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$2.apply(QueryPlanner.scala:75)
at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$2.apply(QueryPlanner.scala:67)
at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:435)
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:441)
at org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:93)
at org.apache.spark.sql.execution.QueryExecution.sparkPlan$lzycompute(QueryExecution.scala:72)
at org.apache.spark.sql.execution.QueryExecution.sparkPlan(QueryExecution.scala:68)
at org.apache.spark.sql.execution.QueryExecution.executedPlan$lzycompute(QueryExecution.scala:77)
at org.apache.spark.sql.execution.QueryExecution.executedPlan(QueryExecution.scala:77)
at org.apache.spark.sql.execution.QueryExecution$$anonfun$toString$3.apply(QueryExecution.scala:207)
at org.apache.spark.sql.execution.QueryExecution$$anonfun$toString$3.apply(QueryExecution.scala:207)
at org.apache.spark.sql.execution.QueryExecution.stringOrError(QueryExecution.scala:99)
at org.apache.spark.sql.execution.QueryExecution.toString(QueryExecution.scala:207)
at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:75)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:668)
at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:276)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:270)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:228)
at org.apache.spark.sql.DataFrameWriter.csv(DataFrameWriter.scala:656)
at StreamDriver.processRecord(StreamDriver.java:101)
at StreamDriver$1.process(StreamDriver.java:157)
at StreamDriver$1.process(StreamDriver.java:148)
at org.apache.spark.sql.execution.streaming.sources.ForeachDataWriter.write(ForeachWriterProvider.scala:120)
at org.apache.spark.sql.execution.streaming.sources.ForeachDataWriter.write(ForeachWriterProvider.scala:106)
at org.apache.spark.sql.execution.datasources.v2.DataWritingSparkTask$$anonfun$run$3.apply(WriteToDataSourceV2Exec.scala:118)
at org.apache.spark.sql.execution.datasources.v2.DataWritingSparkTask$$anonfun$run$3.apply(WriteToDataSourceV2Exec.scala:116)
at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1394)
at org.apache.spark.sql.execution.datasources.v2.DataWritingSparkTask$.run(WriteToDataSourceV2Exec.scala:146)
at org.apache.spark.sql.execution.datasources.v2.WriteToDataSourceV2Exec$$anonfun$doExecute$2.apply(WriteToDataSourceV2Exec.scala:67)
at org.apache.spark.sql.execution.datasources.v2.WriteToDataSourceV2Exec$$anonfun$doExecute$2.apply(WriteToDataSourceV2Exec.scala:66)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:121)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:402)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:408)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Code :
public static void processRecord(Row value){
System.out.println(value.getString(0));
System.out.println(value.getString(1));
System.out.println(value.getString(2));
Dataset<Row> ds1 = null;
try {
ds1 = spark.read().schema(createSchema(convertJsonToMap()))
//.option("inferschema","true")
.option("header","false")
.option("delimiter", "|")
.csv(value.getString(1));
} catch (IOException e) {
e.printStackTrace();
}
ds1.write().mode(SaveMode.Overwrite).option("header","true").csv(value.getString(2));
// try {
// ds1 = spark.readStream()
// .format("csv")
// .schema(createSchema(convertJsonToMap()))
// .option("inferSchema", "true")
// .option("header", "true")
// .option("delimiter", "|")
// .load(value.getString(1));
// } catch (IOException e) {
// e.printStackTrace();
// }
//
//
// StreamingQuery query1 =
// ds1.writeStream()
// .format("csv")
// // .mode(SaveMode.Append)
// .outputMode(OutputMode.Append())
// .option("checkpointLocation", "/Users/kotireddy/git/spark-streaming-poc/src/main/resources")
// .option("path",value.getString(2))
// .start();
// try {
// query1.awaitTermination();
// } catch (StreamingQueryException e) {
// e.printStackTrace();
// }
}
public static void main(String[] args) throws InterruptedException, StreamingQueryException {
StreamingQuery query = null;
spark.sparkContext().setCheckpointDir("/Users/Desktop/git/spark-streaming-poc/src/main/resources");
Dataset<Row> ds = spark.readStream()
.format("socket")
.option("host", "localhost")
.option("port", "12345")
.load();
Dataset<Row> ds2 = ds.select(from_json(col("value").cast("string"), getSchema()).as("data"))
.select(col("data.fileName"), col("data.filePath"), col("data.fileDst"));
query = ds2.writeStream().foreach(new ForeachWriter<Row>() {
@Override
public boolean open(long partitionId, long version) {
return true;
}
@Override
public void process(Row value) {
processRecord(value);
}
@Override
public void close(Throwable errorOrNull) {
//errorOrNull.getSuppressed();
}
}).start();
query.awaitTermination();
}
}
来源:https://stackoverflow.com/questions/64863847/spark-streaming-consuming-message-from-socket-and-processing-null-pointer-exc