问题
I tried the given process (https://azure.microsoft.com/en-in/documentation/articles/hdinsight-apache-spark-eventhub-streaming/) step by step. I have just modified the spark receiver code according to my requirement. The spark streaming consumer api when I am spark-submitting its fetching the data from EventHub as DStream[Array[Bytes]] which I am doing a foreachRDD and converting into an RDD[String] . The issue I am facing here is that the statements below the streaming line are not getting executed until I stop the program execution by pressing ctrl+c.
package com.onerm.spark
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.eventhubs.EventHubsUtils
import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark._
import org.apache.spark.sql.hive.HiveContext
import java.util.concurrent.{Executors, ExecutorService}
object HiveEvents {
def b2s(a: Array[Byte]): String = new String(a)
def main(args: Array[String]): Unit = {
val ehParams = Map[String, String](
"eventhubs.policyname" -> "myreceivepolicy",
"eventhubs.policykey" -> "jgrH/5yjdMjajQ1WUAQsKAVGTu34=",
"eventhubs.namespace" -> "SparkeventHubTest-ns",
"eventhubs.name" -> "SparkeventHubTest",
"eventhubs.partition.count" -> "4",
"eventhubs.consumergroup" -> "$default",
"eventhubs.checkpoint.dir" -> "/EventCheckpoint_0.1",
"eventhubs.checkpoint.interval" -> "10"
)
val conf = new SparkConf().setAppName("Eventhubs Onerm")
val sc= new SparkContext(conf)
val hiveContext = new HiveContext(sc)
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
val pool:ExecutorService=Executors.newFixedThreadPool(5)
val ssc = new StreamingContext(sc, Seconds(120))
var dataString :RDD[String] =sc.emptyRDD
val stream=EventHubsUtils.createUnionStream(ssc, ehParams)
**//lines below are not getting executed until I stop the execution**
stream.print()
stream.foreachRDD {
rdd =>
if(rdd.isEmpty())
{
println("RDD IS EMPTY ")
}
else
{
dataString=rdd.map(line=>b2s(line))
println("COUNT" +dataString.count())
sqlContext.read.json(dataString).registerTempTable("jsoneventdata")
val filterData=sqlContext.sql("SELECT id,ClientProperties.PID,ClientProperties.Program,ClientProperties.Platform,ClientProperties.Version,ClientProperties.HWType,ClientProperties.OffVer,ContentID,Data,Locale,MappedSources,MarketingMessageContext.ActivityInstanceID,MarketingMessageContext.CampaignID,MarketingMessageContext.SegmentName,MarketingMessageContext.OneRMInstanceID,MarketingMessageContext.DateTimeSegmented,Source,Timestamp.Date,Timestamp.Epoch,TransactionID,UserAction,EventProcessedUtcTime,PartitionId,EventEnqueuedUtcTime from jsoneventdata")
filterData.show(10)
filterData.saveAsParquetFile("EventCheckpoint_0.1/ParquetEvent")
} }
ssc.start()
ssc.awaitTermination()
}
}
来源:https://stackoverflow.com/questions/37416080/spark-streaming-for-azure-event-hubs