【推荐】2019 Java 开发者跳槽指南.pdf(吐血整理) >>>
kafka0.8和kafka1.0的区别
1.kafka1.0版本不支持receiver连接方式
2. kafka1.0版本自动更新保存偏移量到kafka中
Kafka0.8版
使用高级api,代码简单,自动获取偏移量并放入zk中,使用WAL机制将接收到的数据存到HDFS
代码
package com.bw.sparkStreaming
import org.apache.spark.{HashPartitioner, SparkConf, SparkContext}
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Durations, StreamingContext}
/**
* kafka基于receiver方式连接sparkStreaming
* 使用sparkStreaming的窗口函数解决程序不能累加问题
*/
object StreamingDemo3 {
/*
String:聚合的key
Seq[Int]:当前批次产生该批次在每个分区中出现的次数
Option[Int]:初始值或累加的结果
*/
def updateFunc=(itr:Iterator[(String,Seq[Int],Option[Int])])=>{
itr.map(t=>(t._1,t._2.sum+t._3.getOrElse(0)))
}
def main (args: Array[String]): Unit = {
val conf = new SparkConf().setAppName(this.getClass.getSimpleName)
.setMaster("local[*]")
val sc = new SparkContext(conf)
sc.setCheckpointDir("hdfs://had01:8020/study09a")
val ssc = new StreamingContext(sc,Durations.seconds(3))
val zkQuorum = "had01:2181,had02:2181,had03:2181"
val groupId = "yyy"
//String:topic名 Int:分区数
val topics = Map[String,Int]("study09a2"->1)
//从kafka中获取的数据是tuple结果 key:元数据信息 val:具体数据
val data: ReceiverInputDStream[(String, String)] = KafkaUtils.createStream(ssc,zkQuorum,groupId,topics)
val lines: DStream[String] = data.map(_._2)
val flated = lines.flatMap(_.split(" "))
val wordAndOne: DStream[(String, Int)] = flated.map((_,1))
val reduce: DStream[(String, Int)] = wordAndOne.updateStateByKey(updateFunc,
new HashPartitioner(sc.defaultParallelism), //分区器
true)
reduce.print()
ssc.start()
ssc.awaitTermination()
}
}
Kafka1.0版
注意
如果使用kafka0.10必须在pom文件中添加一下配置
<dependency>
<groupId>jaxen</groupId>
<artifactId>jaxen</artifactId>
<version>1.1.6</version>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-clients</artifactId>
<version>0.10.2.0</version>
</dependency>
代码
package xxx
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010._
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.{Seconds, StreamingContext}
object DirectStream {
def main (args: Array[String]): Unit = {
val conf = new SparkConf().setAppName(this.getClass.getSimpleName)
.setMaster("local[*]")
val ssc = new StreamingContext(conf,Seconds(5))
val group = "sd"
val topic = "study09a1"
//配置kafka的参数
val kafkaParams = Map[String,Object](
"bootstrap.servers" -> "had01:9092,had02:9092,had03:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> group,
"auto.offset.reset" -> "earliest", // lastest
"enable.auto.commit" -> (false: java.lang.Boolean)
)
val topics = Array(topic)
//使用直连的方式读取kafka中的数据,在kafka中记录偏移量
val stream = KafkaUtils.createDirectStream[String,String](
ssc,
//位置策略(如果kafka和spark程序部署在一起,会有最有位置)
PreferConsistent,
//订阅策略(可以指定用正则方式读取topic,比如:my-ordsers-.*)
Subscribe[String,String](topics,kafkaParams)
)
stream.foreachRDD{rdd=> {
//获取偏移量
val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
//处理数据
rdd.foreach(line => {
println(line.key() + "*************" + line.value())
})
//更新偏移量
stream.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges)
}
}
ssc.start()
ssc.awaitTermination()
}
}
来源:oschina
链接:https://my.oschina.net/u/4427158/blog/3144341