1 redis的事务(pipeline)测试
Redis本身对数据进行操作,单条命令是原子性的,但事务不保证原子性,且没有回滚。事务中任何命令执行失败,其余的命令仍会被执行,将Redis的多个操作放到一起执行,要成功多成功,如果失败了,可以把整个操作放弃,可以实现类似事物的功能。redis事务包含三个阶段:开始事务,命令入队,执行事务。redis的分片副本集集群不支持pipeline,redis只支持单机版的事务(pipeline),Redis的主从复制也支持pipeline(目前一些公司就是这样干的)。若是想用集群,可以使用MongoDB,MongoDB集群支持事物,是一个NoSQL文档数据库,支持存储海量数据、安全、可扩容。
RedisPipelineTest
package com._51doit.spark14 import com._51doit.utils.JedisConnectionPool import redis.clients.jedis.{Jedis, Pipeline} object RedisPipeLineTest { def main(args: Array[String]): Unit = { val jedis: Jedis = JedisConnectionPool.getConnection jedis.select(1) // 获取jedis的pipeline val pipeline: Pipeline = jedis.pipelined() // 开启多个操作在一个批次执行 pipeline.multi() try { pipeline.hincrBy("AAA", "a", 200) var i = 1 / 0 pipeline.hincrBy("BBB", "b", 20) //提交事物 pipeline.exec() pipeline.sync() } catch { case e: Exception => { //将脏数据废弃 pipeline.discard() e.printStackTrace() } } finally { pipeline.close() jedis.close() } } }
2. 利用redis的pipeline实现数据统计的exactlyonce
ExactlyOnceWordCountOffsetStoreInRedis
package cn._51doit.spark.day14 import cn._51doit.spark.utils.{JedisConnectionPool, OffsetUtils} import org.apache.kafka.clients.consumer.ConsumerRecord import org.apache.kafka.common.TopicPartition import org.apache.spark.SparkConf import org.apache.spark.rdd.RDD import org.apache.spark.streaming.dstream.InputDStream import org.apache.spark.streaming.kafka010._ import org.apache.spark.streaming.{Milliseconds, StreamingContext} import redis.clients.jedis.{Jedis, Pipeline} /** * 从Kafka读取数据,实现ExactlyOnce,偏移量保存到Redis中 * 1.将聚合好的数据,收集到Driver端, * 2.然后将计算好的数据和偏移量在一个pipeline中同时保存到Redis中 * 3.成功了提交事物 * 4.失败了废弃原来的数据并让这个任务重启 */ object ExactlyOnceWordCountOffsetStoreInRedis { def main(args: Array[String]): Unit = { //true a1 g1 ta,tb val Array(isLocal, appName, groupId, allTopics) = args val conf = new SparkConf() .setAppName(appName) if (isLocal.toBoolean) { conf.setMaster("local[*]") } //创建StreamingContext,并指定批次生成的时间 val ssc = new StreamingContext(conf, Milliseconds(5000)) //设置日志级别 ssc.sparkContext.setLogLevel("WARN") //SparkStreaming 跟kafka进行整合 //1.导入跟Kafka整合的依赖 //2.跟kafka整合,创建直连的DStream【使用底层的消费API,效率更高】 val topics = allTopics.split(",") //SparkSteaming跟kafka整合的参数 //kafka的消费者默认的参数就是每5秒钟自动提交偏移量到Kafka特殊的topic中: __consumer_offsets val kafkaParams = Map[String, Object]( "bootstrap.servers" -> "node-1.51doit.cn:9092,node-2.51doit.cn:9092,node-3.51doit.cn:9092", "key.deserializer" -> "org.apache.kafka.common.serialization.StringDeserializer", "value.deserializer" -> "org.apache.kafka.common.serialization.StringDeserializer", "group.id" -> groupId, "auto.offset.reset" -> "earliest" //如果没有记录偏移量,第一次从最开始读,有偏移量,接着偏移量读 , "enable.auto.commit" -> (false: java.lang.Boolean) //消费者不自动提交偏移量 ) //在创建KafkaDStream之前要先读取Redis数据库,查询历史偏移量,没有就从头读,有就接着读 //offsets: collection.Map[TopicPartition, Long] val offsets: Map[TopicPartition, Long] = OffsetUtils.queryHistoryOffsetFromRedis(appName, groupId) //跟Kafka进行整合,需要引入跟Kafka整合的依赖 //createDirectStream更加高效,使用的是Kafka底层的消费API,消费者直接连接到Kafka的Leader分区进行消费 //直连方式,RDD的分区数量和Kafka的分区数量是一一对应的【数目一样】 val kafkaDStream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String, String]( ssc, LocationStrategies.PreferConsistent, //调度task到Kafka所在的节点 ConsumerStrategies.Subscribe[String, String](topics, kafkaParams, offsets) //指定订阅Topic的规则 ) kafkaDStream.foreachRDD(rdd => { //判断当前批次的RDD是否有数据 if (!rdd.isEmpty()) { //获取RDD所有分区的偏移量 val offsetRanges: Array[OffsetRange] = rdd.asInstanceOf[HasOffsetRanges].offsetRanges //实现WordCount业务逻辑 val words: RDD[String] = rdd.flatMap(_.value().split(" ")) val wordsAndOne: RDD[(String, Int)] = words.map((_, 1)) val reduced: RDD[(String, Int)] = wordsAndOne.reduceByKey(_ + _) //将计算好的结果收集到Driver端再写入到Redis中【保证数据和偏移量写入在一个事物中】 //触发Action,将数据收集到Driver段 val res: Array[(String, Int)] = reduced.collect() var jedis: Jedis = null var pipeline: Pipeline = null //创建一个Redis的连接【在Driver端创建】 try { jedis = JedisConnectionPool.getConnection() //使用pipeline pipeline = jedis.pipelined() pipeline.select(1) //开启多个操作在一起执行 pipeline.multi() //写入计算好的结果 for (tp <- res) { pipeline.hincrBy("WORD_COUNT", tp._1, tp._2) } //写入偏移量 for (offsetRange <- offsetRanges) { val topic = offsetRange.topic val partition = offsetRange.partition val untilOffset = offsetRange.untilOffset //将原来的偏移量覆盖 pipeline.hset(appName +"_" + groupId, topic + "_" + partition, untilOffset.toString) } //类似提交事物 pipeline.exec() pipeline.sync() } catch { case e: Exception => { pipeline.discard() e.printStackTrace() ssc.stop() } } finally { pipeline.close() jedis.close() } } }) ssc.start() ssc.awaitTermination() } }
查询redis的历史偏移量:OffsetUtils(queryHistoryOffsetFromRedis)
package cn._51doit.spark.utils import java.sql.{Connection, DriverManager, ResultSet} import java.util import org.apache.kafka.common.TopicPartition import org.apache.spark.streaming.kafka010.OffsetRange import scala.collection.mutable object OffsetUtils { def queryHistoryOffsetFromMySQL(appName: String, groupId: String): Map[TopicPartition, Long] = { val offsets = new mutable.HashMap[TopicPartition, Long]() val connection = DriverManager.getConnection("jdbc:mysql://localhost:3306/bigdata", "root", "123456") val ps = connection.prepareStatement("SELECT topic_partition, offset FROM t_kafka_offset WHERE" + " app_gid = ?") ps.setString(1, appName + "_" +groupId) val rs = ps.executeQuery() while (rs.next()) { val topicAndPartition = rs.getString(1) val offset = rs.getLong(2) val fields = topicAndPartition.split("_") val topic = fields(0) val partition = fields(1).toInt val topicPartition = new TopicPartition(topic, partition) //将构建好的TopicPartition放入map中 offsets(topicPartition) = offset } offsets.toMap } /** * 将偏移量更新到MySQL中 * @param offsetRanges * @param connection */ def updateOffsetToMySQL(appNameAndGroupId: String, offsetRanges: Array[OffsetRange], connection: Connection) = { val ps = connection.prepareStatement("INSERT INTO t_kafka_offset (app_gid, topic_partition, offset) VALUES (?, ?, ?) ON DUPLICATE KEY UPDATE offset = ?") for (offsetRange <- offsetRanges) { //topic名称 val topic = offsetRange.topic //topic分区编号 val partition = offsetRange.partition //获取结束偏移量 val untilOffset = offsetRange.untilOffset //将结果写入MySQL ps.setString(1, appNameAndGroupId) ps.setString(2, topic + "_" + partition) ps.setLong(3, untilOffset) ps.setLong(4, untilOffset) ps.executeUpdate() } ps.close() } /** * 从Redis中查询历史偏移量 * @param appName * @param groupId * @return */ def queryHistoryOffsetFromRedis(appName: String, groupId: String): Map[TopicPartition, Long] = { val offsets = new mutable.HashMap[TopicPartition, Long]() val jedis = JedisConnectionPool.getConnection() jedis.select(1) val topicPartitionAndOffsets: util.Map[String, String] = jedis.hgetAll(appName + "_" + groupId) //导入隐式转换 import scala.collection.JavaConversions._ for((topicAndPartition, offset) <- topicPartitionAndOffsets) { val fields = topicAndPartition.split("_") val topic = fields(0) val partition = fields(1).toInt val topicPartition = new TopicPartition(topic, partition) offsets(topicPartition) = offset.toLong } offsets.toMap } //每一次启动该程序,都要从Hbase查询历史偏移量 def queryHistoryOffsetFromHbase(view: String, groupid: String): Map[TopicPartition, Long] = { val offsets = new mutable.HashMap[TopicPartition, Long]() val connection = DriverManager.getConnection("jdbc:phoenix:node-1.51doit.cn,node-2.51doit.cn,node-3.51doit.cn:2181") val ps = connection.prepareStatement("select \"topic_partition\", max(\"offset\") from \"myorder\" where \"groupid\" = ? group by \"topic_partition\"") ps.setString(1, groupid) //查询返回结果 val rs: ResultSet = ps.executeQuery() while(rs.next()) { val topicAndPartition = rs.getString(1) val fields = topicAndPartition.split("_") val topic = fields(0) val partition = fields(1).toInt val offset = rs.getLong(2) offsets.put(new TopicPartition(topic, partition), offset) } offsets.toMap } }
以上的查询偏移量,以及将偏移量都可以写到一个工具类中,封装成方法,上诉OffsetUtils中对将偏移量存mysql这样走了
注意:以上的统计结果都能收集到driver端的原因是数据统计是聚合类的操作(数据量必定小),若不是聚合类的操作,则不能收集到driver端,进而达不到将数据和偏移量同时写入数据库的需求,解决办法如3
3 SparkStreaming中数据写入Hbase实现ExactlyOnce
首都
来源:https://www.cnblogs.com/jj1106/p/12383885.html