例子一:
package sparkday05 import org.apache.spark.util.{CollectionAccumulator, LongAccumulator} import org.apache.spark.{SparkConf, SparkContext} object AccumulatorDemo { def main(args: Array[String]): Unit = { val conf:SparkConf = new SparkConf().setAppName("accumulator").setMaster("local[4]") val sc:SparkContext = new SparkContext(conf) val rdd1 = sc.parallelize(List(1,2,3,4,5,6,7,8)) //long类型的累加器 def longAccumulator(name:String):LongAccumulator = { val acc = new LongAccumulator sc.register(acc,name) acc } val acc1 = longAccumulator("longAcc") //在exevutor端累加 rdd1.foreach(x=>acc1.add(x)) //在driver端执行 println(acc1.value) //collection类型的累加器 def collectionAccumulator(name:String):CollectionAccumulator[Int] = { val acc = new CollectionAccumulator[Int] sc.register(acc,name) acc } val acc2 = collectionAccumulator("collectionACC") rdd1.foreach(x=>acc2.add(x)) println(acc2.value) } }
二.例子2:
package sparkday05 import org.apache.spark.util.LongAccumulator import org.apache.spark.{SparkConf, SparkContext} object AccumulatorDemo2 { def main(args: Array[String]): Unit = { val conf:SparkConf = new SparkConf().setAppName("accumulator").setMaster("local[4]") val sc:SparkContext = new SparkContext(conf) val rdd = sc.parallelize(1 to 100) val acc = new LongAccumulator sc.register(acc,"accDemo") //使用累加器统计rdd里的偶数个数 val rdd1 = rdd.map(x=>{ if(x%2 == 0 ) acc.add(1) else 0 }) //陷阱解决方案,通过cache val rdd2 = rdd1.cache() //累加器陷阱 rdd2.count() println(acc.value) rdd2.count() println(acc.value) } }
3.自定义累加器
package sparkRdd_practice import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.util.AccumulatorV2 import scala.collection.mutable //使用自定义累加器,实现Wordcount功能 class myAccumulator extends AccumulatorV2[String,mutable.HashMap[String,Int]]{ private val hashmapAcc = new mutable.HashMap[String,Int]() //检测累加器是否为空 override def isZero: Boolean = { hashmapAcc.isEmpty } //累加器的拷贝,返回一个新的累加器 override def copy(): AccumulatorV2[String, mutable.HashMap[String, Int]] = { val newAcc = new myAccumulator() hashmapAcc.synchronized { newAcc.hashmapAcc ++= hashmapAcc } newAcc } //重置累加器,清空数据 //重置之后,调用isZero方法,保证返回true override def reset(): Unit = { hashmapAcc.clear() } //task调用,处理每一条数据,数据累加到累加器 def add(word:String) = { hashmapAcc.get(word) match{ case Some(v) => hashmapAcc +=((word,v+1)) case None => hashmapAcc +=((word,1)) } } //合并多个分区中累加器的结果 override def merge(other: AccumulatorV2[String, mutable.HashMap[String, Int]]): Unit = { other match { case acc:AccumulatorV2[String, mutable.HashMap[String, Int]] =>{ for((k,v)<- acc.value){ hashmapAcc.get(k) match { case Some(newv) => hashmapAcc +=((k,v+newv)) case None => hashmapAcc +=((k,v)) } } } } } override def value: mutable.HashMap[String, Int] = hashmapAcc } object DefineAccumulatorDemo { def main(args: Array[String]): Unit = { val conf:SparkConf = new SparkConf().setAppName("accumulator").setMaster("local[4]") val sc:SparkContext = new SparkContext(conf) val rdd = sc.parallelize(List("scala","java","c","scala","c","scala","scala")) //创建一个累加器,然后进行注册 val acc = new myAccumulator() sc.register(acc,"wordCountAcc") rdd.foreach(x=>acc.add(x)) //通过value访问最后累加的结果 println(acc.value) } }
文章来源: https://blog.csdn.net/weixin_43562705/article/details/91439244