Spark累加器例子

匿名 (未验证) 提交于 2019-12-02 23:40:02

例子一:

package sparkday05  import org.apache.spark.util.{CollectionAccumulator, LongAccumulator} import org.apache.spark.{SparkConf, SparkContext}  object AccumulatorDemo {    def main(args: Array[String]): Unit = {     val conf:SparkConf = new SparkConf().setAppName("accumulator").setMaster("local[4]")     val sc:SparkContext = new SparkContext(conf)      val rdd1 = sc.parallelize(List(1,2,3,4,5,6,7,8))     //long类型的累加器     def longAccumulator(name:String):LongAccumulator = {       val acc = new LongAccumulator       sc.register(acc,name)       acc     }     val acc1 = longAccumulator("longAcc")     //在exevutor端累加     rdd1.foreach(x=>acc1.add(x))     //在driver端执行     println(acc1.value)     //collection类型的累加器     def collectionAccumulator(name:String):CollectionAccumulator[Int] = {       val acc = new CollectionAccumulator[Int]       sc.register(acc,name)       acc     }     val acc2 = collectionAccumulator("collectionACC")     rdd1.foreach(x=>acc2.add(x))     println(acc2.value)    }  }  

二.例子2:

package sparkday05  import org.apache.spark.util.LongAccumulator import org.apache.spark.{SparkConf, SparkContext}  object AccumulatorDemo2 {    def main(args: Array[String]): Unit = {     val conf:SparkConf = new SparkConf().setAppName("accumulator").setMaster("local[4]")     val sc:SparkContext = new SparkContext(conf)      val rdd = sc.parallelize(1 to 100)     val acc = new LongAccumulator     sc.register(acc,"accDemo")     //使用累加器统计rdd里的偶数个数     val rdd1 = rdd.map(x=>{       if(x%2 == 0 ) acc.add(1)       else 0     })     //陷阱解决方案,通过cache     val rdd2 = rdd1.cache()     //累加器陷阱     rdd2.count()     println(acc.value)     rdd2.count()     println(acc.value)    }  }  

3.自定义累加器

package sparkRdd_practice  import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.util.AccumulatorV2  import scala.collection.mutable  //使用自定义累加器,实现Wordcount功能  class myAccumulator extends AccumulatorV2[String,mutable.HashMap[String,Int]]{   private val hashmapAcc = new mutable.HashMap[String,Int]()   //检测累加器是否为空   override def isZero: Boolean = {     hashmapAcc.isEmpty   }   //累加器的拷贝,返回一个新的累加器   override def copy(): AccumulatorV2[String, mutable.HashMap[String, Int]] = {     val newAcc = new myAccumulator()     hashmapAcc.synchronized {       newAcc.hashmapAcc ++= hashmapAcc     }     newAcc   }   //重置累加器,清空数据   //重置之后,调用isZero方法,保证返回true   override def reset(): Unit = {     hashmapAcc.clear()   }   //task调用,处理每一条数据,数据累加到累加器   def add(word:String) = {     hashmapAcc.get(word) match{       case Some(v) => hashmapAcc +=((word,v+1))       case None => hashmapAcc +=((word,1))     }   }   //合并多个分区中累加器的结果   override def merge(other: AccumulatorV2[String, mutable.HashMap[String, Int]]): Unit = {     other match {       case acc:AccumulatorV2[String, mutable.HashMap[String, Int]] =>{         for((k,v)<- acc.value){           hashmapAcc.get(k) match {             case Some(newv) => hashmapAcc +=((k,v+newv))             case None => hashmapAcc +=((k,v))           }         }       }     }   }    override def value: mutable.HashMap[String, Int] = hashmapAcc  }  object DefineAccumulatorDemo {    def main(args: Array[String]): Unit = {      val conf:SparkConf = new SparkConf().setAppName("accumulator").setMaster("local[4]")     val sc:SparkContext = new SparkContext(conf)      val rdd = sc.parallelize(List("scala","java","c","scala","c","scala","scala"))      //创建一个累加器,然后进行注册     val acc = new myAccumulator()     sc.register(acc,"wordCountAcc")      rdd.foreach(x=>acc.add(x))      //通过value访问最后累加的结果     println(acc.value)    }  }  
文章来源: https://blog.csdn.net/weixin_43562705/article/details/91439244
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!