Spark自定义累加器实现WordCount
继承后需要实现几个方法
class AccWordCount extends AccumulatorV2[String, mutable.HashMap[String, Int]]
{
override def isZero: Boolean = ???
override def copy(): AccumulatorV2[String, mutable.HashMap[String, Int]] = ???
override def reset(): Unit = ???
override def add(v: String): Unit = ???
override def merge(other: AccumulatorV2[String, mutable.HashMap[String, Int]]): Unit = ???
override def value: mutable.HashMap[String, Int] = ???
}
private val hashAcc = new mutable.HashMap[String, Int]()
override def isZero: Boolean = hashAcc.isEmpty
override def copy(): AccumulatorV2[String, mutable.HashMap[String, Int]] = {
val newAcc = new AccWordCount
//可能存在多个task同时写入,出现线程安全问题
hashAcc.synchronized
{
newAcc.hashAcc ++= this.hashAcc
}
newAcc
}
override def reset(): Unit = hashAcc.clear()
//局部累加
override def add(v: String): Unit = {
hashAcc.synchronized
{
hashAcc.get(v) match
{
case None => hashAcc += ((v, 1))
case Some(x) => hashAcc += ((v, x + 1))
}
}
}
//全局累加
override def merge(other: AccumulatorV2[String, mutable.HashMap[String, Int]]): Unit = {
other match
{
case o: AccumulatorV2[String, mutable.HashMap[String, Int]] =>
{
for ((k, v) <- o.value)
{
hashAcc.get(k) match
{
case None => hashAcc += ((k, v))
case Some(x) => hashAcc += ((k, v + 1))
}
}
}
}
}
override def value: mutable.HashMap[String, Int] = hashAcc
//模板代码
private val conf: SparkConf = new SparkConf().setAppName(this.getClass.getSimpleName).setMaster("local[*]")
private val sc = new SparkContext(conf)
//数据
private val rdd: RDD[String] = sc.makeRDD(List("hello", "world", "python", "spark", "spark", "hello", "scala", "spark","python"))
//创建累加器对象
private val acc = new AccWordCount
sc.register(acc, "acc")
//分布式累加
rdd.foreach(x => acc.add(x))
println(acc.value)
效果
来源:CSDN
作者:赵昕彧
链接:https://blog.csdn.net/qq_40579464/article/details/103586057