准备工作
import pyspark
from pyspark import SparkContext
from pyspark import SparkConf
conf=SparkConf().setAppName("lg").setMaster('local[4]')
sc=SparkContext.getOrCreate(conf)
1. aggregateByKey
aggregateByKey中前一个函数是在各分区内计算的函数,后一个函数是聚合个分区结果的函数
其中zeroVal是对每个元素进行计算时的初始值,和分区无关。
rdd = sc.parallelize([('B',1),('B',2),('A',3),('A',4),('A',5)])
zeroVal = 1
mergeVal = (lambda aggregated,el:aggregated+el) #aggregated即zeroVal
mergeComb = (lambda agg1,agg2:agg1+agg2)
result = rdd.aggregateByKey(zeroVal,mergeVal,mergeComb)
print(rdd.glom().collect())
print(result.collect())
[[('B', 1)], [('B', 2)], [('A', 3)], [('A', 4)], [('A', 5)]]
[('A', 15), ('B', 5)]
2. reduceByKey和GroupByKey
都是按key对元素进行聚合计算,但原理有所不同。
def add(a,b):
c = a + b
return c
rdd = sc.parallelize([('a',1),('b',1),('a',1),('a',1),('b',1),('b',1),('a',1),('a',1),('a',1),('b',1),('b',1),('b',1)])
print(rdd.reduceByKey(add).collect())
print(rdd.groupByKey().mapValues(len).collect())
[('b', 6), ('a', 6)]
[('b', 6), ('a', 6)]
(1)groupByKey()是对RDD中的所有数据做shuffle,根据不同的Key映射到不同的partition中再进行aggregate。
(2)aggregateByKey()是先对每个partition中的数据根据不同的Key进行aggregate,然后将结果进行shuffle,完成各个partition之间的aggregate。因此,和groupByKey()相比,运算量小了很多。
(3)reduceByKey()也是先在单台机器中计算,再将结果进行shuffle,I/O开销比groupByKey要小。
3. sortByKey和sortBy
sortByKey是按元素的键进行排序,可指定降序或者升序;sortBy则还可以指定按键或者按值进行排序。
rdd1 = sc.parallelize([(1,'one'),(6,'six'),(7,'seven'),(2,'two'),(3,'three'),(4,'four'),(5,'five'),(8,'eight'),(9,'night'),(10,'ten')])
rdd2 = rdd1.sortByKey(ascending=True, numPartitions=None)
rdd3 = rdd1.sortBy(ascending=True, numPartitions=None, keyfunc = lambda x: x[1])
rdd4 = rdd1.sortBy(ascending=False, numPartitions=None, keyfunc = lambda x: x[0])
print(rdd2.collect())
print(rdd3.collect())
print(rdd4.collect())
[(1, 'one'), (2, 'two'), (3, 'three'), (4, 'four'), (5, 'five'), (6, 'six'), (7, 'seven'), (8, 'eight'), (9, 'night'), (10, 'ten')]
[(8, 'eight'), (5, 'five'), (4, 'four'), (9, 'night'), (1, 'one'), (7, 'seven'), (6, 'six'), (10, 'ten'), (3, 'three'), (2, 'two')]
[(10, 'ten'), (9, 'night'), (8, 'eight'), (7, 'seven'), (6, 'six'), (5, 'five'), (4, 'four'), (3, 'three'), (2, 'two'), (1, 'one')]
4. countByKey
count返回rdd中元素的个数,返回一个int; countByKey返回rdd中每个元素键出现的次数,如果元素是字符串,则将首字母作为键;如果元素是数值类型则报错 countByValue返回rdd中不同元素出现的个数,返回的是一个字典。
counts = rdd1.count()
print("Number of elements in RDD -> %i" % counts)
print("Number of every elements in RDD -> %s" % rdd1.countByKey())
print("Number of every elements in RDD -> %s" % rdd1.countByValue())
Number of elements in RDD -> 10
Number of every elements in RDD -> defaultdict(<class 'int'>, {1: 1, 6: 1, 7: 1, 2: 1, 3: 1, 4: 1, 5: 1, 8: 1, 9: 1, 10: 1})
Number of every elements in RDD -> defaultdict(<class 'int'>, {(1, 'one'): 1, (6, 'six'): 1, (7, 'seven'): 1, (2, 'two'): 1, (3, 'three'): 1, (4, 'four'): 1, (5, 'five'): 1, (8, 'eight'): 1, (9, 'night'): 1, (10, 'ten'): 1})
rdd1 = sc.parallelize(['hive','hbase','hadoop','spark','flink','storm'])
Number of every elements in RDD1 -> defaultdict(<class 'int'>, {'h': 3, 's': 2, 'f': 1})
5. combineByKey
参数 createCombiner:实现输入RDD[(K,V)]中V到结果RDD[(K,C)]中C的转换, V和C可能是相同类型,也可能是不同类型 mergeValue:将V合并到C中 mergeCombiners:对mergeValue产生的C进一步合并,即是reduce操作。
SparkContext.defaultParallelism = 5
rdd = sc.parallelize([('B',1),('B',2),('A',3),('A',4),('A',5)])
print(rdd.glom().collect())
def createCombiner(el): #el对应每个分区第一个元素的value
c = el**2
return c
def mergeValue(aggregated, el): #aggregated和el分别对应每个分区内已聚合的结果和待聚合的元素
c = aggregated + el
return c
def mergeValue2(aggregated, el):
c = aggregated #此处aggregated初始值是每个分区的第一个元素的value
return c
def mergeValue3(aggregated, el):
c = el #此处el是每个分区的最后一个元素的value
return c
def mergeCombiners(agg1, agg2): #agg1和agg2对应两个分区的结果
c = agg1 + agg2
return c
result1 = rdd.combineByKey(createCombiner, mergeValue, mergeCombiners)
result2 = rdd.combineByKey(createCombiner, mergeValue2, mergeCombiners)
result3 = rdd.combineByKey(createCombiner, mergeValue3, mergeCombiners)
print(result1.collect())
print(result2.collect())
print(result3.collect())
[[('B', 1)], [('B', 2)], [('A', 3)], [('A', 4)], [('A', 5)]]
[('B', 6), ('A', 18)]
[('B', 1), ('A', 9)]
[('B', 3), ('A', 5)]
6. sampleByKey
用于数据的抽样,可按key指定样本被抽样的概率
rdd = sc.parallelize([('A',1),('B',2),('C',3),('A',4),('D',5),('A',6),('A',3),('B',2)])
rdd2 = rdd.sampleByKey(withReplacement=False,
fractions={'A':0.3, 'B':0.5, 'C':0.2, 'D':1})
rdd2.collect()
[('A', 1), ('D', 5), ('A', 6)]
7. subtractByKey
按key是否相同对两个rdd中的元素进行相减(类似差集)
rdd1 = sc.parallelize([('A',1),('B',2),('C',3),('A',4),('D',5),('A',6),('A',3),('B',2)])
rdd2 = rdd = sc.parallelize([('A',1),('B',2),('B',8),('E',10)])
rdd3 = rdd1.subtractByKey(rdd2)
rdd4 = rdd2.subtractByKey(rdd1)
print(rdd3.collect())
print(rdd4.collectAsMap())
[('D', 5), ('C', 3)]
{'E': 10}
来源:CSDN
作者:zhuzuwei
链接:https://blog.csdn.net/zhuzuwei/article/details/104446388