PageRank
PageRank的python版本
#!/usr/bin/env python # -*- coding: utf-8 -*- """ PageRank算法 author = PuLiming 运行: bin/spark-submit files/pagerank.py data/mllib/pagerank_data.txt 10 """ from __future__ import print_function import re import sys from operator import add from pyspark import SparkConf, SparkContext def compute_contribs(urls, rank): """ 给urls计算 Args: urls: 目标url相邻的urls集合 rank: 目标url的当前rank Returns: url: 相邻urls中的一个url rank: 当前url的新的rank """ num_urls = len(urls) for url in urls: yield (url, rank / num_urls) def split_url(url_line): """ 把一行url切分开来 Args: url_line: 一行url,如 1 2 Returns: url, neighbor_url """ parts = re.split(r'\s+', url_line) # 正则 return parts[0], parts[1] def compute_pagerank(sc, url_data_file, iterations): """ 计算各个page的排名 Args: sc: SparkContext url_data_file: 测试数据文件 iterations: 迭代次数 Returns: status: 成功就返回0 """ # 读取url文件 ['1 2', '1 3', '2 1', '3 1'] lines = sc.textFile(url_data_file).map(lambda line: line.encode('utf8')) # 建立Pair RDD (url, neighbor_urls) [(1,[2,3]), (2,[1]), (3, [1])] links = lines.map(lambda line : split_url(line)).distinct().groupByKey().mapValues(lambda x: list(x)).cache() # 初始化所有url的rank为1 [(1, 1), (2, 1), (3, 1)] ranks = lines.map(lambda line : (line[0], 1)) for i in range(iterations): # (url, [(neighbor_urls), rank]) join neighbor_urls and rank # 把当前url的rank分别contribute到其他相邻的url (url, rank) contribs = links.join(ranks).flatMap( lambda url_urls_rank: compute_contribs(url_urls_rank[1][0], url_urls_rank[1][1]) ) # 把url的所有rank加起来,再赋值新的 ranks = contribs.reduceByKey(add).mapValues(lambda rank : rank * 0.85 + 0.15) for (link, rank) in ranks.collect(): print("%s has rank %s." % (link, rank)) return 0 if __name__ == '__main__': if len(sys.argv) != 3: print("Usage: python pagerank.py <data.txt> <iterations>", file = sys.stderr) sys.exit(-1) # 数据文件和迭代次数 url_data_file = sys.argv[1] iterations = int(sys.argv[2]) # 配置 SparkContext conf = SparkConf().setAppName('PythonPageRank') conf.setMaster('local') sc = SparkContext(conf=conf) ret = compute_pagerank(sc, url_data_file, iterations) sys.exit(ret)
PageRank的scala版本
val sc = new SparkContext(...) val links = sc.objectFile[(String, Seq[String])]("links") .partitionBy(new HashPartitioner(100)) .persist() var ranks = links.mapValues(_ => 1.0) // 迭代10次 for (i <- 0 until 10) { val contributions = links.join(ranks).flatMap { case (pageId, (links, rank)) => links.map(dest => (dest, rank / links.size)) } ranks = contributions.reduceByKey(_ + _).mapValues(0.15 + 0.85* _) } ranks.saveAsTextFile("ranks")
当前scala版本的PageRank算法的优点:
- links每次都会和ranks发生连接操作,所以一开始就对它进行分区
partitionBy
,就不会通过网络进行数据混洗了,节约了相当可观的网络通信开销 - 对links进行
persist
,留在内存中,每次迭代使用 - 第一次创建ranks,使用
mapValues
保留了父RDD的分区方式,第一次连接开销就会很小 reduceByKey
后已经是分区了,再使用mapValues
分区方式,再次和links进行join
就会更加高效