问题
This is in continuation of my earlier post and being advised to open another question.
My problem statement is to get connected component id in pyspark(preferred) without using graphframes as my enterprise spark setup does not have this library. I was advised to use the code at the link:
https://mlwhiz.com/blog/2018/12/07/connected_components/#connected-components-in-pyspark
I have a list of edges as tuples of the graph. I am using the below code to create the adjacency list:
from itertools import groupby
edges = [(1, 2), (2, 3), (1, 3)]
adj = {k: [v[1] for v in g] for k, g in groupby(sorted(l), lambda e: e[0])}
Now while running the below code:
def create_edges(line):
a = [int[x] for x in line.split(" ")]
edges_list=[]
for i in range(0, len(a)-1):
for j in range(i+1 ,len(a)):
edges_list.append((a[i],a[j]))
edges_list.append((a[j],a[i]))
return edges_list
#adj_list.txt is a txt file containing adjacency list of the graph.
adjacency_list = sc.parallelize([adj])
edges_rdd = adjacency_list.flatMap(lambda line : create_edges(line)).distinct()
def largeStarInit(record):
a, b = record
yield (a,b)
yield (b,a)
def largeStar(record):
a, b = record
t_list = list(b)
t_list.append(a)
list_min = min(t_list)
for x in b:
if a < x:
yield (x,list_min)
def smallStarInit(record):
a, b = record
if b<=a:
yield (a,b)
else:
yield (b,a)
def smallStar(record):
a, b = record
t_list = list(b)
t_list.append(a)
list_min = min(t_list)
for x in t_list:
if x!=list_min:
yield (x,list_min)
#Handle case for single nodes
def single_vertex(line):
a = [int(x) for x in line.split(" ")]
edges_list=[]
if len(a)==1:
edges_list.append((a[0],a[0]))
return edges_list
iteration_num =0
while 1==1:
if iteration_num==0:
print ("iter", iteration_num)
large_star_rdd = edges_rdd.groupByKey().flatMap(lambda x : largeStar(x))
small_star_rdd = large_star_rdd.flatMap(lambda x : smallStarInit(x)).groupByKey().flatMap(lambda x : smallStar(x)).distinct()
iteration_num += 1
else:
print ("iter", iteration_num)
large_star_rdd = small_star_rdd.flatMap(lambda x: largeStarInit(x)).groupByKey().flatMap(lambda x : largeStar(x)).distinct()
small_star_rdd = large_star_rdd.flatMap(lambda x : smallStarInit(x)).groupByKey().flatMap(lambda x : smallStar(x)).distinct()
iteration_num += 1
#check Convergence
changes = (large_star_rdd.subtract(small_star_rdd).union(small_star_rdd.subtract(large_star_rdd))).collect()
if len(changes) == 0 :
break
single_vertex_rdd = adjacency_list.flatMap(lambda line : single_vertex(line)).distinct()
answer = single_vertex_rdd.collect() + large_star_rdd.collect()
print (answer[:10])
I get the following error:
iter 0
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-12-a6ca174b9651> in <module>
68 #check Convergence
69
---> 70 changes = (large_star_rdd.subtract(small_star_rdd).union(small_star_rdd.subtract(large_star_rdd))).collect()
71 if len(changes) == 0 :
72 break
C:\opt\spark\spark-2.4.3-bin-hadoop2.7\python\pyspark\rdd.py in collect(self)
814 """
815 with SCCallSiteSync(self.context) as css:
--> 816 sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
817 return list(_load_from_socket(sock_info, self._jrdd_deserializer))
818
C:\opt\spark\spark-2.4.3-bin-hadoop2.7\python\lib\py4j-0.10.7-src.zip\py4j\java_gateway.py in __call__(self, *args)
1255 answer = self.gateway_client.send_command(command)
1256 return_value = get_return_value(
-> 1257 answer, self.gateway_client, self.target_id, self.name)
1258
1259 for temp_arg in temp_args:
C:\opt\spark\spark-2.4.3-bin-hadoop2.7\python\pyspark\sql\utils.py in deco(*a, **kw)
61 def deco(*a, **kw):
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
65 s = e.java_exception.toString()
C:\opt\spark\spark-2.4.3-bin-hadoop2.7\python\lib\py4j-0.10.7-src.zip\py4j\protocol.py in get_return_value(answer, gateway_client, target_id, name)
326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 3 in stage 0.0 failed 1 times, most recent failure: Lost task 3.0 in stage 0.0 (TID 3, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "C:\opt\spark\spark-2.4.3-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 377, in main
File "C:\opt\spark\spark-2.4.3-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 372, in process
File "C:\opt\spark\spark-2.4.3-bin-hadoop2.7\python\pyspark\rdd.py", line 2499, in pipeline_func
return func(split, prev_func(split, iterator))
File "C:\opt\spark\spark-2.4.3-bin-hadoop2.7\python\pyspark\rdd.py", line 2499, in pipeline_func
return func(split, prev_func(split, iterator))
File "C:\opt\spark\spark-2.4.3-bin-hadoop2.7\python\pyspark\rdd.py", line 352, in func
return f(iterator)
File "C:\opt\spark\spark-2.4.3-bin-hadoop2.7\python\pyspark\rdd.py", line 1861, in combineLocally
merger.mergeValues(iterator)
File "C:\opt\spark\spark-2.4.3-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\shuffle.py", line 238, in mergeValues
for k, v in iterator:
File "C:\opt\spark\spark-2.4.3-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\util.py", line 99, in wrapper
return f(*args, **kwargs)
File "<ipython-input-12-a6ca174b9651>", line 15, in <lambda>
File "<ipython-input-12-a6ca174b9651>", line 2, in create_edges
AttributeError: 'dict' object has no attribute 'split'
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:452)
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:588)
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:571)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:406)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$GroupedIterator.fill(Iterator.scala:1124)
at scala.collection.Iterator$GroupedIterator.hasNext(Iterator.scala:1130)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
at org.apache.spark.scheduler.Task.run(Task.scala:121)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
at java.lang.Thread.run(Unknown Source)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1889)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1876)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1876)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2110)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2059)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2048)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:945)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
at org.apache.spark.rdd.RDD.collect(RDD.scala:944)
at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:166)
at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
at java.lang.reflect.Method.invoke(Unknown Source)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Unknown Source)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "C:\opt\spark\spark-2.4.3-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 377, in main
File "C:\opt\spark\spark-2.4.3-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 372, in process
File "C:\opt\spark\spark-2.4.3-bin-hadoop2.7\python\pyspark\rdd.py", line 2499, in pipeline_func
return func(split, prev_func(split, iterator))
File "C:\opt\spark\spark-2.4.3-bin-hadoop2.7\python\pyspark\rdd.py", line 2499, in pipeline_func
return func(split, prev_func(split, iterator))
File "C:\opt\spark\spark-2.4.3-bin-hadoop2.7\python\pyspark\rdd.py", line 352, in func
return f(iterator)
File "C:\opt\spark\spark-2.4.3-bin-hadoop2.7\python\pyspark\rdd.py", line 1861, in combineLocally
merger.mergeValues(iterator)
File "C:\opt\spark\spark-2.4.3-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\shuffle.py", line 238, in mergeValues
for k, v in iterator:
File "C:\opt\spark\spark-2.4.3-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\util.py", line 99, in wrapper
return f(*args, **kwargs)
File "<ipython-input-12-a6ca174b9651>", line 15, in <lambda>
File "<ipython-input-12-a6ca174b9651>", line 2, in create_edges
**AttributeError: 'dict' object has no attribute 'split'**
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:452)
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:588)
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:571)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:406)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$GroupedIterator.fill(Iterator.scala:1124)
at scala.collection.Iterator$GroupedIterator.hasNext(Iterator.scala:1130)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
at org.apache.spark.scheduler.Task.run(Task.scala:121)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
... 1 more
I am unable to solve for AttributeError: 'dict' object has no attribute 'split'
.
来源:https://stackoverflow.com/questions/57085669/how-to-get-connected-components-in-graph-analysis-without-using-graphframes-pysp