How to get connected components in graph analysis without using graphframes pyspark library?

谁都会走 提交于 2020-01-03 05:17:31

问题


This is in continuation of my earlier post and being advised to open another question.

My problem statement is to get connected component id in pyspark(preferred) without using graphframes as my enterprise spark setup does not have this library. I was advised to use the code at the link:

https://mlwhiz.com/blog/2018/12/07/connected_components/#connected-components-in-pyspark

I have a list of edges as tuples of the graph. I am using the below code to create the adjacency list:

from itertools import groupby

edges = [(1, 2), (2, 3), (1, 3)]

adj = {k: [v[1] for v in g] for k, g in groupby(sorted(l), lambda e: e[0])}

Now while running the below code:

def create_edges(line):
    a = [int[x] for x in line.split(" ")]
    edges_list=[]

    for i in range(0, len(a)-1):
        for j in range(i+1 ,len(a)):
            edges_list.append((a[i],a[j]))
            edges_list.append((a[j],a[i]))
    return edges_list

#adj_list.txt is a txt file containing adjacency list of the graph.
adjacency_list = sc.parallelize([adj])

edges_rdd = adjacency_list.flatMap(lambda line : create_edges(line)).distinct()

def largeStarInit(record):
    a, b = record
    yield (a,b)
    yield (b,a)

def largeStar(record):
    a, b = record
    t_list = list(b)
    t_list.append(a)
    list_min = min(t_list)
    for x in b:
        if a < x:
            yield (x,list_min)

def smallStarInit(record):
    a, b = record
    if b<=a:
        yield (a,b)
    else:
        yield (b,a)

def smallStar(record):
    a, b = record
    t_list = list(b)
    t_list.append(a)
    list_min = min(t_list)
    for x in t_list:
        if x!=list_min:
            yield (x,list_min)

#Handle case for single nodes
def single_vertex(line):
    a = [int(x) for x in line.split(" ")]
    edges_list=[]
    if len(a)==1:
        edges_list.append((a[0],a[0]))
    return edges_list

iteration_num =0
while 1==1:
    if iteration_num==0:
        print ("iter", iteration_num)
        large_star_rdd = edges_rdd.groupByKey().flatMap(lambda x : largeStar(x))
        small_star_rdd = large_star_rdd.flatMap(lambda x : smallStarInit(x)).groupByKey().flatMap(lambda x : smallStar(x)).distinct()
        iteration_num += 1

    else:
        print ("iter", iteration_num)
        large_star_rdd = small_star_rdd.flatMap(lambda x: largeStarInit(x)).groupByKey().flatMap(lambda x : largeStar(x)).distinct()
        small_star_rdd = large_star_rdd.flatMap(lambda x : smallStarInit(x)).groupByKey().flatMap(lambda x : smallStar(x)).distinct()
        iteration_num += 1
    #check Convergence

    changes = (large_star_rdd.subtract(small_star_rdd).union(small_star_rdd.subtract(large_star_rdd))).collect()
    if len(changes) == 0 :
        break

single_vertex_rdd = adjacency_list.flatMap(lambda line : single_vertex(line)).distinct()

answer = single_vertex_rdd.collect() + large_star_rdd.collect()

print (answer[:10])

I get the following error:

iter 0
---------------------------------------------------------------------------
Py4JJavaError                             Traceback (most recent call last)
<ipython-input-12-a6ca174b9651> in <module>
     68     #check Convergence
     69
---> 70     changes = (large_star_rdd.subtract(small_star_rdd).union(small_star_rdd.subtract(large_star_rdd))).collect()
     71     if len(changes) == 0 :
     72         break

C:\opt\spark\spark-2.4.3-bin-hadoop2.7\python\pyspark\rdd.py in collect(self)
    814         """
    815         with SCCallSiteSync(self.context) as css:
--> 816             sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
    817         return list(_load_from_socket(sock_info, self._jrdd_deserializer))
    818

C:\opt\spark\spark-2.4.3-bin-hadoop2.7\python\lib\py4j-0.10.7-src.zip\py4j\java_gateway.py in __call__(self, *args)
   1255         answer = self.gateway_client.send_command(command)
   1256         return_value = get_return_value(
-> 1257             answer, self.gateway_client, self.target_id, self.name)
   1258
   1259         for temp_arg in temp_args:

C:\opt\spark\spark-2.4.3-bin-hadoop2.7\python\pyspark\sql\utils.py in deco(*a, **kw)
     61     def deco(*a, **kw):
     62         try:
---> 63             return f(*a, **kw)
     64         except py4j.protocol.Py4JJavaError as e:
     65             s = e.java_exception.toString()

C:\opt\spark\spark-2.4.3-bin-hadoop2.7\python\lib\py4j-0.10.7-src.zip\py4j\protocol.py in get_return_value(answer, gateway_client, target_id, name)
    326                 raise Py4JJavaError(
    327                     "An error occurred while calling {0}{1}{2}.\n".
--> 328                     format(target_id, ".", name), value)
    329             else:
    330                 raise Py4JError(

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 3 in stage 0.0 failed 1 times, most recent failure: Lost task 3.0 in stage 0.0 (TID 3, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "C:\opt\spark\spark-2.4.3-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 377, in main
  File "C:\opt\spark\spark-2.4.3-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 372, in process
  File "C:\opt\spark\spark-2.4.3-bin-hadoop2.7\python\pyspark\rdd.py", line 2499, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "C:\opt\spark\spark-2.4.3-bin-hadoop2.7\python\pyspark\rdd.py", line 2499, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "C:\opt\spark\spark-2.4.3-bin-hadoop2.7\python\pyspark\rdd.py", line 352, in func
    return f(iterator)
  File "C:\opt\spark\spark-2.4.3-bin-hadoop2.7\python\pyspark\rdd.py", line 1861, in combineLocally
    merger.mergeValues(iterator)
  File "C:\opt\spark\spark-2.4.3-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\shuffle.py", line 238, in mergeValues
    for k, v in iterator:
  File "C:\opt\spark\spark-2.4.3-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\util.py", line 99, in wrapper
    return f(*args, **kwargs)
  File "<ipython-input-12-a6ca174b9651>", line 15, in <lambda>
  File "<ipython-input-12-a6ca174b9651>", line 2, in create_edges
AttributeError: 'dict' object has no attribute 'split'

    at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:452)
    at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:588)
    at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:571)
    at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:406)
    at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
    at scala.collection.Iterator$GroupedIterator.fill(Iterator.scala:1124)
    at scala.collection.Iterator$GroupedIterator.hasNext(Iterator.scala:1130)
    at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
    at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)
    at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
    at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
    at org.apache.spark.scheduler.Task.run(Task.scala:121)
    at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
    at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
    at java.lang.Thread.run(Unknown Source)

Driver stacktrace:
    at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1889)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1876)
    at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
    at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
    at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1876)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
    at scala.Option.foreach(Option.scala:257)
    at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2110)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2059)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2048)
    at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
    at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
    at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:945)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
    at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
    at org.apache.spark.rdd.RDD.collect(RDD.scala:944)
    at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:166)
    at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
    at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
    at java.lang.reflect.Method.invoke(Unknown Source)
    at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
    at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
    at py4j.Gateway.invoke(Gateway.java:282)
    at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
    at py4j.commands.CallCommand.execute(CallCommand.java:79)
    at py4j.GatewayConnection.run(GatewayConnection.java:238)
    at java.lang.Thread.run(Unknown Source)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "C:\opt\spark\spark-2.4.3-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 377, in main
  File "C:\opt\spark\spark-2.4.3-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 372, in process
  File "C:\opt\spark\spark-2.4.3-bin-hadoop2.7\python\pyspark\rdd.py", line 2499, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "C:\opt\spark\spark-2.4.3-bin-hadoop2.7\python\pyspark\rdd.py", line 2499, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "C:\opt\spark\spark-2.4.3-bin-hadoop2.7\python\pyspark\rdd.py", line 352, in func
    return f(iterator)
  File "C:\opt\spark\spark-2.4.3-bin-hadoop2.7\python\pyspark\rdd.py", line 1861, in combineLocally
    merger.mergeValues(iterator)
  File "C:\opt\spark\spark-2.4.3-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\shuffle.py", line 238, in mergeValues
    for k, v in iterator:
  File "C:\opt\spark\spark-2.4.3-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\util.py", line 99, in wrapper
    return f(*args, **kwargs)
  File "<ipython-input-12-a6ca174b9651>", line 15, in <lambda>
  File "<ipython-input-12-a6ca174b9651>", line 2, in create_edges
**AttributeError: 'dict' object has no attribute 'split'**

    at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:452)
    at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:588)
    at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:571)
    at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:406)
    at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
    at scala.collection.Iterator$GroupedIterator.fill(Iterator.scala:1124)
    at scala.collection.Iterator$GroupedIterator.hasNext(Iterator.scala:1130)
    at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
    at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)
    at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
    at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
    at org.apache.spark.scheduler.Task.run(Task.scala:121)
    at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
    at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
    ... 1 more

I am unable to solve for AttributeError: 'dict' object has no attribute 'split'.

来源:https://stackoverflow.com/questions/57085669/how-to-get-connected-components-in-graph-analysis-without-using-graphframes-pysp

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!