问题
I have a cluster with 4 nodes: 3 Spark nodes and 1 Solr node. My CPU is 8 core, my memory is 32 GB, disc space is SSD. I use cassandra as my database. My data amount is 22GB after 6 hours and I now have around 3,4 Million rows, which should be read in under 5 minutes.
But already it can't complete the task in this amount of time. My future plan is to read 100 Million rows in under 5 minutes. I am not sure what I can increase or do better to achieve this result now as well as to achieve my future goal. Is that even possible or would it be better to use spark for the real time analysis and use for example hadoop for longer tail data (older then 1 day or a couple of hours)? Thanks a lot! Here is my Spark app code:
import sys
import json
from pyspark import SparkContext, SparkConf
from pyspark.streaming import StreamingContext
from pyspark.sql import SQLContext, Row
from pyspark.streaming.kafka import KafkaUtils
from datetime import datetime, timedelta
from dateutil.parser import parse
from cassandra.cluster import Cluster
import pytz
from dateutil.tz import tzutc
tz = pytz.timezone('')
appname = str(sys.argv[1])
source = str(sys.argv[2])
cluster = Cluster(['localhost']);
session_statis = cluster.connect('keyspace')
def read_json(x):
try:
y = json.loads(x)
except:
y = 0
return y
def TransformInData(x):
try:
body = json.loads(x['body'])
return (body['articles'])
except:
return 0
def axesTransformData(x):
try:
body = json.loads(x['body'])
return (body)
except:
return 0
def storeDataToCassandra(rdd):
rdd_cassandra =rdd.map(lambda x:(x[0],(x[0],x[1]['thumbnail'], x[1]['title'], x[1]['url'], datetime.strptime(parse(x[1]['created_at']).strftime('%Y-%m-%d %H:%M:%S'), "%Y-%m-%d %H:%M:%S"),source, x[1]['category'] if x[1]['category'] else '', x[1]['channel'],x[1]['genre']))) \
.subtract(articles)
rdd_article = rdd_cassandra.map(lambda x:Row(id=x[1][0],source=x[1][5],thumbnail=x[1][1],title=x[1][2],url=x[1][3],created_at=x[1][4],category=x[1][6],channel=x[1][7],genre=x[1][8]))
rdd_schedule = rdd_cassandra.map(lambda x:Row(source=x[1][5],type='article',scheduled_for=x[1][4]+timedelta(minutes=5),id=x[1][0]))
rdd_article_by_created_at = rdd_cassandra.map(lambda x:Row(source=x[1][5],created_at=x[1][4],article=x[1][0]))
rdd_article_by_url = rdd_cassandra.map(lambda x:Row(url=x[1][3],article=x[1][0]))
if rdd_article.count()>0:
result_rdd_article = sqlContext.createDataFrame(rdd_article)
result_rdd_article.write.format("org.apache.spark.sql.cassandra").options(table="tablename", keyspace = "keyspace").save(mode ="append")
if rdd_schedule.count()>0:
result_rdd_schedule = sqlContext.createDataFrame(rdd_schedule)
result_rdd_schedule.write.format("org.apache.spark.sql.cassandra").options(table="tablename", keyspace = "keyspace").save(mode ="append")
if rdd_article_by_created_at.count()>0:
result_rdd_article_by_created_at = sqlContext.createDataFrame(rdd_article_by_created_at)
result_rdd_article_by_created_at.write.format("org.apache.spark.sql.cassandra").options(table="tablename", keyspace = "keyspace").save(mode ="append")
if rdd_article_by_url.count()>0:
result_rdd_article_by_url = sqlContext.createDataFrame(rdd_article_by_url)
result_rdd_article_by_url.write.format("org.apache.spark.sql.cassandra").options(table="tablename", keyspace = "keyspace").save(mode ="append")
#
def axesStoreToCassandra(rdd):
axes_rdd = rdd.map(lambda x:Row(article=x[1]['id'],at=datetime.now(),comments=x[1]['comments'],likes=x[1]['attitudes'],reads=0,shares=x[1]['reposts']))
if axes_rdd.count()>0:
result_axes_rdd = sqlContext.createDataFrame(axes_rdd)
result_axes_rdd.write.format("org.apache.spark.sql.cassandra").options(table="tablename", keyspace = "keyspace").save(mode ="append")
def joinstream(rdd):
article_channels = articlestat.join(channels).map(lambda x:(x[1][0]['id'],{'id':x[1][0]['id'],'thumbnail':x[1][0]['thumbnail'],'title':x[1][0]['title'],'url':x[1][0]['url'],'created_at':x[1][0]['created_at'],'source':x[1][0]['source'],'genre':x[1][0]['genre'],'category':x[1][1]['category'],'author':x[1][1]['author']}))
speed_rdd = axes.map(lambda x:(x.article,[[x.at,x.comments,x.likes,x.reads,x.shares]])) \
.reduceByKey(lambda x,y:x+y) \
.map(lambda x:(x[0],sorted(x[1],key=lambda y:y[0],reverse = True)[0],sorted(x[1],key=lambda y:y[0],reverse = True)[1]) if len(x[1])>=2 else (x[0],sorted(x[1],key=lambda y:y[0],reverse = True)[0],[sorted(x[1],key=lambda y:y[0],reverse = True)[0][0]-timedelta(seconds=300),0,0,0,0])) \
.filter(lambda x:(x[1][0]-x[2][0]).seconds>0) \
.map(lambda x:(x[0],{'id':x[0],'comments':x[1][1],'likes':x[1][2],'reads':x[1][3],'shares':x[1][4],'speed':int(5*288*((x[1][4]-x[2][4])/((x[1][0]-x[2][0]).seconds/60.0)))})) \
.filter(lambda x:x[1]['speed']>=0) \
.filter(lambda x:x[1]['shares']>0)
statistics = article_channels.join(speed_rdd) \
.map(lambda x:{'id':x[1][0]['id'],'thumbnail':x[1][0]['thumbnail'],'title':x[1][0]['title'],'url':x[1][0]['url'],'created_at':x[1][0]['created_at'],'source':x[1][0]['source'],'category':x[1][0]['category'],'author':x[1][0]['author'],'genre':x[1][0]['genre'],'comments':x[1][1]['comments'],'likes':x[1][1]['likes'],'reads':x[1][1]['reads'],'shares':x[1][1]['shares'],'speed':x[1][1]['speed']})
timeone=datetime.now()-timedelta(hours=1)
timethree = datetime.now()-timedelta(hours=3)
timesix = datetime.now()-timedelta(hours=6)
timetwelve = datetime.now()-timedelta(hours=12)
timetwentyfour = datetime.now()-timedelta(hours=24)
result1 = statistics.filter(lambda x:x['created_at']+timedelta(hours=8)>=timeone).map(lambda x:Row(timespan='1',source=source,id=x['id'],title=x['title'],thumbnail=x['thumbnail'],url=x['url'],created_at=x['created_at']+timedelta(hours=8),genre=x['genre'],reads=0,likes=x['likes'],comments=x['comments'],shares=x['shares'],speed=x['speed'],category=x['category'],author=x['author']))
result3 = statistics.filter(lambda x:x['created_at']+timedelta(hours=8)>=timethree and x['created_at']+timedelta(hours=8)<=timeone).map(lambda x:Row(timespan='3',source=source,id=x['id'],title=x['title'],thumbnail=x['thumbnail'],url=x['url'],created_at=x['created_at']+timedelta(hours=8),genre=x['genre'],reads=0,likes=x['likes'],comments=x['comments'],shares=x['shares'],speed=x['speed'],category=x['category'],author=x['author']))
result6 = statistics.filter(lambda x:x['created_at']+timedelta(hours=8)>=timesix and x['created_at']+timedelta(hours=8)<=timethree).map(lambda x:Row(timespan='6',source=source,id=x['id'],title=x['title'],thumbnail=x['thumbnail'],url=x['url'],created_at=x['created_at']+timedelta(hours=8),genre=x['genre'],reads=0,likes=x['likes'],comments=x['comments'],shares=x['shares'],speed=x['speed'],category=x['category'],author=x['author']))
result12 = statistics.filter(lambda x:x['created_at']+timedelta(hours=8)>=timetwelve and x['created_at']+timedelta(hours=8)<=timesix).map(lambda x:Row(timespan='12',source=source,id=x['id'],title=x['title'],thumbnail=x['thumbnail'],url=x['url'],created_at=x['created_at']+timedelta(hours=8),genre=x['genre'],reads=0,likes=x['likes'],comments=x['comments'],shares=x['shares'],speed=x['speed'],category=x['category'],author=x['author']))
result24 = statistics.filter(lambda x:x['created_at']+timedelta(hours=8)>=timetwentyfour and x['created_at']+timedelta(hours=8)<=timetwelve).map(lambda x:Row(timespan='24',source=source,id=x['id'],title=x['title'],thumbnail=x['thumbnail'],url=x['url'],created_at=x['created_at']+timedelta(hours=8),genre=x['genre'],reads=0,likes=x['likes'],comments=x['comments'],shares=x['shares'],speed=x['speed'],category=x['category'],author=x['author']))
if result1.count()>0:
session_statis.execute('DELETE FROM tablename WHERE source = %s and timespan= %s', (source,'1'))
resultschema1 = sqlContext.createDataFrame(result1)
resultschema1.write.format("org.apache.spark.sql.cassandra").options(table="tablename", keyspace = "keyspace").save(mode ="append")
if result3.count()>0:
session_statis.execute('DELETE FROM tablename WHERE source = %s and timespan= %s', (source,'3'))
resultschema3 = sqlContext.createDataFrame(result3)
resultschema3.write.format("org.apache.spark.sql.cassandra").options(table="tablename", keyspace = "keyspace").save(mode ="append")
if result6.count()>0:
session_statis.execute('DELETE FROM tablename WHERE source = %s and timespan= %s', (source,'6'))
resultschema6 = sqlContext.createDataFrame(result6)
resultschema6.write.format("org.apache.spark.sql.cassandra").options(table="tablename", keyspace = "keyspace").save(mode ="append")
if result12.count()>0:
session_statis.execute('DELETE FROM tablename WHERE source = %s and timespan= %s', (source,'12'))
resultschema12 = sqlContext.createDataFrame(result12)
resultschema12.write.format("org.apache.spark.sql.cassandra").options(table="tablename", keyspace = "keyspace").save(mode ="append")
if result24.count()>0:
session_statis.execute('DELETE FROM tablename WHERE source = %s and timespan= %s', (source,'24'))
resultschema24 = sqlContext.createDataFrame(result24)
resultschema24.write.format("org.apache.spark.sql.cassandra").options(table="tablename", keyspace = "keyspace").save(mode ="append")
conf = SparkConf().setAppName(appname)
sc = SparkContext(conf=conf)
ssc = StreamingContext(sc,30)
sqlContext = SQLContext(sc)
channels = sc.cassandraTable("keyspace","tablename").map(lambda x:(x.id,{'author':x.name,'category':x.category}))
articles = sc.cassandraTable('keyspace','tablename').map(lambda x:(x.id,(x.id,x.thumbnail,x.title,x.url,x.created_at+timedelta(hours=8),source,x.category,x.channel,x.genre)))
articlestat = sc.cassandraTable('keyspace','tablename').map(lambda x:(x.channel,{'id':x.id,'thumbnail':x.thumbnail,'title':x.title,'url':x.url,'created_at':x.created_at,'source':x.source,'category':x.category,'channel':x.channel,'genre':x.genre}))
axes = sc.cassandraTable('keyspace','tablename')
topic = 'topic1'
kafkaParams = {"metadata.broker.list": "localhost:9092"}
article_stream = KafkaUtils.createDirectStream(ssc, [topic], kafkaParams)
article_join_stream=article_stream.map(lambda x:read_json(x[1])).filter(lambda x: x!=0).map(lambda x:TransformInData(x)).filter(lambda x: x!=0).flatMap(lambda x:(a for a in x)).map(lambda x:(x['id'].encode("utf-8") ,x))
article_join_stream.transform(storeDataToCassandra).pprint()
axes_topic = 'topic2'
axes_stream = KafkaUtils.createDirectStream(ssc, [axes_topic], kafkaParams)
axes_join_stream = axes_stream.map(lambda x:read_json(x[1])).filter(lambda x: x!=0).map(lambda x:axesTransformData(x)).filter(lambda x: x!=0).flatMap(lambda x:(a for a in x)).map(lambda x:(str(x['id']),x))
axes_join_stream.transform(axesStoreToCassandra).pprint()
statistics = article_join_stream.map(lambda x:(x[0])).window(15*60,15*60)
statistics.transform(joinstream).pprint()
ssc.start()
EDIT: This is the stage that seems to consume most time. Any thoughts on that?
回答1:
At first glance it seems that you just start your application with "spark-submit <your application>"
This means you are using the default allocation of memory and CPU's to your application (which is about 1cpu and 512MB of ram in most default cases)
This is assuming you are using YARN since you don't provide info on this.
Start your application with the appropriate resources and you'll see improvements.
Edit:
I see you are using a lot of lambdas, those need to be serialized. Do know that when using objects you are passing around the full object every time.
I.E. you are using the full object this.value
and not just value
.
To fix this, you could use a local variable _value = this.value and use that to proceed.
This might provide you with a speedup.
来源:https://stackoverflow.com/questions/35889361/why-is-my-spark-streaming-app-so-slow