问题
I have a function which I want to run on around 3 million datapoints. I am trying to parallelise the function using mcmapply
on a Ubuntu machine with 8 cores. The function takes in a list
of length 3 million as well as 3 more vectors of length 3 million and 1 constant value cutoffyearmon
.
The code runs perfectly fine with 100000 rows of data within 2 minutes on a single core and throws no error. However, when I try to run the code in parallel on 6 cores of my machine using mcmapply
it keeps on running for more than 5 hours.
UPDATE: This is the watered down version of my function call. There are 9 more variables which I create for time duration of 1month, 2month and 3month. I have taken variables for time 6month and 1year only.
I am using the following function call:
abc_xx_last_xxx_days=mcmapply(function(abcstrnew,sd,naflag,empflag,daysdiff,cutoffyearmon){
abcstrnew=if((!naflag) & (!empflag)){
substring(text = abcstrnew,first = seq(from = 1,to = (nchar(abcstrnew)-2),by = 3),last = seq(from = 3,to = (nchar(abcstrnew)),by = 3))
}else{
if(!is.na(empflag) & empflag){
""
}else{
NA_character_
}
}
abcstrnew=if((!naflag) & (!empflag)){
as.numeric(abcstrnew)
}else{
if(!is.na(empflag) & empflag){
as.numeric(0)
}else{
NA_real_
}
}
if(is.na(daysdiff)){
return(list(worst_abc_ever=NA_real_,
times_abc=NA_real_,
times_abc_last_180_days=NA_real_,
times_abc_last_365_days=NA_real_,
times_abc30_last_365_days=NA_real_,
times_abc30_last_180_days=NA_real_,
times_abc60_last_365_days=NA_real_,
times_abc60_last_180_days=NA_real_,
abc_last_180_days=NA_real_,
abc_last_365_days=NA_real_
))
}else{
if((!naflag)&(!empflag)){
abcstrlen=length(abcstrnew)
worst_abc_ever=max(abcstrnew)
times_abc=as.numeric(length(which(abcstrnew>0)))
if(daysdiff>365){
abc_last_365_days=as.numeric(0)
times_abc30_last_365_days=as.numeric(0)
times_abc60_last_365_days=as.numeric(0)
times_abc_last_365_days=as.numeric(0)
}else{
abcmonthstwelve=12-round(round(difftime(time1 = cutoffyearmon,time2 = as.yearmon(sd)))/30)
if(abcstrlen>=abcmonthstwelve){
abc_last_365_days=(max(abcstrnew[1:abcmonthstwelve]))
}else{
abc_last_365_days=(max(abcstrnew[1:abcstrlen]))
}
if(abcstrlen>=abcmonthstwelve){
times_abc30_last_365_days=as.numeric(length(which(abcstrnew[1:abcmonthstwelve]>=30)))
}else{
times_abc30_last_365_days=as.numeric(length(which(abcstrnew[1:abcstrlen]>=30)))
}
if(abcstrlen>=abcmonthstwelve){
times_abc60_last_365_days=as.numeric(length(which(abcstrnew[1:abcmonthstwelve]>=60)))
}else{
times_abc60_last_365_days=as.numeric(length(which(abcstrnew[1:abcstrlen]>=60)))
}
if(abcstrlen>=abcmonthstwelve){
times_abc_last_365_days=as.numeric(length(which(abcstrnew[1:abcmonthstwelve]>0)))
}else{
times_abc_last_365_days=as.numeric(length(which(abcstrnew[1:abcstrlen]>0)))
}
}
if(daysdiff>180){
abc_last_180_days=as.numeric(0)
times_abc30_last_180_days=as.numeric(0)
times_abc60_last_180_days=as.numeric(0)
times_abc_last_180_days=as.numeric(0)
}else{
abcmonthssix=6-round(round(difftime(time1 = cutoffyearmon,time2 = as.yearmon(sd)))/30)
if(abcstrlen>=abcmonthssix){
abc_last_180_days=(max(abcstrnew[1:abcmonthssix]))
}else{
abc_last_180_days=(max(abcstrnew[1:abcstrlen]))
}
if(abcstrlen>=abcmonthssix){
times_abc30_last_180_days=as.numeric(length(which(abcstrnew[1:abcmonthssix]>=30)))
}else{
times_abc30_last_180_days=as.numeric(length(which(abcstrnew[1:abcstrlen]>=30)))
}
if(abcstrlen>=abcmonthssix){
times_abc60_last_180_days=as.numeric(length(which(abcstrnew[1:abcmonthssix]>=60)))
}else{
times_abc60_last_180_days=as.numeric(length(which(abcstrnew[1:abcstrlen]>=60)))
}
if(abcstrlen>=abcmonthssix){
times_abc_last_180_days=as.numeric(length(which(abcstrnew[1:abcmonthssix]>0)))
}else{
times_abc_last_180_days=as.numeric(length(which(abcstrnew[1:abcstrlen]>0)))
}
}
return(list(worst_abc_ever=worst_abc_ever,
times_abc=times_abc,
times_abc_last_180_days=times_abc_last_180_days,
times_abc_last_365_days=times_abc_last_365_days,
times_abc30_last_365_days=times_abc30_last_365_days,
times_abc30_last_180_days=times_abc30_last_180_days,
times_abc60_last_365_days=times_abc60_last_365_days,
times_abc60_last_180_days=times_abc60_last_180_days,
abc_last_180_days=abc_last_180_days,
abc_last_365_days=abc_last_365_days
))
}else{
return(list(worst_abc_ever=NA_real_,
times_abc=NA_real_,
times_abc_last_180_days=NA_real_,
times_abc_last_365_days=NA_real_,
times_abc30_last_365_days=NA_real_,
times_abc30_last_180_days=NA_real_,
times_abc60_last_365_days=NA_real_,
times_abc60_last_180_days=NA_real_,
abc_last_180_days=NA_real_,
abc_last_365_days=NA_real_
))
}
}
},lst,sd,naflag,empflag,daysdiff,cutoffyearmon,mc.cores=6, mc.preschedule=TRUE, mc.cleanup=TRUE)
You can use the following set of inputs to run the function and check it's output.
lst=list("000050000032","000000340000000000000")
sd=c(as.Date.character("2017-05-22"),as.Date.character("2017-04-23"))
empflag=c(FALSE,FALSE)
naflag=c(FALSE,FALSE)
daysdiff=difftime(time1 = as.Date.character("2017-06-30"),time2 = sd)
cutoffyearmon=as.yearmon("2017-06-30")
I am assuming the code will divide the data almost equally between 6 cores by assigning mc.preschedule=TRUE
. But I am not able to see any significant performance in the processing speed. I am expecting the processing to be completed in around 1.5 hours when run on 6 cores of the machine.
Any suggestions if I have missed something.
When using pbmcmapply with mc.cores=6
I am getting an ETA of 06:01:32:57
来源:https://stackoverflow.com/questions/51475300/mcmapply-performance-on-multiple-cores