I love the setting .progress = \'text\'
in plyr\'s
llply
. However, it causes my much anxiety to not know how far along an mclapp
Based on the answer of @fotNelson, using a progress bar instead of line by line printing and calling an external function with mclapply.
library('utils')
library('multicore')
prog.indic <- local({ #evaluates in local environment only
f <- fifo(tempfile(), open="w+b", blocking=T) # open fifo connection
assign(x='f',value=f,envir=.GlobalEnv)
pb <- txtProgressBar(min=1, max=MC,style=3)
if (inherits(fork(), "masterProcess")) { #progress tracker
# Child
progress <- 0.0
while (progress < MC && !isIncomplete(f)){
msg <- readBin(f, "double")
progress <- progress + as.numeric(msg)
# Updating the progress bar.
setTxtProgressBar(pb,progress)
}
exit()
}
MC <- 100
result <- mclapply(1:MC, .mcfunc)
cat('\n')
assign(x='result',value=result,envir=.GlobalEnv)
close(f)
})
.mcfunc<-function(i,...){
writeBin(1, f)
return(i)
}
Assigning the fifo connection to the .GlobalEnv is necessary to use it from a function outside of the mclapply call. Thanks for the questions and the previous replies, I had been wondering how to do this for a while.
Essentially adding another version of @fotNelson's solution but with some modifications:
parallel
rather than multicore
which has now been removed from CRANHope this helps someone...
library(parallel)
#-------------------------------------------------------------------------------
#' Wrapper around mclapply to track progress
#'
#' Based on http://stackoverflow.com/questions/10984556
#'
#' @param X a vector (atomic or list) or an expressions vector. Other
#' objects (including classed objects) will be coerced by
#' ‘as.list’
#' @param FUN the function to be applied to
#' @param ... optional arguments to ‘FUN’
#' @param mc.preschedule see mclapply
#' @param mc.set.seed see mclapply
#' @param mc.silent see mclapply
#' @param mc.cores see mclapply
#' @param mc.cleanup see mclapply
#' @param mc.allow.recursive see mclapply
#' @param mc.progress track progress?
#' @param mc.style style of progress bar (see txtProgressBar)
#'
#' @examples
#' x <- mclapply2(1:1000, function(i, y) Sys.sleep(0.01))
#' x <- mclapply2(1:3, function(i, y) Sys.sleep(1), mc.cores=1)
#'
#' dat <- lapply(1:10, function(x) rnorm(100))
#' func <- function(x, arg1) mean(x)/arg1
#' mclapply2(dat, func, arg1=10, mc.cores=2)
#-------------------------------------------------------------------------------
mclapply2 <- function(X, FUN, ...,
mc.preschedule = TRUE, mc.set.seed = TRUE,
mc.silent = FALSE, mc.cores = getOption("mc.cores", 2L),
mc.cleanup = TRUE, mc.allow.recursive = TRUE,
mc.progress=TRUE, mc.style=3)
{
if (!is.vector(X) || is.object(X)) X <- as.list(X)
if (mc.progress) {
f <- fifo(tempfile(), open="w+b", blocking=T)
p <- parallel:::mcfork()
pb <- txtProgressBar(0, length(X), style=mc.style)
setTxtProgressBar(pb, 0)
progress <- 0
if (inherits(p, "masterProcess")) {
while (progress < length(X)) {
readBin(f, "double")
progress <- progress + 1
setTxtProgressBar(pb, progress)
}
cat("\n")
parallel:::mcexit()
}
}
tryCatch({
result <- mclapply(X, ..., function(...) {
res <- FUN(...)
if (mc.progress) writeBin(1, f)
res
},
mc.preschedule = mc.preschedule, mc.set.seed = mc.set.seed,
mc.silent = mc.silent, mc.cores = mc.cores,
mc.cleanup = mc.cleanup, mc.allow.recursive = mc.allow.recursive
)
}, finally = {
if (mc.progress) close(f)
})
result
}
The pbapply
package has implemented this for the general case (i.e. on Unix-like and Windows, also works with RStudio). Both pblapply
and pbsapply
have a cl
argument. From the documentation:
Parallel processing can be enabled through the
cl
argument.parLapply
is called whencl
is a ’cluster
’ object,mclapply
is called whencl
is an integer. Showing the progress bar increases the communication overhead between the main process and nodes / child processes compared to the parallel equivalents of the functions without the progress bar. The functions fall back to their original equivalents when the progress bar is disabled (i.e.getOption("pboptions")$type == "none"
dopb()
isFALSE
). This is the default wheninteractive()
ifFALSE
(i.e. called from command line R script).
If one does not supply cl
(or passes NULL
) the default non-parallel lapply
is used, also including a progress bar.
Here's a function based on @fotNelton's solution to apply wherever you would normally use mcapply.
mcadply <- function(X, FUN, ...) {
# Runs multicore lapply with progress indicator and transformation to
# data.table output. Arguments mirror those passed to lapply.
#
# Args:
# X: Vector.
# FUN: Function to apply to each value of X. Note this is transformed to
# a data.frame return if necessary.
# ...: Other arguments passed to mclapply.
#
# Returns:
# data.table stack of each mclapply return value
#
# Progress bar code based on https://stackoverflow.com/a/10993589
require(multicore)
require(plyr)
require(data.table)
local({
f <- fifo(tempfile(), open="w+b", blocking=T)
if (inherits(fork(), "masterProcess")) {
# Child
progress <- 0
print.progress <- 0
while (progress < 1 && !isIncomplete(f)) {
msg <- readBin(f, "double")
progress <- progress + as.numeric(msg)
# Print every 1%
if(progress >= print.progress + 0.01) {
cat(sprintf("Progress: %.0f%%\n", progress * 100))
print.progress <- floor(progress * 100) / 100
}
}
exit()
}
newFun <- function(...) {
writeBin(1 / length(X), f)
return(as.data.frame(FUN(...)))
}
result <- as.data.table(rbind.fill(mclapply(X, newFun, ...)))
close(f)
cat("Done\n")
return(result)
})
}
You can use your systems echo function to write from your workers, so simply add the following line to your function:
myfun <- function(x){
if(x %% 5 == 0) system(paste("echo 'now processing:",x,"'"))
dosomething(mydata[x])
}
result <- mclapply(1:10,myfun,mc.cores=5)
> now processing: 5
> now processing: 10
This will work if you pass an index e.g., so rather than passing a list of data, pass the index and extract the data in the worker function.
Due to the fact that mclapply
spawns multiple processes, one might want to use fifos, pipes, or even sockets. Now consider the following example:
library(multicore)
finalResult <- local({
f <- fifo(tempfile(), open="w+b", blocking=T)
if (inherits(fork(), "masterProcess")) {
# Child
progress <- 0.0
while (progress < 1 && !isIncomplete(f)) {
msg <- readBin(f, "double")
progress <- progress + as.numeric(msg)
cat(sprintf("Progress: %.2f%%\n", progress * 100))
}
exit()
}
numJobs <- 100
result <- mclapply(1:numJobs, function(...) {
# Dome something fancy here
# ...
# Send some progress update
writeBin(1/numJobs, f)
# Some arbitrary result
sample(1000, 1)
})
close(f)
result
})
cat("Done\n")
Here, a temporary file is used as fifo, and the main process forks a child whose only duty is to report the current progress. The main process continues by calling mclapply
where the expression (more precisely, the expression block) that is to be evaluated writes partial progress information to the fifo buffer by means of writeBin
.
As this is only a simple example, you'll probably have to adapt the whole output stuff to your needs. HTH!