问题
I am attempting to create an all in one parallel processing caret function for training caret models with different inputs. I want the function to be its own process independant of all other calls.
The function that I have developed so far seems to be reproducible with some models and not with others.
For example, below I train a gbm on the iris data set = fail to reproduce. Then train a rpart model = reproduce (aside from time difference). Is my function sound? Is it okay to specify the parallel processing within the function?
This question links to others in a small way: Fully reproducible parallel models using caret, Best function to compare caret model objects
library(caret)
library(recipes)
library(doParallel)
# recipe to be supplied
Recipe.Obj <- recipe(Sepal.Length ~ ., data = iris) %>%
step_zv(all_predictors()) %>%
step_nzv(all_predictors()) %>%
step_normalize(all_numeric(), -Sepal.Length)
# train control object
TC.Obj <- trainControl("cv", savePredictions = "all", summaryFunction = defaultSummary, returnResamp = "all")
# tune.grid values to be used
gbm.TG = expand.grid(n.trees = c(seq(50,200, by = 100)),
interaction.depth = c(1:2),
shrinkage = c(0.05,0.1),
n.minobsinnode = c(10,20))
# function for seed generation
generateTrainSeeds <- function(repeats, resampling_folds, nModels) {
#nmodels = nparams * ntunelength
vectorLength <- repeats * resampling_folds + 1
set.seed(0)
seeds <- vector(mode = "list", length = vectorLength)
# for resampling
for(i in 1:(vectorLength-1)) seeds[[i]]<- sample.int(n=1000, nModels)
# for final model
seeds[[vectorLength]] <- sample.int(1000, 1)
return (seeds)
}
# parallel processing model training function
SO.Parallel.Model.Train.Func <- function(Model,Recipe, TC, Training.Data.Set, Metric, Tune.Length = NULL, Tune.Grid = NULL) {
# determine number of tuning parameters
nTuneParameters <- nrow(modelLookup(Model))
resampling_folds <- TC$number
repeats <- ifelse(is.na(TC$repeats), 1, TC$repeats)
# specify number of models according to which argument (Tune.Length or Tune.Grid) is supplied
if(!missing("Tune.Length")){
nModels <- Tune.Length * nTuneParameters
} else if (!missing("Tune.Grid")){
nModels <- nrow(Tune.Grid)
}
# use function to generate seeds for reproducibility
seeds <- generateTrainSeeds(repeats, resampling_folds, nModels)
# allocate seeds to the trainControl() object
TC$seeds <- seeds
# establish parallel environment
cl <- makePSOCKcluster(detectCores()-3, autoStop = TRUE)
registerDoParallel(cl)
# run model if Tune.Length is supplied, otherwise do with Tune.Grid
if(!missing("Tune.Length")){
set.seed(0)
Model.Obj <- train(Recipe,
method = Model,
data = Training.Data.Set,
trControl = TC,
metric = Metric,
tuneLength = Tune.Length
)
} else if (!missing("Tune.Grid")){
set.seed(0)
Model.Obj <- train(Recipe,
method = Model,
data = Training.Data.Set,
trControl = TC,
metric = Metric,
tuneGrid = Tune.Grid
)
}
# stop parallel environment & return model created
stopCluster(cl)
registerDoSEQ()
return(Model.Obj)
}
# example 1
gbm.Model.1 <- SO.Parallel.Model.Train.Func(Model = "gbm",
Recipe = Recipe.Obj,
TC = TC.Obj,
Training.Data.Set = iris,
Metric = "RMSE",
Tune.Grid = gbm.TG)
# example 2
gbm.Model.2 <- SO.Parallel.Model.Train.Func(Model = "gbm",
Recipe = Recipe.Obj,
TC = TC.Obj,
Training.Data.Set = iris,
Metric = "RMSE",
Tune.Grid = gbm.TG)
# returns many differences with resampling performance
all.equal(gbm.Model.1, gbm.Model.2)
[1] "Component “results”: Component “RMSE”: Mean relative difference: 0.001501716"
[2] "Component “results”: Component “Rsquared”: Mean relative difference: 0.001225451"
[3] "Component “results”: Component “MAE”: Mean relative difference: 0.001895233"
[4] "Component “results”: Component “RMSESD”: Mean relative difference: 0.01187857"
[5] "Component “results”: Component “RsquaredSD”: Mean relative difference: 0.007913823"
[6] "Component “results”: Component “MAESD”: Mean relative difference: 0.01049994"
[7] "Component “pred”: Component “pred”: Mean relative difference: 0.003305424"
[8] "Component “resample”: Component “RMSE”: Mean relative difference: 0.00653162"
[9] "Component “resample”: Component “Rsquared”: Mean relative difference: 0.004632896"
[10] "Component “resample”: Component “MAE”: Mean relative difference: 0.00941898"
[11] "Component “times”: Component “everything”: Mean relative difference: 0.03904282"
[12] "Component “times”: Component “final”: Mean relative difference: 0.1666667"
# example 3
rpart.Model.3 <- SO.Parallel.Model.Train.Func(Model = "rpart",
Recipe = Recipe.Obj,
TC = TC.Obj,
Training.Data.Set = iris,
Metric = "RMSE",
Tune.Length = 3)
# example 4
rpart.Model.4 <- SO.Parallel.Model.Train.Func(Model = "rpart",
Recipe = Recipe.Obj,
TC = TC.Obj,
Training.Data.Set = iris,
Metric = "RMSE",
Tune.Length = 3)
# gives the same result aside from the times
all.equal(rpart.Model.3, rpart.Model.4)
来源:https://stackoverflow.com/questions/61514348/parallel-processing-within-a-function-with-caret-model