Parallel processing within a function with caret model

问题

I am attempting to create an all in one parallel processing caret function for training caret models with different inputs. I want the function to be its own process independant of all other calls.

The function that I have developed so far seems to be reproducible with some models and not with others.

For example, below I train a gbm on the iris data set = fail to reproduce. Then train a rpart model = reproduce (aside from time difference). Is my function sound? Is it okay to specify the parallel processing within the function?

This question links to others in a small way: Fully reproducible parallel models using caret, Best function to compare caret model objects

library(caret)
library(recipes)
library(doParallel)

# recipe to be supplied
Recipe.Obj <- recipe(Sepal.Length ~ ., data = iris) %>% 
  step_zv(all_predictors()) %>%
  step_nzv(all_predictors()) %>% 
  step_normalize(all_numeric(), -Sepal.Length)

# train control object
TC.Obj <- trainControl("cv", savePredictions = "all", summaryFunction = defaultSummary, returnResamp = "all")

# tune.grid values to be used
gbm.TG = expand.grid(n.trees = c(seq(50,200, by = 100)),
                     interaction.depth = c(1:2),
                     shrinkage = c(0.05,0.1),
                     n.minobsinnode = c(10,20))

# function for seed generation
generateTrainSeeds <- function(repeats, resampling_folds, nModels) {
  #nmodels = nparams * ntunelength

  vectorLength <- repeats * resampling_folds + 1

  set.seed(0)
  seeds <- vector(mode = "list", length = vectorLength)

  # for resampling
  for(i in 1:(vectorLength-1)) seeds[[i]]<- sample.int(n=1000, nModels)

  # for final model
  seeds[[vectorLength]] <- sample.int(1000, 1)

  return (seeds)
}

# parallel processing model training function
SO.Parallel.Model.Train.Func <- function(Model,Recipe, TC, Training.Data.Set, Metric, Tune.Length = NULL, Tune.Grid = NULL) {

  # determine number of tuning parameters
  nTuneParameters <- nrow(modelLookup(Model))

  resampling_folds <- TC$number
  repeats <- ifelse(is.na(TC$repeats), 1, TC$repeats)

  # specify number of models according to which argument (Tune.Length or Tune.Grid) is supplied
  if(!missing("Tune.Length")){
    nModels <- Tune.Length * nTuneParameters
  } else if (!missing("Tune.Grid")){
    nModels <- nrow(Tune.Grid)
  }

  # use function to generate seeds for reproducibility
  seeds <- generateTrainSeeds(repeats, resampling_folds, nModels)

  # allocate seeds to the trainControl() object
  TC$seeds <- seeds

  # establish parallel environment
  cl <- makePSOCKcluster(detectCores()-3, autoStop = TRUE)
  registerDoParallel(cl)

  # run model if Tune.Length is supplied, otherwise do with Tune.Grid

  if(!missing("Tune.Length")){

    set.seed(0)
    Model.Obj <- train(Recipe,
                       method = Model,
                       data = Training.Data.Set,
                       trControl = TC,
                       metric = Metric,
                       tuneLength = Tune.Length
    )


  } else if (!missing("Tune.Grid")){

    set.seed(0)
    Model.Obj <- train(Recipe,
                       method = Model,
                       data = Training.Data.Set,
                       trControl = TC,
                       metric = Metric,
                       tuneGrid = Tune.Grid
    )   

  }

  # stop parallel environment & return model created
  stopCluster(cl)
  registerDoSEQ()

  return(Model.Obj)
}



# example 1
gbm.Model.1 <- SO.Parallel.Model.Train.Func(Model = "gbm",
                                        Recipe = Recipe.Obj,
                                        TC = TC.Obj,
                                        Training.Data.Set = iris,
                                        Metric = "RMSE",
                                        Tune.Grid = gbm.TG)

# example 2
gbm.Model.2 <- SO.Parallel.Model.Train.Func(Model = "gbm",
                                        Recipe = Recipe.Obj,
                                        TC = TC.Obj,
                                        Training.Data.Set = iris,
                                        Metric = "RMSE",
                                        Tune.Grid = gbm.TG)

# returns many differences with resampling performance
all.equal(gbm.Model.1, gbm.Model.2)

[1] "Component “results”: Component “RMSE”: Mean relative difference: 0.001501716"      
 [2] "Component “results”: Component “Rsquared”: Mean relative difference: 0.001225451"  
 [3] "Component “results”: Component “MAE”: Mean relative difference: 0.001895233"       
 [4] "Component “results”: Component “RMSESD”: Mean relative difference: 0.01187857"     
 [5] "Component “results”: Component “RsquaredSD”: Mean relative difference: 0.007913823"
 [6] "Component “results”: Component “MAESD”: Mean relative difference: 0.01049994"      
 [7] "Component “pred”: Component “pred”: Mean relative difference: 0.003305424"         
 [8] "Component “resample”: Component “RMSE”: Mean relative difference: 0.00653162"      
 [9] "Component “resample”: Component “Rsquared”: Mean relative difference: 0.004632896" 
[10] "Component “resample”: Component “MAE”: Mean relative difference: 0.00941898"       
[11] "Component “times”: Component “everything”: Mean relative difference: 0.03904282"   
[12] "Component “times”: Component “final”: Mean relative difference: 0.1666667" 

# example 3
rpart.Model.3 <- SO.Parallel.Model.Train.Func(Model = "rpart",
                                            Recipe = Recipe.Obj,
                                            TC = TC.Obj,
                                            Training.Data.Set = iris,
                                            Metric = "RMSE",
                                            Tune.Length = 3)

# example 4
rpart.Model.4 <- SO.Parallel.Model.Train.Func(Model = "rpart",
                                            Recipe = Recipe.Obj,
                                            TC = TC.Obj,
                                            Training.Data.Set = iris,
                                            Metric = "RMSE",
                                            Tune.Length = 3)

# gives the same result aside from the times
all.equal(rpart.Model.3, rpart.Model.4)

来源：https://stackoverflow.com/questions/61514348/parallel-processing-within-a-function-with-caret-model

标签

parallel-processing

r-caret

doparallel