问题
I would like to ask for help please. I use this code to run the XGboost model in the Caret package. However, I want to use the validation split based on time. I want 60% training, 20% validation ,20% testing. I already split the data, but I do know how to deal with the validation data if it is not cross-validation.
Thank you,
xgb_trainControl = trainControl(
method = "cv",
number = 5,
returnData = FALSE
)
xgb_grid <- expand.grid(nrounds = 1000,
eta = 0.01,
max_depth = 8,
gamma = 1,
colsample_bytree = 1,
min_child_weight = 1,
subsample = 1
)
set.seed(123)
xgb1 = train(sale~., data = trans_train,
trControl = xgb_trainControl,
tuneGrid = xgb_grid,
method = "xgbTree",
)
xgb1
pred = predict(lm1, trans_test)
回答1:
The validation partition should not be used when you are creating the model - it should be 'set aside' until the model is trained and tuned using the 'training' and 'tuning' partitions, then you can apply the model to predict the outcome of the validation dataset and summarise how accurate the predictions were.
For example, in my own work I create three partitions: training (75%), tuning (10%) and testing/validation (15%) using
# Define the partition (e.g. 75% of the data for training)
trainIndex <- createDataPartition(data$response, p = .75,
list = FALSE,
times = 1)
# Split the dataset using the defined partition
train_data <- data[trainIndex, ,drop=FALSE]
tune_plus_val_data <- data[-trainIndex, ,drop=FALSE]
# Define a new partition to split the remaining 25%
tune_plus_val_index <- createDataPartition(tune_plus_val_data$response,
p = .6,
list = FALSE,
times = 1)
# Split the remaining ~25% of the data: 40% (tune) and 60% (val)
tune_data <- tune_plus_val_data[-tune_plus_val_index, ,drop=FALSE]
val_data <- tune_plus_val_data[tune_plus_val_index, ,drop=FALSE]
# Outcome of this section is that the data (100%) is split into:
# training (~75%)
# tuning (~10%)
# validation (~15%)
These data partitions are converted to xgb.DMatrix matrices ("dtrain", "dtune", "dval"). I then use the 'training' partition to train models and the 'tuning' partition to tune hyperparameters (e.g. random grid search) and evaluate model training (e.g. cross validation). This is ~equivalent to the code in your question.
lrn_tune <- setHyperPars(lrn, par.vals = mytune$x)
params2 <- list(booster = "gbtree",
objective = lrn_tune$par.vals$objective,
eta=lrn_tune$par.vals$eta, gamma=0,
max_depth=lrn_tune$par.vals$max_depth,
min_child_weight=lrn_tune$par.vals$min_child_weight,
subsample = 0.8,
colsample_bytree=lrn_tune$par.vals$colsample_bytree)
xgb2 <- xgb.train(params = params2,
data = dtrain, nrounds = 50,
watchlist = list(val=dtune, train=dtrain),
print_every_n = 10, early_stopping_rounds = 50,
maximize = FALSE, eval_metric = "error")
Once the model is trained I apply the model to the validation data with predict()
:
xgbpred2_keep <- predict(xgb2, dval)
xg2_val <- data.frame("Prediction" = xgbpred2_keep,
"Patient" = rownames(val),
"Response" = val_data$response)
# Reorder Patients according to Response
xg2_val$Patient <- factor(xg2_val$Patient,
levels = xg2_val$Patient[order(xg2_val$Response)])
ggplot(xg2_val, aes(x = Patient, y = Prediction,
fill = Response)) +
geom_bar(stat = "identity") +
theme_bw(base_size = 16) +
labs(title=paste("Patient predictions (xgb2) for the validation dataset (n = ",
length(rownames(val)), ")", sep = ""),
subtitle="Above 0.5 = Non-Responder, Below 0.5 = Responder",
caption=paste("JM", Sys.Date(), sep = " "),
x = "") +
theme(axis.text.x = element_text(angle=90, vjust=0.5,
hjust = 1, size = 8)) +
# Distance from red line = confidence of prediction
geom_hline(yintercept = 0.5, colour = "red")
# Convert predictions to binary outcome (responder / non-responder)
xgbpred2_binary <- ifelse(predict(xgb2, dval) > 0.5,1,0)
# Results matrix (i.e. true positives/negatives & false positives/negatives)
confusionMatrix(as.factor(xgbpred2_binary), as.factor(labels_tv))
# Summary of results
Summary_of_results <- data.frame(Patient_ID = rownames(val),
label = labels_tv,
pred = xgbpred2_binary)
Summary_of_results$eval <- ifelse(
Summary_of_results$label != Summary_of_results$pred,
"wrong",
"correct")
Summary_of_results$conf <- round(predict(xgb2, dval), 2)
Summary_of_results$CDS <- val_data$`variants`
Summary_of_results
This provides you with a summary of how well the model 'works' on your validation data.
来源:https://stackoverflow.com/questions/63256553/train-validation-test-split-model-in-caret-in-r