问题
I'm using tidymodels to fit a PLS model but I'm struggling to find the PLS variable importance scores or coefficients.
This is what I've tried so far; the example data is from AppliedPredictiveModeling package.
Modeling fitting
data(ChemicalManufacturingProcess)
split <- ChemicalManufacturingProcess %>% initial_split(prop = 0.7)
train <- training(split)
test <- testing(split)
tidy_rec <- recipe(Yield ~ ., data = train) %>%
step_knnimpute(all_predictors()) %>%
step_BoxCox(all_predictors()) %>%
step_normalize(all_predictors()) %>%
step_nzv(all_predictors()) %>%
step_corr(all_predictors())
boots <- bootstraps(time = 25, data = train)
tidy_model <- plsmod::pls(num_comp = tune()) %>%
set_mode("regression") %>%
set_engine("mixOmics")
tidy_grid <- expand.grid(num_comp = seq(from = 1, to = 48, by = 5))
tidy_tune <- tidy_model %>% tune_grid(
preprocessor = tidy_rec,
grid = tidy_grid,
resamples = boots,
metrics = metric_set(mae, rmse, rsq)
)
tidy_best <- tidy_tune %>% select_best("rsq")
Final_model <- tidy_model %>% finalize_model(tidy_best)
tidy_wf <- workflow() %>%
add_model(Final_model) %>%
add_recipe(tidy_rec)
Fit_PLS <- tidy_wf %>% fit(data = train)
# check the most important predictors
tidy_info <- Fit_PLS %>% pull_workflow_fit()
loadings <- tidy_info$fit$loadings$X
PLS variable importance
tidy_load <- loadings %>% as.data.frame() %>% rownames_to_column() %>%
select(rowname, comp1, comp2, comp3) %>%
pivot_longer(-rowname) %>%
rename(predictors = rowname)
tidy_load %>% mutate(Sing = if_else(value < 0, "neg", "pos")) %>%
mutate(absvalue = abs(value)) %>% group_by(predictors) %>% summarise(Importance = sum(absvalue)) %>%
mutate(predictors = fct_reorder(predictors, Importance)) %>%
slice_head(n = 15) %>%
ggplot(aes(Importance, predictors, fill = predictors)) + geom_col(show.legend = F)
Thanks! The vi()
function from the vip package is not available for this model.
回答1:
You can directly tidy()
the output of the PLS model to get the coefficients:
library(tidymodels)
library(tidyverse)
library(plsmod)
data(ChemicalManufacturingProcess, package = "AppliedPredictiveModeling")
split <- initial_split(ChemicalManufacturingProcess, prop = 0.7)
train <- training(split)
test <- testing(split)
chem_rec <- recipe(Yield ~ ., data = train) %>%
step_knnimpute(all_predictors()) %>%
step_BoxCox(all_predictors()) %>%
step_normalize(all_predictors()) %>%
step_nzv(all_predictors()) %>%
step_corr(all_predictors())
pls_spec <- pls(num_comp = 4) %>% ## can tune instead to find the optimal number
set_mode("regression") %>%
set_engine("mixOmics")
wf <- workflow() %>%
add_recipe(chem_rec) %>%
add_model(pls_spec)
pls_fit <- fit(wf, train)
## tidy the fitted model
tidy_pls <- pls_fit %>%
pull_workflow_fit()
tidy()
tidy_pls
#> # A tibble: 192 x 4
#> term value type component
#> <chr> <dbl> <chr> <dbl>
#> 1 BiologicalMaterial01 0.193 predictors 1
#> 2 BiologicalMaterial01 -0.247 predictors 2
#> 3 BiologicalMaterial01 0.00969 predictors 3
#> 4 BiologicalMaterial01 0.0228 predictors 4
#> 5 BiologicalMaterial03 0.249 predictors 1
#> 6 BiologicalMaterial03 -0.00118 predictors 2
#> 7 BiologicalMaterial03 0.0780 predictors 3
#> 8 BiologicalMaterial03 -0.0866 predictors 4
#> 9 BiologicalMaterial04 0.217 predictors 1
#> 10 BiologicalMaterial04 -0.192 predictors 2
#> # … with 182 more rows
tidy_pls %>%
filter(term != "Y") %>%
group_by(component) %>%
slice_max(abs(value), n = 10) %>%
ungroup() %>%
ggplot(aes(value, fct_reorder(term, value), fill = factor(component))) +
geom_col(show.legend = FALSE) +
facet_wrap(~component, scales = "free_y") +
labs(y = NULL)
Created on 2020-10-19 by the reprex package (v0.3.0.9001)
I showed this without tuning the number of components, but it works about the same with tuning.
来源:https://stackoverflow.com/questions/64254295/predictor-importance-for-pls-model-trained-with-tidymodels