Automatically compare nested models from mice's glm.mids

问题

I have a multiply-imputed model from R's mice package in which there are lots of factor variables. For example:

library(mice)
library(Hmisc)

# turn all the variables into factors
fake = nhanes
fake$age = as.factor(nhanes$age)
fake$bmi = cut2(nhanes$bmi, g=3) 
fake$chl = cut2(nhanes$chl, g=3) 

head(fake)
  age         bmi hyp       chl
1   1        <NA>  NA      <NA>
2   2 [20.4,25.5)   1 [187,206)
3   1        <NA>   1 [187,206)
4   3        <NA>  NA      <NA>
5   1 [20.4,25.5)   1 [113,187)
6   3        <NA>  NA [113,187)

imput = mice(nhanes)

# big model
fit1 = glm.mids((hyp==2) ~ age + bmi + chl, data=imput, family = binomial)

I want to test the significance of each entire factor variable in the model (not the indicator variables for each level) by testing the full model against each possible nested model that drops one variable at a time. Manually, I can do:

# small model (no chl)
fit2 = glm.mids((hyp==2) ~ age + bmi, data=imput, family = binomial)

# extract p-value from pool.compare
pool.compare(fit1, fit2)$pvalue

How can I do this automatically for all the factor variables in my model? The very helpful function drop1 was suggested to me for a previous question -- now I want to do something exactly like that except for the mice case.

Possibly helpful note: An annoying feature of pool.compare is that it appears to want the "extra" variables in the larger model to be placed after the ones that are shared with the smaller model.

回答1:

You can use a loop to iterate through the different combinations of predictors, after arranging them in the order required for pool.compare.

So using your fake data from above - tweaked the number of categories

library(mice)
library(Hmisc)
# turn all the variables into factors
# turn all the variables into factors
fake <- nhanes
fake$age <- as.factor(nhanes$age)
fake$bmi <- cut2(nhanes$bmi, g=2) 
fake$chl <- cut2(nhanes$chl, g=2) 

# Impute
imput <- mice(fake, seed=1)

# Create models 
# - reduced models with one variable removed
# - full models with extra variables at end of expression
vars <- c("age", "bmi", "chl")

red <- combn(vars, length(vars)-1 , simplify=FALSE)
diffs <- lapply(red, function(i) setdiff(vars, i) )
(full <- lapply(1:length(red), function(i) 
                            paste(c(red[[i]], diffs[[i]]), collapse=" + ")))
#[[1]]
#[1] "age + bmi + chl"

#[[2]]
#[1] "age + chl + bmi"

#[[3]]
#[1] "bmi + chl + age"

(red <- combn(vars, length(vars)-1 , FUN=paste, collapse=" + "))
#[1] "age + bmi" "age + chl" "bmi + chl"

The models are now in the correct order to pass to the glm call. I've also replaced glm.mids method as it has been replaced by with.mids - see ?glm.mids

out <- vector("list", length(red))

for( i in 1:length(red)) {

  redMod <-  with(imput, 
               glm(formula(paste("(hyp==2) ~ ", red[[i]])), family = binomial))

  fullMod <-  with(imput, 
               glm(formula(paste("(hyp==2) ~ ", full[[i]])), family = binomial))

  out[[i]] <- list(predictors = diffs[[i]], 
                   pval = c(pool.compare(fullMod, redMod)$pvalue))
   }

do.call(rbind.data.frame, out)
#    predictors      pval
#2         chl 0.9976629
#21        bmi 0.9985028
#3         age 0.9815831

# Check manually by leaving out chl
mod1 <- with(imput, glm((hyp==2) ~ age + bmi + chl , family = binomial))
mod2 <- with(imput, glm((hyp==2) ~ age + bmi , family = binomial))
pool.compare(mod1, mod2)$pvalue
#         [,1]
#[1,] 0.9976629

You will get a lot of warnings using this dataset

EDIT

You could wrap this in a function

impGlmDrop1 <- function(vars, outcome, Data=imput,  Family="binomial") 
{

  red <- combn(vars, length(vars)-1 , simplify=FALSE)
  diffs <- lapply(red, function(i) setdiff(vars, i))
  full <- lapply(1:length(red), function(i) 
                      paste(c(red[[i]], diffs[[i]]), collapse=" + "))
  red <- combn(vars, length(vars)-1 , FUN=paste, collapse=" + ")

  out <- vector("list", length(red))
  for( i in 1:length(red)) {

  redMod <-  with(Data, 
              glm(formula(paste(outcome, red[[i]], sep="~")), family = Family))
  fullMod <-  with(Data, 
              glm(formula(paste(outcome, full[[i]], sep="~")), family = Family))
  out[[i]] <- list(predictors = diffs[[i]], 
                   pval = c(pool.compare(fullMod, redMod)$pvalue)  )
  }
  do.call(rbind.data.frame, out)
}

# Run
impGlmDrop1(c("age", "bmi", "chl"), "(hyp==2)")

来源：https://stackoverflow.com/questions/26621092/automatically-compare-nested-models-from-mices-glm-mids

标签

anova

categorical-data

r-mice