R: repeat linear regression for all variables and save results in a new data frame

前端 未结 3 973
醉话见心
醉话见心 2021-01-28 17:03

I have a data frame named “dat” with 10 numeric variables (var1, var2,var3,var4 , var5,…var 10), each with several observations…

dat

   var1 var2 var3 var4 var         


        
相关标签:
3条回答
  • 2021-01-28 17:40

    You can try the following code to have the desired output

    data <- structure(list(var1 = c(12L, 3L, 13L, 17L, 9L, 15L, 12L, 3L, 
    13L), var2 = c(5L, 2L, 15L, 11L, 13L, 6L, 5L, 2L, 15L), var3 = c(18L, 
    10L, 14L, 16L, 8L, 20L, 18L, 10L, 14L), var4 = c(19L, 6L, 13L, 
    18L, 8L, 17L, 19L, 6L, 13L), var5 = c(12L, 13L, 1L, 10L, 7L, 
    3L, 12L, 13L, 1L), var6 = c(17L, 17L, 17L, 17L, 17L, 17L, 17L, 
    17L, 17L), var7 = c(11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L
    ), var8 = c(16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L), var9 = c(18L, 
    18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L), var10 = c(10L, 10L, 
    10L, 10L, 10L, 10L, 10L, 10L, 10L)), class = "data.frame", row.names = c(NA, 
    -9L))
    
    head(data,2)
    #>   var1 var2 var3 var4 var5 var6 var7 var8 var9 var10
    #> 1   12    5   18   19   12   17   11   16   18    10
    #> 2    3    2   10    6   13   17   11   16   18    10
    
    x = names(data[,-1])
    out <- unlist(lapply(1, function(n) combn(x, 1, FUN=function(row) paste0("var1 ~ ", paste0(row, collapse = "+")))))
    out
    #> [1] "var1 ~ var2"  "var1 ~ var3"  "var1 ~ var4"  "var1 ~ var5" 
    #> [5] "var1 ~ var6"  "var1 ~ var7"  "var1 ~ var8"  "var1 ~ var9" 
    #> [9] "var1 ~ var10"
    
    library(broom)
    #> Warning: package 'broom' was built under R version 3.5.3
    
    library(dplyr)
    #> Warning: package 'dplyr' was built under R version 3.5.3
    #> 
    #> Attaching package: 'dplyr'
    #> The following objects are masked from 'package:stats':
    #> 
    #>     filter, lag
    #> The following objects are masked from 'package:base':
    #> 
    #>     intersect, setdiff, setequal, union
    
    #To have the regression coefficients
    tmp1 = bind_rows(lapply(out, function(frml) {
     a = tidy(lm(frml, data=data))
     a$frml = frml
     return(a)
    }))
    head(tmp1)
    #> # A tibble: 6 x 6
    #>   term        estimate std.error statistic p.value frml       
    #>   <chr>          <dbl>     <dbl>     <dbl>   <dbl> <chr>      
    #> 1 (Intercept)    6.46      2.78      2.33  0.0529  var1 ~ var2
    #> 2 var2           0.525     0.288     1.82  0.111   var1 ~ var2
    #> 3 (Intercept)   -1.50      4.47     -0.335 0.748   var1 ~ var3
    #> 4 var3           0.863     0.303     2.85  0.0247  var1 ~ var3
    #> 5 (Intercept)    0.649     2.60      0.250 0.810   var1 ~ var4
    #> 6 var4           0.766     0.183     4.18  0.00413 var1 ~ var4
    
    #To have the regression results i.e. R2, AIC, BIC
    tmp2 = bind_rows(lapply(out, function(frml) {
     a = glance(lm(frml, data=data))
     a$frml = frml
     return(a)
    }))
    head(tmp2)
    #> # A tibble: 6 x 12
    #>   r.squared adj.r.squared sigma statistic  p.value    df logLik   AIC   BIC
    #>       <dbl>         <dbl> <dbl>     <dbl>    <dbl> <int>  <dbl> <dbl> <dbl>
    #> 1     0.321         0.224  4.33      3.31  0.111       2  -24.8  55.7  56.3
    #> 2     0.537         0.471  3.58      8.12  0.0247      2  -23.1  52.2  52.8
    #> 3     0.714         0.673  2.81     17.5   0.00413     2  -20.9  47.9  48.5
    #> 4     0.276         0.173  4.47      2.67  0.146       2  -25.1  56.2  56.8
    #> 5     0             0      4.92     NA    NA           1  -26.6  57.2  57.6
    #> 6     0             0      4.92     NA    NA           1  -26.6  57.2  57.6
    #> # ... with 3 more variables: deviance <dbl>, df.residual <int>, frml <chr>
    
    write.csv(tmp1, "Try_lm_coefficients.csv")
    write.csv(tmp2, "Try_lm_results.csv")
    

    Created on 2019-11-20 by the reprex package (v0.3.0)

    0 讨论(0)
  • 2021-01-28 17:41

    There a several ways to do what you want in R. I suggest sapply which is a simple way to apply a function other a list of variables. Here is an example to get the coefficients of each linear regression between var1 and all other variables.

    # define a function to get coefficients from linear regression
    do_lm <- function(var){ # var is the name of the column
      res <- lm(as.formula(paste0("var1~",var)), data = dat) # compute linear regression
      coefs <- c(intercept = res$coefficient[2], slope = res$coefficient[1]) # get coefficients
      return(coefs)
    }
    
    t(
      sapply(colnames(dat)[2:10], do_lm)
     )
    # t transposes the result 
    # sapply : applies on "var2" ... "var10" the function do_lm
    

    It returns :

          intercept.var2 slope.(Intercept)
    var2       0.5251232         6.4600985
    var3       0.8630573        -1.4968153
    var4       0.7660377         0.6490566
    var5      -0.5047619        14.8158730
    var6              NA        10.7777778
    var7              NA        10.7777778
    var8              NA        10.7777778
    var9              NA        10.7777778
    var10             NA        10.7777778
    

    You can adapt the function do_lm in sapply to compute other things, like correlations ...

    0 讨论(0)
  • 2021-01-28 17:46
    dat <- structure(list(var1 = c(12L, 3L, 13L, 17L, 9L, 15L, 12L, 3L, 
    13L), var2 = c(5L, 2L, 15L, 11L, 13L, 6L, 5L, 2L, 15L), var3 = c(18L, 
    10L, 14L, 16L, 8L, 20L, 18L, 10L, 14L), var4 = c(19L, 6L, 13L, 
    18L, 8L, 17L, 19L, 6L, 13L), var5 = c(12L, 13L, 1L, 10L, 7L, 
    3L, 12L, 13L, 1L), var6 = c(17L, 17L, 17L, 17L, 17L, 17L, 17L, 
    17L, 17L), var7 = c(11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L
    ), var8 = c(16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L), var9 = c(18L, 
    18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L), var10 = c(10L, 10L, 
    10L, 10L, 10L, 10L, 10L, 10L, 10L)), class = "data.frame", row.names = c("1", 
    "2", "3", "4", "5", "6", "7", "8", "9"))
    

    We first write a function to obtain all the statistics you need. Note, rsq is the square of the correlation coefficient. So you don't need the linear model. The coefficient you get from the model is the slope.

    STATS = function(x,y,DATA){
     COR = cor.test(DATA[,y],DATA[,x])
     MODEL = summary(lm(DATA[,y]~DATA[,x]))
     data.frame(
     VAR=x,
     PEARSON_COR=as.numeric(COR$estimate),
     PVAL=COR$p.value,
     RSQ=as.numeric(COR$estimate^2),
     SLOPE = MODEL$coefficients[2,1],
     stringsAsFactors=FALSE
     )
    }
    

    We test it on var2

    STATS("var2","var1",dat)
    
         VAR PEARSON_COR      PVAL      RSQ     SLOPE
    1 var2   0.5668721 0.1114741 0.321344 0.5251232
    

    We do it for example on var2,var3,var4 and combine them into a data frame. Note I did not try var 6 to 10 because it's only 1 value

    results = do.call(rbind,
    lapply(c("var2","var3","var4"),function(i)STATS(i,"var1",dat)))
    results
    
        VAR PEARSON_COR        PVAL       RSQ     SLOPE
    1 var2   0.5668721 0.111474101 0.3213440 0.5251232
    2 var3   0.7328421 0.024699805 0.5370575 0.8630573
    3 var4   0.8450726 0.004127542 0.7141477 0.7660377
    

    If you are familiar with tidyverse and purrr, you can do the following:

    library(dplyr)
    library(purrr)
    c("var2","var3","var4") %>% map_dfr(STATS,"var1",dat)
    
    0 讨论(0)
提交回复
热议问题