Tabulate coefficients from lm

后端 未结 5 1622
失恋的感觉
失恋的感觉 2021-01-25 17:32

I have 10 linear models where I only need some information, namely: r-squared, p-value, coefficients of slope and intercept. I managed to extract these values (via ridiculously

相关标签:
5条回答
  • 2021-01-25 18:00

    Consider building a matrix of lm results. First create a defined function to handle your generalized data frame model build with results extraction. Then, call by which can subset your data frame by a factor column and pass each subset into defined method. Finally, rbind all grouped matrices together for a singular output

    lm_results <- function(df) {
    
      model <- lm(Qend ~ Rainfall, data = df)
      res <- summary(model)
    
      p <- res$fstatistic
    
      c(gradient = res$coefficients[2,1],
        intercept = res$coefficients[2,2],
        r_sq = res$r.squared,
        adj_r_sq = res$adj.r.squared,
        f_stat = p[['value']],
        p_value = unname(pf(p[1], p[2], p[3], lower.tail=FALSE))
      )
    }
    
    matrix_list <- by(d, d$group, lm_results)
    
    final_matrix <- do.call(rbind, matrix_list)
    

    To demonstrate on random, seeded data

    set.seed(12262018)
    data_tools <- c("sas", "stata", "spss", "python", "r", "julia")
    
    d <- data.frame(
      group = sample(data_tools, 500, replace=TRUE),
      int = sample(1:15, 500, replace=TRUE),
      Qend = rnorm(500) / 100,
      Rainfall = rnorm(500) * 10
    )
    

    Results

    mat_list <- by(d, d$group, lm_results)
    
    final_matrix <- do.call(rbind, mat_list)
    final_matrix
    
    #             gradient    intercept        r_sq     adj_r_sq    f_stat    p_value
    # julia  -1.407313e-04 1.203832e-04 0.017219149  0.004619395 1.3666258 0.24595273
    # python -1.438116e-04 1.125170e-04 0.018641512  0.007230367 1.6336233 0.20464162
    # r       2.031717e-04 1.168037e-04 0.041432175  0.027738349 3.0256098 0.08635510
    # sas    -1.549510e-04 9.067337e-05 0.032476668  0.021355710 2.9203121 0.09103619
    # spss    9.326656e-05 1.068516e-04 0.008583473 -0.002682623 0.7618853 0.38511469
    # stata  -7.079514e-05 1.024010e-04 0.006013841 -0.006568262 0.4779679 0.49137093
    
    0 讨论(0)
  • 2021-01-25 18:00

    Using library(data.table) you can do

    d <- fread("example.csv")
    d[, .(
      r2         = (fit <- summary(lm(Qend~Rainfall)))$r.squared,
      adj.r2     = fit$adj.r.squared,
      intercept  = fit$coefficients[1,1], 
      gradient   = fit$coefficients[2,1],
      p.value    = {p <- fit$fstatistic; pf(p[1], p[2], p[3], lower.tail=FALSE)}),
      by  = CatChro]
    
    #    CatChro         r2       adj.r2   intercept     gradient      p.value
    # 1:    A3G1 0.03627553  0.011564648 0.024432020 0.0001147645 0.2329519751
    # 2:    A3D1 0.28069553  0.254054622 0.011876543 0.0004085644 0.0031181110
    # 3:    A3G2 0.06449971  0.041112205 0.026079409 0.0004583538 0.1045970987
    # 4:    A3D2 0.03384173  0.005425311 0.023601325 0.0005431693 0.2828170556
    # 5:    A3G3 0.07587433  0.054383038 0.043537869 0.0006964512 0.0670399684
    # 6:    A3D3 0.04285322  0.002972105 0.022106960 0.0001451185 0.3102578215
    # 7:    A3G4 0.17337420  0.155404076 0.023706652 0.0006442175 0.0032431299
    # 8:    A3D4 0.37219027  0.349768492 0.009301843 0.0006614213 0.0003442445
    # 9:    A3G5 0.17227383  0.150491566 0.025994831 0.0006658466 0.0077413595
    #10:    A3D5 0.04411669 -0.008987936 0.014341399 0.0001084626 0.3741011769
    
    0 讨论(0)
  • 2021-01-25 18:02

    Here in only a couple of lines:

    library(tidyverse)
    library(broom)
    # create grouped dataframe:
    df_g <- df %>% group_by(CatChro)
    df_g %>% do(tidy(lm(Qend ~ Rainfall, data = .))) %>% 
       select(CatChro, term, estimate) %>% spread(term, estimate) %>% 
       left_join(df_g %>% do(glance(lm(Qend ~ Rainfall, data = .))) %>%
       select(CatChro, r.squared, adj.r.squared, p.value), by = "CatChro")
    

    And the result will be:

    # A tibble: 10 x 6
    # Groups:   CatChro [?]
       CatChro `(Intercept)` Rainfall r.squared adj.r.squared  p.value
       <chr>           <dbl>    <dbl>     <dbl>         <dbl>    <dbl>
     1 A3D1          0.0119  0.000409    0.281        0.254   0.00312 
     2 A3D2          0.0236  0.000543    0.0338       0.00543 0.283   
     3 A3D3          0.0221  0.000145    0.0429       0.00297 0.310   
     4 A3D4          0.00930 0.000661    0.372        0.350   0.000344
     5 A3D5          0.0143  0.000108    0.0441      -0.00899 0.374   
     6 A3G1          0.0244  0.000115    0.0363       0.0116  0.233   
     7 A3G2          0.0261  0.000458    0.0645       0.0411  0.105   
     8 A3G3          0.0435  0.000696    0.0759       0.0544  0.0670  
     9 A3G4          0.0237  0.000644    0.173        0.155   0.00324 
    10 A3G5          0.0260  0.000666    0.172        0.150   0.00774 
    

    So, how does this work?

    The following creates a dataframe with all coefficients and the corresponding statistics (tidy turns the result of lm into a dataframe):

    df_g %>%
      do(tidy(lm(Qend ~ Rainfall, data = .)))
    A tibble: 20 x 6
    Groups:   CatChro [10]
       CatChro term        estimate std.error statistic      p.value
       <chr>   <chr>          <dbl>     <dbl>     <dbl>        <dbl>
     1 A3D1    (Intercept) 0.0119   0.00358       3.32  0.00258     
     2 A3D1    Rainfall    0.000409 0.000126      3.25  0.00312     
     3 A3D2    (Intercept) 0.0236   0.00928       2.54  0.0157      
     4 A3D2    Rainfall    0.000543 0.000498      1.09  0.283       
    

    I understand that you want to have the intercept and the coefficient on Rainfall as individual columns, so let's "spread" them out. This is achieved by first selecting the relevant columns, and then invoking tidyr::spread, as in

    select(CatChro, term, estimate) %>% spread(term, estimate)
    

    This gives you:

    df_g %>% do(tidy(lm(Qend ~ Rainfall, data = .))) %>% 
      select(CatChro, term, estimate) %>% spread(term, estimate)
    A tibble: 10 x 3
    Groups:   CatChro [10]
       CatChro `(Intercept)` Rainfall
       <chr>           <dbl>    <dbl>
     1 A3D1          0.0119  0.000409
     2 A3D2          0.0236  0.000543
     3 A3D3          0.0221  0.000145
     4 A3D4          0.00930 0.000661
    

    Glance gives you the summary statistics you are looking for, for each model one. The models are indexed by group, here CatChro, so it is easy to just merge them onto the previous dataframe, which is what the rest of the code is about.

    0 讨论(0)
  • 2021-01-25 18:03

    Another solution, with lme4::lmList. The summary() method for objects produced by lmList does almost everything you want (although it doesn't store p-values, that's something I had to add below).

    m <- lme4::lmList(Qend~Rainfall|CatChro,data=d)
    s <- summary(m)
    pvals <- apply(s$fstatistic,1,function(x) pf(x[1],x[2],x[3],lower.tail=FALSE))
    data.frame(intercept=coef(s)[,"Estimate","(Intercept)"],
               slope=coef(s)[,"Estimate","Rainfall"],
               r.squared=s$r.squared,
               adj.r.squared=unlist(s$adj.r.squared),
               p.value=pvals)
    
    0 讨论(0)
  • 2021-01-25 18:06

    Here is a base R solution:

    data <- read.csv("./data/so53933238.csv",header=TRUE)
    
    # split by value of CatChro into a list of datasets
    dataList <- split(data,data$CatChro)
    
    # process the list with lm(), extract results to a data frame, write to a list
    lmResults <- lapply(dataList,function(x){
         y <- summary(lm(Qend ~ Rainfall,data = x))
         Intercept <- y$coefficients[1,1]
         Slope <- y$coefficients[2,1]
         rSquared <- y$r.squared
         adjRSquared <- y$adj.r.squared
         f <- y$fstatistic[1]
         pValue <- pf(y$fstatistic[1],y$fstatistic[2],y$fstatistic[3],lower.tail = FALSE)
         data.frame(Slope,Intercept,rSquared,adjRSquared,pValue)
    })
    lmResultTable <- do.call(rbind,lmResults)
    # add CatChro indicators
    lmResultTable$catChro <- names(dataList)
    
    lmResultTable 
    

    ...and the output:

        > lmResultTable
                Slope   Intercept   rSquared  adjRSquared       pValue catChro
    A3D1 0.0004085644 0.011876543 0.28069553  0.254054622 0.0031181110    A3D1
    A3D2 0.0005431693 0.023601325 0.03384173  0.005425311 0.2828170556    A3D2
    A3D3 0.0001451185 0.022106960 0.04285322  0.002972105 0.3102578215    A3D3
    A3D4 0.0006614213 0.009301843 0.37219027  0.349768492 0.0003442445    A3D4
    A3D5 0.0001084626 0.014341399 0.04411669 -0.008987936 0.3741011769    A3D5
    A3G1 0.0001147645 0.024432020 0.03627553  0.011564648 0.2329519751    A3G1
    A3G2 0.0004583538 0.026079409 0.06449971  0.041112205 0.1045970987    A3G2
    A3G3 0.0006964512 0.043537869 0.07587433  0.054383038 0.0670399684    A3G3
    A3G4 0.0006442175 0.023706652 0.17337420  0.155404076 0.0032431299    A3G4
    A3G5 0.0006658466 0.025994831 0.17227383  0.150491566 0.0077413595    A3G5
    >
    

    To render the output in a tabular format in HTML, one can use knitr::kable().

    library(knitr)
    kable(lmResultTable[1:5],row.names=TRUE,digits=5) 
    

    ...which produces the following output after rendering the Markdown:

    0 讨论(0)
提交回复
热议问题