How to fit distribution on multiple columns using broom and dplyr

谁都会走 提交于 2021-01-05 08:03:05

问题


I have the following data:


library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
set.seed(1)
df <- data_frame(
  genes = paste("Gene_",letters[0:10],sep=""),
  X = rnorm(10, 0, 1),
  Y = rnorm(10, 0, 2),
  Z = rnorm(10, 0, 4))

df
#> # A tibble: 10 × 4
#>     genes          X           Y          Z
#>     <chr>      <dbl>       <dbl>      <dbl>
#> 1  Gene_a -0.6264538  3.02356234  3.6759095
#> 2  Gene_b  0.1836433  0.77968647  3.1285452
#> 3  Gene_c -0.8356286 -1.24248116  0.2982599
#> 4  Gene_d  1.5952808 -4.42939977 -7.9574068
#> 5  Gene_e  0.3295078  2.24986184  2.4793030
#> 6  Gene_f -0.8204684 -0.08986722 -0.2245150
#> 7  Gene_g  0.4874291 -0.03238053 -0.6231820
#> 8  Gene_h  0.7383247  1.88767242 -5.8830095
#> 9  Gene_i  0.5757814  1.64244239 -1.9126002
#> 10 Gene_j -0.3053884  1.18780264  1.6717662

I can calculate for just column X this way:


fit_X <- MASS::fitdistr(df$X,"normal")
broom::tidy(fit_X)
#>   term  estimate std.error
#> 1 mean 0.1322028 0.2341758
#> 2   sd 0.7405289 0.1655873
broom::glance(fit_X)
#>    n    logLik      AIC      BIC
#> 1 10 -11.18548 26.37096 26.97613

How can I do it for all columns (except the first one - genes) so that in the end I get:

    mean.estimate sd.estimate mean.stderror sd.stderror n loglik    AIC      BIC
X      0.1322028  0.7405289   0.2341758    0.1655873   10 -11.18548 26.37096 26.97613
Y      0.4976899  2.0292617   0.6417089    0.4537      10 -21.26611 46.53221 47.13738
Z     -0.534693   3.626276    1.1467291    0.8108599   10 -27.07145 58.14289 58.74806

回答1:


You can use tidyr::gather, purrr::map & broom::glance as follows:

df_new <- gather(df,var, val,-genes) %>% 
        group_by(var) %>% 
        summarise(vec = val %>% list) %>% 
        mutate(mod = map(vec, ~MASS::fitdistr(.x, "normal") )) %>% 
        mutate(mod_est_mean = map(mod, ("estimate")) %>% map_dbl("mean")) %>% 
        mutate(mod_est_sd = map(mod, ("estimate")) %>% map_dbl("sd")) %>% 
        mutate(mod_std_mean = map(mod, ("sd")) %>% map_dbl("mean")) %>% 
        mutate(mod_std_error = map(mod, ("sd")) %>% map_dbl("sd")) %>% 
        mutate(mod_glance = map(mod, glance)) %>% 
        select(-vec, -mod) %>% 
        unnest()

The breakdown of this code chunk is as follows:

Get the vector of each group in a column

gather(df,var, val,-genes) %>% 
        group_by(var) %>% 
        summarise(vec = val %>% list)

# A tibble: 3 × 2
    var        vec
  <chr>     <list>
1     X <dbl [10]>
2     Y <dbl [10]>
3     Z <dbl [10]>

Add the model in a new column

df_new <- gather(df,var, val,-genes) %>% 
        group_by(var) %>% 
        summarise(vec = val %>% list) %>% 
        mutate(mod = map(vec, ~MASS::fitdistr(.x, "normal") )) 
# A tibble: 3 × 3
    var        vec            mod
  <chr>     <list>         <list>
1     X <dbl [10]> <S3: fitdistr>
2     Y <dbl [10]> <S3: fitdistr>
3     Z <dbl [10]> <S3: fitdistr>

Add columns for estimates

df_new <- gather(df,var, val,-genes) %>% 
        group_by(var) %>% 
        summarise(vec = val %>% list) %>% 
        mutate(mod = map(vec, ~MASS::fitdistr(.x, "normal") )) %>% 
        mutate(mod_est_mean = map(mod, ("estimate")) %>% map_dbl("mean")) %>% 
        mutate(mod_est_sd = map(mod, ("estimate")) %>% map_dbl("sd")) %>% 
        mutate(mod_std_mean = map(mod, ("sd")) %>% map_dbl("mean")) %>% 
        mutate(mod_std_error = map(mod, ("sd")) %>% map_dbl("sd"))

# A tibble: 3 × 7
    var        vec            mod mod_est_mean mod_est_sd mod_std_mean mod_std_error
  <chr>     <list>         <list>        <dbl>      <dbl>        <dbl>         <dbl>
1     X <dbl [10]> <S3: fitdistr>    0.1322028  0.7405289    0.2341758     0.1655873
2     Y <dbl [10]> <S3: fitdistr>    0.4976899  2.0292617    0.6417089     0.4537567
3     Z <dbl [10]> <S3: fitdistr>   -0.5346930  3.6262759    1.1467291     0.8108599

glance the model and unnest

gather(df,var, val,-genes) %>% 
        group_by(var) %>% 
        summarise(vec = val %>% list) %>% 
        mutate(mod = map(vec, ~MASS::fitdistr(.x, "normal") )) %>% 
        mutate(mod_est_mean = map(mod, ("estimate")) %>% map_dbl("mean")) %>% 
        mutate(mod_est_sd = map(mod, ("estimate")) %>% map_dbl("sd")) %>% 
        mutate(mod_std_mean = map(mod, ("sd")) %>% map_dbl("mean")) %>% 
        mutate(mod_std_error = map(mod, ("sd")) %>% map_dbl("sd")) %>% 
        mutate(mod_glance = map(mod, glance)) %>% 
        select(-vec, -mod) %>% 
        unnest()

# A tibble: 3 × 9
    var mod_est_mean mod_est_sd mod_std_mean mod_std_error     n    logLik      AIC      BIC
  <chr>        <dbl>      <dbl>        <dbl>         <dbl> <int>     <dbl>    <dbl>    <dbl>
1     X    0.1322028  0.7405289    0.2341758     0.1655873    10 -11.18548 26.37096 26.97613
2     Y    0.4976899  2.0292617    0.6417089     0.4537567    10 -21.26611 46.53221 47.13738
3     Z   -0.5346930  3.6262759    1.1467291     0.8108599    10 -27.07145 58.14289 58.74806



回答2:


Try:

library(reshape2)
df <- melt(df)

df2 <- df %>% group_by(variable) %>% do(broom::tidy(MASS::fitdistr(.$value, "normal")))
df2 <- merge(df2 %>% dcast(variable ~ term, value.var = "estimate"), 
             df2 %>% dcast(variable ~ term, value.var = "std.error"), 
             by="variable", suffixes = c("_estimates", "_stderror")
)
df2 <- merge(df2, df %>% group_by(variable) %>% do(broom::glance(MASS::fitdistr(.$value, "normal"))))
df2
  variable mean_estimates sd_estimates mean_stderror sd_stderror  n    logLik      AIC      BIC
1        X      0.1322028    0.7405289     0.2341758   0.1655873 10 -11.18548 26.37096 26.97613
2        Y      0.4976899    2.0292617     0.6417089   0.4537567 10 -21.26611 46.53221 47.13738
3        Z     -0.5346930    3.6262759     1.1467291   0.8108599 10 -27.07145 58.14289 58.74806


来源:https://stackoverflow.com/questions/43849952/how-to-fit-distribution-on-multiple-columns-using-broom-and-dplyr

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!