问题
I have the following data:
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
set.seed(1)
df <- data_frame(
genes = paste("Gene_",letters[0:10],sep=""),
X = rnorm(10, 0, 1),
Y = rnorm(10, 0, 2),
Z = rnorm(10, 0, 4))
df
#> # A tibble: 10 × 4
#> genes X Y Z
#> <chr> <dbl> <dbl> <dbl>
#> 1 Gene_a -0.6264538 3.02356234 3.6759095
#> 2 Gene_b 0.1836433 0.77968647 3.1285452
#> 3 Gene_c -0.8356286 -1.24248116 0.2982599
#> 4 Gene_d 1.5952808 -4.42939977 -7.9574068
#> 5 Gene_e 0.3295078 2.24986184 2.4793030
#> 6 Gene_f -0.8204684 -0.08986722 -0.2245150
#> 7 Gene_g 0.4874291 -0.03238053 -0.6231820
#> 8 Gene_h 0.7383247 1.88767242 -5.8830095
#> 9 Gene_i 0.5757814 1.64244239 -1.9126002
#> 10 Gene_j -0.3053884 1.18780264 1.6717662
I can calculate for just column X this way:
fit_X <- MASS::fitdistr(df$X,"normal")
broom::tidy(fit_X)
#> term estimate std.error
#> 1 mean 0.1322028 0.2341758
#> 2 sd 0.7405289 0.1655873
broom::glance(fit_X)
#> n logLik AIC BIC
#> 1 10 -11.18548 26.37096 26.97613
How can I do it for all columns (except the first one - genes
) so that in the end I get:
mean.estimate sd.estimate mean.stderror sd.stderror n loglik AIC BIC
X 0.1322028 0.7405289 0.2341758 0.1655873 10 -11.18548 26.37096 26.97613
Y 0.4976899 2.0292617 0.6417089 0.4537 10 -21.26611 46.53221 47.13738
Z -0.534693 3.626276 1.1467291 0.8108599 10 -27.07145 58.14289 58.74806
回答1:
You can use tidyr::gather
, purrr::map
& broom::glance
as follows:
df_new <- gather(df,var, val,-genes) %>%
group_by(var) %>%
summarise(vec = val %>% list) %>%
mutate(mod = map(vec, ~MASS::fitdistr(.x, "normal") )) %>%
mutate(mod_est_mean = map(mod, ("estimate")) %>% map_dbl("mean")) %>%
mutate(mod_est_sd = map(mod, ("estimate")) %>% map_dbl("sd")) %>%
mutate(mod_std_mean = map(mod, ("sd")) %>% map_dbl("mean")) %>%
mutate(mod_std_error = map(mod, ("sd")) %>% map_dbl("sd")) %>%
mutate(mod_glance = map(mod, glance)) %>%
select(-vec, -mod) %>%
unnest()
The breakdown of this code chunk is as follows:
Get the vector of each group in a column
gather(df,var, val,-genes) %>%
group_by(var) %>%
summarise(vec = val %>% list)
# A tibble: 3 × 2
var vec
<chr> <list>
1 X <dbl [10]>
2 Y <dbl [10]>
3 Z <dbl [10]>
Add the model in a new column
df_new <- gather(df,var, val,-genes) %>%
group_by(var) %>%
summarise(vec = val %>% list) %>%
mutate(mod = map(vec, ~MASS::fitdistr(.x, "normal") ))
# A tibble: 3 × 3
var vec mod
<chr> <list> <list>
1 X <dbl [10]> <S3: fitdistr>
2 Y <dbl [10]> <S3: fitdistr>
3 Z <dbl [10]> <S3: fitdistr>
Add columns for estimates
df_new <- gather(df,var, val,-genes) %>%
group_by(var) %>%
summarise(vec = val %>% list) %>%
mutate(mod = map(vec, ~MASS::fitdistr(.x, "normal") )) %>%
mutate(mod_est_mean = map(mod, ("estimate")) %>% map_dbl("mean")) %>%
mutate(mod_est_sd = map(mod, ("estimate")) %>% map_dbl("sd")) %>%
mutate(mod_std_mean = map(mod, ("sd")) %>% map_dbl("mean")) %>%
mutate(mod_std_error = map(mod, ("sd")) %>% map_dbl("sd"))
# A tibble: 3 × 7
var vec mod mod_est_mean mod_est_sd mod_std_mean mod_std_error
<chr> <list> <list> <dbl> <dbl> <dbl> <dbl>
1 X <dbl [10]> <S3: fitdistr> 0.1322028 0.7405289 0.2341758 0.1655873
2 Y <dbl [10]> <S3: fitdistr> 0.4976899 2.0292617 0.6417089 0.4537567
3 Z <dbl [10]> <S3: fitdistr> -0.5346930 3.6262759 1.1467291 0.8108599
glance the model and unnest
gather(df,var, val,-genes) %>%
group_by(var) %>%
summarise(vec = val %>% list) %>%
mutate(mod = map(vec, ~MASS::fitdistr(.x, "normal") )) %>%
mutate(mod_est_mean = map(mod, ("estimate")) %>% map_dbl("mean")) %>%
mutate(mod_est_sd = map(mod, ("estimate")) %>% map_dbl("sd")) %>%
mutate(mod_std_mean = map(mod, ("sd")) %>% map_dbl("mean")) %>%
mutate(mod_std_error = map(mod, ("sd")) %>% map_dbl("sd")) %>%
mutate(mod_glance = map(mod, glance)) %>%
select(-vec, -mod) %>%
unnest()
# A tibble: 3 × 9
var mod_est_mean mod_est_sd mod_std_mean mod_std_error n logLik AIC BIC
<chr> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
1 X 0.1322028 0.7405289 0.2341758 0.1655873 10 -11.18548 26.37096 26.97613
2 Y 0.4976899 2.0292617 0.6417089 0.4537567 10 -21.26611 46.53221 47.13738
3 Z -0.5346930 3.6262759 1.1467291 0.8108599 10 -27.07145 58.14289 58.74806
回答2:
Try:
library(reshape2)
df <- melt(df)
df2 <- df %>% group_by(variable) %>% do(broom::tidy(MASS::fitdistr(.$value, "normal")))
df2 <- merge(df2 %>% dcast(variable ~ term, value.var = "estimate"),
df2 %>% dcast(variable ~ term, value.var = "std.error"),
by="variable", suffixes = c("_estimates", "_stderror")
)
df2 <- merge(df2, df %>% group_by(variable) %>% do(broom::glance(MASS::fitdistr(.$value, "normal"))))
df2
variable mean_estimates sd_estimates mean_stderror sd_stderror n logLik AIC BIC
1 X 0.1322028 0.7405289 0.2341758 0.1655873 10 -11.18548 26.37096 26.97613
2 Y 0.4976899 2.0292617 0.6417089 0.4537567 10 -21.26611 46.53221 47.13738
3 Z -0.5346930 3.6262759 1.1467291 0.8108599 10 -27.07145 58.14289 58.74806
来源:https://stackoverflow.com/questions/43849952/how-to-fit-distribution-on-multiple-columns-using-broom-and-dplyr