问题
One of the great things about pivot tables in excel is that they provide subtotals automatically. First, I would like to know if there is anything already created within dplyr that can accomplish this. If not, what is the easiest way to achieve it?
In the example below, I show the mean displacement by number of cylinders and carburetors. For each group of cylinders (4,6,8), I'd like to see the mean displacement for the group (or total displacement, or any other summary statistic).
library(dplyr)
mtcars %>% group_by(cyl,carb) %>% summarize(mean(disp))
cyl carb mean(disp)
1 4 1 91.38
2 4 2 116.60
3 6 1 241.50
4 6 4 163.80
5 6 6 145.00
6 8 2 345.50
7 8 3 275.80
8 8 4 405.50
9 8 8 301.00
回答1:
data.table It's very clunky, but this is one way:
library(data.table)
DT <- data.table(mtcars)
rbind(
DT[,.(mean(disp)), by=.(cyl,carb)],
DT[,.(mean(disp), carb=NA), by=.(cyl) ],
DT[,.(mean(disp), cyl=NA), by=.(carb)]
)[order(cyl,carb)]
This gives
cyl carb V1
1: 4 1 91.3800
2: 4 2 116.6000
3: 4 NA 105.1364
4: 6 1 241.5000
5: 6 4 163.8000
6: 6 6 145.0000
7: 6 NA 183.3143
8: 8 2 345.5000
9: 8 3 275.8000
10: 8 4 405.5000
11: 8 8 301.0000
12: 8 NA 353.1000
13: NA 1 134.2714
14: NA 2 208.1600
15: NA 3 275.8000
16: NA 4 308.8200
17: NA 6 145.0000
18: NA 8 301.0000
I'd rather see results in something like an R table
, but don't know of any functions for that.
dplyr @akrun found this analogous code
bind_rows(
mtcars %>%
group_by(cyl, carb) %>%
summarise(Mean= mean(disp)),
mtcars %>%
group_by(cyl) %>%
summarise(carb=NA, Mean=mean(disp)),
mtcars %>%
group_by(carb) %>%
summarise(cyl=NA, Mean=mean(disp))
) %>% arrange(cyl, carb)
We could wrap the repeat operations in a function
library(lazyeval)
f1 <- function(df, grp, Var, func){
FUN <- match.fun(func)
df %>%
group_by_(.dots=grp) %>%
summarise_(interp(~FUN(v), v=as.name(Var)))
}
m1 <- f1(mtcars, c('carb', 'cyl'), 'disp', 'mean')
m2 <- f1(mtcars, 'carb', 'disp', 'mean')
m3 <- f1(mtcars, 'cyl', 'disp', 'mean')
bind_rows(list(m1, m2, m3)) %>%
arrange(cyl, carb) %>%
rename(Mean=`FUN(disp)`)
carb cyl Mean
1 1 4 91.3800
2 2 4 116.6000
3 NA 4 105.1364
4 1 6 241.5000
5 4 6 163.8000
6 6 6 145.0000
7 NA 6 183.3143
8 2 8 345.5000
9 3 8 275.8000
10 4 8 405.5000
11 8 8 301.0000
12 NA 8 353.1000
13 1 NA 134.2714
14 2 NA 208.1600
15 3 NA 275.8000
16 4 NA 308.8200
17 6 NA 145.0000
18 8 NA 301.0000
Either option can be made a little less ugly with data.table's rbindlist
with fill
:
rbindlist(list(
mtcars %>% group_by(cyl) %>% summarise(mean(disp)),
mtcars %>% group_by(carb) %>% summarise(mean(disp)),
mtcars %>% group_by(cyl,carb) %>% summarise(mean(disp))
),fill=TRUE) %>% arrange(cyl,carb)
rbindlist(list(
DT[,mean(disp),by=.(cyl,carb)],
DT[,mean(disp),by=.(cyl)],
DT[,mean(disp),by=.(carb)]
),fill=TRUE)[order(cyl,carb)]
回答2:
Something similar to table
with addmargins
(although actually a data.frame
)
library(dplyr)
library(reshape2)
out <- bind_cols(
mtcars %>% group_by(cyl, carb) %>%
summarise(mu = mean(disp)) %>%
dcast(cyl ~ carb),
(mtcars %>% group_by(cyl) %>% summarise(Total=mean(disp)))[,2]
)
margin <- t((mtcars %>% group_by(carb) %>% summarise(Total=mean(disp)))[,2])
rbind(out, c(NA, margin, mean(mtcars$disp))) %>%
`rownames<-`(c(paste("cyl", c(4,6,8)), "Total")) # add some row names
# cyl 1 2 3 4 6 8 Total
# cyl 4 4 91.3800 116.60 NA NA NA NA 105.1364
# cyl 6 6 241.5000 NA NA 163.80 145 NA 183.3143
# cyl 8 8 NA 345.50 275.8 405.50 NA 301 353.1000
# Total NA 134.2714 208.16 275.8 308.82 145 301 230.7219
The bottom row is the column wise margins, columns named 1:8 are carbs, and Total is the rowwise margins.
回答3:
Also possible by simply joining the two group results:
cyl_carb <- mtcars %>% group_by(cyl,carb) %>% summarize(mean(disp))
cyl <- mtcars %>% group_by(cyl) %>% summarize(mean(disp))
joined <- full_join(cyl_carb, cyl)
result <- arrange(joined, cyl)
result
gives:
Source: local data frame [12 x 3]
Groups: cyl [3]
cyl carb mean(disp)
(dbl) (dbl) (dbl)
1 4 1 91.3800
2 4 2 116.6000
3 4 NA 105.1364
4 6 1 241.5000
5 6 4 163.8000
6 6 6 145.0000
7 6 NA 183.3143
8 8 2 345.5000
9 8 3 275.8000
10 8 4 405.5000
11 8 8 301.0000
12 8 NA 353.1000
or with an additional column:
cyl_carb <- mtcars %>% group_by(cyl,carb) %>% summarize(mean(disp))
cyl <- mtcars %>% group_by(cyl) %>% summarize(mean.cyl = mean(disp))
joined <- full_join(cyl_carb, cyl)
joined
gives:
Source: local data frame [9 x 4]
Groups: cyl [?]
cyl carb mean(disp) mean.cyl
(dbl) (dbl) (dbl) (dbl)
1 4 1 91.38 105.1364
2 4 2 116.60 105.1364
3 6 1 241.50 183.3143
4 6 4 163.80 183.3143
5 6 6 145.00 183.3143
6 8 2 345.50 353.1000
7 8 3 275.80 353.1000
8 8 4 405.50 353.1000
9 8 8 301.00 353.1000
回答4:
I know that this may not be a very elegant solution, but I hope it helps anyway:
p <-mtcars %>% group_by(cyl,carb)
p$cyl <- as.factor(p$cyl)
average_disp <- sapply(1:length(levels(p$cyl)), function(x)mean(subset(p,p$cyl==levels(p$cyl)[x])$disp))
df <- data.frame(levels(p$cyl),average_disp)
colnames(df)[1]<-"cyl"
#> df
# cyl average_disp
#1 4 105.1364
#2 6 183.3143
#3 8 353.1000
(Edit: After a minor modification in the definition of p
this now yields the same results as @Frank's and @akrun's solution)
回答5:
Here is a simple one-liner creating margins within a data_frame:
library(plyr)
library(dplyr)
# Margins without labels
mtcars %>%
group_by(cyl,carb) %>%
summarize(Mean_Disp=mean(disp)) %>%
do(plyr::rbind.fill(., data_frame(cyl=first(.$cyl), Mean_Disp=sum(.$Mean_Disp, na.rm=T))))
output:
Source: local data frame [12 x 3]
Groups: cyl [3]
cyl carb Mean_Disp
<dbl> <dbl> <dbl>
1 4 1 91.38
2 4 2 116.60
3 4 NA 207.98
4 6 1 241.50
5 6 4 163.80
6 6 6 145.00
7 6 NA 550.30
8 8 2 345.50
9 8 3 275.80
10 8 4 405.50
11 8 8 301.00
12 8 NA 1327.80
You may also add labels for the summary statistics like:
mtcars %>%
group_by(cyl,carb) %>%
summarize(Mean_Disp=mean(disp)) %>%
do(plyr::rbind.fill(., data_frame(cyl=first(.$cyl), carb=c("Total", "Mean"), Mean_Disp=c(sum(.$Mean_Disp, na.rm=T), mean(.$Mean_Disp, na.rm=T)))))
output:
Source: local data frame [15 x 3]
Groups: cyl [3]
cyl carb Mean_Disp
<dbl> <chr> <dbl>
1 4 1 91.38
2 4 2 116.60
3 4 Total 207.98
4 4 Mean 103.99
5 6 1 241.50
6 6 4 163.80
7 6 6 145.00
8 6 Total 550.30
9 6 Mean 183.43
10 8 2 345.50
11 8 3 275.80
12 8 4 405.50
13 8 8 301.00
14 8 Total 1327.80
15 8 Mean 331.95
回答6:
You can use this wrapper around ddply
, which applies ddply
for each possible margin and rbinds
the results with its usual output.
To marginalize over all grouping factors:
mtcars %>% ddplym(.variables = .(cyl, carb), .fun = summarise, mean(disp))
To marginalize over carb
only:
mtcars %>% ddplym(
.variables = .(carb),
.fun = function(data) data %>% group_by(cyl) %>% summarise(mean(disp)))
Wrapper:
require(plyr)
require(dplyr)
ddplym <- function(.data, .variables, .fun, ..., .margin = TRUE, .margin_name = '(all)') {
if (.margin) {
df <- .ddplym(.data, .variables, .fun, ..., .margin_name = .margin_name)
} else {
df <- ddply(.data, .variables, .fun, ...)
if (.variables %>% length == 0) {
df$.id <- NULL
}
}
return(df)
}
.ddplym <- function(.data,
.variables,
.fun,
...,
.margin_name = '(all)'
) {
.variables <- as.quoted(.variables)
n <- length(.variables)
var_combn_idx <- lapply(0:n, function(x) {
combn(1:n, n - x) %>% alply(2, c)
}) %>%
unlist(recursive = FALSE, use.names = FALSE)
data_list <- lapply(var_combn_idx, function(x) {
data <- ddply(.data, .variables[x], .fun, ...)
# drop '.id' column created when no variables to split by specified
if (!length(.variables[x]))
data <- data[, -1, drop = FALSE]
return(data)
})
# workaround for NULL .variables
if (unlist(.variables) %>% is.null && names(.variables) %>% is.null) {
data_list <- data_list[1]
} else if (unlist(.variables) %>% is.null) {
data_list <- data_list[2]
}
if (length(data_list) > 1) {
data_list <- lapply(data_list, function(data)
rbind_pre(
data = data,
colnames = colnames(data_list[[1]]),
fill = .margin_name
))
}
Reduce(rbind, data_list)
}
rbind_pre <- function(data, colnames, fill = NA) {
colnames_fill <- setdiff(colnames, colnames(data))
data_fill <- matrix(fill,
nrow = nrow(data),
ncol = length(colnames_fill)) %>%
as.data.frame %>% setNames(colnames_fill)
cbind(data, data_fill)[, colnames]
}
来源:https://stackoverflow.com/questions/31164350/dplyr-summarize-with-subtotals