问题
Given a tbl_df
object df
containing multiple variables (i.e. Var.50, Var.100, Var.150 and Var.200), measured twice (i.e. P1 and P2), I want to mutate
a new set of the same variables from repeated measurements (for example, average P1 and P2, creating P3 for each corresponding variable).
Similar questions have been asked before, but there does not seem to have clear answers using dplyr.
Example data:
df <- structure(list(P1.Var.50 = c(134.242050170898, 52.375, 177.126017252604
), P1.Var.100 = c(395.202219645182, 161.636606852214, 538.408426920573
), P1.Var.150 = c(544.40028889974, 266.439168294271, 718.998555501302
), P1.Var.200 = c(620.076151529948, 333.218780517578, 837.109700520833
), P2.Var.50 = c(106.133892059326, 113.252154032389, 172.384114583333
), P2.Var.100 = c(355.226725260417, 277.197153727214, 502.086781819661
), P2.Var.150 = c(481.993103027344, 329.575764973958, 709.315409342448
), P2.Var.200 = c(541.859161376953, 372.05473836263, 829.299621582031
)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-3L), .Names = c("P1.Var.50", "P1.Var.100", "P1.Var.150", "P1.Var.200",
"P2.Var.50", "P2.Var.100", "P2.Var.150", "P2.Var.200"))
回答1:
Here is an option by gather
approach
library(tidyverse)
rownames_to_column(df, 'rn') %>%
gather( key, value, -rn) %>%
separate(key, into = c('key1', 'key2'), extra = 'merge', remove = FALSE) %>%
group_by(rn, key2) %>%
summarise(key3 = 'P3', value = mean(value)) %>%
unite(key, key3, key2) %>%
spread(key, value) %>%
ungroup() %>%
select(-rn) %>%
select(order(as.numeric(sub(".*\\.(\\d+)$", "\\1", names(.))))) %>%
bind_cols(df, .)
# A tibble: 3 x 12
# P1.Var.50 P1.Var.100 P1.Var.150 P1.Var.200 P2.Var.50 P2.Var.100 P2.Var.150 P2.Var.200 P3_Var.50 P3_Var.100 P3_Var.150 P3_Var.200
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 134.2421 395.2022 544.4003 620.0762 106.1339 355.2267 481.9931 541.8592 120.18797 375.2145 513.1967 580.9677
#2 52.3750 161.6366 266.4392 333.2188 113.2522 277.1972 329.5758 372.0547 82.81358 219.4169 298.0075 352.6368
#3 177.1260 538.4084 718.9986 837.1097 172.3841 502.0868 709.3154 829.2996 174.75507 520.2476 714.1570 833.2047
回答2:
Using dplyr
:
library(dplyr)
df1 <- df %>%
rowwise() %>%
mutate(P3.Var.50 = mean(c(P1.Var.50,P2.Var.50)),
P3.Var.100 = mean(c(P1.Var.100,P2.Var.100)),
P3.Var.150 = mean(c(P1.Var.150,P2.Var.150)),
P3.Var.200 = mean(c(P1.Var.200,P2.Var.200)))
-----------programmatically--------------
newcols <- sapply(seq(50,200,50), function(i) paste0("P3.Var.",i))
[1] "P3.Var.50" "P3.Var.100" "P3.Var.150" "P3.Var.200"
df1 <- df %>%
rowwise() %>%
mutate_(.dots = setNames(paste0("mean(c(",gsub("P3","P1",newcols),",",gsub("P3","P2",newcols),"))"), newcols))
回答3:
This is less general than Akrun's solution but if you don't have missing columns and that you know your categories P and Vars it should be faster (and shorter).
It uses only base R + the pipes:
np = 2
vars <- seq(50,200,by = 50)
df %>%
unlist %>%
matrix(ncol=np) %>%
cbind(rowMeans(.)) %>%
matrix(nrow=nrow(df)) %>%
`colnames<-`(c(names(df),paste0("P",np+1,".Var.",vars))) %>%
as.data.frame(stringsAsFactors=FALSE)
# P1.Var.50 P1.Var.100 P1.Var.150 P1.Var.200 P2.Var.50 P2.Var.100 P2.Var.150 P2.Var.200 P3.Var.50 P3.Var.100 P3.Var.150 P3.Var.200
# 1 134.2421 395.2022 544.4003 620.0762 106.1339 355.2267 481.9931 541.8592 120.18797 375.2145 513.1967 580.9677
# 2 52.3750 161.6366 266.4392 333.2188 113.2522 277.1972 329.5758 372.0547 82.81358 219.4169 298.0075 352.6368
# 3 177.1260 538.4084 718.9986 837.1097 172.3841 502.0868 709.3154 829.2996 174.75507 520.2476 714.1570 833.2047
来源:https://stackoverflow.com/questions/45103007/how-can-i-mutate-multiple-variables-using-dplyr