I am looking for your help. I am trying to divide a continuous variable in two groups, I put this example about what I am trying to do:
x=data.frame(v1=c(1,1
The first step could have been more economically done with an identical result as:
g1=x[x$v1>20,]
g2=x[x$v1<=20,] # since "[" would have returned a dataframe
But why not skip that step and do this instead:
do.call(cbind, by(x$v1, list(v1GT20 = x$v1 > 20),
function(v) c(Mean=mean(v), SD=sd(v), N=length(v)) ) )
FALSE TRUE
Mean 6.21622 62.6154
SD 3.55227 26.8096
N 37.00000 13.0000
If you want to get cuts at a variety of locations then use the cut function to split and identify the groups:
do.call(cbind, by(x$v1, cut( x$v1 , breaks=c(10,30,50,70,90,110,130,150,170,190) ),
function(v) c(Mean=mean(v), SD=sd(v), N=length(v)) ) )
(10,30] (30,50] (50,70] (70,90] (90,110]
Mean 16.28571 39.66667 61.6667 85.66667 99.00000
SD 7.93125 5.50757 9.2376 6.65833 1.41421
N 7.00000 3.00000 3.0000 3.00000 2.00000
If you wanted it in long format then the melt
function in package reshape2 is useful, and I noticed that the breaks vector needed a lowest argument that picked up the items below 10:
> melt( do.call(cbind, by(x$v1,
cut( x$v1 , breaks=c(-Inf, 10,30,50,70,90,110,130,150,170,190),
include.lowest=TRUE ),
function(v) c(Mean=mean(v), SD=sd(v), N=length(v)) ) ) )
Var1 Var2 value
1 Mean [-Inf,10] 5.34375
2 SD [-Inf,10] 2.90283
3 N [-Inf,10] 32.00000
4 Mean (10,30] 16.28571
5 SD (10,30] 7.93125
6 N (10,30] 7.00000
7 Mean (30,50] 39.66667
8 SD (30,50] 5.50757
9 N (30,50] 3.00000
10 Mean (50,70] 61.66667
11 SD (50,70] 9.23760
12 N (50,70] 3.00000
13 Mean (70,90] 85.66667
14 SD (70,90] 6.65833
15 N (70,90] 3.00000
16 Mean (90,110] 99.00000
17 SD (90,110] 1.41421
18 N (90,110] 2.00000
Here's a data.table
solution:
require(data.table)
x.dt <- data.table(x)
rbindlist(lapply(v, function(i) {
lbls <- paste0(c(">", "<="), i)
x.dt[, grp := as.character(factor(v1 > i, levels=c(TRUE, FALSE), labels=lbls))]
x.dt[, as.list(c(v = i, mean = mean(v1),
sd = sd(v1), length = length(v1))), by = grp]
}))
# grp v mean sd length
# 1: <=10 10 5.343750 2.902828 32
# 2: >10 10 48.500000 32.505656 18
# 3: <=20 20 6.216216 3.552270 37
# 4: >20 20 62.615385 26.809633 13
# 5: <=30 30 7.307692 5.907862 39
# 6: >30 30 69.000000 23.870484 11
# 7: <=50 50 9.619048 10.245647 42
# 8: >50 50 80.000000 17.270950 8
# 9: <=70 70 13.088889 16.555447 45
# 10: >70 70 91.000000 8.717798 5
# 11: <=90 90 17.625000 23.951747 48
# 12: >90 90 99.000000 1.414214 2
# 13: <=110 110 20.880000 28.456655 50
# 14: <=130 130 20.880000 28.456655 50
# 15: <=150 150 20.880000 28.456655 50
# 16: <=170 170 20.880000 28.456655 50
# 17: <=190 190 20.880000 28.456655 50
I would use reshape2 and plyr,
library(plyr) ; library(reshape2)
v=c(10,20,30,50,70,90,110,130,150,170,190) # added 20 for checking
# create new dichotomy id variable
l1 = llply(v, function(.v) transform(x, test = x[["v1"]] <= .v))
names(l1) = v # name list elements for later reference
all = melt(l1, id=c("v1","test")) # merge data.frames together
# summarise the data by groups
results = ddply(all, c("L1","test"), summarise,
mean = mean(v1), sd=sd(v1), length=length(v1))
Resulting in
arrange(results, as.numeric(L1))
L1 test mean sd length
1 10 FALSE 48.500000 32.505656 18
2 10 TRUE 5.343750 2.902828 32
3 20 FALSE 62.615385 26.809633 13
4 20 TRUE 6.216216 3.552270 37
5 30 FALSE 69.000000 23.870484 11
6 30 TRUE 7.307692 5.907862 39
7 50 FALSE 80.000000 17.270950 8
8 50 TRUE 9.619048 10.245647 42
9 70 FALSE 91.000000 8.717798 5
10 70 TRUE 13.088889 16.555447 45
11 90 FALSE 99.000000 1.414214 2
12 90 TRUE 17.625000 23.951747 48
13 110 TRUE 20.880000 28.456655 50
14 130 TRUE 20.880000 28.456655 50
15 150 TRUE 20.880000 28.456655 50
16 170 TRUE 20.880000 28.456655 50
17 190 TRUE 20.880000 28.456655 50
You can simply use summary
here with lapply
do.call(rbind,lapply( v,function(x) {
v1.inf <- summary(v1[v1<=x])
v1.sup <- summary(v1[v1>x])
m <- as.matrix(rbind(v1.inf,v1.sup))
rownames(m) <- paste(x,c('inf','sup'),sep='')
m
}))
Min. 1st Qu. Median Mean 3rd Qu. Max.
10inf 1 2.75 5.0 5.344 8.00 9
10sup 11 17.50 42.5 48.500 75.25 100
20inf 1 4.00 6.0 6.216 9.00 15
20sup 25 40.00 67.0 62.620 89.00 100
30inf 1 4.00 6.0 7.308 9.00 30
30sup 34 48.00 67.0 69.000 89.50 100
50inf 1 4.25 7.0 9.619 9.00 45
50sup 51 67.00 83.5 80.000 92.00 100
70inf 1 5.00 8.0 13.090 11.00 67
70sup 78 89.00 90.0 91.000 98.00 100
90inf 1 5.00 8.0 17.620 12.00 90
90sup 98 98.50 99.0 99.000 99.50 100
110inf 1 5.00 8.5 20.880 22.50 100
110sup NA NA NA NaN NA NA
130inf 1 5.00 8.5 20.880 22.50 100
130sup NA NA NA NaN NA NA
150inf 1 5.00 8.5 20.880 22.50 100
150sup NA NA NA NaN NA NA
170inf 1 5.00 8.5 20.880 22.50 100
170sup NA NA NA NaN NA NA
190inf 1 5.00 8.5 20.880 22.50 100
190sup NA NA NA NaN NA NA