Building a table with values that break up a continuous variable in two groups

后端 未结 4 639
眼角桃花
眼角桃花 2021-01-19 10:31

I am looking for your help. I am trying to divide a continuous variable in two groups, I put this example about what I am trying to do:

x=data.frame(v1=c(1,1         


        
相关标签:
4条回答
  • 2021-01-19 10:53

    The first step could have been more economically done with an identical result as:

    g1=x[x$v1>20,]
    g2=x[x$v1<=20,] # since "[" would have returned a dataframe
    

    But why not skip that step and do this instead:

    do.call(cbind, by(x$v1, list(v1GT20 = x$v1 > 20), 
                          function(v) c(Mean=mean(v), SD=sd(v), N=length(v)) ) )
            FALSE    TRUE
    Mean  6.21622 62.6154
    SD    3.55227 26.8096
    N    37.00000 13.0000
    

    If you want to get cuts at a variety of locations then use the cut function to split and identify the groups:

    do.call(cbind, by(x$v1, cut( x$v1 , breaks=c(10,30,50,70,90,110,130,150,170,190) ), 
                                   function(v) c(Mean=mean(v), SD=sd(v), N=length(v)) ) )
    
          (10,30]  (30,50] (50,70]  (70,90] (90,110]
    Mean 16.28571 39.66667 61.6667 85.66667 99.00000
    SD    7.93125  5.50757  9.2376  6.65833  1.41421
    N     7.00000  3.00000  3.0000  3.00000  2.00000
    

    If you wanted it in long format then the melt function in package reshape2 is useful, and I noticed that the breaks vector needed a lowest argument that picked up the items below 10:

    > melt( do.call(cbind, by(x$v1,
                              cut( x$v1 , breaks=c(-Inf, 10,30,50,70,90,110,130,150,170,190), 
                                                   include.lowest=TRUE ), 
                              function(v) c(Mean=mean(v), SD=sd(v), N=length(v)) ) ) )
    
       Var1      Var2    value
    1  Mean [-Inf,10]  5.34375
    2    SD [-Inf,10]  2.90283
    3     N [-Inf,10] 32.00000
    4  Mean   (10,30] 16.28571
    5    SD   (10,30]  7.93125
    6     N   (10,30]  7.00000
    7  Mean   (30,50] 39.66667
    8    SD   (30,50]  5.50757
    9     N   (30,50]  3.00000
    10 Mean   (50,70] 61.66667
    11   SD   (50,70]  9.23760
    12    N   (50,70]  3.00000
    13 Mean   (70,90] 85.66667
    14   SD   (70,90]  6.65833
    15    N   (70,90]  3.00000
    16 Mean  (90,110] 99.00000
    17   SD  (90,110]  1.41421
    18    N  (90,110]  2.00000
    
    0 讨论(0)
  • 2021-01-19 11:01

    Here's a data.table solution:

    require(data.table)
    x.dt <- data.table(x)
    rbindlist(lapply(v, function(i) {
        lbls <- paste0(c(">", "<="), i)
        x.dt[, grp := as.character(factor(v1 > i, levels=c(TRUE, FALSE), labels=lbls))]
        x.dt[, as.list(c(v = i, mean = mean(v1), 
            sd = sd(v1), length = length(v1))), by = grp]
    }))
    #       grp   v      mean        sd length
    #  1:  <=10  10  5.343750  2.902828     32
    #  2:   >10  10 48.500000 32.505656     18
    #  3:  <=20  20  6.216216  3.552270     37
    #  4:   >20  20 62.615385 26.809633     13
    #  5:  <=30  30  7.307692  5.907862     39
    #  6:   >30  30 69.000000 23.870484     11
    #  7:  <=50  50  9.619048 10.245647     42
    #  8:   >50  50 80.000000 17.270950      8
    #  9:  <=70  70 13.088889 16.555447     45
    # 10:   >70  70 91.000000  8.717798      5
    # 11:  <=90  90 17.625000 23.951747     48
    # 12:   >90  90 99.000000  1.414214      2
    # 13: <=110 110 20.880000 28.456655     50
    # 14: <=130 130 20.880000 28.456655     50
    # 15: <=150 150 20.880000 28.456655     50
    # 16: <=170 170 20.880000 28.456655     50
    # 17: <=190 190 20.880000 28.456655     50
    
    0 讨论(0)
  • 2021-01-19 11:06

    I would use reshape2 and plyr,

    library(plyr) ; library(reshape2)
    v=c(10,20,30,50,70,90,110,130,150,170,190) # added 20 for checking
    # create new dichotomy id variable
    l1 = llply(v, function(.v) transform(x, test = x[["v1"]] <= .v))
    names(l1) = v # name list elements for later reference
    all = melt(l1, id=c("v1","test")) # merge data.frames together
    # summarise the data by groups
    results = ddply(all, c("L1","test"), summarise, 
              mean = mean(v1), sd=sd(v1), length=length(v1))
    

    Resulting in

    arrange(results, as.numeric(L1))
    
        L1  test      mean        sd length
    1   10 FALSE 48.500000 32.505656     18
    2   10  TRUE  5.343750  2.902828     32
    3   20 FALSE 62.615385 26.809633     13
    4   20  TRUE  6.216216  3.552270     37
    5   30 FALSE 69.000000 23.870484     11
    6   30  TRUE  7.307692  5.907862     39
    7   50 FALSE 80.000000 17.270950      8
    8   50  TRUE  9.619048 10.245647     42
    9   70 FALSE 91.000000  8.717798      5
    10  70  TRUE 13.088889 16.555447     45
    11  90 FALSE 99.000000  1.414214      2
    12  90  TRUE 17.625000 23.951747     48
    13 110  TRUE 20.880000 28.456655     50
    14 130  TRUE 20.880000 28.456655     50
    15 150  TRUE 20.880000 28.456655     50
    16 170  TRUE 20.880000 28.456655     50
    17 190  TRUE 20.880000 28.456655     50
    
    0 讨论(0)
  • 2021-01-19 11:10

    You can simply use summary here with lapply

    do.call(rbind,lapply( v,function(x) {
      v1.inf <- summary(v1[v1<=x])
      v1.sup <- summary(v1[v1>x])
      m <- as.matrix(rbind(v1.inf,v1.sup))
      rownames(m) <- paste(x,c('inf','sup'),sep='')
      m
    }))
    
           Min. 1st Qu. Median   Mean 3rd Qu. Max.
    10inf     1    2.75    5.0  5.344    8.00    9
    10sup    11   17.50   42.5 48.500   75.25  100
    20inf     1    4.00    6.0  6.216    9.00   15
    20sup    25   40.00   67.0 62.620   89.00  100
    30inf     1    4.00    6.0  7.308    9.00   30
    30sup    34   48.00   67.0 69.000   89.50  100
    50inf     1    4.25    7.0  9.619    9.00   45
    50sup    51   67.00   83.5 80.000   92.00  100
    70inf     1    5.00    8.0 13.090   11.00   67
    70sup    78   89.00   90.0 91.000   98.00  100
    90inf     1    5.00    8.0 17.620   12.00   90
    90sup    98   98.50   99.0 99.000   99.50  100
    110inf    1    5.00    8.5 20.880   22.50  100
    110sup   NA      NA     NA    NaN      NA   NA
    130inf    1    5.00    8.5 20.880   22.50  100
    130sup   NA      NA     NA    NaN      NA   NA
    150inf    1    5.00    8.5 20.880   22.50  100
    150sup   NA      NA     NA    NaN      NA   NA
    170inf    1    5.00    8.5 20.880   22.50  100
    170sup   NA      NA     NA    NaN      NA   NA
    190inf    1    5.00    8.5 20.880   22.50  100
    190sup   NA      NA     NA    NaN      NA   NA
    
    0 讨论(0)
提交回复
热议问题