A simpler way to achieve a frequency count with mean, sum, length and sd in R

问题

I've been tasked with creating frequency tables with statistical summaries. My goal is to create a data frame that can be exported simply to excel. Most of this could be in sql using stored procedures but I decided to do this in R. I'm learning R so I might be doing it the long way. This is a follow on question from getting-r-frequency-counts-for-all-possible-answers

Given

    Id <- c(1,2,3,4,5,6,7,8,9,10)
    ClassA <- c(1,NA,3,1,1,2,1,4,5,3)
    ClassB <- c(2,1,1,3,3,2,1,1,3,3)
    R <- c(1,2,3,NA,9,2,4,5,6,7)
    S <- c(3,7,NA,9,5,8,7,NA,7,6)
    df <- data.frame(Id,ClassA,ClassB,R,S)

    ZeroTenNAScale <- c(0:10,NA);

    R.freq <- setNames(nm=c('answer','value'),data.frame(table(factor(df$R,levels=ZeroTenNAScale,exclude=NULL))));
    R.freq[, 1] <- as.numeric(as.character( R.freq[, 1] ))
    R.freq <- cbind(question='R',R.freq)

    S.freq <- setNames(nm=c('answer','value'),data.frame(table(factor(df$S,levels=ZeroTenNAScale,exclude=NULL))));
    S.freq[, 1] <- as.numeric(as.character( S.freq[, 1] ))
    S.freq <- cbind(question='S',S.freq)

    R.mean = mean(df$R, na.rm = TRUE) 
    R.length = sum(!is.na(df$R)) 
    R.sd = sd(df$R, na.rm = TRUE) 
    R.sum = sum(df$R, na.rm = TRUE)

    S.mean = mean(df$S, na.rm = TRUE) 
    S.length = sum(!is.na(df$S)) 
    S.sd = sd(df$S, na.rm = TRUE) 
    S.sum = sum(df$S, na.rm = TRUE)

    S.row <- cbind('S','sum',as.numeric(S.sum))
    S.row <- setNames(nm=c('question','answer','value'),data.frame(S.row))
    S.freq = rbind(S.freq, S.row )

    S.row <- cbind('S','length',as.numeric(S.length))
    S.row <- setNames(nm=c('question','answer','value'),data.frame(S.row))
    S.freq = rbind(S.freq, S.row )

    S.row <- cbind('S','mean',as.numeric(S.mean))
    S.row <- setNames(nm=c('question','answer','value'),data.frame(S.row))
    S.freq = rbind(S.freq, S.row )

    S.row <- cbind('S','sd',as.numeric(S.sd))
    S.row <- setNames(nm=c('question','answer','value'),data.frame(S.row))
    S.freq = rbind(S.freq, S.row )

    R.row <- cbind('R','sum',as.numeric(R.sum))
    R.row <- setNames(nm=c('question','answer','value'),data.frame(R.row))
    R.freq = rbind(R.freq, R.row )

    R.row <- cbind('R','length',as.numeric(R.length))
    R.row <- setNames(nm=c('question','answer','value'),data.frame(R.row))
    R.freq = rbind(R.freq, R.row )

    R.row <- cbind('R','mean',as.numeric(R.mean))
    R.row <- setNames(nm=c('question','answer','value'),data.frame(R.row))
    R.freq = rbind(R.freq, R.row )

    R.row <- cbind('R','sd',as.numeric(R.sd))
    R.row <- setNames(nm=c('question','answer','value'),data.frame(R.row))
    R.freq = rbind(R.freq, R.row )

    result <- rbind(R.freq,S.freq)
    result <- cbind(filter='None',result)
    result

I get

   filter question answer            value
1    None        R      0                0
2    None        R      1                1
3    None        R      2                2
4    None        R      3                1
5    None        R      4                1
6    None        R      5                1
7    None        R      6                1
8    None        R      7                1
9    None        R      8                0
10   None        R      9                1
11   None        R     10                0
12   None        R   <NA>                1
13   None        R    sum               39
14   None        R length                9
15   None        R   mean 4.33333333333333
16   None        R     sd 2.64575131106459
17   None        S      0                0
18   None        S      1                0
19   None        S      2                0
20   None        S      3                1
21   None        S      4                0
22   None        S      5                1
23   None        S      6                1
24   None        S      7                3
25   None        S      8                1
26   None        S      9                1
27   None        S     10                0
28   None        S   <NA>                2
29   None        S    sum               52
30   None        S length                8
31   None        S   mean              6.5
32   None        S     sd  1.8516401995451

Which is pretty much what I'm looking for. The next step as I see it is to start wrapping in some functions to simplify the code before I start adding in similar result sets from ClassA=1, ClassA=n+1 ... ClassA=NA, then ClassB=1, ClassB=2 ... ClassB=NA. Is there a much simpler way of doing this?

The new code after studying the answers from Ernest A and Imo is

    # https://stackoverflow.com/questions/36790376/a-simpler-way-to-achieve-a-frequency-count-with-mean-sum-length-and-sd-in-r/36794422#36794422

    # create the summary function
    summaryStatistics <- function(x) {
        xx <- na.omit(x)
        c(table(factor(x, levels=0:10), useNA='always', exclude=NULL),
          sum=sum(xx), length=length(x), mean=mean(xx), sd=sqrt(var(xx)))
    }

    # create the test data frame
    Id <- c(1,2,3,4,5,6,7,8,9,10)
    ClassA <- c(1,NA,3,1,1,2,1,4,5,3)
    ClassB <- c(2,1,1,3,3,2,1,1,3,3)
    R <- c(1,2,3,NA,9,2,4,5,6,7)
    S <- c(3,7,NA,9,5,8,7,NA,7,6)
    df <- data.frame(Id,ClassA,ClassB,R,S)

    # create the result
    result <- setNames(
        nm=c('answer','question','value'),
        as.data.frame(
            as.table(
                simplify2array(
                    lapply(
                        df[c('R', 'S')], 
                        summaryStatistics
                    )
                )
            )
        )
    )

    # change the order to question, answer, value
    result <- result[, c(2, 1, 3)]

    # add the filter
    result <- cbind(filter='None',result)

    # return the result
    result

Which is much simpler and make my other task of training our team much simpler. Thanks to Ernest A and Imo.

The next question in relation to my understanding of R is Using vectors in R to change the output of a function

回答1:

Yes, it definitely can be simplified. Typically you would use a summary function such as

smry <- function(x, levels) {
    xx <- na.omit(x)
    c(table(factor(x, levels=levels), useNA='always', exclude=NULL),
      sum=sum(xx), length=length(x), mean=mean(xx), sd=sqrt(var(xx)))
}

then apply it to the different subsets of the data

> lapply(df[c('R', 'S')], smry, 0:10)
$R
        0         1         2         3         4         5         6         7 
 0.000000  1.000000  2.000000  1.000000  1.000000  1.000000  1.000000  1.000000 
        8         9        10      <NA>       sum    length      mean        sd 
 0.000000  1.000000  0.000000  1.000000 39.000000 10.000000  4.333333  2.645751 

$S
       0        1        2        3        4        5        6        7 
 0.00000  0.00000  0.00000  1.00000  0.00000  1.00000  1.00000  3.00000 
       8        9       10     <NA>      sum   length     mean       sd 
 1.00000  1.00000  0.00000  2.00000 52.00000 10.00000  6.50000  1.85164

If you absolutely have to put everything in a data frame

> as.data.frame(as.table(simplify2array(lapply(df[c('R', 'S')], smry, 0:10))))
     Var1 Var2      Freq
1       0    R  0.000000
2       1    R  1.000000
3       2    R  2.000000
4       3    R  1.000000
5       4    R  1.000000
6       5    R  1.000000
7       6    R  1.000000
8       7    R  1.000000
9       8    R  0.000000
10      9    R  1.000000
11     10    R  0.000000
12   <NA>    R  1.000000
13    sum    R 39.000000
14 length    R 10.000000
15   mean    R  4.333333
16     sd    R  2.645751
17      0    S  0.000000
18      1    S  0.000000
19      2    S  0.000000
20      3    S  1.000000
21      4    S  0.000000
22      5    S  1.000000
23      6    S  1.000000
24      7    S  3.000000
25      8    S  1.000000
26      9    S  1.000000
27     10    S  0.000000
28   <NA>    S  2.000000
29    sum    S 52.000000
30 length    S 10.000000
31   mean    S  6.500000
32     sd    S  1.851640

and then simply change the column names / add columns as you need.

回答2:

One thing you can do to reduce your code size is to wrap the summary stats in a function:

myStats <- function(x) {
  answer <- c("sum"=sum(x, na.rm = TRUE), "length"=sum(!is.na(x)), 
              "mean"=mean(x, na.rm = TRUE), "sd"=sd(x, na.rm = TRUE))

  return(answer)
}

This returns a named vector of summary stats ordered as you have in your output. You can then rbind the returned values as well as the names to your frequency table:

R.stats <- myStats(df$R)
rbind(R.freq, data.frame("question"='R', "answer"=names(R.stats),
                         "value"=R.stats))

来源：https://stackoverflow.com/questions/36790376/a-simpler-way-to-achieve-a-frequency-count-with-mean-sum-length-and-sd-in-r

标签

statistics

frequency

summary