Creating a summary statistical table from a data frame

前端未结

关注

 5  882

I have the following data frame (df) of 29 observations of 5 variables:

    age   height_seca1 height_chad1 height_DL weight_alog1
1   19         1800


                      
              相关标签:


      
      
        
          5条回答        

        
                         				            
            
           
            
                              
                
              
              
                
                  孤街浪徒        
                
              
                            
                2020-11-27 15:51
              
            
            
                                                                       
Or using what you have already done, you just need to put those summaries into a list and use do.call

df <- structure(list(age = c(19L, 19L, 21L, 21L, 21L, 18L, 19L, 19L, 21L, 17L, 28L, 22L, 19L, 19L, 18L, 18L, 19L, 19L, 18L, 21L, 19L, 31L, 26L, 19L, 18L, 19L, 26L, 20L, 18L), height_seca1 = c(1800L, 1682L, 1765L, 1829L, 1706L, 1607L, 1578L, 1577L, 1666L, 1710L, 1616L, 1648L, 1569L, 1779L, 1773L, 1816L, 1766L, 1745L, 1716L, 1785L, 1850L, 1875L, 1877L, 1836L, 1825L, 1755L, 1658L, 1816L, 1755L), height_chad1 = c(1797L, 1670L, 1765L, 1833L, 1705L, 1606L, 1576L, 1575L, 1665L, 1716L, 1619L, 1644L, 1570L, 1777L, 1772L, 1809L, 1765L, 1741L, 1714L, 1783L, 1854L, 1880L, 1877L, 1837L, 1823L, 1754L, 1658L, 1818L, 1755L), height_DL = c(180L, 167L, 178L, 181L, 170L, 160L, 156L, 156L, 166L, 172L, 161L, 165L, 155L, 177L, 179L, 181L, 178L, 174L, 170L, 179L, 185L, 188L, 186L, 185L, 182L, 174L, 165L, 183L, 175L), weight_alog1 = c(70L, 69L, 80L, 74L, 103L, 76L, 50L, 61L, 52L, 65L, 66L, 58L, 55L, 55L, 70L, 81L, 77L, 76L, 71L, 64L, 71L, 95L, 106L, 100L, 85L, 79L, 69L, 84L, 67L)), class = "data.frame", row.names = c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26", "27", "28", "29"))

tmp <- do.call(data.frame, 
           list(mean = apply(df, 2, mean),
                sd = apply(df, 2, sd),
                median = apply(df, 2, median),
                min = apply(df, 2, min),
                max = apply(df, 2, max),
                n = apply(df, 2, length)))
tmp

                   mean        sd median  min  max  n
age            20.41379  3.300619     19   17   31 29
height_seca1 1737.24138 91.919474   1755 1569 1877 29
height_chad1 1736.48276 92.682492   1755 1570 1880 29
height_DL     173.37931  9.685828    175  155  188 29
weight_alog1   73.41379 14.541854     71   50  106 29


or...

data.frame(t(tmp))

             age height_seca1 height_chad1  height_DL weight_alog1
mean   20.413793   1737.24138   1736.48276 173.379310     73.41379
sd      3.300619     91.91947     92.68249   9.685828     14.54185
median 19.000000   1755.00000   1755.00000 175.000000     71.00000
min    17.000000   1569.00000   1570.00000 155.000000     50.00000
max    31.000000   1877.00000   1880.00000 188.000000    106.00000
n      29.000000     29.00000     29.00000  29.000000     29.00000

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  佛祖请我去吃肉        
                
              
                            
                2020-11-27 15:52
              
            
            
                                                                       
You can use lapply to go over each column and an anonymous function to do each of your calculations:

res <- lapply( mydf , function(x) rbind( mean = mean(x) ,
                                  sd = sd(x) ,
                                  median = median(x) ,
                                  minimum = min(x) ,
                                  maximum = max(x) ,
                                  s.size = length(x) ) )

data.frame( res )
#              age height_seca1 height_chad1  height_DL weight_alog1
#mean    20.413793   1737.24138   1736.48276 173.379310     73.41379
#sd       3.300619     91.91947     92.68249   9.685828     14.54185
#median  19.000000   1755.00000   1755.00000 175.000000     71.00000
#minimum 17.000000   1569.00000   1570.00000 155.000000     50.00000
#maximum 31.000000   1877.00000   1880.00000 188.000000    106.00000
#s.size  29.000000     29.00000     29.00000  29.000000     29.00000

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  挽巷        
                
              
                            
                2020-11-27 15:57
              
            
            
                                                                       
So far I had the same problem and I wrote ...

h <- function(x, flist){
  f <- function(f,...)f(...)
  g <- function(x, flist){vapply(flist, f , x, FUN.VALUE = numeric(1))}
  df <- as.data.frame(lapply(x, g , flist))
  row.names(df) <- names(flist)
  df

}

h(cars, flist = list(mean = mean, median = median, std_dev =  sd))


it should work with any function as specified in flist as long as the function returns a single value; i.e. it wont work with range

Note that elements of flist should be named otherwise, you'll get strange row.names for the resulting data.frame
                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  轻奢々        
                
              
                            
                2020-11-27 16:00
              
            
            
                                                                       
Adding few more options for quick Exploratory Data Analysis (EDA)



1) skimr package:    

install.packages("skimr")
library(skimr)
skim(df)




2) ExPanDaR package:  

install.packages("ExPanDaR")
library(ExPanDaR)
# export data and code to a notebook
ExPanD(df, export_nb_option = TRUE)

# open a shiny app
ExPanD(df) 








3)  DescTools package:  

install.packages("DescTools")
library(DescTools)
Desc(df, plotit = TRUE)
#> ------------------------------------------------------------------------------ 
#> Describe df (data.frame):
#> 
#> data frame:  29 obs. of  5 variables
#>      29 complete cases (100.0%)
#> 
#>   Nr  ColName       Class    NAs  Levels
#>   1   age           integer  .          
#>   2   height_seca1  integer  .          
#>   3   height_chad1  integer  .          
#>   4   height_DL     integer  .          
#>   5   weight_alog1  integer  .          
#> 
#> 
#> ------------------------------------------------------------------------------ 
#> 1 - age (integer)
#> 
#>   length       n    NAs  unique     0s   mean  meanCI
#>       29      29      0       9      0  20.41   19.16
#>           100.0%   0.0%           0.0%          21.67
#>                                                      
#>      .05     .10    .25  median    .75    .90     .95
#>    18.00   18.00  19.00   19.00  21.00  26.00   27.20
#>                                                      
#>    range      sd  vcoef     mad    IQR   skew    kurt
#>    14.00    3.30   0.16    1.48   2.00   1.75    2.29
#>                                                      
#> 
#>    level  freq   perc  cumfreq  cumperc
#> 1     17     1   3.4%        1     3.4%
#> 2     18     6  20.7%        7    24.1%
#> 3     19    11  37.9%       18    62.1%
#> 4     20     1   3.4%       19    65.5%
#> 5     21     5  17.2%       24    82.8%
#> 6     22     1   3.4%       25    86.2%
#> 7     26     2   6.9%       27    93.1%
#> 8     28     1   3.4%       28    96.6%
#> 9     31     1   3.4%       29   100.0%
#> 
#> heap(?): remarkable frequency (37.9%) for the mode(s) (= 19)




Results from Desc can be saved to a Microsoft Word docx file

### RDCOMClient package is needed
install.packages("RDCOMClient", repos = "http://www.omegahat.net/R")
# or
devtools::install_github("omegahat/RDCOMClient")

# create a new word instance and insert title and contents
wrd <- GetNewWrd(header = TRUE)
DescTools::Desc(df, plotit = TRUE, wrd = wrd)


^{Created on 2020-01-17 by the reprex package (v0.3.0)}
                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  时光说笑        
                
              
                            
                2020-11-27 16:07
              
            
            
                                                                       
Try with basicStats from fBasics package
> install.packages("fBasics")
> library(fBasics)
> basicStats(df)
                   age height_seca1 height_chad1   height_DL weight_alog1
nobs         29.000000    29.000000    29.000000   29.000000    29.000000
NAs           0.000000     0.000000     0.000000    0.000000     0.000000
Minimum      17.000000  1569.000000  1570.000000  155.000000    50.000000
Maximum      31.000000  1877.000000  1880.000000  188.000000   106.000000
1. Quartile  19.000000  1666.000000  1665.000000  166.000000    65.000000
3. Quartile  21.000000  1816.000000  1809.000000  181.000000    80.000000
Mean         20.413793  1737.241379  1736.482759  173.379310    73.413793
Median       19.000000  1755.000000  1755.000000  175.000000    71.000000
Sum         592.000000 50380.000000 50358.000000 5028.000000  2129.000000
SE Mean       0.612910    17.069018    17.210707    1.798613     2.700354
LCL Mean     19.158305  1702.277081  1701.228224  169.695018    67.882368
UCL Mean     21.669282  1772.205677  1771.737293  177.063602    78.945219
Variance     10.894089  8449.189655  8590.044335   93.815271   211.465517
Stdev         3.300619    91.919474    92.682492    9.685828    14.541854
Skewness      1.746597    -0.355499    -0.322915   -0.430019     0.560360
Kurtosis      2.290686    -1.077820    -1.086108   -1.040182    -0.311017

You can also subset the output to get what you want:
> basicStats(df)[c("Mean", "Stdev", "Median", "Minimum", "Maximum", "nobs"),]
              age height_seca1 height_chad1  height_DL weight_alog1
Mean    20.413793   1737.24138   1736.48276 173.379310     73.41379
Stdev    3.300619     91.91947     92.68249   9.685828     14.54185
Median  19.000000   1755.00000   1755.00000 175.000000     71.00000
Minimum 17.000000   1569.00000   1570.00000 155.000000     50.00000
Maximum 31.000000   1877.00000   1880.00000 188.000000    106.00000
nobs    29.000000     29.00000     29.00000  29.000000     29.00000

Another alternative is that you define your own function as in this post.
Update:
(I hadn't read the "My assignment requires me not use any 'extra' R packages." part)
As I said before, you can define your own function and loop over each column by using *apply family functions:
my.summary <- function(x,...){
  c(mean=mean(x, ...),
    sd=sd(x, ...),
    median=median(x, ...),
    min=min(x, ...),
    max=max(x,...), 
    n=length(x))
}

# all these calls should give you the same results.
apply(df, 2, my.summary)
sapply(df, my.summary)
do.call(cbind,lapply(df, my.summary))

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
                             
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复