Use cut to create 24 categories for a time variable

前端 未结 1 463
无人共我
无人共我 2021-01-27 10:47

Here I import the data, do some manipulations to it (this is likely not going to be where the issue/fix lies)

The first two lines set my parameters for my cut.

<
相关标签:
1条回答
  • 2021-01-27 11:32

    Instead of saving parts of the data.frame as separate files and doing the same operations on them, you can just group by multiple variables. You can use lubridate::month to extract the month as a number from each date (in base R you could use strptime(df$date, '%Y-%m-%d')$mon + 1), which lets you simply use ifelse to create a new grouping variable instead of cut with repeated labels (which will cause an error in R >= 3.4.0). Once you set all the grouping variables, summarizing is simple and DRY.

    library(dplyr)
    
    df %>% group_by(canopy_understory,    # Group by canopy/understory factor
                    # Extract numeric month from date. If less than 5, make `season` "s" else "w", and group by it.
                    season = ifelse(lubridate::month(date) < 5, 's', 'w'), 
                    # Cut time by 0,100,200,...,2400, and group by the factor returned.
                    hour = cut(time, seq(0, 2400, 100), include.lowest = TRUE)) %>% 
        summarise(temp_mean = mean(temp),    # For each group, calc mean and sd of temp.
                  temp_sd = sd(temp))
    
    #> # A tibble: 20 x 5
    #> # Groups: canopy_understory, season [?]
    #>    canopy_understory season              hour temp_mean temp_sd
    #>               <fctr>  <chr>            <fctr>     <dbl>   <dbl>
    #>  1                 c      w           [0,100]      21.5      NA
    #>  2                 c      w         (500,600]      20.1      NA
    #>  3                 c      w         (700,800]      25.5      NA
    #>  4                 c      w       (900,1e+03]      29.0      NA
    #>  5                 c      w (1.1e+03,1.2e+03]      28.0      NA
    #>  6                 c      w (1.3e+03,1.4e+03]      28.5      NA
    #>  7                 c      w (1.6e+03,1.7e+03]      27.5      NA
    #>  8                 c      w (1.8e+03,1.9e+03]      25.5      NA
    #>  9                 c      w   (2e+03,2.1e+03]      23.5      NA
    #> 10                 c      w (2.1e+03,2.2e+03]      22.5      NA
    #> 11                 u      s         (100,200]      23.6      NA
    #> 12                 u      s         (300,400]      24.1      NA
    #> 13                 u      s         (500,600]      24.1      NA
    #> 14                 u      s         (700,800]      24.6      NA
    #> 15                 u      s       (900,1e+03]      24.6      NA
    #> 16                 u      s (1.1e+03,1.2e+03]      26.1      NA
    #> 17                 u      s (1.3e+03,1.4e+03]      26.6      NA
    #> 18                 u      s (1.5e+03,1.6e+03]      25.6      NA
    #> 19                 u      s (1.7e+03,1.8e+03]      24.1      NA
    #> 20                 u      s   (1.9e+03,2e+03]      24.1      NA
    

    Standard deviations for the sample data are NA because there's only one observation in each group, but it should work fine on larger data.


    Data

    df <- structure(list(trap = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
        1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("LS_trap_10c", 
        "LS_trap_10u"), class = "factor"), serial_no = structure(c(1L, 
        1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
        2L, 2L, 2L), .Label = c("7C000000395C1641", "9F00000039641541"
        ), class = "factor"), file_name = structure(c(1L, 1L, 1L, 1L, 
        1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L
        ), .Label = c("trap10c_7C000000395C1641_150809.csv", "trap10u_9F00000039641541_160110.csv"
        ), class = "factor"), canopy_understory = structure(c(1L, 1L, 
        1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
        2L, 2L), .Label = c("c", "u"), class = "factor"), date = structure(c(1L, 
        1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
        3L, 3L, 3L), .Label = c("2015-05-28", "2015-05-29", "2016-01-01"
        ), class = "factor"), time = c(600L, 800L, 1000L, 1200L, 1400L, 
        1601L, 1803L, 2001L, 2200L, 0L, 159L, 359L, 559L, 759L, 959L, 
        1159L, 1359L, 1559L, 1759L, 1959L), temp = c(20.1, 25.5, 29, 
        28, 28.5, 27.5, 25.5, 23.5, 22.5, 21.5, 23.6, 24.1, 24.1, 24.6, 
        24.6, 26.1, 26.6, 25.6, 24.1, 24.1), humidity = structure(c(1L, 
        1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
        1L, 1L, 1L), .Label = "<NA>", class = "factor")), .Names = c("trap", 
        "serial_no", "file_name", "canopy_understory", "date", "time", 
        "temp", "humidity"), class = "data.frame", row.names = c("1", 
        "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", 
        "14", "15", "16", "17", "18", "19", "20"))
    
    0 讨论(0)
提交回复
热议问题