Use cut to create 24 categories for a time variable

问题

Here I import the data, do some manipulations to it (this is likely not going to be where the issue/fix lies)

The first two lines set my parameters for my cut.

lab_var_num <- (0:24) 
times_var <-c(0,100,200,300,400,500,600,700,800,900,1000,1100,1200,1300,1400,1500,1600,1700,1800,1900,2000,2100,2200,2300,2400,2500)


all_files_ls <- read_csv("~/Desktop/bioinformatic_work/log_parse_files/sorted_by_habitat/all_trap/all_files_la_selva_log.csv")
#Eliminate bad data and capture in separate dataframe- "bad" data contained within all_files_ls_bad
all_files_ls_bad<-subset(all_files_ls,all_files_ls$temp<10|all_files_ls$temp>50)
all_files_ls <-subset(all_files_ls,all_files_ls$temp>10&all_files_ls$temp<50)

# convert our character data to date data- then change to POSIXct data type.
# all_dates <- strptime(all_files_ls$date,format="%m/%d/%Y")
# Data needs to be put into a cosnistant format of %m/%d/%Y before you can coerce it
# into POSIXct, or any other, data otherwise it will spit out errors.

all_files_ls$date <- strptime(all_files_ls$date,format="%m/%d/%Y")
all_files_ls$date <- as.POSIXct(all_files_ls$date)
# Create wet and dry season data sets.
all_files_ls_w <- subset(all_files_ls,date>="2015-05-01"&date<="2015-12-31"|date>="2016-05-01"&date<="2016-12-31")
all_files_ls_s <- subset(all_files_ls,date>="2015-01-01"&date<="2015-4-30"|date>="2016-01-01"&date<="2016-04-30")


# Subset into canopy and understory dataframes.

all_files_ls_s_c <- subset(all_files_ls_s,canopy_understory=="c"|canopy_understory=="C")
all_files_ls_s_u <- subset(all_files_ls_s,canopy_understory=="u"|canopy_understory=="U")

all_files_ls_w_c <- subset(all_files_ls_w,canopy_understory=="c"|canopy_understory=="C")
all_files_ls_w_u <- subset(all_files_ls_w,canopy_understory=="u"|canopy_understory=="U")

all_files_ls_s_c_summ <- all_files_ls_s_c %>% group_by(time)%>% summarise(standard_deviation = sd(temp,na.rm=TRUE),mean = mean(temp,na.rm=TRUE))
all_files_ls_s_u_summ <- all_files_ls_s_u %>% group_by(time)%>% summarise(standard_deviation = sd(temp,na.rm=TRUE),mean = mean(temp,na.rm=TRUE))
all_files_ls_w_c_summ <- all_files_ls_w_c %>% group_by(time)%>% summarise(standard_deviation = sd(temp,na.rm=TRUE),mean = mean(temp,na.rm=TRUE))
all_files_ls_w_u_summ <- all_files_ls_w_u %>% group_by(time)%>% summarise(standard_deviation = sd(temp,na.rm=TRUE),mean = mean(temp,na.rm=TRUE))

Here are my cut functions:

all_files_ls_s_c_summ$time <- cut(as.numeric(all_files_ls_s_c_summ$time),breaks=c(times_var),labels = lab_var_num,include.lowest = TRUE)
all_files_ls_s_u_summ$time <- cut(as.numeric(all_files_ls_s_u_summ$time),breaks=c(times_var),labels = lab_var_num,include.lowest = TRUE)
all_files_ls_w_c_summ$time <- cut(as.numeric(all_files_ls_w_c_summ$time),breaks=c(times_var),labels = lab_var_num,include.lowest = TRUE)
all_files_ls_w_u_summ$time <- cut(as.numeric(all_files_ls_w_u_summ$time),breaks=c(times_var),labels = lab_var_num,include.lowest = TRUE)

when I examine the data resulting from the cut function, I have many more than the 24 categories that I want.

Here is some example data:

  trap        serial_no                           file_name canopy_understory       date  time  temp humidity
1  LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv                 c 2015-05-28   600  20.1     <NA>
2  LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv                 c 2015-05-28   800  25.5     <NA>
3  LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv                 c 2015-05-28  1000  29.0     <NA>
4  LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv                 c 2015-05-28  1200  28.0     <NA>
5  LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv                 c 2015-05-28  1400  28.5     <NA>
6  LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv                 c 2015-05-28  1601  27.5     <NA>
7  LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv                 c 2015-05-28  1803  25.5     <NA>
8  LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv                 c 2015-05-28  2001  23.5     <NA>
9  LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv                 c 2015-05-28  2200  22.5     <NA>
10 LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv                 c 2015-05-29   000  21.5     <NA>
11  LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv                 u 2016-01-01  0159  23.6     <NA>
12  LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv                 u 2016-01-01  0359  24.1     <NA>
13  LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv                 u 2016-01-01  0559  24.1     <NA>
14  LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv                 u 2016-01-01  0759  24.6     <NA>
15  LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv                 u 2016-01-01  0959  24.6     <NA>
16  LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv                 u 2016-01-01  1159  26.1     <NA>
17  LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv                 u 2016-01-01  1359  26.6     <NA>
18  LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv                 u 2016-01-01  1559  25.6     <NA>
19  LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv                 u 2016-01-01  1759  24.1     <NA>
20 LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv                 u 2016-01-01  1959  24.1     <NA>

This sample data may be problematic, because I can't provide an adequate snapshot of the dataset (too large) the high variability of which may be the issue.

Here one of the dataframes resulting from the cut:

"","time","standard_deviation","mean"
"1","0",0.864956566100052,23.5574468085106
"2","0",1.14440510857225,22.81103515625
"3","0",0.984904980117555,22.2286831812256
"4","0",1.08678357585325,22.3990654205607
"5","1",1.05145037946718,22.0769704433498
"6","1",1.12960402993109,22.3836754643206
"7","2",1.03725039998279,21.7559322033898
"8","2",1.1068790873174,21.9357894736842
"9","3",1.12097157902533,21.6717980295567
"10","3",1.19621923944834,22.00751953125
"11","4",1.07458677721861,21.4380704041721
"12","4",1.13677253853809,21.6116959064328
"13","5",1.17900504899409,21.4315270935961
"14","5",1.28653071505367,21.79990234375
"15","6",1.20354620166699,21.9286831812256
"16","6",1.31676108631382,22.2322429906542
"17","7",1.86260704732764,23.7655596555966
"18","7",1.77861521566506,24.20419921875
"19","8",2.46883855937697,25.7301298701299
"20","8",2.46920498327612,26.1562427071179
"21","9",2.68395795782085,27.1479115479115
"22","0",0.949097628789142,23.3553191489362
"23","9",2.79945910162021,27.6413533834586
"24","10",2.79930128034239,27.7833981841764
"25","10",2.90435941493285,28.4350606394708
"26","11",2.79704441144441,28.2748466257669
"27","11",2.84178392019108,28.8
"28","12",2.88487423989003,28.5626131953428
"29","12",3.09977843678832,29.2737596471885
"30","13",2.78609514613334,28.6300613496933
"31","13",2.9274394403559,29.0124410933082
"32","14",2.46471466241151,28.0413748378729
"33","14",2.64014509330527,28.5502750275027
"34","15",2.24926437332819,27.1096296296296
"35","15",2.3886068967475,27.4907634307257
"36","16",1.9467999768684,26.0171875
"37","16",1.96854340222531,26.4749174917492
"38","17",1.43673026552318,24.7727385377943
"39","17",1.49178257598373,25.1431279620853
"40","18",1.23662593572858,24.0101694915254
"41","18",1.36276616154878,24.3736434108527
"42","19",1.07197213445298,23.5255266418835
"43","1",0.99431780638411,23.0787234042553
"44","19",1.13453791853054,23.854174573055
"45","20",1.01855291267246,23.1731421121252
"46","20",1.10799364301127,23.4543743078627
"47","21",0.998989468534969,22.9889714993804
"48","21",1.0452391633029,23.2751423149905
"49","22",0.993841145023006,22.6971316818774
"50","22",1.08423014353774,22.9405524861878
"51","23",1.01856406998964,22.517843866171
"52","2",1.03074836073784,22.8872340425532
"53","3",1.10188636506543,22.7382978723404
"54","4",1.11782711780932,22.5787234042553
"55","5",1.06571756649915,22.6106382978723
"56","6",1.16909794681656,23.8127659574468
"57","7",1.28653814110936,26.2702127659574
"58","8",1.39470055539637,28.0787234042553

I'm using the group_by to get summary data for each time point. Then trying to use cut to make it so that each data point near a particular time is assigned to that time. So, if a time is 1801 it is grouped together with 1800. The group_by function merely puts every data point together that has an identical "time". What I want to accomplish is the grouping together of every nearby time point.

I can't figure out why I'm getting 58 categories when I expect to get 24.

回答1:

Instead of saving parts of the data.frame as separate files and doing the same operations on them, you can just group by multiple variables. You can use lubridate::month to extract the month as a number from each date (in base R you could use strptime(df$date, '%Y-%m-%d')$mon + 1), which lets you simply use ifelse to create a new grouping variable instead of cut with repeated labels (which will cause an error in R >= 3.4.0). Once you set all the grouping variables, summarizing is simple and DRY.

library(dplyr)

df %>% group_by(canopy_understory,    # Group by canopy/understory factor
                # Extract numeric month from date. If less than 5, make `season` "s" else "w", and group by it.
                season = ifelse(lubridate::month(date) < 5, 's', 'w'), 
                # Cut time by 0,100,200,...,2400, and group by the factor returned.
                hour = cut(time, seq(0, 2400, 100), include.lowest = TRUE)) %>% 
    summarise(temp_mean = mean(temp),    # For each group, calc mean and sd of temp.
              temp_sd = sd(temp))

#> # A tibble: 20 x 5
#> # Groups: canopy_understory, season [?]
#>    canopy_understory season              hour temp_mean temp_sd
#>               <fctr>  <chr>            <fctr>     <dbl>   <dbl>
#>  1                 c      w           [0,100]      21.5      NA
#>  2                 c      w         (500,600]      20.1      NA
#>  3                 c      w         (700,800]      25.5      NA
#>  4                 c      w       (900,1e+03]      29.0      NA
#>  5                 c      w (1.1e+03,1.2e+03]      28.0      NA
#>  6                 c      w (1.3e+03,1.4e+03]      28.5      NA
#>  7                 c      w (1.6e+03,1.7e+03]      27.5      NA
#>  8                 c      w (1.8e+03,1.9e+03]      25.5      NA
#>  9                 c      w   (2e+03,2.1e+03]      23.5      NA
#> 10                 c      w (2.1e+03,2.2e+03]      22.5      NA
#> 11                 u      s         (100,200]      23.6      NA
#> 12                 u      s         (300,400]      24.1      NA
#> 13                 u      s         (500,600]      24.1      NA
#> 14                 u      s         (700,800]      24.6      NA
#> 15                 u      s       (900,1e+03]      24.6      NA
#> 16                 u      s (1.1e+03,1.2e+03]      26.1      NA
#> 17                 u      s (1.3e+03,1.4e+03]      26.6      NA
#> 18                 u      s (1.5e+03,1.6e+03]      25.6      NA
#> 19                 u      s (1.7e+03,1.8e+03]      24.1      NA
#> 20                 u      s   (1.9e+03,2e+03]      24.1      NA

Standard deviations for the sample data are NA because there's only one observation in each group, but it should work fine on larger data.

Data

df <- structure(list(trap = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("LS_trap_10c", 
    "LS_trap_10u"), class = "factor"), serial_no = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
    2L, 2L, 2L), .Label = c("7C000000395C1641", "9F00000039641541"
    ), class = "factor"), file_name = structure(c(1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L
    ), .Label = c("trap10c_7C000000395C1641_150809.csv", "trap10u_9F00000039641541_160110.csv"
    ), class = "factor"), canopy_understory = structure(c(1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
    2L, 2L), .Label = c("c", "u"), class = "factor"), date = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
    3L, 3L, 3L), .Label = c("2015-05-28", "2015-05-29", "2016-01-01"
    ), class = "factor"), time = c(600L, 800L, 1000L, 1200L, 1400L, 
    1601L, 1803L, 2001L, 2200L, 0L, 159L, 359L, 559L, 759L, 959L, 
    1159L, 1359L, 1559L, 1759L, 1959L), temp = c(20.1, 25.5, 29, 
    28, 28.5, 27.5, 25.5, 23.5, 22.5, 21.5, 23.6, 24.1, 24.1, 24.6, 
    24.6, 26.1, 26.6, 25.6, 24.1, 24.1), humidity = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L), .Label = "<NA>", class = "factor")), .Names = c("trap", 
    "serial_no", "file_name", "canopy_understory", "date", "time", 
    "temp", "humidity"), class = "data.frame", row.names = c("1", 
    "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", 
    "14", "15", "16", "17", "18", "19", "20"))

来源：https://stackoverflow.com/questions/43946949/use-cut-to-create-24-categories-for-a-time-variable

标签

dataframe

cut

categorical-data