问题
Here I import the data, do some manipulations to it (this is likely not going to be where the issue/fix lies)
The first two lines set my parameters for my cut.
lab_var_num <- (0:24)
times_var <-c(0,100,200,300,400,500,600,700,800,900,1000,1100,1200,1300,1400,1500,1600,1700,1800,1900,2000,2100,2200,2300,2400,2500)
all_files_ls <- read_csv("~/Desktop/bioinformatic_work/log_parse_files/sorted_by_habitat/all_trap/all_files_la_selva_log.csv")
#Eliminate bad data and capture in separate dataframe- "bad" data contained within all_files_ls_bad
all_files_ls_bad<-subset(all_files_ls,all_files_ls$temp<10|all_files_ls$temp>50)
all_files_ls <-subset(all_files_ls,all_files_ls$temp>10&all_files_ls$temp<50)
# convert our character data to date data- then change to POSIXct data type.
# all_dates <- strptime(all_files_ls$date,format="%m/%d/%Y")
# Data needs to be put into a cosnistant format of %m/%d/%Y before you can coerce it
# into POSIXct, or any other, data otherwise it will spit out errors.
all_files_ls$date <- strptime(all_files_ls$date,format="%m/%d/%Y")
all_files_ls$date <- as.POSIXct(all_files_ls$date)
# Create wet and dry season data sets.
all_files_ls_w <- subset(all_files_ls,date>="2015-05-01"&date<="2015-12-31"|date>="2016-05-01"&date<="2016-12-31")
all_files_ls_s <- subset(all_files_ls,date>="2015-01-01"&date<="2015-4-30"|date>="2016-01-01"&date<="2016-04-30")
# Subset into canopy and understory dataframes.
all_files_ls_s_c <- subset(all_files_ls_s,canopy_understory=="c"|canopy_understory=="C")
all_files_ls_s_u <- subset(all_files_ls_s,canopy_understory=="u"|canopy_understory=="U")
all_files_ls_w_c <- subset(all_files_ls_w,canopy_understory=="c"|canopy_understory=="C")
all_files_ls_w_u <- subset(all_files_ls_w,canopy_understory=="u"|canopy_understory=="U")
all_files_ls_s_c_summ <- all_files_ls_s_c %>% group_by(time)%>% summarise(standard_deviation = sd(temp,na.rm=TRUE),mean = mean(temp,na.rm=TRUE))
all_files_ls_s_u_summ <- all_files_ls_s_u %>% group_by(time)%>% summarise(standard_deviation = sd(temp,na.rm=TRUE),mean = mean(temp,na.rm=TRUE))
all_files_ls_w_c_summ <- all_files_ls_w_c %>% group_by(time)%>% summarise(standard_deviation = sd(temp,na.rm=TRUE),mean = mean(temp,na.rm=TRUE))
all_files_ls_w_u_summ <- all_files_ls_w_u %>% group_by(time)%>% summarise(standard_deviation = sd(temp,na.rm=TRUE),mean = mean(temp,na.rm=TRUE))
Here are my cut functions:
all_files_ls_s_c_summ$time <- cut(as.numeric(all_files_ls_s_c_summ$time),breaks=c(times_var),labels = lab_var_num,include.lowest = TRUE)
all_files_ls_s_u_summ$time <- cut(as.numeric(all_files_ls_s_u_summ$time),breaks=c(times_var),labels = lab_var_num,include.lowest = TRUE)
all_files_ls_w_c_summ$time <- cut(as.numeric(all_files_ls_w_c_summ$time),breaks=c(times_var),labels = lab_var_num,include.lowest = TRUE)
all_files_ls_w_u_summ$time <- cut(as.numeric(all_files_ls_w_u_summ$time),breaks=c(times_var),labels = lab_var_num,include.lowest = TRUE)
when I examine the data resulting from the cut function, I have many more than the 24 categories that I want.
Here is some example data:
trap serial_no file_name canopy_understory date time temp humidity
1 LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv c 2015-05-28 600 20.1 <NA>
2 LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv c 2015-05-28 800 25.5 <NA>
3 LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv c 2015-05-28 1000 29.0 <NA>
4 LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv c 2015-05-28 1200 28.0 <NA>
5 LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv c 2015-05-28 1400 28.5 <NA>
6 LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv c 2015-05-28 1601 27.5 <NA>
7 LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv c 2015-05-28 1803 25.5 <NA>
8 LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv c 2015-05-28 2001 23.5 <NA>
9 LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv c 2015-05-28 2200 22.5 <NA>
10 LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv c 2015-05-29 000 21.5 <NA>
11 LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv u 2016-01-01 0159 23.6 <NA>
12 LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv u 2016-01-01 0359 24.1 <NA>
13 LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv u 2016-01-01 0559 24.1 <NA>
14 LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv u 2016-01-01 0759 24.6 <NA>
15 LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv u 2016-01-01 0959 24.6 <NA>
16 LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv u 2016-01-01 1159 26.1 <NA>
17 LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv u 2016-01-01 1359 26.6 <NA>
18 LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv u 2016-01-01 1559 25.6 <NA>
19 LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv u 2016-01-01 1759 24.1 <NA>
20 LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv u 2016-01-01 1959 24.1 <NA>
This sample data may be problematic, because I can't provide an adequate snapshot of the dataset (too large) the high variability of which may be the issue.
Here one of the dataframes resulting from the cut:
"","time","standard_deviation","mean"
"1","0",0.864956566100052,23.5574468085106
"2","0",1.14440510857225,22.81103515625
"3","0",0.984904980117555,22.2286831812256
"4","0",1.08678357585325,22.3990654205607
"5","1",1.05145037946718,22.0769704433498
"6","1",1.12960402993109,22.3836754643206
"7","2",1.03725039998279,21.7559322033898
"8","2",1.1068790873174,21.9357894736842
"9","3",1.12097157902533,21.6717980295567
"10","3",1.19621923944834,22.00751953125
"11","4",1.07458677721861,21.4380704041721
"12","4",1.13677253853809,21.6116959064328
"13","5",1.17900504899409,21.4315270935961
"14","5",1.28653071505367,21.79990234375
"15","6",1.20354620166699,21.9286831812256
"16","6",1.31676108631382,22.2322429906542
"17","7",1.86260704732764,23.7655596555966
"18","7",1.77861521566506,24.20419921875
"19","8",2.46883855937697,25.7301298701299
"20","8",2.46920498327612,26.1562427071179
"21","9",2.68395795782085,27.1479115479115
"22","0",0.949097628789142,23.3553191489362
"23","9",2.79945910162021,27.6413533834586
"24","10",2.79930128034239,27.7833981841764
"25","10",2.90435941493285,28.4350606394708
"26","11",2.79704441144441,28.2748466257669
"27","11",2.84178392019108,28.8
"28","12",2.88487423989003,28.5626131953428
"29","12",3.09977843678832,29.2737596471885
"30","13",2.78609514613334,28.6300613496933
"31","13",2.9274394403559,29.0124410933082
"32","14",2.46471466241151,28.0413748378729
"33","14",2.64014509330527,28.5502750275027
"34","15",2.24926437332819,27.1096296296296
"35","15",2.3886068967475,27.4907634307257
"36","16",1.9467999768684,26.0171875
"37","16",1.96854340222531,26.4749174917492
"38","17",1.43673026552318,24.7727385377943
"39","17",1.49178257598373,25.1431279620853
"40","18",1.23662593572858,24.0101694915254
"41","18",1.36276616154878,24.3736434108527
"42","19",1.07197213445298,23.5255266418835
"43","1",0.99431780638411,23.0787234042553
"44","19",1.13453791853054,23.854174573055
"45","20",1.01855291267246,23.1731421121252
"46","20",1.10799364301127,23.4543743078627
"47","21",0.998989468534969,22.9889714993804
"48","21",1.0452391633029,23.2751423149905
"49","22",0.993841145023006,22.6971316818774
"50","22",1.08423014353774,22.9405524861878
"51","23",1.01856406998964,22.517843866171
"52","2",1.03074836073784,22.8872340425532
"53","3",1.10188636506543,22.7382978723404
"54","4",1.11782711780932,22.5787234042553
"55","5",1.06571756649915,22.6106382978723
"56","6",1.16909794681656,23.8127659574468
"57","7",1.28653814110936,26.2702127659574
"58","8",1.39470055539637,28.0787234042553
I'm using the group_by to get summary data for each time point. Then trying to use cut to make it so that each data point near a particular time is assigned to that time. So, if a time is 1801 it is grouped together with 1800. The group_by function merely puts every data point together that has an identical "time". What I want to accomplish is the grouping together of every nearby time point.
I can't figure out why I'm getting 58 categories when I expect to get 24.
回答1:
Instead of saving parts of the data.frame as separate files and doing the same operations on them, you can just group by multiple variables. You can use lubridate::month
to extract the month as a number from each date (in base R you could use strptime(df$date, '%Y-%m-%d')$mon + 1
), which lets you simply use ifelse
to create a new grouping variable instead of cut
with repeated labels (which will cause an error in R >= 3.4.0). Once you set all the grouping variables, summarizing is simple and DRY.
library(dplyr)
df %>% group_by(canopy_understory, # Group by canopy/understory factor
# Extract numeric month from date. If less than 5, make `season` "s" else "w", and group by it.
season = ifelse(lubridate::month(date) < 5, 's', 'w'),
# Cut time by 0,100,200,...,2400, and group by the factor returned.
hour = cut(time, seq(0, 2400, 100), include.lowest = TRUE)) %>%
summarise(temp_mean = mean(temp), # For each group, calc mean and sd of temp.
temp_sd = sd(temp))
#> # A tibble: 20 x 5
#> # Groups: canopy_understory, season [?]
#> canopy_understory season hour temp_mean temp_sd
#> <fctr> <chr> <fctr> <dbl> <dbl>
#> 1 c w [0,100] 21.5 NA
#> 2 c w (500,600] 20.1 NA
#> 3 c w (700,800] 25.5 NA
#> 4 c w (900,1e+03] 29.0 NA
#> 5 c w (1.1e+03,1.2e+03] 28.0 NA
#> 6 c w (1.3e+03,1.4e+03] 28.5 NA
#> 7 c w (1.6e+03,1.7e+03] 27.5 NA
#> 8 c w (1.8e+03,1.9e+03] 25.5 NA
#> 9 c w (2e+03,2.1e+03] 23.5 NA
#> 10 c w (2.1e+03,2.2e+03] 22.5 NA
#> 11 u s (100,200] 23.6 NA
#> 12 u s (300,400] 24.1 NA
#> 13 u s (500,600] 24.1 NA
#> 14 u s (700,800] 24.6 NA
#> 15 u s (900,1e+03] 24.6 NA
#> 16 u s (1.1e+03,1.2e+03] 26.1 NA
#> 17 u s (1.3e+03,1.4e+03] 26.6 NA
#> 18 u s (1.5e+03,1.6e+03] 25.6 NA
#> 19 u s (1.7e+03,1.8e+03] 24.1 NA
#> 20 u s (1.9e+03,2e+03] 24.1 NA
Standard deviations for the sample data are NA
because there's only one observation in each group, but it should work fine on larger data.
Data
df <- structure(list(trap = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("LS_trap_10c",
"LS_trap_10u"), class = "factor"), serial_no = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L), .Label = c("7C000000395C1641", "9F00000039641541"
), class = "factor"), file_name = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L
), .Label = c("trap10c_7C000000395C1641_150809.csv", "trap10u_9F00000039641541_160110.csv"
), class = "factor"), canopy_understory = structure(c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L), .Label = c("c", "u"), class = "factor"), date = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L), .Label = c("2015-05-28", "2015-05-29", "2016-01-01"
), class = "factor"), time = c(600L, 800L, 1000L, 1200L, 1400L,
1601L, 1803L, 2001L, 2200L, 0L, 159L, 359L, 559L, 759L, 959L,
1159L, 1359L, 1559L, 1759L, 1959L), temp = c(20.1, 25.5, 29,
28, 28.5, 27.5, 25.5, 23.5, 22.5, 21.5, 23.6, 24.1, 24.1, 24.6,
24.6, 26.1, 26.6, 25.6, 24.1, 24.1), humidity = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = "<NA>", class = "factor")), .Names = c("trap",
"serial_no", "file_name", "canopy_understory", "date", "time",
"temp", "humidity"), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14", "15", "16", "17", "18", "19", "20"))
来源:https://stackoverflow.com/questions/43946949/use-cut-to-create-24-categories-for-a-time-variable