问题
I'm trying to follow some rules about when to group data to chart. How would I go from this data frame:
# A tibble: 11 x 8
assay year qtr invalid valid total_assays hfr predicted_inv
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 test_case 2016. 1. 2. 36. 38. 0.0350 1.33
2 test_case 2016. 2. 1. 34. 35. 0.0350 1.23
3 test_case 2016. 3. 0. 25. 25. 0.0350 0.875
4 test_case 2016. 4. 2. 23. 25. 0.0350 0.875
5 test_case 2017. 1. 1. 29. 30. 0.0350 1.05
6 test_case 2017. 2. 2. 24. 26. 0.0350 0.910
7 test_case 2017. 3. 0. 23. 23. 0.0350 0.805
8 test_case 2017. 4. 1. 20. 21. 0.0350 0.735
9 test_case 2018. 1. 2. 33. 35. 0.0350 1.23
10 test_case 2018. 2. 5. 28. 33. 0.0350 1.16
11 test_case 2018. 3. 4. 9. 13. 0.0350 0.455
To this one:
assay year qtr invalid valid total_assays hfr predicted_inv co_inv co_val co_prd_inv trend
1 test_case 2016 1 2 36 38 0.035 1.330 2 36 1.330 No
2 test_case 2016 2 1 34 35 0.035 1.225 3 70 2.555 No
3 test_case 2016 3 0 25 25 0.035 0.875 3 95 3.430 No
4 test_case 2016 4 2 23 25 0.035 0.875 5 118 4.305 Yes
5 test_case 2017 1 1 29 30 0.035 1.050 1 29 1.050 No
6 test_case 2017 2 2 24 26 0.035 0.910 3 53 1.960 No
7 test_case 2017 3 0 23 23 0.035 0.805 3 76 2.765 No
8 test_case 2017 4 1 20 21 0.035 0.735 4 96 3.500 No
9 test_case 2018 1 2 33 35 0.035 1.225 6 129 4.725 Yes
10 test_case 2018 2 5 28 33 0.035 1.155 5 28 1.155 Yes
11 test_case 2018 3 4 9 13 0.035 0.455 4 9 0.455 No
The rules are fairly simple. For each row, if the cumulative sum of either invalid or predicted_inv is 5 or greater, then trend is 'yes' and the cumulative sums of all three parameters (invalid, valid, predicted_inv) are reset and start again from the next row. In the end the groupings (co_*) would be trended.
I've tried some solutions using dplyr, but I keep getting errors when I try to create multiple interdependent variables at the same time.
Now I'm trying a custom function that takes just the 3 parameters as vectors, but I keep being forced to build loops... I would prefer an easy to read dplyr solution.
Here are the dputs:
egdf1 <- structure(list(assay = c("test_case", "test_case", "test_case",
"test_case", "test_case", "test_case", "test_case", "test_case",
"test_case", "test_case", "test_case"), year = c(2016, 2016,
2016, 2016, 2017, 2017, 2017, 2017, 2018, 2018, 2018), qtr = c(1,
2, 3, 4, 1, 2, 3, 4, 1, 2, 3), invalid = c(2, 1, 0, 2, 1, 2,
0, 1, 2, 5, 4), valid = c(36, 34, 25, 23, 29, 24, 23, 20, 33,
28, 9), total_assays = c(38, 35, 25, 25, 30, 26, 23, 21, 35,
33, 13), hfr = c(0.035, 0.035, 0.035, 0.035, 0.035, 0.035, 0.035,
0.035, 0.035, 0.035, 0.035), predicted_inv = c(1.33, 1.225, 0.875,
0.875, 1.05, 0.91, 0.805, 0.735, 1.225, 1.155, 0.455)), .Names = c("assay",
"year", "qtr", "invalid", "valid", "total_assays", "hfr", "predicted_inv"
), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-11L))
egdf2 <- structure(list(assay = c("test_case", "test_case", "test_case",
"test_case", "test_case", "test_case", "test_case", "test_case",
"test_case", "test_case", "test_case"), year = c(2016L, 2016L,
2016L, 2016L, 2017L, 2017L, 2017L, 2017L, 2018L, 2018L, 2018L
), qtr = c(1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L), invalid = c(2L,
1L, 0L, 2L, 1L, 2L, 0L, 1L, 2L, 5L, 4L), valid = c(36L, 34L,
25L, 23L, 29L, 24L, 23L, 20L, 33L, 28L, 9L), total_assays = c(38L,
35L, 25L, 25L, 30L, 26L, 23L, 21L, 35L, 33L, 13L), hfr = c(0.035,
0.035, 0.035, 0.035, 0.035, 0.035, 0.035, 0.035, 0.035, 0.035,
0.035), predicted_inv = c(1.33, 1.225, 0.875, 0.875, 1.05, 0.91,
0.805, 0.735, 1.225, 1.155, 0.455), co_inv = c(2L, 3L, 3L, 5L,
1L, 3L, 3L, 4L, 6L, 5L, 4L), co_val = c(36L, 70L, 95L, 118L,
29L, 53L, 76L, 96L, 129L, 28L, 9L), co_prd_inv = c(1.33, 2.555,
3.43, 4.305, 1.05, 1.96, 2.765, 3.5, 4.725, 1.155, 0.455), trend = c("No",
"No", "No", "Yes", "No", "No", "No", "No", "Yes", "Yes", "No"
)), .Names = c("assay", "year", "qtr", "invalid", "valid", "total_assays",
"hfr", "predicted_inv", "co_inv", "co_val", "co_prd_inv", "trend"
), class = "data.frame", row.names = c(NA, -11L))
回答1:
Using the function cumsumbinning
from the MESS package to set up the value of the threshold that the cumulative group sum must not cross (5 in your example). Please bear in mind that in row 9 because adding 2 to 4 crosses the threshold of 5 creates another group, while in your desired output you want that reset in the next row.
library(MESS)
egdf1 %>%
group_by(group = cumsumbinning(invalid, 5)) %>%
mutate(
co_inv = cumsum(invalid),
co_val = cumsum(valid),
co_prd_inv = cumsum(predicted_inv),
trend = ifelse(group - lag(group, default = 0) > 1, "yes", "no")
)
Output
assay year qtr invalid valid total_assays hfr predicted_inv group co_inv co_val co_prd_inv trend
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <chr>
1 test_case 2016 1 2 36 38 0.035 1.33 1 2 36 1.33 no
2 test_case 2016 2 1 34 35 0.035 1.23 1 3 70 2.56 no
3 test_case 2016 3 0 25 25 0.035 0.875 1 3 95 3.43 no
4 test_case 2016 4 2 23 25 0.035 0.875 1 5 118 4.30 no
5 test_case 2017 1 1 29 30 0.035 1.05 2 1 29 1.05 yes
6 test_case 2017 2 2 24 26 0.035 0.91 2 3 53 1.96 no
7 test_case 2017 3 0 23 23 0.035 0.805 2 3 76 2.76 no
8 test_case 2017 4 1 20 21 0.035 0.735 2 4 96 3.5 no
9 test_case 2018 1 2 33 35 0.035 1.23 3 2 33 1.23 yes
10 test_case 2018 2 5 28 33 0.035 1.16 4 5 28 1.16 yes
11 test_case 2018 3 4 9 13 0.035 0.455 5 4 9 0.455 yes
回答2:
A base R solution using Reduce
:
cs <- Reduce(function(x, y) if (max(x[1], x[3]) < 5) x + y else y,
Map(c, egdf1$invalid, egdf1$valid, egdf1$predicted_inv),
accumulate = TRUE)
co <- do.call(rbind.data.frame, cs)
names(co) <- c("co_inv", "co_val", "co_prd_inv")
co$trend <- ifelse(pmax(co$co_inv, co$co_prd_inv) >= 5, "Yes", "No")
all.equal(cbind(egdf1, co), egdf2)
# [1] TRUE
来源:https://stackoverflow.com/questions/51936638/complex-cumulative-sum-with-double-resets