问题
I have a dataset containing purchases made by different households across different retailers. For eg
Example Dataset
Using dput()
structure(list(household_code = c(76, 76, 76, 76, 76, 76, 76,
76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 126,
126, 126, 126, 126, 126, 126, 126, 126), trip_code_uc = c(1032497498L,
1025776063L, 1029419047L, 1030418100L, 1029502602L, 1034153056L,
1027035051L, 1027533991L, 1033515804L, 1032998207L, 1032066227L,
1028192785L, 1033419039L, 1028730296L, 1027388499L, 1030652869L,
1025638394L, 1034032718L, 1034032718L, 1025678520L, 1029490031L,
1029898838L, 1028024134L, 1030324171L, 1031983761L, 1031983761L,
1033767148L, 1023953965L, 1030954113L, 1030954113L, 1027392968L
), purchase_date = structure(c(1L, 2L, 23L, 50L, 52L, 74L, 77L,
94L, 148L, 158L, 176L, 179L, 196L, 211L, 224L, 246L, 271L, 286L,
286L, 309L, 329L, 346L, 2L, 9L, 46L, 46L, 50L, 58L, 66L, 66L,
68L), .Label = c("2012-01-01", "2012-01-02", "2012-01-03", "2012-01-04",
"2012-01-05", "2012-01-06", "2012-01-07", "2012-01-08", "2012-01-09",
"2012-01-10", "2012-01-11", "2012-01-12", "2012-01-13", "2012-01-14",
"2012-01-15", "2012-01-16", "2012-01-17", "2012-01-18", "2012-01-19",
"2012-01-20", "2012-01-21", "2012-01-22", "2012-01-23", "2012-01-24",
"2012-01-25", "2012-01-26", "2012-01-27", "2012-01-28", "2012-01-29",
"2012-01-30", "2012-01-31", "2012-02-01", "2012-02-02", "2012-02-03",
"2012-02-04", "2012-02-05", "2012-02-06", "2012-02-07", "2012-02-08",
"2012-02-09", "2012-02-10", "2012-02-11", "2012-02-12", "2012-02-13",
"2012-02-14", "2012-02-15", "2012-02-16", "2012-02-17", "2012-02-18",
"2012-02-19", "2012-02-20", "2012-02-21", "2012-02-22", "2012-02-23",
"2012-02-24", "2012-02-25", "2012-02-26", "2012-02-27", "2012-02-28",
"2012-02-29", "2012-03-01", "2012-03-02", "2012-03-03", "2012-03-04",
"2012-03-05", "2012-03-06", "2012-03-07", "2012-03-08", "2012-03-09",
"2012-03-10", "2012-03-11", "2012-03-12", "2012-03-13", "2012-03-14",
"2012-03-15", "2012-03-16", "2012-03-17", "2012-03-18", "2012-03-19",
"2012-03-20", "2012-03-21", "2012-03-22", "2012-03-23", "2012-03-24",
"2012-03-25", "2012-03-26", "2012-03-27", "2012-03-28", "2012-03-29",
"2012-03-30", "2012-03-31", "2012-04-01", "2012-04-02", "2012-04-03",
"2012-04-04", "2012-04-05", "2012-04-06", "2012-04-07", "2012-04-08",
"2012-04-09", "2012-04-10", "2012-04-11", "2012-04-12", "2012-04-13",
"2012-04-14", "2012-04-15", "2012-04-16", "2012-04-17", "2012-04-18",
"2012-04-19", "2012-04-20", "2012-04-21", "2012-04-22", "2012-04-23",
"2012-04-24", "2012-04-25", "2012-04-26", "2012-04-27", "2012-04-28",
"2012-04-29", "2012-04-30", "2012-05-01", "2012-05-02", "2012-05-03",
"2012-05-04", "2012-05-05", "2012-05-06", "2012-05-07", "2012-05-08",
"2012-05-09", "2012-05-10", "2012-05-11", "2012-05-12", "2012-05-13",
"2012-05-14", "2012-05-15", "2012-05-16", "2012-05-17", "2012-05-18",
"2012-05-19", "2012-05-20", "2012-05-21", "2012-05-22", "2012-05-23",
"2012-05-24", "2012-05-25", "2012-05-26", "2012-05-27", "2012-05-28",
"2012-05-29", "2012-05-30", "2012-05-31", "2012-06-01", "2012-06-02",
"2012-06-03", "2012-06-04", "2012-06-05", "2012-06-06", "2012-06-07",
"2012-06-08", "2012-06-09", "2012-06-10", "2012-06-11", "2012-06-12",
"2012-06-13", "2012-06-14", "2012-06-15", "2012-06-16", "2012-06-17",
"2012-06-18", "2012-06-19", "2012-06-20", "2012-06-21", "2012-06-22",
"2012-06-23", "2012-06-24", "2012-06-25", "2012-06-26", "2012-06-27",
"2012-06-28", "2012-06-29", "2012-06-30", "2012-07-01", "2012-07-02",
"2012-07-03", "2012-07-04", "2012-07-05", "2012-07-06", "2012-07-07",
"2012-07-08", "2012-07-09", "2012-07-10", "2012-07-11", "2012-07-12",
"2012-07-13", "2012-07-14", "2012-07-15", "2012-07-16", "2012-07-17",
"2012-07-18", "2012-07-19", "2012-07-20", "2012-07-21", "2012-07-22",
"2012-07-23", "2012-07-24", "2012-07-25", "2012-07-26", "2012-07-27",
"2012-07-28", "2012-07-29", "2012-07-30", "2012-07-31", "2012-08-01",
"2012-08-02", "2012-08-03", "2012-08-04", "2012-08-05", "2012-08-06",
"2012-08-07", "2012-08-08", "2012-08-09", "2012-08-10", "2012-08-11",
"2012-08-12", "2012-08-13", "2012-08-14", "2012-08-15", "2012-08-16",
"2012-08-17", "2012-08-18", "2012-08-19", "2012-08-20", "2012-08-21",
"2012-08-22", "2012-08-23", "2012-08-24", "2012-08-25", "2012-08-26",
"2012-08-27", "2012-08-28", "2012-08-29", "2012-08-30", "2012-08-31",
"2012-09-01", "2012-09-02", "2012-09-03", "2012-09-04", "2012-09-05",
"2012-09-06", "2012-09-07", "2012-09-08", "2012-09-09", "2012-09-10",
"2012-09-11", "2012-09-12", "2012-09-13", "2012-09-14", "2012-09-15",
"2012-09-16", "2012-09-17", "2012-09-18", "2012-09-19", "2012-09-20",
"2012-09-21", "2012-09-22", "2012-09-23", "2012-09-24", "2012-09-25",
"2012-09-26", "2012-09-27", "2012-09-28", "2012-09-29", "2012-09-30",
"2012-10-01", "2012-10-02", "2012-10-03", "2012-10-04", "2012-10-05",
"2012-10-06", "2012-10-07", "2012-10-08", "2012-10-09", "2012-10-10",
"2012-10-11", "2012-10-12", "2012-10-13", "2012-10-14", "2012-10-15",
"2012-10-16", "2012-10-17", "2012-10-18", "2012-10-19", "2012-10-20",
"2012-10-21", "2012-10-22", "2012-10-23", "2012-10-24", "2012-10-25",
"2012-10-26", "2012-10-27", "2012-10-28", "2012-10-29", "2012-10-30",
"2012-10-31", "2012-11-01", "2012-11-02", "2012-11-03", "2012-11-04",
"2012-11-05", "2012-11-06", "2012-11-07", "2012-11-08", "2012-11-09",
"2012-11-10", "2012-11-11", "2012-11-12", "2012-11-13", "2012-11-14",
"2012-11-15", "2012-11-16", "2012-11-17", "2012-11-18", "2012-11-19",
"2012-11-20", "2012-11-21", "2012-11-22", "2012-11-23", "2012-11-24",
"2012-11-25", "2012-11-26", "2012-11-27", "2012-11-28", "2012-11-29",
"2012-11-30", "2012-12-01", "2012-12-02", "2012-12-03", "2012-12-04",
"2012-12-05", "2012-12-06", "2012-12-07", "2012-12-08", "2012-12-09",
"2012-12-10", "2012-12-11", "2012-12-12", "2012-12-13", "2012-12-14",
"2012-12-15", "2012-12-16", "2012-12-17", "2012-12-18", "2012-12-19",
"2012-12-20", "2012-12-21", "2012-12-22", "2012-12-23", "2012-12-24",
"2012-12-25", "2012-12-26", "2012-12-27", "2012-12-28", "2012-12-29"
), class = "factor"), retailer_code = c(11024, 11024, 11024,
11024, 11024, 11024, 11024, 11024, 11024, 11024, 11024, 11024,
11024, 11024, 11024, 11024, 2353, 11024, 11024, 2353, 11024,
11024, 63882, 650, 89960, 89960, 650, 89960, 89960, 89960, 650
), Overall_Brand = structure(c(19L, 74L, 19L, 48L, 19L, 48L,
19L, 19L, 19L, 48L, 48L, 31L, 46L, 31L, 31L, 48L, 74L, 31L, 74L,
19L, 31L, 19L, 48L, 48L, 31L, 31L, 48L, 31L, 31L, 48L, 48L), .Label = c("ABUNDANCE",
"ALPEN", "AMERICAN BREAKFAST ", "ANNIE'S HOMEGROWN", "ARWHD MLS",
"BARBARA'S", "BEAR NAKED", "BEAR RIVER", "BOB'S RED MILL", "BOKOMO COUNTRY",
"BREAKFAST CHOICE", "BREAKFAST ZONE", "BROOKFARM MACADAMIA ",
"BRUGGEN", "BUCKEYE HEROES", "CADIA", "CASCADIAN FARM ", "CHOCOLATE SPOONERS",
"CTL BR", "DORSET", "ENJOY LIFE PERKY'S CRUNCHY FLX", "EREWHON",
"F-FACTOR", "FAMILIA", "FIELD DAY", "FINAX", "FLEURY FLAKES MARC ANDRE",
"FOOD FOR LIFE EZEKIEL 50", "FORRELLI ", "GEFEN KING", "GENERAL MILLS",
"GERONIMO PEYTON HILLIS REDZONE", "GLUCERNA", "GLUTINO", "GOLDEN FOODS",
"GRANVITA PUFFY'S", "GREENBRIER INT INC-NBL CRN FLK", "HEALTH VALLEY",
"HODGSON MILL", "HOME FAVORITE", "HOSTESS TOASTED OATS", "HSP",
"ISABEL'S WAY ", "JASPER", "JUSTIN VERLANDER'S FASTBLL FLK",
"KASHI", "KAY'S NATURALS BETTER BALANCE", "Kellogg", "KIND",
"KOZY SHACK READY GRAINS", "KRETSCHMER", "LADY LIBERTY", "LIEBER'S",
"LIVING INTENTIONS SPRFD CRL", "LOVE GROWN FOODS", "MAIZORO",
"MANISCHEWITZ", "MILL SELECT ", "MOTHER'S", "MULTIGRAIN SPNRS",
"NASH BROTHERS", "NATURE'S PATH", "NESTLE", "NEW ENGLAND NATURALS",
"NEWMAN'S OWN SWEET ENGH WHT PF", "NUTRISYSTEM NOURISH", "NUTRITIOUS LIVING",
"PAMPA ", "PASKESZ CHOCO RIOS", "PBLC LB BR-NBL HY NT TSD OT DM",
"PEACE", "POST", "PROTEIN PLUS CORN FLAKE", "QUAKER", "RALSTON",
"RLSTN", "SALLY'S ", "SCRUMPTIOUS SPELNDID CRLS CBB", "SEITENBACHER MUESLI",
"SIMPLY FIBER", "SKINNER'S RAISIN BRAN", "SORIANA WHOLE BRAN",
"STREIT'S FRUIT & NUT MUESLI", "SUNBELT", "SWAD", "T. ABRAHAM'S",
"TAANUG CORN FLAKES", "TASTY", "TEMMY'S", "THE", "THREE SISTERS",
"TIKISS SWEETENED PUFFD WHL WHT", "TRU ROOTS", "VOTTO'S", "WEETABIX",
"WHOLESOME GOODNESS", "WILD ROOTS ANCIENT ORIGINS", "WONDER CORN FLAKES",
"YOG ACTIVE"), class = "factor")), row.names = c(90609L, 222436L,
90606L, 688592L, 90607L, 688593L, 90605L, 90604L, 90608L, 668330L,
321377L, 567447L, 945385L, 567445L, 567443L, 892854L, 583186L,
567446L, 583185L, 168138L, 567444L, 60086L, 698120L, 698127L,
3933L, 809409L, 698129L, 15286L, 15284L, 698116L, 319455L), class = "data.frame")
I have a separate variable for Household, trip ID for a particular purchase instance, retailer, and Brand they purchased. One household may purchase more than 1 brand in 1 trip. I wanted to calculate the repeat purchases in 2 consecutive trips. For example, if a household purchase General Mills and Kellogg both in trip 1 and only General mill in 2nd trip, for general mills, it will be a repeat.
Example output:
Now, it is easy when I have only 1 purchase per trip. I do it by the following code using rle()
e1 = transform(e1, brand_last_dum = ave(as.character(Brand), rleid(Household, Brand), FUN = seq_along))
However, when there is more than 1 purchase, it doesn't work. Data is arranged by the purchase date and more than 1 purchase of the same brand in a trip maybe considered as a different purchase for calculating repeat in the next row. Please Help
回答1:
2020-11-29 UPDATE: Adjusted code to count second+ purchases of same brand in same shopping trip as repeat purchases. Given the additional requirement I was able to eliminate one of the joins which had been previously added to treat all purchases of a given brand within a trip consistently.
library(dplyr)
data %>%
arrange(.,household_code,purchase_date,trip_code_uc) %>%
group_by(household_code,purchase_date) %>%
distinct(trip_code_uc) %>%
group_by(household_code) %>%
mutate(trip_seq = seq_along(household_code)) %>%
ungroup() %>%
left_join(data,.) %>%
arrange(household_code,Overall_Brand,trip_seq) %>%
mutate(repeat_purchase = if_else(household_code == lag(household_code) &
Overall_Brand == lag(Overall_Brand) &
(trip_seq == lag(trip_seq) + 1 | trip_seq == lag(trip_seq)),
TRUE,FALSE,missing=FALSE)) %>%
arrange(.,household_code,purchase_date,Overall_Brand) -> result
print(as.data.frame(result[,c(1,3,5,6,7)]))
...and the output:
> print(as.data.frame(result[,c(1,3,5,6,7)]))
household_code purchase_date Overall_Brand trip_seq repeat_purchase
1 76 2012-01-01 CTL BR 1 FALSE
2 76 2012-01-02 QUAKER 2 FALSE
3 76 2012-01-23 CTL BR 3 FALSE
4 76 2012-02-19 Kellogg 4 FALSE
5 76 2012-02-21 CTL BR 5 FALSE
6 76 2012-03-14 Kellogg 6 FALSE
7 76 2012-03-17 CTL BR 7 FALSE
8 76 2012-04-03 CTL BR 8 TRUE
9 76 2012-05-27 CTL BR 9 TRUE
10 76 2012-06-06 Kellogg 10 FALSE
11 76 2012-06-24 Kellogg 11 TRUE
12 76 2012-06-27 GENERAL MILLS 12 FALSE
13 76 2012-07-14 KASHI 13 FALSE
14 76 2012-07-29 GENERAL MILLS 14 FALSE
15 76 2012-08-11 GENERAL MILLS 15 TRUE
16 76 2012-09-02 Kellogg 16 FALSE
17 76 2012-09-27 QUAKER 17 FALSE
18 76 2012-10-12 GENERAL MILLS 18 FALSE
19 76 2012-10-12 QUAKER 18 TRUE
20 76 2012-11-04 CTL BR 19 FALSE
21 76 2012-11-24 GENERAL MILLS 20 FALSE
22 76 2012-12-11 CTL BR 21 FALSE
23 126 2012-01-02 Kellogg 1 FALSE
24 126 2012-01-09 Kellogg 2 TRUE
25 126 2012-02-15 GENERAL MILLS 3 FALSE
26 126 2012-02-15 GENERAL MILLS 3 TRUE
27 126 2012-02-19 Kellogg 4 FALSE
28 126 2012-02-27 GENERAL MILLS 5 FALSE
29 126 2012-03-06 GENERAL MILLS 6 TRUE
30 126 2012-03-06 Kellogg 6 FALSE
31 126 2012-03-08 Kellogg 7 TRUE
>
Prior version included below to maintain relevance of comments
2020-11-28 UPDATE: After receiving the updated data that includes purchase_date
, we altered our solution to use this information to generate trip_seq
. We assume that if there are multiple trip_code_uc
values in a single day, the shopping experiences occur in ascending order of trip_code_uc
.
We use dplyr
to define distinct trip identifiers for each household and assign a trip sequence. We then use the trip sequence to evaluate whether the same brand was purchased in two consecutive shopping trips after merging it with the original data.
library(dplyr)
data %>%
arrange(.,household_code,purchase_date,trip_code_uc) %>%
group_by(household_code,purchase_date) %>%
distinct(trip_code_uc) %>%
group_by(household_code) %>%
mutate(trip_seq = seq_along(household_code)) %>%
ungroup() %>%
left_join(data,.) %>%
group_by(household_code,purchase_date,trip_seq) %>%
distinct(Overall_Brand) %>%
ungroup() %>%
arrange(household_code,Overall_Brand,purchase_date,trip_seq) %>%
mutate(repeat_purchase = if_else(household_code == lag(household_code) &
Overall_Brand == lag(Overall_Brand) &
trip_seq == lag(trip_seq) + 1,
TRUE,FALSE,missing=FALSE)) %>%
left_join(data,.) -> result
result <- arrange(result,household_code,purchase_date,Overall_Brand)
print(as.data.frame(result[,c(1,3,5,6,7)]))
...and the output:
> print(as.data.frame(result[,c(1,3,5,6,7)]))
household_code purchase_date Overall_Brand trip_seq repeat_purchase
1 76 2012-01-01 CTL BR 1 FALSE
2 76 2012-01-02 QUAKER 2 FALSE
3 76 2012-01-23 CTL BR 3 FALSE
4 76 2012-02-19 Kellogg 4 FALSE
5 76 2012-02-21 CTL BR 5 FALSE
6 76 2012-03-14 Kellogg 6 FALSE
7 76 2012-03-17 CTL BR 7 FALSE
8 76 2012-04-03 CTL BR 8 TRUE
9 76 2012-05-27 CTL BR 9 TRUE
10 76 2012-06-06 Kellogg 10 FALSE
11 76 2012-06-24 Kellogg 11 TRUE
12 76 2012-06-27 GENERAL MILLS 12 FALSE
13 76 2012-07-14 KASHI 13 FALSE
14 76 2012-07-29 GENERAL MILLS 14 FALSE
15 76 2012-08-11 GENERAL MILLS 15 TRUE
16 76 2012-09-02 Kellogg 16 FALSE
17 76 2012-09-27 QUAKER 17 FALSE
18 76 2012-10-12 GENERAL MILLS 18 FALSE
19 76 2012-10-12 QUAKER 18 TRUE
20 76 2012-11-04 CTL BR 19 FALSE
21 76 2012-11-24 GENERAL MILLS 20 FALSE
22 76 2012-12-11 CTL BR 21 FALSE
23 126 2012-01-02 Kellogg 1 FALSE
24 126 2012-01-09 Kellogg 2 TRUE
25 126 2012-02-15 GENERAL MILLS 3 FALSE
26 126 2012-02-15 GENERAL MILLS 3 FALSE
27 126 2012-02-19 Kellogg 4 FALSE
28 126 2012-02-27 GENERAL MILLS 5 FALSE
29 126 2012-03-06 GENERAL MILLS 6 TRUE
30 126 2012-03-06 Kellogg 6 FALSE
31 126 2012-03-08 Kellogg 7 TRUE
The sequence number is important because if we simply sort by Household
, Brand
and Trip ID
, we can't tell whether the next trip number is truly the "next" purchase, as illustrated by Household 126 purchases of the Kellog Brand, where it is purchased on trip sequences 1, 2, 4, 6, and 7. Only purchases on trips 2 and 7 should be counted as repeat purchases in consecutive trips, per the request in the OP.
回答2:
On second thought, I think things can be much easier if we introduce a function like this
conditional_count <- function(x, fcond) {
fcond <- rlang::as_function(fcond)
x <- fcond(x)
x[x] <- with(rle(x), sequence(lengths[values]))
x
}
fcond
needs to be a function that applies to x
and returns only TRUE
or FALSE
. The output of this function is a sequence for each streak of TRUE
s and 0 otherwise. For example,
> conditional_count(c(1,1,2,3,3,3,5), duplicated)
[1] 0 1 0 0 1 2 0
We can then achieve what you want in two steps. First, generate a rleid for each trip. Then, for each Household
and Brand
, we count each trip id that is exactly the same as or just one plus the id for the previous trip. The first condition indicates that one trip has two purchases of the same brand. The second condition indicates that the same brand is found on the last trip. Both suggest that the brand is a repeat as per your requirements. For example, if a brand occurs for trips 1,1,2,3,3,5,6, then the count will be 0,1,2,3,4,0,1. Then, the code is just
library(dplyr)
e1 %>%
group_by(Household) %>%
mutate(trip_seq = rleid(`Trip ID`)) %>%
group_by(Household, Brand) %>%
mutate(
Repeat = conditional_count(trip_seq, ~. - lag(., default = -1L) < 2L),
trip_seq = NULL
)
Output
# A tibble: 31 x 5
# Groups: Household, Brand [7]
Household `Trip ID` Retailer Brand Repeat
<dbl> <int> <dbl> <fct> <int>
1 76 1032497498 11024 CTL BR 0
2 76 1025776063 11024 QUAKER 0
3 76 1029419047 11024 CTL BR 0
4 76 1030418100 11024 Kellogg 0
5 76 1029502602 11024 CTL BR 0
6 76 1034153056 11024 Kellogg 0
7 76 1027035051 11024 CTL BR 0
8 76 1027533991 11024 CTL BR 1
9 76 1033515804 11024 CTL BR 2
10 76 1032998207 11024 Kellogg 0
11 76 1032066227 11024 Kellogg 1
12 76 1028192785 11024 GENERAL MILLS 0
13 76 1033419039 11024 KASHI 0
14 76 1028730296 11024 GENERAL MILLS 0
15 76 1027388499 11024 GENERAL MILLS 1
16 76 1030652869 11024 Kellogg 0
17 76 1025638394 2353 QUAKER 0
18 76 1034032718 11024 GENERAL MILLS 0
19 76 1034032718 11024 QUAKER 1
20 76 1025678520 2353 CTL BR 0
21 76 1029490031 11024 GENERAL MILLS 0
22 76 1029898838 11024 CTL BR 0
23 126 1028024134 63882 Kellogg 0
24 126 1030324171 650 Kellogg 1
25 126 1031983761 89960 GENERAL MILLS 0
26 126 1031983761 89960 GENERAL MILLS 1
27 126 1033767148 650 Kellogg 0
28 126 1023953965 89960 GENERAL MILLS 0
29 126 1030954113 89960 GENERAL MILLS 1
30 126 1030954113 89960 Kellogg 0
31 126 1027392968 650 Kellogg 1
回答3:
Though I have upvoted Len Greski's solution proposed, yet I feel the code can be shortened like this (there is no need to join).
EDIT On second thoughts I am proposing a solution using dense_rank
instead of cur_group_id
library(dplyr)
df_result <- df %>% group_by(household_code) %>%
arrange(purchase_date, trip_code_uc) %>%
mutate(shop_id = dense_rank(paste(purchase_date, trip_code_uc)),
brand_seq = dense_rank(Overall_Brand)) %>%
group_by(household_code, brand_seq) %>%
mutate(cond = ifelse(shop_id == lag(shop_id) | shop_id == lag(shop_id)+1, TRUE, FALSE),
cond = ifelse(is.na(cond), FALSE, cond)) %>%
ungroup() %>%
select(-shop_id, -brand_seq)
#Check for true rows
> df_result %>% filter(cond == T)
# A tibble: 9 x 6
household_code trip_code_uc purchase_date retailer_code Overall_Brand cond
<dbl> <int> <fct> <dbl> <fct> <lgl>
1 126 1030324171 2012-01-09 650 Kellogg TRUE
2 126 1031983761 2012-02-15 89960 GENERAL MILLS TRUE
3 126 1030954113 2012-03-06 89960 GENERAL MILLS TRUE
4 126 1027392968 2012-03-08 650 Kellogg TRUE
5 76 1027533991 2012-04-03 11024 CTL BR TRUE
6 76 1033515804 2012-05-27 11024 CTL BR TRUE
7 76 1032066227 2012-06-24 11024 Kellogg TRUE
8 76 1027388499 2012-08-11 11024 GENERAL MILLS TRUE
9 76 1034032718 2012-10-12 11024 QUAKER TRUE
Note That I have generated shop_id
keeping in mind that two different purchases can be made by one customer on one day.
Earlier proposed solution
library(tidyverse)
df %>% group_by(household_code, purchase_date, trip_code_uc) %>%
arrange(purchase_date, trip_code_uc) %>%
mutate(shop_id = cur_group_id()) %>%
group_by(household_code, Overall_Brand) %>%
mutate(brand_seq = cur_group_id()) %>%
group_by(household_code, brand_seq) %>%
mutate(cond = ifelse(shop_id == lag(shop_id) | shop_id == lag(shop_id)+1, TRUE, FALSE),
cond = ifelse(is.na(cond), FALSE, cond)) %>%
ungroup() %>%
select(-shop_id, -brand_seq)
# A tibble: 31 x 6
household_code trip_code_uc purchase_date retailer_code Overall_Brand cond
<dbl> <int> <fct> <dbl> <fct> <lgl>
1 76 1032497498 2012-01-01 11024 CTL BR FALSE
2 76 1025776063 2012-01-02 11024 QUAKER FALSE
3 126 1028024134 2012-01-02 63882 Kellogg FALSE
4 126 1030324171 2012-01-09 650 Kellogg TRUE
5 76 1029419047 2012-01-23 11024 CTL BR FALSE
6 126 1031983761 2012-02-15 89960 GENERAL MILLS FALSE
7 126 1031983761 2012-02-15 89960 GENERAL MILLS TRUE
8 76 1030418100 2012-02-19 11024 Kellogg FALSE
9 126 1033767148 2012-02-19 650 Kellogg FALSE
10 76 1029502602 2012-02-21 11024 CTL BR FALSE
# ... with 21 more rows
If we check with TRUE
rows only
# A tibble: 9 x 6
household_code trip_code_uc purchase_date retailer_code Overall_Brand cond
<dbl> <int> <fct> <dbl> <fct> <lgl>
1 126 1030324171 2012-01-09 650 Kellogg TRUE
2 126 1031983761 2012-02-15 89960 GENERAL MILLS TRUE
3 126 1030954113 2012-03-06 89960 GENERAL MILLS TRUE
4 126 1027392968 2012-03-08 650 Kellogg TRUE
5 76 1027533991 2012-04-03 11024 CTL BR TRUE
6 76 1033515804 2012-05-27 11024 CTL BR TRUE
7 76 1032066227 2012-06-24 11024 Kellogg TRUE
8 76 1027388499 2012-08-11 11024 GENERAL MILLS TRUE
9 76 1034032718 2012-10-12 11024 QUAKER TRUE
In this solution, group_ids aren't renumbered at every household
. If you specifically want to restart numbering at every houseld, this code may help
df %>%
group_split(household_code) %>%
purrr::map_df(~.x %>% arrange(purchase_date, trip_code_uc) %>%
group_by(purchase_date, trip_code_uc) %>%
mutate(shop_id = cur_group_id()) %>%
group_by(Overall_Brand) %>%
mutate(brand_seq = cur_group_id())) %>%
group_by(household_code, brand_seq) %>%
mutate(cond = ifelse(shop_id == lag(shop_id) | shop_id == lag(shop_id)+1, TRUE, FALSE),
cond = ifelse(is.na(cond), FALSE, cond)) %>%
ungroup()
# A tibble: 31 x 8
household_code trip_code_uc purchase_date retailer_code Overall_Brand shop_id brand_seq cond
<dbl> <int> <fct> <dbl> <fct> <int> <int> <lgl>
1 76 1032497498 2012-01-01 11024 CTL BR 1 1 FALSE
2 76 1025776063 2012-01-02 11024 QUAKER 2 5 FALSE
3 76 1029419047 2012-01-23 11024 CTL BR 3 1 FALSE
4 76 1030418100 2012-02-19 11024 Kellogg 4 4 FALSE
5 76 1029502602 2012-02-21 11024 CTL BR 5 1 FALSE
6 76 1034153056 2012-03-14 11024 Kellogg 6 4 FALSE
7 76 1027035051 2012-03-17 11024 CTL BR 7 1 FALSE
8 76 1027533991 2012-04-03 11024 CTL BR 8 1 TRUE
9 76 1033515804 2012-05-27 11024 CTL BR 9 1 TRUE
10 76 1032998207 2012-06-06 11024 Kellogg 10 4 FALSE
# ... with 21 more rows
来源:https://stackoverflow.com/questions/65053932/to-check-if-a-value-in-a-row-is-repeated-between-groups-in-r