To check if a value in a row is repeated between groups in R

喜夏-厌秋 提交于 2020-12-06 06:06:11

问题


I have a dataset containing purchases made by different households across different retailers. For eg

Example Dataset

Using dput()

structure(list(household_code = c(76, 76, 76, 76, 76, 76, 76, 
76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 126, 
126, 126, 126, 126, 126, 126, 126, 126), trip_code_uc = c(1032497498L, 
1025776063L, 1029419047L, 1030418100L, 1029502602L, 1034153056L, 
1027035051L, 1027533991L, 1033515804L, 1032998207L, 1032066227L, 
1028192785L, 1033419039L, 1028730296L, 1027388499L, 1030652869L, 
1025638394L, 1034032718L, 1034032718L, 1025678520L, 1029490031L, 
1029898838L, 1028024134L, 1030324171L, 1031983761L, 1031983761L, 
1033767148L, 1023953965L, 1030954113L, 1030954113L, 1027392968L
), purchase_date = structure(c(1L, 2L, 23L, 50L, 52L, 74L, 77L, 
94L, 148L, 158L, 176L, 179L, 196L, 211L, 224L, 246L, 271L, 286L, 
286L, 309L, 329L, 346L, 2L, 9L, 46L, 46L, 50L, 58L, 66L, 66L, 
68L), .Label = c("2012-01-01", "2012-01-02", "2012-01-03", "2012-01-04", 
"2012-01-05", "2012-01-06", "2012-01-07", "2012-01-08", "2012-01-09", 
"2012-01-10", "2012-01-11", "2012-01-12", "2012-01-13", "2012-01-14", 
"2012-01-15", "2012-01-16", "2012-01-17", "2012-01-18", "2012-01-19", 
"2012-01-20", "2012-01-21", "2012-01-22", "2012-01-23", "2012-01-24", 
"2012-01-25", "2012-01-26", "2012-01-27", "2012-01-28", "2012-01-29", 
"2012-01-30", "2012-01-31", "2012-02-01", "2012-02-02", "2012-02-03", 
"2012-02-04", "2012-02-05", "2012-02-06", "2012-02-07", "2012-02-08", 
"2012-02-09", "2012-02-10", "2012-02-11", "2012-02-12", "2012-02-13", 
"2012-02-14", "2012-02-15", "2012-02-16", "2012-02-17", "2012-02-18", 
"2012-02-19", "2012-02-20", "2012-02-21", "2012-02-22", "2012-02-23", 
"2012-02-24", "2012-02-25", "2012-02-26", "2012-02-27", "2012-02-28", 
"2012-02-29", "2012-03-01", "2012-03-02", "2012-03-03", "2012-03-04", 
"2012-03-05", "2012-03-06", "2012-03-07", "2012-03-08", "2012-03-09", 
"2012-03-10", "2012-03-11", "2012-03-12", "2012-03-13", "2012-03-14", 
"2012-03-15", "2012-03-16", "2012-03-17", "2012-03-18", "2012-03-19", 
"2012-03-20", "2012-03-21", "2012-03-22", "2012-03-23", "2012-03-24", 
"2012-03-25", "2012-03-26", "2012-03-27", "2012-03-28", "2012-03-29", 
"2012-03-30", "2012-03-31", "2012-04-01", "2012-04-02", "2012-04-03", 
"2012-04-04", "2012-04-05", "2012-04-06", "2012-04-07", "2012-04-08", 
"2012-04-09", "2012-04-10", "2012-04-11", "2012-04-12", "2012-04-13", 
"2012-04-14", "2012-04-15", "2012-04-16", "2012-04-17", "2012-04-18", 
"2012-04-19", "2012-04-20", "2012-04-21", "2012-04-22", "2012-04-23", 
"2012-04-24", "2012-04-25", "2012-04-26", "2012-04-27", "2012-04-28", 
"2012-04-29", "2012-04-30", "2012-05-01", "2012-05-02", "2012-05-03", 
"2012-05-04", "2012-05-05", "2012-05-06", "2012-05-07", "2012-05-08", 
"2012-05-09", "2012-05-10", "2012-05-11", "2012-05-12", "2012-05-13", 
"2012-05-14", "2012-05-15", "2012-05-16", "2012-05-17", "2012-05-18", 
"2012-05-19", "2012-05-20", "2012-05-21", "2012-05-22", "2012-05-23", 
"2012-05-24", "2012-05-25", "2012-05-26", "2012-05-27", "2012-05-28", 
"2012-05-29", "2012-05-30", "2012-05-31", "2012-06-01", "2012-06-02", 
"2012-06-03", "2012-06-04", "2012-06-05", "2012-06-06", "2012-06-07", 
"2012-06-08", "2012-06-09", "2012-06-10", "2012-06-11", "2012-06-12", 
"2012-06-13", "2012-06-14", "2012-06-15", "2012-06-16", "2012-06-17", 
"2012-06-18", "2012-06-19", "2012-06-20", "2012-06-21", "2012-06-22", 
"2012-06-23", "2012-06-24", "2012-06-25", "2012-06-26", "2012-06-27", 
"2012-06-28", "2012-06-29", "2012-06-30", "2012-07-01", "2012-07-02", 
"2012-07-03", "2012-07-04", "2012-07-05", "2012-07-06", "2012-07-07", 
"2012-07-08", "2012-07-09", "2012-07-10", "2012-07-11", "2012-07-12", 
"2012-07-13", "2012-07-14", "2012-07-15", "2012-07-16", "2012-07-17", 
"2012-07-18", "2012-07-19", "2012-07-20", "2012-07-21", "2012-07-22", 
"2012-07-23", "2012-07-24", "2012-07-25", "2012-07-26", "2012-07-27", 
"2012-07-28", "2012-07-29", "2012-07-30", "2012-07-31", "2012-08-01", 
"2012-08-02", "2012-08-03", "2012-08-04", "2012-08-05", "2012-08-06", 
"2012-08-07", "2012-08-08", "2012-08-09", "2012-08-10", "2012-08-11", 
"2012-08-12", "2012-08-13", "2012-08-14", "2012-08-15", "2012-08-16", 
"2012-08-17", "2012-08-18", "2012-08-19", "2012-08-20", "2012-08-21", 
"2012-08-22", "2012-08-23", "2012-08-24", "2012-08-25", "2012-08-26", 
"2012-08-27", "2012-08-28", "2012-08-29", "2012-08-30", "2012-08-31", 
"2012-09-01", "2012-09-02", "2012-09-03", "2012-09-04", "2012-09-05", 
"2012-09-06", "2012-09-07", "2012-09-08", "2012-09-09", "2012-09-10", 
"2012-09-11", "2012-09-12", "2012-09-13", "2012-09-14", "2012-09-15", 
"2012-09-16", "2012-09-17", "2012-09-18", "2012-09-19", "2012-09-20", 
"2012-09-21", "2012-09-22", "2012-09-23", "2012-09-24", "2012-09-25", 
"2012-09-26", "2012-09-27", "2012-09-28", "2012-09-29", "2012-09-30", 
"2012-10-01", "2012-10-02", "2012-10-03", "2012-10-04", "2012-10-05", 
"2012-10-06", "2012-10-07", "2012-10-08", "2012-10-09", "2012-10-10", 
"2012-10-11", "2012-10-12", "2012-10-13", "2012-10-14", "2012-10-15", 
"2012-10-16", "2012-10-17", "2012-10-18", "2012-10-19", "2012-10-20", 
"2012-10-21", "2012-10-22", "2012-10-23", "2012-10-24", "2012-10-25", 
"2012-10-26", "2012-10-27", "2012-10-28", "2012-10-29", "2012-10-30", 
"2012-10-31", "2012-11-01", "2012-11-02", "2012-11-03", "2012-11-04", 
"2012-11-05", "2012-11-06", "2012-11-07", "2012-11-08", "2012-11-09", 
"2012-11-10", "2012-11-11", "2012-11-12", "2012-11-13", "2012-11-14", 
"2012-11-15", "2012-11-16", "2012-11-17", "2012-11-18", "2012-11-19", 
"2012-11-20", "2012-11-21", "2012-11-22", "2012-11-23", "2012-11-24", 
"2012-11-25", "2012-11-26", "2012-11-27", "2012-11-28", "2012-11-29", 
"2012-11-30", "2012-12-01", "2012-12-02", "2012-12-03", "2012-12-04", 
"2012-12-05", "2012-12-06", "2012-12-07", "2012-12-08", "2012-12-09", 
"2012-12-10", "2012-12-11", "2012-12-12", "2012-12-13", "2012-12-14", 
"2012-12-15", "2012-12-16", "2012-12-17", "2012-12-18", "2012-12-19", 
"2012-12-20", "2012-12-21", "2012-12-22", "2012-12-23", "2012-12-24", 
"2012-12-25", "2012-12-26", "2012-12-27", "2012-12-28", "2012-12-29"
), class = "factor"), retailer_code = c(11024, 11024, 11024, 
11024, 11024, 11024, 11024, 11024, 11024, 11024, 11024, 11024, 
11024, 11024, 11024, 11024, 2353, 11024, 11024, 2353, 11024, 
11024, 63882, 650, 89960, 89960, 650, 89960, 89960, 89960, 650
), Overall_Brand = structure(c(19L, 74L, 19L, 48L, 19L, 48L, 
19L, 19L, 19L, 48L, 48L, 31L, 46L, 31L, 31L, 48L, 74L, 31L, 74L, 
19L, 31L, 19L, 48L, 48L, 31L, 31L, 48L, 31L, 31L, 48L, 48L), .Label = c("ABUNDANCE", 
"ALPEN", "AMERICAN BREAKFAST ", "ANNIE'S HOMEGROWN", "ARWHD MLS", 
"BARBARA'S", "BEAR NAKED", "BEAR RIVER", "BOB'S RED MILL", "BOKOMO COUNTRY", 
"BREAKFAST CHOICE", "BREAKFAST ZONE", "BROOKFARM MACADAMIA ", 
"BRUGGEN", "BUCKEYE HEROES", "CADIA", "CASCADIAN FARM ", "CHOCOLATE SPOONERS", 
"CTL BR", "DORSET", "ENJOY LIFE PERKY'S CRUNCHY FLX", "EREWHON", 
"F-FACTOR", "FAMILIA", "FIELD DAY", "FINAX", "FLEURY FLAKES MARC ANDRE", 
"FOOD FOR LIFE EZEKIEL 50", "FORRELLI ", "GEFEN KING", "GENERAL MILLS", 
"GERONIMO PEYTON HILLIS REDZONE", "GLUCERNA", "GLUTINO", "GOLDEN FOODS", 
"GRANVITA PUFFY'S", "GREENBRIER INT INC-NBL CRN FLK", "HEALTH VALLEY", 
"HODGSON MILL", "HOME FAVORITE", "HOSTESS TOASTED OATS", "HSP", 
"ISABEL'S WAY ", "JASPER", "JUSTIN VERLANDER'S FASTBLL FLK", 
"KASHI", "KAY'S NATURALS BETTER BALANCE", "Kellogg", "KIND", 
"KOZY SHACK READY GRAINS", "KRETSCHMER", "LADY LIBERTY", "LIEBER'S", 
"LIVING INTENTIONS SPRFD CRL", "LOVE GROWN FOODS", "MAIZORO", 
"MANISCHEWITZ", "MILL SELECT ", "MOTHER'S", "MULTIGRAIN SPNRS", 
"NASH BROTHERS", "NATURE'S PATH", "NESTLE", "NEW ENGLAND NATURALS", 
"NEWMAN'S OWN SWEET ENGH WHT PF", "NUTRISYSTEM NOURISH", "NUTRITIOUS LIVING", 
"PAMPA ", "PASKESZ CHOCO RIOS", "PBLC LB BR-NBL HY NT TSD OT DM", 
"PEACE", "POST", "PROTEIN PLUS CORN FLAKE", "QUAKER", "RALSTON", 
"RLSTN", "SALLY'S ", "SCRUMPTIOUS SPELNDID CRLS CBB", "SEITENBACHER MUESLI", 
"SIMPLY FIBER", "SKINNER'S RAISIN BRAN", "SORIANA WHOLE BRAN", 
"STREIT'S FRUIT & NUT MUESLI", "SUNBELT", "SWAD", "T. ABRAHAM'S", 
"TAANUG CORN FLAKES", "TASTY", "TEMMY'S", "THE", "THREE SISTERS", 
"TIKISS SWEETENED PUFFD WHL WHT", "TRU ROOTS", "VOTTO'S", "WEETABIX", 
"WHOLESOME GOODNESS", "WILD ROOTS ANCIENT ORIGINS", "WONDER CORN FLAKES", 
"YOG ACTIVE"), class = "factor")), row.names = c(90609L, 222436L, 
90606L, 688592L, 90607L, 688593L, 90605L, 90604L, 90608L, 668330L, 
321377L, 567447L, 945385L, 567445L, 567443L, 892854L, 583186L, 
567446L, 583185L, 168138L, 567444L, 60086L, 698120L, 698127L, 
3933L, 809409L, 698129L, 15286L, 15284L, 698116L, 319455L), class = "data.frame")

I have a separate variable for Household, trip ID for a particular purchase instance, retailer, and Brand they purchased. One household may purchase more than 1 brand in 1 trip. I wanted to calculate the repeat purchases in 2 consecutive trips. For example, if a household purchase General Mills and Kellogg both in trip 1 and only General mill in 2nd trip, for general mills, it will be a repeat.

Example output:

Example Output

Now, it is easy when I have only 1 purchase per trip. I do it by the following code using rle()

e1 = transform(e1, brand_last_dum = ave(as.character(Brand), rleid(Household, Brand), FUN = seq_along))

However, when there is more than 1 purchase, it doesn't work. Data is arranged by the purchase date and more than 1 purchase of the same brand in a trip maybe considered as a different purchase for calculating repeat in the next row. Please Help


回答1:


2020-11-29 UPDATE: Adjusted code to count second+ purchases of same brand in same shopping trip as repeat purchases. Given the additional requirement I was able to eliminate one of the joins which had been previously added to treat all purchases of a given brand within a trip consistently.

library(dplyr)
data %>%
        arrange(.,household_code,purchase_date,trip_code_uc) %>% 
        group_by(household_code,purchase_date) %>%
        distinct(trip_code_uc) %>% 
        group_by(household_code) %>% 
        mutate(trip_seq = seq_along(household_code)) %>% 
        ungroup() %>% 
        left_join(data,.) %>%
        arrange(household_code,Overall_Brand,trip_seq) %>%
        mutate(repeat_purchase = if_else(household_code == lag(household_code) &
                                                 Overall_Brand == lag(Overall_Brand) & 
                                                 (trip_seq == lag(trip_seq) + 1 | trip_seq == lag(trip_seq)),
                                         TRUE,FALSE,missing=FALSE)) %>%
        arrange(.,household_code,purchase_date,Overall_Brand) -> result

print(as.data.frame(result[,c(1,3,5,6,7)]))

...and the output:

> print(as.data.frame(result[,c(1,3,5,6,7)]))
   household_code purchase_date Overall_Brand trip_seq repeat_purchase
1              76    2012-01-01        CTL BR        1           FALSE
2              76    2012-01-02        QUAKER        2           FALSE
3              76    2012-01-23        CTL BR        3           FALSE
4              76    2012-02-19       Kellogg        4           FALSE
5              76    2012-02-21        CTL BR        5           FALSE
6              76    2012-03-14       Kellogg        6           FALSE
7              76    2012-03-17        CTL BR        7           FALSE
8              76    2012-04-03        CTL BR        8            TRUE
9              76    2012-05-27        CTL BR        9            TRUE
10             76    2012-06-06       Kellogg       10           FALSE
11             76    2012-06-24       Kellogg       11            TRUE
12             76    2012-06-27 GENERAL MILLS       12           FALSE
13             76    2012-07-14         KASHI       13           FALSE
14             76    2012-07-29 GENERAL MILLS       14           FALSE
15             76    2012-08-11 GENERAL MILLS       15            TRUE
16             76    2012-09-02       Kellogg       16           FALSE
17             76    2012-09-27        QUAKER       17           FALSE
18             76    2012-10-12 GENERAL MILLS       18           FALSE
19             76    2012-10-12        QUAKER       18            TRUE
20             76    2012-11-04        CTL BR       19           FALSE
21             76    2012-11-24 GENERAL MILLS       20           FALSE
22             76    2012-12-11        CTL BR       21           FALSE
23            126    2012-01-02       Kellogg        1           FALSE
24            126    2012-01-09       Kellogg        2            TRUE
25            126    2012-02-15 GENERAL MILLS        3           FALSE
26            126    2012-02-15 GENERAL MILLS        3            TRUE
27            126    2012-02-19       Kellogg        4           FALSE
28            126    2012-02-27 GENERAL MILLS        5           FALSE
29            126    2012-03-06 GENERAL MILLS        6            TRUE
30            126    2012-03-06       Kellogg        6           FALSE
31            126    2012-03-08       Kellogg        7            TRUE
> 

Prior version included below to maintain relevance of comments

2020-11-28 UPDATE: After receiving the updated data that includes purchase_date, we altered our solution to use this information to generate trip_seq. We assume that if there are multiple trip_code_uc values in a single day, the shopping experiences occur in ascending order of trip_code_uc.

We use dplyr to define distinct trip identifiers for each household and assign a trip sequence. We then use the trip sequence to evaluate whether the same brand was purchased in two consecutive shopping trips after merging it with the original data.

library(dplyr)
data %>%
     arrange(.,household_code,purchase_date,trip_code_uc) %>% 
     group_by(household_code,purchase_date) %>%
     distinct(trip_code_uc) %>% 
     group_by(household_code) %>% 
     mutate(trip_seq = seq_along(household_code)) %>% 
     ungroup() %>% 
     left_join(data,.) %>% 
     group_by(household_code,purchase_date,trip_seq) %>%
     distinct(Overall_Brand) %>%
     ungroup() %>%
     arrange(household_code,Overall_Brand,purchase_date,trip_seq) %>%
     mutate(repeat_purchase = if_else(household_code == lag(household_code) &
                                           Overall_Brand == lag(Overall_Brand) & 
                                           trip_seq == lag(trip_seq) + 1,
                                      TRUE,FALSE,missing=FALSE)) %>%
     left_join(data,.) -> result
result <- arrange(result,household_code,purchase_date,Overall_Brand)
print(as.data.frame(result[,c(1,3,5,6,7)]))

...and the output:

> print(as.data.frame(result[,c(1,3,5,6,7)]))
   household_code purchase_date Overall_Brand trip_seq repeat_purchase
1              76    2012-01-01        CTL BR        1           FALSE
2              76    2012-01-02        QUAKER        2           FALSE
3              76    2012-01-23        CTL BR        3           FALSE
4              76    2012-02-19       Kellogg        4           FALSE
5              76    2012-02-21        CTL BR        5           FALSE
6              76    2012-03-14       Kellogg        6           FALSE
7              76    2012-03-17        CTL BR        7           FALSE
8              76    2012-04-03        CTL BR        8            TRUE
9              76    2012-05-27        CTL BR        9            TRUE
10             76    2012-06-06       Kellogg       10           FALSE
11             76    2012-06-24       Kellogg       11            TRUE
12             76    2012-06-27 GENERAL MILLS       12           FALSE
13             76    2012-07-14         KASHI       13           FALSE
14             76    2012-07-29 GENERAL MILLS       14           FALSE
15             76    2012-08-11 GENERAL MILLS       15            TRUE
16             76    2012-09-02       Kellogg       16           FALSE
17             76    2012-09-27        QUAKER       17           FALSE
18             76    2012-10-12 GENERAL MILLS       18           FALSE
19             76    2012-10-12        QUAKER       18            TRUE
20             76    2012-11-04        CTL BR       19           FALSE
21             76    2012-11-24 GENERAL MILLS       20           FALSE
22             76    2012-12-11        CTL BR       21           FALSE
23            126    2012-01-02       Kellogg        1           FALSE
24            126    2012-01-09       Kellogg        2            TRUE
25            126    2012-02-15 GENERAL MILLS        3           FALSE
26            126    2012-02-15 GENERAL MILLS        3           FALSE
27            126    2012-02-19       Kellogg        4           FALSE
28            126    2012-02-27 GENERAL MILLS        5           FALSE
29            126    2012-03-06 GENERAL MILLS        6            TRUE
30            126    2012-03-06       Kellogg        6           FALSE
31            126    2012-03-08       Kellogg        7            TRUE

The sequence number is important because if we simply sort by Household, Brand and Trip ID, we can't tell whether the next trip number is truly the "next" purchase, as illustrated by Household 126 purchases of the Kellog Brand, where it is purchased on trip sequences 1, 2, 4, 6, and 7. Only purchases on trips 2 and 7 should be counted as repeat purchases in consecutive trips, per the request in the OP.




回答2:


On second thought, I think things can be much easier if we introduce a function like this

conditional_count <- function(x, fcond) {
  fcond <- rlang::as_function(fcond)
  x <- fcond(x)
  x[x] <- with(rle(x), sequence(lengths[values]))
  x
}

fcond needs to be a function that applies to x and returns only TRUE or FALSE. The output of this function is a sequence for each streak of TRUEs and 0 otherwise. For example,

> conditional_count(c(1,1,2,3,3,3,5), duplicated)
[1] 0 1 0 0 1 2 0

We can then achieve what you want in two steps. First, generate a rleid for each trip. Then, for each Household and Brand, we count each trip id that is exactly the same as or just one plus the id for the previous trip. The first condition indicates that one trip has two purchases of the same brand. The second condition indicates that the same brand is found on the last trip. Both suggest that the brand is a repeat as per your requirements. For example, if a brand occurs for trips 1,1,2,3,3,5,6, then the count will be 0,1,2,3,4,0,1. Then, the code is just

library(dplyr)

e1 %>% 
  group_by(Household) %>% 
  mutate(trip_seq = rleid(`Trip ID`)) %>% 
  group_by(Household, Brand) %>% 
  mutate(
    Repeat = conditional_count(trip_seq, ~. - lag(., default = -1L) < 2L),
    trip_seq = NULL
  )

Output

# A tibble: 31 x 5
# Groups:   Household, Brand [7]
   Household  `Trip ID` Retailer Brand         Repeat
       <dbl>      <int>    <dbl> <fct>          <int>
 1        76 1032497498    11024 CTL BR             0
 2        76 1025776063    11024 QUAKER             0
 3        76 1029419047    11024 CTL BR             0
 4        76 1030418100    11024 Kellogg            0
 5        76 1029502602    11024 CTL BR             0
 6        76 1034153056    11024 Kellogg            0
 7        76 1027035051    11024 CTL BR             0
 8        76 1027533991    11024 CTL BR             1
 9        76 1033515804    11024 CTL BR             2
10        76 1032998207    11024 Kellogg            0
11        76 1032066227    11024 Kellogg            1
12        76 1028192785    11024 GENERAL MILLS      0
13        76 1033419039    11024 KASHI              0
14        76 1028730296    11024 GENERAL MILLS      0
15        76 1027388499    11024 GENERAL MILLS      1
16        76 1030652869    11024 Kellogg            0
17        76 1025638394     2353 QUAKER             0
18        76 1034032718    11024 GENERAL MILLS      0
19        76 1034032718    11024 QUAKER             1
20        76 1025678520     2353 CTL BR             0
21        76 1029490031    11024 GENERAL MILLS      0
22        76 1029898838    11024 CTL BR             0
23       126 1028024134    63882 Kellogg            0
24       126 1030324171      650 Kellogg            1
25       126 1031983761    89960 GENERAL MILLS      0
26       126 1031983761    89960 GENERAL MILLS      1
27       126 1033767148      650 Kellogg            0
28       126 1023953965    89960 GENERAL MILLS      0
29       126 1030954113    89960 GENERAL MILLS      1
30       126 1030954113    89960 Kellogg            0
31       126 1027392968      650 Kellogg            1



回答3:


Though I have upvoted Len Greski's solution proposed, yet I feel the code can be shortened like this (there is no need to join).

EDIT On second thoughts I am proposing a solution using dense_rank instead of cur_group_id

library(dplyr)

df_result <- df %>% group_by(household_code) %>%
  arrange(purchase_date, trip_code_uc) %>% 
  mutate(shop_id = dense_rank(paste(purchase_date, trip_code_uc)),
         brand_seq = dense_rank(Overall_Brand)) %>%
  group_by(household_code, brand_seq) %>%
  mutate(cond = ifelse(shop_id == lag(shop_id)  | shop_id == lag(shop_id)+1, TRUE, FALSE),
         cond = ifelse(is.na(cond), FALSE, cond)) %>%
  ungroup() %>%
  select(-shop_id, -brand_seq)

#Check for true rows
> df_result %>% filter(cond == T)
# A tibble: 9 x 6
  household_code trip_code_uc purchase_date retailer_code Overall_Brand cond 
           <dbl>        <int> <fct>                 <dbl> <fct>         <lgl>
1            126   1030324171 2012-01-09              650 Kellogg       TRUE 
2            126   1031983761 2012-02-15            89960 GENERAL MILLS TRUE 
3            126   1030954113 2012-03-06            89960 GENERAL MILLS TRUE 
4            126   1027392968 2012-03-08              650 Kellogg       TRUE 
5             76   1027533991 2012-04-03            11024 CTL BR        TRUE 
6             76   1033515804 2012-05-27            11024 CTL BR        TRUE 
7             76   1032066227 2012-06-24            11024 Kellogg       TRUE 
8             76   1027388499 2012-08-11            11024 GENERAL MILLS TRUE 
9             76   1034032718 2012-10-12            11024 QUAKER        TRUE 

Note That I have generated shop_id keeping in mind that two different purchases can be made by one customer on one day.

Earlier proposed solution

library(tidyverse)

df %>% group_by(household_code, purchase_date, trip_code_uc) %>%
  arrange(purchase_date, trip_code_uc) %>% 
  mutate(shop_id = cur_group_id()) %>%
  group_by(household_code, Overall_Brand) %>%
  mutate(brand_seq = cur_group_id()) %>%
  group_by(household_code, brand_seq) %>%
  mutate(cond = ifelse(shop_id == lag(shop_id)  | shop_id == lag(shop_id)+1, TRUE, FALSE),
         cond = ifelse(is.na(cond), FALSE, cond)) %>%
  ungroup() %>%
  select(-shop_id, -brand_seq)

# A tibble: 31 x 6
   household_code trip_code_uc purchase_date retailer_code Overall_Brand cond 
            <dbl>        <int> <fct>                 <dbl> <fct>         <lgl>
 1             76   1032497498 2012-01-01            11024 CTL BR        FALSE
 2             76   1025776063 2012-01-02            11024 QUAKER        FALSE
 3            126   1028024134 2012-01-02            63882 Kellogg       FALSE
 4            126   1030324171 2012-01-09              650 Kellogg       TRUE 
 5             76   1029419047 2012-01-23            11024 CTL BR        FALSE
 6            126   1031983761 2012-02-15            89960 GENERAL MILLS FALSE
 7            126   1031983761 2012-02-15            89960 GENERAL MILLS TRUE 
 8             76   1030418100 2012-02-19            11024 Kellogg       FALSE
 9            126   1033767148 2012-02-19              650 Kellogg       FALSE
10             76   1029502602 2012-02-21            11024 CTL BR        FALSE
# ... with 21 more rows

If we check with TRUE rows only

# A tibble: 9 x 6
  household_code trip_code_uc purchase_date retailer_code Overall_Brand cond 
           <dbl>        <int> <fct>                 <dbl> <fct>         <lgl>
1            126   1030324171 2012-01-09              650 Kellogg       TRUE 
2            126   1031983761 2012-02-15            89960 GENERAL MILLS TRUE 
3            126   1030954113 2012-03-06            89960 GENERAL MILLS TRUE 
4            126   1027392968 2012-03-08              650 Kellogg       TRUE 
5             76   1027533991 2012-04-03            11024 CTL BR        TRUE 
6             76   1033515804 2012-05-27            11024 CTL BR        TRUE 
7             76   1032066227 2012-06-24            11024 Kellogg       TRUE 
8             76   1027388499 2012-08-11            11024 GENERAL MILLS TRUE 
9             76   1034032718 2012-10-12            11024 QUAKER        TRUE 

In this solution, group_ids aren't renumbered at every household. If you specifically want to restart numbering at every houseld, this code may help

df %>% 
  group_split(household_code) %>%
  purrr::map_df(~.x %>% arrange(purchase_date, trip_code_uc) %>%
                  group_by(purchase_date, trip_code_uc) %>% 
                  mutate(shop_id = cur_group_id()) %>%
                  group_by(Overall_Brand) %>%
                  mutate(brand_seq = cur_group_id())) %>%
  group_by(household_code, brand_seq) %>%
  mutate(cond = ifelse(shop_id == lag(shop_id)  | shop_id == lag(shop_id)+1, TRUE, FALSE),
         cond = ifelse(is.na(cond), FALSE, cond)) %>%
  ungroup()

# A tibble: 31 x 8
   household_code trip_code_uc purchase_date retailer_code Overall_Brand shop_id brand_seq cond 
            <dbl>        <int> <fct>                 <dbl> <fct>           <int>     <int> <lgl>
 1             76   1032497498 2012-01-01            11024 CTL BR              1         1 FALSE
 2             76   1025776063 2012-01-02            11024 QUAKER              2         5 FALSE
 3             76   1029419047 2012-01-23            11024 CTL BR              3         1 FALSE
 4             76   1030418100 2012-02-19            11024 Kellogg             4         4 FALSE
 5             76   1029502602 2012-02-21            11024 CTL BR              5         1 FALSE
 6             76   1034153056 2012-03-14            11024 Kellogg             6         4 FALSE
 7             76   1027035051 2012-03-17            11024 CTL BR              7         1 FALSE
 8             76   1027533991 2012-04-03            11024 CTL BR              8         1 TRUE 
 9             76   1033515804 2012-05-27            11024 CTL BR              9         1 TRUE 
10             76   1032998207 2012-06-06            11024 Kellogg            10         4 FALSE
# ... with 21 more rows


来源:https://stackoverflow.com/questions/65053932/to-check-if-a-value-in-a-row-is-repeated-between-groups-in-r

标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!