counting after and before change in value, within groups, generating new variables for each unique shift

南笙酒味 提交于 2019-12-20 02:13:36

问题


I am working to count occurrences of unique values within my groups, id. I'm looking at TF. When TF changes I want to count both forward and backwards from that point. This counting should be stored in a new variable PM#, so that PM# holds both plus and minus to each unique shift in TF. From what I've gathered I need to use rle, but I am kinda stuck.

I made this working example to illustrate my issue.

I have this data

df <- structure(list(id = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 
7L, 7L, 7L, 7L), TF = c(NA, 0L, NA, 0L, 0L, 1L, 1L, 1L, NA, 0L, 
0L, NA, 0L, 0L, 0L, 1L, 1L, 1L, NA, NA, 0L, 0L, 1L, 0L, 0L, 1L, 
0L, 1L, 1L, 1L)), .Names = c("id", "TF"), class = "data.frame", row.names = c(NA, 
-30L))

This is the kinda data I am seeing

df[c(1:12,19:30),]
#>    id TF
#> 1   0 NA
#> 2   0  0
#> 3   0 NA
#> 4   0  0
#> 5   0  0
#> 6   0  1
#> 7   0  1
#> 8   0  1
#> 9   0 NA
#> 10  0  0
#> 11  0  0
#> 12  1 NA
#> 19  1 NA
#> 20  7 NA
#> 21  7  0
#> 22  7  0
#> 23  7  1
#> 24  7  0
#> 25  7  0
#> 26  7  1
#> 27  7  0
#> 28  7  1
#> 29  7  1
#> 30  7  1

I've started meddling with ave, cumsum and with rle, but haven't solved it this way yet.

df$PM01 <- with(df, ifelse(is.na(TF), NA, 1))
df$PM01 <- with(df, ave(PM01, TF, id, FUN=cumsum))

with(df, tapply(TF, rep(rle(id)[[2]], rle(id)[[1]]), count))

This is what I am trying to obtain,

dfa <- structure(list(id = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 
7L, 7L, 7L, 7L), TF = c(NA, 0L, NA, 0L, 0L, 1L, 1L, 1L, NA, 0L, 
0L, NA, 0L, 0L, 0L, 1L, 1L, 1L, NA, NA, 0L, 0L, 1L, 0L, 0L, 1L, 
0L, 1L, 1L, 1L), PM1 = c(NA, -3L, NA, -2L, -1L, 1L, 2L, 3L, NA, 
NA, NA, NA, -3L, -2L, -1L, 1L, 2L, 3L, NA, NA, -2L, -1L, 1L, 
NA, NA, NA, NA, NA, NA, NA), PM2 = c(NA, NA, NA, NA, NA, -3L, 
-2L, -1L, NA, 1L, 2L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, -1L, 1L, 2L, NA, NA, NA, NA, NA), PM3 = c(NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, -2L, -1L, 1L, NA, NA, NA, NA), PM4 = c(NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, -1L, 1L, NA, NA, NA), PM5 = c(NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, -1L, 1L, 2L, 3L)), .Names = c("id", 
"TF", "PM1", "PM2", "PM3", "PM4", "PM5"), class = "data.frame", row.names = c(NA, 
-30L))

dfa[c(1:12,19:30),]
#>    id TF PM1 PM2 PM3 PM4 PM5
#> 1   0 NA  NA  NA  NA  NA  NA
#> 2   0  0  -3  NA  NA  NA  NA
#> 3   0 NA  NA  NA  NA  NA  NA
#> 4   0  0  -2  NA  NA  NA  NA
#> 5   0  0  -1  NA  NA  NA  NA
#> 6   0  1   1  -3  NA  NA  NA
#> 7   0  1   2  -2  NA  NA  NA
#> 8   0  1   3  -1  NA  NA  NA
#> 9   0 NA  NA  NA  NA  NA  NA
#> 10  0  0  NA   1  NA  NA  NA
#> 11  0  0  NA   2  NA  NA  NA
#> 12  1 NA  NA  NA  NA  NA  NA
#> 19  1 NA  NA  NA  NA  NA  NA
#> 20  7 NA  NA  NA  NA  NA  NA
#> 21  7  0  -2  NA  NA  NA  NA
#> 22  7  0  -1  NA  NA  NA  NA
#> 23  7  1   1  -1  NA  NA  NA
#> 24  7  0  NA   1  -2  NA  NA
#> 25  7  0  NA   2  -1  NA  NA
#> 26  7  1  NA  NA   1  -1  NA
#> 27  7  0  NA  NA  NA   1  -1
#> 28  7  1  NA  NA  NA  NA   1
#> 29  7  1  NA  NA  NA  NA   2
#> 30  7  1  NA  NA  NA  NA   3

回答1:


This was really a tricky one, and I'm sure the code can be further improved. However, I was able to reproduce your expected result. Please, try this approach with your production data. If OK, I will add an explanation later.

library(data.table)

tmp <- setDT(df)[, rn := .I][!is.na(TF)][, rl := rleid(TF), by = id][
  , c("up", "dn") := .(seq_len(.N), -rev(seq_len(.N))), by = .(id, rl)][]

res <- tmp[tmp[, seq_len(max(rl) - 1L), by = .(id)], on = .(id), allow.cartesian = TRUE][
  rl == V1, PM := dn][rl == V1 + 1L, PM := up][
    , dcast(.SD, id + TF + rn ~ paste0("PM", V1), value.var = "PM")][
      df, on = .(rn, id, TF)][, -"rn"]
res
    id TF PM1 PM2 PM3 PM4 PM5
 1:  0 NA  NA  NA  NA  NA  NA
 2:  0  0  -3  NA  NA  NA  NA
 3:  0 NA  NA  NA  NA  NA  NA
 4:  0  0  -2  NA  NA  NA  NA
 5:  0  0  -1  NA  NA  NA  NA
 6:  0  1   1  -3  NA  NA  NA
 7:  0  1   2  -2  NA  NA  NA
 8:  0  1   3  -1  NA  NA  NA
 9:  0 NA  NA  NA  NA  NA  NA
10:  0  0  NA   1  NA  NA  NA
11:  0  0  NA   2  NA  NA  NA
12:  1 NA  NA  NA  NA  NA  NA
13:  1  0  -3  NA  NA  NA  NA
14:  1  0  -2  NA  NA  NA  NA
15:  1  0  -1  NA  NA  NA  NA
16:  1  1   1  NA  NA  NA  NA
17:  1  1   2  NA  NA  NA  NA
18:  1  1   3  NA  NA  NA  NA
19:  1 NA  NA  NA  NA  NA  NA
20:  7 NA  NA  NA  NA  NA  NA
21:  7  0  -2  NA  NA  NA  NA
22:  7  0  -1  NA  NA  NA  NA
23:  7  1   1  -1  NA  NA  NA
24:  7  0  NA   1  -2  NA  NA
25:  7  0  NA   2  -1  NA  NA
26:  7  1  NA  NA   1  -1  NA
27:  7  0  NA  NA  NA   1  -1
28:  7  1  NA  NA  NA  NA   1
29:  7  1  NA  NA  NA  NA   2
30:  7  1  NA  NA  NA  NA   3
    id TF PM1 PM2 PM3 PM4 PM5
# verify results are identical
identical(res, dfa)
[1] TRUE

In case of more than 9 changes per group paste0("PM", V1) should be replaced by sprintf("PM%02d",V1) in the call to dcast() to ensure the PM columns are ordered properly.

Explanation

tmp <- 
  # coerce to data.table
  setDT(df)[
    # create row id column (required for final join to get NA rows back in)
    , rn := .I][
      # ignore NA rows 
      !is.na(TF)][
        # number streaks of unique values within each group
        , rl := rleid(TF), by = id][
          # create ascending and descending counts for each streak
          # this is done once to avoid repeatedly creation of counts for each PM 
          # (slight performance gain)
          , c("up", "dn") := .(seq_len(.N), -rev(seq_len(.N))), by = .(id, rl)]


tmp[]
    id TF rn rl up dn
 1:  0  0  2  1  1 -3
 2:  0  0  4  1  2 -2
 3:  0  0  5  1  3 -1
 4:  0  1  6  2  1 -3
 5:  0  1  7  2  2 -2
 6:  0  1  8  2  3 -1
 7:  0  0 10  3  1 -2
 8:  0  0 11  3  2 -1
 9:  1  0 13  1  1 -3
10:  1  0 14  1  2 -2
11:  1  0 15  1  3 -1
12:  1  1 16  2  1 -3
13:  1  1 17  2  2 -2
14:  1  1 18  2  3 -1
15:  7  0 21  1  1 -2
16:  7  0 22  1  2 -1
17:  7  1 23  2  1 -1
18:  7  0 24  3  1 -2
19:  7  0 25  3  2 -1
20:  7  1 26  4  1 -1
21:  7  0 27  5  1 -1
22:  7  1 28  6  1 -3
23:  7  1 29  6  2 -2
24:  7  1 30  6  3 -1
    id TF rn rl up dn

For the next step, we need the count of changes V1 within each group

tmp[, seq_len(max(rl) - 1L), by = .(id)]
   id V1
1:  0  1
2:  0  2
3:  1  1
4:  7  1
5:  7  2
6:  7  3
7:  7  4
8:  7  5

Now, we create a "cartesian join" of all possible changes with the rows of each group:

# right join with count of changes within each group
tmp[tmp[, seq_len(max(rl) - 1L), by = .(id)], on = .(id), allow.cartesian = TRUE][
  # copy descending counts to rows before the switch
  rl == V1, PM := dn][
    # copy ascending counts to rows after the switch
    rl == V1 + 1L, PM := up][]
    id TF rn rl up dn V1 PM
 1:  0  0  2  1  1 -3  1 -3
 2:  0  0  4  1  2 -2  1 -2
 3:  0  0  5  1  3 -1  1 -1
 4:  0  1  6  2  1 -3  1  1
 5:  0  1  7  2  2 -2  1  2
 6:  0  1  8  2  3 -1  1  3
 7:  0  0 10  3  1 -2  1 NA
 8:  0  0 11  3  2 -1  1 NA
 9:  0  0  2  1  1 -3  2 NA
10:  0  0  4  1  2 -2  2 NA
11:  0  0  5  1  3 -1  2 NA
12:  0  1  6  2  1 -3  2 -3
13:  0  1  7  2  2 -2  2 -2
14:  0  1  8  2  3 -1  2 -1
15:  0  0 10  3  1 -2  2  1
16:  0  0 11  3  2 -1  2  2
17:  1  0 13  1  1 -3  1 -3
18:  1  0 14  1  2 -2  1 -2
19:  1  0 15  1  3 -1  1 -1
20:  1  1 16  2  1 -3  1  1
21:  1  1 17  2  2 -2  1  2
22:  1  1 18  2  3 -1  1  3
23:  7  0 21  1  1 -2  1 -2
24:  7  0 22  1  2 -1  1 -1
25:  7  1 23  2  1 -1  1  1
26:  7  0 24  3  1 -2  1 NA
27:  7  0 25  3  2 -1  1 NA
28:  7  1 26  4  1 -1  1 NA
29:  7  0 27  5  1 -1  1 NA
30:  7  1 28  6  1 -3  1 NA
31:  7  1 29  6  2 -2  1 NA
32:  7  1 30  6  3 -1  1 NA
33:  7  0 21  1  1 -2  2 NA
34:  7  0 22  1  2 -1  2 NA
35:  7  1 23  2  1 -1  2 -1
36:  7  0 24  3  1 -2  2  1
37:  7  0 25  3  2 -1  2  2
38:  7  1 26  4  1 -1  2 NA
39:  7  0 27  5  1 -1  2 NA
40:  7  1 28  6  1 -3  2 NA
41:  7  1 29  6  2 -2  2 NA
42:  7  1 30  6  3 -1  2 NA
43:  7  0 21  1  1 -2  3 NA
44:  7  0 22  1  2 -1  3 NA
45:  7  1 23  2  1 -1  3 NA
46:  7  0 24  3  1 -2  3 -2
47:  7  0 25  3  2 -1  3 -1
48:  7  1 26  4  1 -1  3  1
49:  7  0 27  5  1 -1  3 NA
50:  7  1 28  6  1 -3  3 NA
51:  7  1 29  6  2 -2  3 NA
52:  7  1 30  6  3 -1  3 NA
53:  7  0 21  1  1 -2  4 NA
54:  7  0 22  1  2 -1  4 NA
55:  7  1 23  2  1 -1  4 NA
56:  7  0 24  3  1 -2  4 NA
57:  7  0 25  3  2 -1  4 NA
58:  7  1 26  4  1 -1  4 -1
59:  7  0 27  5  1 -1  4  1
60:  7  1 28  6  1 -3  4 NA
61:  7  1 29  6  2 -2  4 NA
62:  7  1 30  6  3 -1  4 NA
63:  7  0 21  1  1 -2  5 NA
64:  7  0 22  1  2 -1  5 NA
65:  7  1 23  2  1 -1  5 NA
66:  7  0 24  3  1 -2  5 NA
67:  7  0 25  3  2 -1  5 NA
68:  7  1 26  4  1 -1  5 NA
69:  7  0 27  5  1 -1  5 -1
70:  7  1 28  6  1 -3  5  1
71:  7  1 29  6  2 -2  5  2
72:  7  1 30  6  3 -1  5  3
    id TF rn rl up dn V1 PM

Finally, the intermediate result is reshaped from long to wide format.

res <- 
  # create a "cartesian join" of all possible changes with the rows of each group
  tmp[tmp[, seq_len(max(rl) - 1L), by = .(id)], on = .(id), allow.cartesian = TRUE][
    # copy descending counts to rows before the switch
    rl == V1, PM := dn][
      # copy ascending counts to rows after the switch
      rl == V1 + 1L, PM := up][
        # reshape from wide to long with the change count as new columns
        , dcast(.SD, id + TF + rn ~ sprintf("PM%02d", V1), value.var = "PM")][
          # join with original df to get NA rows back in
          df, on = .(rn, id, TF)][
            # omit helper column
            , -"rn"]



回答2:


I think the cartesian join is unnecessary:

library(data.table)
tmp <- setDT(df)[, rn := .I][!is.na(TF)][, rl := rleid(TF), by = id][
                 , `:=`(up = 1:.N, down = -.N:-1), by = .(id, rl)][
                 , `:=`(last = (rl == max(rl)) * (-down)), by = id]

up   = dcast(tmp, rn ~ rl, value.var = 'up'  , fill = 0)
down = dcast(tmp, rn ~ rl, value.var = 'down', fill = 0)
last = dcast(tmp, rn ~ rl, value.var = 'last', fill = 0)

rl.max = tmp[, max(rl)]
res = down[, 2:rl.max] + up[, 3:(rl.max+1)] + last[, 2:rl.max]

res[res == 0] = NA
res[, rn := up$rn]

setcolorder(res[df, on='rn'][,-'rn'], c('id','TF', 1:(rl.max-1)))[]
#    id TF   1   2   3   4   5
# 1:  0 NA  NA  NA  NA  NA  NA
# 2:  0  0  -3  NA  NA  NA  NA
# 3:  0 NA  NA  NA  NA  NA  NA
# 4:  0  0  -2  NA  NA  NA  NA
# 5:  0  0  -1  NA  NA  NA  NA
# 6:  0  1   1  -3  NA  NA  NA
# 7:  0  1   2  -2  NA  NA  NA
# 8:  0  1   3  -1  NA  NA  NA
# 9:  0 NA  NA  NA  NA  NA  NA
#10:  0  0  NA   1  NA  NA  NA
#11:  0  0  NA   2  NA  NA  NA
#12:  1 NA  NA  NA  NA  NA  NA
#13:  1  0  -3  NA  NA  NA  NA
#14:  1  0  -2  NA  NA  NA  NA
#15:  1  0  -1  NA  NA  NA  NA
#16:  1  1   1  NA  NA  NA  NA
#17:  1  1   2  NA  NA  NA  NA
#18:  1  1   3  NA  NA  NA  NA
#19:  1 NA  NA  NA  NA  NA  NA
#20:  7 NA  NA  NA  NA  NA  NA
#21:  7  0  -2  NA  NA  NA  NA
#22:  7  0  -1  NA  NA  NA  NA
#23:  7  1   1  -1  NA  NA  NA
#24:  7  0  NA   1  -2  NA  NA
#25:  7  0  NA   2  -1  NA  NA
#26:  7  1  NA  NA   1  -1  NA
#27:  7  0  NA  NA  NA   1  -1
#28:  7  1  NA  NA  NA  NA   1
#29:  7  1  NA  NA  NA  NA   2
#30:  7  1  NA  NA  NA  NA   3
#    id TF   1   2   3   4   5


来源:https://stackoverflow.com/questions/46541843/counting-after-and-before-change-in-value-within-groups-generating-new-variabl

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!