(Update) Add index column to data.frame based on two columns

主宰稳场 提交于 2019-12-31 04:24:08

问题


Example data.frame:

df = read.table(text = 'colA colB
                2 7
                2 7
                2 7
                2 7
                1 7
                1 7
                1 7
                89 5
                89 5
                89 5
                88 5
                88 5
                70 5
                70 5
                70 5
                69 5
                69 5
                44 4
                44 4
                44 4
                43 4
                42 4
                42 4
                41 4
                41 4
                120 1
                100 1', header = TRUE)

I need to add an index col based on colA and colB where colB shows the exact number of rows to group but it can be duplicated. colB groups rows based on colA and colA -1.

Expected output:

 colA colB index_col
    2 7       1
    2 7       1
    2 7       1
    2 7       1
    1 7       1
    1 7       1
    1 7       1
    89 5      2
    89 5      2
    89 5      2
    88 5      2
    88 5      2
    70 5      3
    70 5      3
    70 5      3
    69 5      3
    69 5      3
    44 4      4
    44 4      4 
    44 4      4
    43 4      4
    42 4      5
    42 4      5
    41 4      5
    41 4      5
   120 1      6
   100 1      7

UPDATE

How can I adapt the code that works for the above df for the same purpose but by looking at colB values grouped based on colA, colA -1 and colA -2? i.e. (instead of 2 days considering 3 days)

new_df = read.table(text = 'colA colB
        3 10
        3 10
        3 10      
        2 10
        2 10
        2 10
        2 10
        1 10
        1 10
        1 10
        90 7
        90 7
        89 7
        89 7
        89 7
        88 7
        88 7
        71 7
        71 7
        70 7
        70 7
        70 7
        69 7
        69 7
        44 5
        44 5
        44 5
        43 5
        42 5
        41 5
        41 5
        41 5
        40 5
        40 5
       120 1
       100 1', header = TRUE)

Expected output:

     colA colB index_col
        3 10      1
        3 10      1
        3 10      1
        2 10      1
        2 10      1
        2 10      1
        2 10      1
        1 10      1
        1 10      1
        1 10      1
        90 7      2
        90 7      2
        89 7      2
        89 7      2
        89 7      2
        88 7      2
        88 7      2
        71 7      3
        71 7      3
        70 7      3
        70 7      3
        70 7      3
        69 7      3
        69 7      3
        44 5      4
        44 5      4
        44 5      4
        43 5      4
        42 5      4
        41 5      5
        41 5      5
        41 5      5
        40 5      5
        40 5      5
       120 1      6
       100 1      7

Thanks


回答1:


We can use rleid

library(data.table)
index_col <-setDT(df)[, if(colB[1L] < .N) ((seq_len(.N)-1) %/% colB[1L])+1  
       else as.numeric(colB), rleid(colB)][, rleid(V1)]
df[, index_col := index_col]
df
#    colA colB index_col
# 1:    2    7         1
# 2:    2    7         1
# 3:    2    7         1
# 4:    2    7         1
# 5:    1    7         1
# 6:    1    7         1
# 7:    1    7         1
# 8:   70    5         2
# 9:   70    5         2
#10:   70    5         2
#11:   69    5         2
#12:   69    5         2
#13:   89    5         3
#14:   89    5         3
#15:   89    5         3
#16:   88    5         3
#17:   88    5         3
#18:  120    1         4
#19:  100    1         5

Or a one-liner would be

setDT(df)[, index_col := df[, ((seq_len(.N)-1) %/% colB[1L])+1, rleid(colB)][, as.integer(interaction(.SD, drop = TRUE, lex.order = TRUE))]]

Update

Based on the new update in the OP's post

setDT(new_df)[, index_col :=  cumsum(c(TRUE, abs(diff(colA))> 1))
          ][, colB := .N , index_col]
new_df
#    colA colB index_col
# 1:    3   10         1
# 2:    3   10         1
# 3:    3   10         1
# 4:    2   10         1
# 5:    2   10         1
# 6:    2   10         1
# 7:    2   10         1
# 8:    1   10         1
# 9:    1   10         1
#10:    1   10         1
#11:   71    7         2
#12:   71    7         2
#13:   70    7         2
#14:   70    7         2
#15:   70    7         2
#16:   69    7         2
#17:   69    7         2
#18:   90    7         3
#19:   90    7         3
#20:   89    7         3
#21:   89    7         3
#22:   89    7         3
#23:   88    7         3
#24:   88    7         3
#25:   44    2         4
#26:   43    2         4
#27:  120    1         5
#28:  100    1         6



回答2:


An approach in base R:

df$idxcol <- cumsum(c(1,abs(diff(df$colA)) > 1) + c(0,diff(df$colB) != 0) > 0)

which gives:

> df
   colA colB idxcol
1     2    7      1
2     2    7      1
3     2    7      1
4     2    7      1
5     1    7      1
6     1    7      1
7     1    7      1
8    70    5      2
9    70    5      2
10   70    5      2
11   69    5      2
12   69    5      2
13   89    5      3
14   89    5      3
15   89    5      3
16   88    5      3
17   88    5      3
18  120    1      4
19  100    1      5

On the updated example data, you need to adapt the approach to:

n <- 1
idx1 <- cumsum(c(1, diff(df$colA) < -n) + c(0, diff(df$colB) != 0) > 0)
idx2 <- ave(df$colA, cumsum(c(1, diff(df$colA) < -n)), FUN = function(x) c(0, cumsum(diff(x)) < -n ))
idx2[idx2==1 & c(0,diff(idx2))==0] <- 0

df$idxcol <- idx1 + cumsum(idx2)

which gives:

> df
   colA colB idxcol
1     2    7      1
2     2    7      1
3     2    7      1
4     2    7      1
5     1    7      1
6     1    7      1
7     1    7      1
8    89    5      2
9    89    5      2
10   89    5      2
11   88    5      2
12   88    5      2
13   70    5      3
14   70    5      3
15   70    5      3
16   69    5      3
17   69    5      3
18   44    4      4
19   44    4      4
20   44    4      4
21   43    4      4
22   42    4      5
23   42    4      5
24   41    4      5
25   41    4      5
26  120    1      6
27  100    1      7

For new_df just change n tot 2 and you will get the desired output for that as well.



来源:https://stackoverflow.com/questions/44245666/update-add-index-column-to-data-frame-based-on-two-columns

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!