How to create a lag variable within each group?

前端 未结 5 1482
没有蜡笔的小新
没有蜡笔的小新 2020-11-22 04:45

I have a data.table:

set.seed(1)
data <- data.table(time = c(1:3, 1:4),
                   groups = c(rep(c(\"b\", \"a\"), c(3, 4))),
                   v         


        
5条回答
  •  青春惊慌失措
    2020-11-22 05:28

    You could do this within data.table

     library(data.table)
     data[, lag.value:=c(NA, value[-.N]), by=groups]
      data
     #   time groups       value   lag.value
     #1:    1      a  0.02779005          NA
     #2:    2      a  0.88029938  0.02779005
     #3:    3      a -1.69514201  0.88029938
     #4:    1      b -1.27560288          NA
     #5:    2      b -0.65976434 -1.27560288
     #6:    3      b -1.37804943 -0.65976434
     #7:    4      b  0.12041778 -1.37804943
    

    For multiple columns:

    nm1 <- grep("^value", colnames(data), value=TRUE)
    nm2 <- paste("lag", nm1, sep=".")
    data[, (nm2):=lapply(.SD, function(x) c(NA, x[-.N])), by=groups, .SDcols=nm1]
     data
    #    time groups      value     value1      value2  lag.value lag.value1
    #1:    1      b -0.6264538  0.7383247  1.12493092         NA         NA
    #2:    2      b  0.1836433  0.5757814 -0.04493361 -0.6264538  0.7383247
    #3:    3      b -0.8356286 -0.3053884 -0.01619026  0.1836433  0.5757814
    #4:    1      a  1.5952808  1.5117812  0.94383621         NA         NA
    #5:    2      a  0.3295078  0.3898432  0.82122120  1.5952808  1.5117812
    #6:    3      a -0.8204684 -0.6212406  0.59390132  0.3295078  0.3898432
    #7:    4      a  0.4874291 -2.2146999  0.91897737 -0.8204684 -0.6212406
    #    lag.value2
    #1:          NA
    #2:  1.12493092
    #3: -0.04493361
    #4:          NA
    #5:  0.94383621
    #6:  0.82122120
    #7:  0.59390132
    

    Update

    From data.table versions >= v1.9.5, we can use shift with type as lag or lead. By default, the type is lag.

    data[, (nm2) :=  shift(.SD), by=groups, .SDcols=nm1]
    #   time groups      value     value1      value2  lag.value lag.value1
    #1:    1      b -0.6264538  0.7383247  1.12493092         NA         NA
    #2:    2      b  0.1836433  0.5757814 -0.04493361 -0.6264538  0.7383247
    #3:    3      b -0.8356286 -0.3053884 -0.01619026  0.1836433  0.5757814
    #4:    1      a  1.5952808  1.5117812  0.94383621         NA         NA
    #5:    2      a  0.3295078  0.3898432  0.82122120  1.5952808  1.5117812
    #6:    3      a -0.8204684 -0.6212406  0.59390132  0.3295078  0.3898432
    #7:    4      a  0.4874291 -2.2146999  0.91897737 -0.8204684 -0.6212406
    #    lag.value2
    #1:          NA
    #2:  1.12493092
    #3: -0.04493361
    #4:          NA
    #5:  0.94383621
    #6:  0.82122120
    #7:  0.59390132
    

    If you need the reverse, use type=lead

    nm3 <- paste("lead", nm1, sep=".")
    

    Using the original dataset

      data[, (nm3) := shift(.SD, type='lead'), by = groups, .SDcols=nm1]
      #  time groups      value     value1      value2 lead.value lead.value1
      #1:    1      b -0.6264538  0.7383247  1.12493092  0.1836433   0.5757814
      #2:    2      b  0.1836433  0.5757814 -0.04493361 -0.8356286  -0.3053884
      #3:    3      b -0.8356286 -0.3053884 -0.01619026         NA          NA
      #4:    1      a  1.5952808  1.5117812  0.94383621  0.3295078   0.3898432
      #5:    2      a  0.3295078  0.3898432  0.82122120 -0.8204684  -0.6212406
      #6:    3      a -0.8204684 -0.6212406  0.59390132  0.4874291  -2.2146999
      #7:    4      a  0.4874291 -2.2146999  0.91897737         NA          NA
     #   lead.value2
     #1: -0.04493361
     #2: -0.01619026
     #3:          NA
     #4:  0.82122120
     #5:  0.59390132
     #6:  0.91897737
     #7:          NA
    

    data

     set.seed(1)
     data <- data.table(time =c(1:3,1:4),groups = c(rep(c("b","a"),c(3,4))),
                 value = rnorm(7), value1=rnorm(7), value2=rnorm(7))
    

提交回复
热议问题