R data.table: row-based conditions split/apply/combine

后端 未结 4 1558
轻奢々
轻奢々 2020-12-20 06:46

I have the following data.table

initial.date <- as.POSIXct(\'2018-10-27 10:00:00\',tz=\'GMT\')
last.date <- as.POSIXct(\'2018-12-28 17:00:         


        
4条回答
  •  生来不讨喜
    2020-12-20 07:38

    OP did not mention the size of the new dataset. But a Rcpp solution should speed things up.

    As per previous comment:

    mtd1 <- function() {
        ndf[, rn:=.I]
        iidx <- ndf[
            .(inst=InstrumentSymbol, prevMin=datetime-60L, nextMin=datetime+60L, idx=id, tp=TradePrice),
    
            .SD[id != idx, rn[which.min(abs(TradePrice - tp))]],
    
            by=.EACHI,
    
            on=.(InstrumentSymbol=inst, datetime>=prevMin, datetime<=nextMin)];
    
        ndf[, c("minpricewithin60", "index.minpricewithin60") := .SD[iidx$V1, .(TradePrice, id)]]
    }
    

    arg0naut's approach:

    mtd2 <- function() {
        res2[, `:=` (min_60 = datetime - 60, plus_60 = datetime + 60, idx = .I)][
            res2,  on = .(InstrumentSymbol = InstrumentSymbol, datetime >= min_60, datetime <= plus_60), allow.cartesian = TRUE][
                idx != i.idx, .SD[which.min(abs(i.TradePrice - TradePrice))], by = id][
                    , .(id, minpricewithin60 = i.TradePrice, index.minpricewithin60 = i.idx)][
                        res, on = .(id)][, `:=` (min_60 = NULL, plus_60 = NULL, idx = NULL)]
    
    }
    

    A possible Rcpp approach:

    library(Rcpp)
    cppFunction('
    NumericVector nearestPrice(NumericVector id, NumericVector datetime, NumericVector price) {
        int i, j, n = id.size();
        NumericVector res(n);
        double prev, diff;
    
        for (i=0; i= datetime[i]-60 && j>=0) {
                diff = std::abs(price[i] - price[j]);
    
                if (diff < prev) {
                    res[i] = id[j];
                    prev = diff;
                }
                j--;
            }
    
            j = i+1;
            while (datetime[j] <= datetime[i]+60 && j<=n) {
                diff = std::abs(price[i] - price[j]);
    
                if (diff < prev) {
                    res[i] = id[j];
                    prev = diff;
                }
                j++;
            }
        }
    
        return(res);
    }
    ')
    
    mtd3 <- function() {
        setorder(ndf2, InstrumentSymbol, PriorityDateTime)
        iidx <- ndf2[, nearestPrice(.I, datetime, TradePrice), by=.(InstrumentSymbol)]
        ndf2[, c("minpricewithin60", "index.minpricewithin60") := .SD[iidx$V1, .(TradePrice, id)]]
    }
    

    timing code:

    library(microbenchmark)
    microbenchmark(mtd1(), mtd2(), mtd3(), times=3L)
    

    timings:

    Unit: milliseconds
       expr         min          lq        mean      median          uq         max neval
     mtd1() 49447.09713 49457.12408 49528.14395 49467.15103 49568.66737 49670.18371     3
     mtd2() 64189.67241 64343.67138 64656.40058 64497.67034 64889.76466 65281.85899     3
     mtd3()    17.33116    19.58716    22.36557    21.84316    24.88277    27.92238     3
    

    data:

    set.seed(0L)
    initial.date <- as.POSIXct('2018-01-01 00:00:00', tz='GMT')
    last.date <- initial.date + 30 * (180000/2)
    PriorityDateTime <- seq.POSIXt(from=initial.date, to=last.date, by='30 sec')
    
    library(data.table)
    ndf <- data.table(PriorityDateTime=c(PriorityDateTime, PriorityDateTime),
        TradePrice=rnorm(length(PriorityDateTime)*2, 100, 20),
        InstrumentSymbol=rep(c('asset1','asset2'), each=length(PriorityDateTime)),
        datetime=c(PriorityDateTime, PriorityDateTime))
    setorder(ndf, InstrumentSymbol, PriorityDateTime)[, id := .I]
    res <- copy(ndf)
    res2  <- copy(ndf)
    ndf2 <- copy(ndf)
    

提交回复
热议问题