问题
I am new to R. I have a data frame in R like following
df <- data.frame(ID=c(rep("A1",10),rep("A2",13),rep("A3",12)),
Values=c(10,2,4,23,10,5,20,15,13,21,15,9,19,5,14,25,18,19,31,26,4,21,4,6,7,12,15,18,25,20,16,29,21,19,10))
For every ID I would like to sum the counts in column "Values" in a sliding windows for every 3 positions. Following data frame is an excerpt from df
which includes only the records corresponding to A1
:
ID Values
A1 10
A1 2
A1 4
A1 23
A1 10
A1 5
A1 20
A1 15
A1 13
A1 21
I would like to take 3 entries at time and sum and move to next 3 entries. When the sliding windows can't accommodate 3 entries then I skip those values.
For an example, Window_1
starts from first value (10
) while window_2
starts from second value (2
) and window_3 starts from third value (4
).
window_1 = [10+2+4] + [23+10+5] + [20+15+13] = 102
window_2 = [2+4+23] + [10+5+20] + [15+13+21] = 113
window_3 = [4+23+10] + [5+20+15] = 77
and report it in a data frame like following:
ID Window_1 Window_2 Window_3
A1 102 113 77
Likewise I would like sum the counts in column Values
for everyid in the data frame "df" and report in a data.frmae like following:
ID window_1 window_2 window_3
A1 102 113 77
A2 206 195 161
A3 198 163 175
I tried the following code
sum_win_3=0
sum_win_2=0
sum_win_1=0
win_1_counts=0
win_2_counts=0
win_3_counts=0
for (i in seq(1,length(df$Values),3))
{
if((i+i+1+i+2) %% 3 == 0)
{
win_1_counts=df$Values[i]+df$Values[i+1]+df$Values[i+2]
win_1_counts[is.na(win_1_counts)]=0
#print(win_1_counts)
}
sum_win_1=sum_win_1+win_1_counts
}
#print(sum_win_1)
for (j in seq(2,length(df$Values),3))
{
if((j+j+1+j+2) %% 3 == 0)
{
win_2_counts=df$Values[j]+df$Values[j+1]+df$Values[j+2]
win_2_counts[is.na(win_2_counts)]=0
#print(win_2_counts)
}
sum_win_2=sum_win_2+win_2_counts
}
#print(sum_win_2)
for (k in seq(3,length(df$Values),3))
{
if((k+k+1+k+2) %% 3 == 0)
{
win_3_counts=df$Values[k]+df$Values[k+1]+df$Values[k+2]
win_3_counts[is.na(win_3_counts)]=0
#print(win_3_counts)
}
#sum_win_3=sum_win_3+win_3_counts
}
print(sum_win_3)
output=data.frame(ID=df[1],Window_1=sum_win_1,Window_2=sum_win_2,Window_3=sum_win_3)
The above code sums the counts for window_1, windows_2 and window_3 by taking all the IDs together rather working on every ID separately.
Kindly guide me in getting the the output in the desired format stated above.
Thanks in advance
回答1:
Using the data.table package, I would approach it as follows:
library(data.table)
setDT(df)[, .(w1 = sum(Values[1:(3*(.N%/%3))]),
w2 = sum(Values[2:(3*((.N-1)%/%3)+1)]),
w3 = sum(Values[3:(3*((.N-2)%/%3)+2)]))
, by = ID]
which gives:
ID w1 w2 w3 1: A1 102 113 77 2: A2 206 195 161 3: A3 198 163 175
Or to avoid the repetition (thanx to @Cath):
setDT(df)[, lapply(1:3, function(i) {sum(Values[i:(3*((.N-i+1)%/%3)+(i-1))])})
, by = ID]
If you want to rename the V1, V2 & V3 variables, you can do that afterwards, but you can also do:
cols <- c("w1","w2","w3")
setDT(df)[, (cols) := lapply(1:3, function(i) {sum(Values[i:(3*((.N-i+1)%/%3)+(i-1))])})
, by = ID]
回答2:
This could be done using tapply
and aggregate
sumf <- function(x1){
sum(tapply(x1,
(seq_along(x1) -1) %/%3,
function(x) ifelse(length(x) == 3, sum(x), 0)))
}
aggregate(Values ~ ID, data = df,
FUN = function(y){
cbind(sumf(y), sumf(y[-1]), sumf(y[-c(1,2)]))
})
# Group.1 x.1 x.2 x.3
#1 A1 102 113 77
#2 A2 206 195 161
#3 A3 198 163 175
This can also be done using filter
sum.filter <- function(z) tapply(head(tail(as.numeric(
filter(z, c(1,1,1))),-1), -1),
0:(length(z)-3) %% 3 +1, sum)
aggregate(Values ~ ID, data = df, FUN = function(y){ cbind(sum.filter(y) )})
回答3:
This seems to work:
library(zoo)
wins = function(x, w)
rollapply(x, width = w*((length(x)-seq(w)+1) %/% w), align = "left", sum)
aggregate(Values ~ ID, df, wins, 3)
# ID Values.1 Values.2 Values.3
# 1 A1 102 113 77
# 2 A2 206 195 161
# 3 A3 198 163 175
This is the only answer so far to perform the calculation on a rolling basis, which is usually more efficient.
来源:https://stackoverflow.com/questions/35039361/summing-the-counts-in-a-data-frame-using-sliding-window