问题
This is my transaction data. It shows the transactions made from the accounts in from
column to the accounts in to
column with the date and the amount information
data
id from to date amount
<int> <fctr> <fctr> <date> <dbl>
19521 6644 6934 2005-01-01 700.0
19524 6753 8456 2005-01-01 600.0
19523 9242 9333 2005-01-01 1000.0
… … … … …
1056317 7819 7454 2010-12-31 60.2
1056318 6164 7497 2010-12-31 107.5
1056319 7533 7492 2010-12-31 164.1
I want to calculate how much transaction amount the accounts in from
column received in the last 6 month prior to the date that particular transaction was made and want to save this information as a new column.
This following code works very well to accomplish this in a small dataset ,say, with 1000 rows:
library(dplyr)
library(purrr)
data %>%
mutate(total_trx_amount_received_in_last_sixmonth= map2_dbl(from, date,
~sum(amount[to == .x & between(date, .y - 180, .y)])))
However, since my data has over 1 million rows, this code will take more than a couple of hours to complete.
I searched the internet if I can speed up the run time of this code. I tried this suggestion on SO about how to make purrr
map
function run faster. So, I tried the following code and instead of mutate
of dplyr
I used data.table
to speed up the code even faster:
library(future)
library(data.table)
library(furrr)
data[, total_trx_amount_received_in_last_sixmonth:= furrr::future_pmap_dbl(list(from, date),
~mean(amount[to == .x & between(date, .y-180, .y)])) ]
But, the speed hasn't been improved at all.
Is there any suggestion on how can I make the code run faster?
dput() output of the data:
structure(list(id = c(18529L, 13742L, 9913L, 956L, 2557L, 1602L,
18669L, 35900L, 48667L, 51341L, 53713L, 60126L, 60545L, 65113L,
66783L, 83324L, 87614L, 88898L, 89874L, 94765L, 100277L, 101587L,
103444L, 108414L, 113319L, 121516L, 126607L, 130170L, 131771L,
135002L, 149431L, 157403L, 157645L, 158831L, 162597L, 162680L,
163901L, 165044L, 167082L, 168562L, 168940L, 172578L, 173031L,
173267L, 177507L, 179167L, 182612L, 183499L, 188171L, 189625L,
193940L, 198764L, 199342L, 200134L, 203328L, 203763L, 204733L,
205651L, 209672L, 210242L, 210979L, 214532L, 214741L, 215738L,
216709L, 220828L, 222140L, 222905L, 226133L, 226527L, 227160L,
228193L, 231782L, 232454L, 233774L, 237836L, 237837L, 238860L,
240223L, 245032L, 246673L, 247561L, 251611L, 251696L, 252663L,
254410L, 255126L, 255230L, 258484L, 258485L, 259309L, 259910L,
260542L, 262091L, 264462L, 264887L, 264888L, 266125L, 268574L,
272959L), from = c("5370", "5370", "5370", "8605", "5370", "6390",
"5370", "5370", "8934", "5370", "5635", "6046", "5680", "8026",
"9037", "5370", "7816", "8046", "5492", "8756", "5370", "9254",
"5370", "5370", "7078", "6615", "5370", "9817", "8228", "8822",
"5735", "7058", "5370", "8667", "9315", "6053", "7990", "8247",
"8165", "5656", "9261", "5929", "8251", "5370", "6725", "5370",
"6004", "7022", "7442", "5370", "8679", "6491", "7078", "5370",
"5370", "5370", "5658", "5370", "9296", "8386", "5370", "5370",
"5370", "9535", "5370", "7541", "5370", "9621", "5370", "7158",
"8240", "5370", "5370", "8025", "5370", "5370", "5370", "6989",
"5370", "7059", "5370", "5370", "5370", "9121", "5608", "5370",
"5370", "7551", "5370", "5370", "5370", "5370", "9163", "9362",
"6072", "5370", "5370", "5370", "5370", "5370"), to = c("9356",
"5605", "8567", "5370", "5636", "5370", "8933", "8483", "5370",
"7626", "5370", "5370", "5370", "5370", "5370", "9676", "5370",
"5370", "5370", "5370", "9105", "5370", "9772", "6979", "5370",
"5370", "7564", "5370", "5370", "5370", "5370", "5370", "8744",
"5370", "5370", "5370", "5370", "5370", "5370", "5370", "5370",
"5370", "5370", "7318", "5370", "8433", "5370", "5370", "5370",
"7122", "5370", "5370", "5370", "8566", "6728", "9689", "5370",
"8342", "5370", "5370", "5614", "5596", "5953", "5370", "7336",
"5370", "7247", "5370", "7291", "5370", "5370", "6282", "7236",
"5370", "8866", "8613", "9247", "5370", "6767", "5370", "9273",
"7320", "9533", "5370", "5370", "8930", "9343", "5370", "9499",
"7693", "7830", "5392", "5370", "5370", "5370", "7497", "8516",
"9023", "7310", "8939"), date = structure(c(12934, 13000, 13038,
13061, 13099, 13113, 13117, 13179, 13238, 13249, 13268, 13296,
13299, 13309, 13314, 13391, 13400, 13404, 13409, 13428, 13452,
13452, 13460, 13482, 13493, 13518, 13526, 13537, 13542, 13544,
13596, 13616, 13617, 13626, 13633, 13633, 13639, 13642, 13646,
13656, 13660, 13664, 13667, 13669, 13677, 13686, 13694, 13694,
13707, 13716, 13725, 13738, 13739, 13746, 13756, 13756, 13756,
13761, 13769, 13770, 13776, 13786, 13786, 13786, 13791, 13799,
13806, 13813, 13817, 13817, 13817, 13822, 13829, 13830, 13836,
13847, 13847, 13847, 13852, 13860, 13866, 13871, 13878, 13878,
13878, 13882, 13883, 13883, 13887, 13887, 13888, 13889, 13890,
13891, 13895, 13896, 13896, 13899, 13905, 13909), class = "Date"),
amount = c(24.4, 7618, 21971, 5245, 2921, 8000, 169.2, 71.5,
14.6, 4214, 14.6, 13920, 14.6, 24640, 1600, 261.1, 16400,
3500, 2700, 19882, 182, 14.6, 16927, 25653, 3059, 2880, 9658,
4500, 12480, 14.6, 1000, 3679, 34430, 12600, 14.6, 19.2,
4900, 826, 3679, 2100, 38000, 79, 11400, 21495, 3679, 200,
14.6, 100.6, 3679, 5300, 108.9, 3679, 2696, 7500, 171.6,
14.6, 99.2, 2452, 3679, 3218, 700, 69.7, 14.6, 91.5, 2452,
3679, 2900, 17572, 14.6, 14.6, 90.5, 2452, 49752, 3679, 1900,
14.6, 870, 85.2, 2452, 3679, 1600, 540, 14.6, 14.6, 79, 210,
2452, 28400, 720, 180, 420, 44289, 489, 3679, 840, 2900,
150, 870, 420, 14.6)), row.names = c(NA, -100L), class = "data.frame")
回答1:
This is simply a non-equi join in data.table. You can create a variable of date - 180
and limit the join between the current date and that variable. This should be fairly quick
library(data.table)
setDT(dt)[, date_minus_180 := date - 180]
dt[, amnt_6_m := .SD[dt, sum(amount, na.rm = TRUE),
on = .(to = from, date <= date, date >= date_minus_180), by = .EACHI]$V1]
head(dt, 10)
# id from to date amount date_minus_180 amnt_6_m
# 1: 18529 5370 9356 2005-05-31 24.4 2004-12-02 0.0
# 2: 13742 5370 5605 2005-08-05 7618.0 2005-02-06 0.0
# 3: 9913 5370 8567 2005-09-12 21971.0 2005-03-16 0.0
# 4: 956 8605 5370 2005-10-05 5245.0 2005-04-08 0.0
# 5: 2557 5370 5636 2005-11-12 2921.0 2005-05-16 5245.0
# 6: 1602 6390 5370 2005-11-26 8000.0 2005-05-30 0.0
# 7: 18669 5370 8933 2005-11-30 169.2 2005-06-03 13245.0
# 8: 35900 5370 8483 2006-01-31 71.5 2005-08-04 13245.0
# 9: 48667 8934 5370 2006-03-31 14.6 2005-10-02 0.0
# 10: 51341 5370 7626 2006-04-11 4214.0 2005-10-13 8014.6
回答2:
Here is one option using data.table
:
library(data.table)
setDT(df)
setkey(df, to, date)
# Unique combination of from and date
af <- df[, unique(.SD), .SDcols = c("from", "date")]
# For each combination check sum of incoming in the last 6 months
for (i in 1:nrow(af)) {
set(
af, i = i, j = "am6m",
value = df[(date) %between% (af$date[[i]] - c(180, 0)) & to == af$from[[i]], sum(amount)]
)
}
# Join the results into the main data.frame
df[, am6m := af[.SD, on = .(from, date), am6m]]
> tail(df)
# id from to date amount am6m
# 1: 18529 5370 9356 2005-05-31 24.4 0.0
# 2: 258484 5370 9499 2008-01-09 720.0 74543.5
# 3: 251611 5370 9533 2007-12-31 14.6 46143.5
# 4: 83324 5370 9676 2006-08-31 261.1 40203.8
# 5: 203763 5370 9689 2007-08-31 14.6 92353.1
# 6: 103444 5370 9772 2006-11-08 16927.0 82671.2
回答3:
Here's an option using window functions.
However, they require complete daily data in order to work, so the amount of memory required can be large (you have to have a row for every day for each from).
Also note that this method is only useful for large datasets or for performing calculations directly on a database. It takes a lot of set-up time to get the original data into a form that has no gaps. And it takes time to join the data at the end.
However, the slide function is relatively consistent in how fast it is regardless of size of data. As opposed to subsetting which increases in time as the size of the data being subset increases.
library(tidyverse)
library(tsibble)
# Calculate the 6 month window
six_mo_rollup <- data %>%
## NOTE: You have to deal with duplicates somehow...either remove
## false duplicates or make them not duplicates...
# We can get a unique from/date combo by summing since we need
# to sum anyway.
group_by(from,date) %>%
summarise(amount = sum(amount),
.groups = "keep") %>%
ungroup() %>%
# Now that each from/date is unique
# convert data to a tsibble object
as_tsibble(key = c(from),index = date) %>%
# window functions can't have any missing time periods...so fill gaps
# window functions grab 180 rows...not 180 days from the date
group_by_key() %>%
fill_gaps(.full = TRUE) %>%
ungroup() %>%
# arrange data from lowest to highest so slide can work right.
arrange(date) %>%
group_by(from) %>%
mutate(
six_mo_sum = slide_dbl(
amount,
sum,
na.rm = TRUE,
.size = 180,
.align = "right"
)
) %>%
ungroup() %>%
# any row without amount was created by fill_gaps in the example
# so we can drop those rows to save space
filter(!is.na(amount))
six_mo_rollup %>% filter(from == "5370")
# # A tsibble: 41 x 4 [1D]
# # Key: from [1]
# from date amount six_mo_sum
# <chr> <date> <dbl> <dbl>
# 1 5370 2005-05-31 24.4 NA
# 2 5370 2005-08-05 7618 NA
# 3 5370 2005-09-12 21971 NA
# 4 5370 2005-11-12 2921 NA
# 5 5370 2005-11-30 169. 32679.
# 6 5370 2006-01-31 71.5 32751.
# 7 5370 2006-04-11 4214 7376.
# 8 5370 2006-08-31 261. 4475.
# 9 5370 2006-10-31 182 443.
# 10 5370 2006-11-08 16927 17370.
# # ... with 31 more rows
# Join the windowed data to the original dataset
data <- data %>%
left_join(
six_mo_rollup %>% select(from,date,six_mo_sum),
by = c("from","date")
)
UPDATE:
In the comments it became apparent that you wanted to sum up the to values for each for. I didn't understand that originally. The update to the code is to change all of the rollup to to
instead of for
.
Also, you wanted values that didn't have 6 months of complete data. So you add .partial = TRUE
.
# Calculate the 6 month window
six_mo_rollup <- data %>%
## NOTE: You have to deal with duplicates somehow...either remove
## false duplicates or make them not duplicates...
# We can get a unique from/date combo by summing since we need
# to sum anyway.
group_by(to,date) %>%
summarise(amount = sum(amount),
.groups = "keep") %>%
ungroup() %>%
# Now that each from/date is unique
# convert data to a tsibble object
as_tsibble(key = c(to),index = date) %>%
# window functions can't have any missing time periods...so fill gaps
# window functions grab 180 rows...not 180 days from the date
group_by_key() %>%
fill_gaps(.full = TRUE) %>%
ungroup() %>%
# arrange data from lowest to highest so slide can work right.
arrange(date) %>%
group_by(to) %>%
mutate(
six_mo_sum = slide_dbl(
amount,
sum,
na.rm = TRUE,
.size = 180,
.align = "right",
.partial = TRUE
)
) %>%
ungroup() %>%
# any row without amount was created by fill_gaps in the example
# so we can drop those rows to save space
filter(!is.na(amount))
six_mo_rollup %>% filter(to == "5370")
# # A tsibble: 50 x 4 [1D]
# # Key: to [1]
# to date amount six_mo_sum
# <chr> <date> <dbl> <dbl>
# 1 5370 2005-10-05 5245 5245
# 2 5370 2005-11-26 8000 13245
# 3 5370 2006-03-31 14.6 13260.
# 4 5370 2006-04-30 14.6 8029.
# 5 5370 2006-05-28 13920 13949.
# 6 5370 2006-05-31 14.6 13964.
# 7 5370 2006-06-10 24640 38604.
# 8 5370 2006-06-15 1600 40204.
# 9 5370 2006-09-09 16400 56604.
# 10 5370 2006-09-13 3500 60104.
# # ... with 40 more rows
# Join the windowed data to the original dataset
data <- data %>%
left_join(
six_mo_rollup %>% select(to,date,six_mo_sum),
by = c("from" = "to","date" = "date")
)
回答4:
A 1m record dataset is small enough that no parallelization is needed. There are many ways to do this that "look" right but are not... be careful!
First off, you may be wondering why is your original approach slow? R is an interpreted array language. To do anything with acceptable performance you have to pass vectors to fast functions that have been precompiled in lower level languages. If you "map" a function elementwise over a dataset you lose most of the advantages of vectorization - purrr::map
, base::lapply
, etc all fundamentally have comparable performance to a for loop with preallocation, ie. not great. You are making 1 million plus individual function calls (one per record). Parallelization of this can only improve performance by a factor of however many cores you have minus some overhead.
Clarifying questions for you:
- is there a limit of one transaction per day per account or are multiple transactions possible on any given day? I am assuming yes, multiple transactions per day are possible.
- "how much transaction amount the accounts in from column received in the last 6 month prior to the date that particular transaction was made" - I am assuming this means "ignore transactions made on the same date as the transaction getting the field appended to", as there is no way to determine what time these transactions were performed at
My approach: first sum by account and day, then calculate rolling sum by day, then join that to the subsequent day.
install.packages("RcppRoll") # for roll_sum()
install.packages(tidyr) # for complete()
library(dplyr)
start_date <- as.Date("2018-01-01")
end_date <- as.Date("2020-01-01")
window_size <- 180L
# your example dataset is way too small to assess performance.
# Here is a 100k record dataset.
big_data <- tibble(
from = as.factor(sapply(1:1000L, function(x) sample(1:100L,100, replace = F))),
to = as.factor(sapply(1:1000L, function(x) sample(1:100L,100, replace = F))),
amount = sample(1:10000, 100000, replace = TRUE),
date = sample(seq.Date(from = start_date, to = end_date, by = "days"), 100000, replace = TRUE)
) %>%
arrange(date) %>%
mutate(id = row_number()) %>%
ungroup()
# calculate daily sum of values from PRECEDING day for join
daily_summary <- big_data %>%
group_by(to, date) %>%
summarize(daily_sum = sum(amount, na.rm = TRUE)) %>%
ungroup() %>%
# backfill empty records for data going back 6 months from start
# this is needed because roll_sum() has no partial mode implemented.
# and populate missing account - date combinations
complete(date = seq.Date(from = start_date - window_size, to = end_date, by = "days"), to, fill = list(daily_sum = 0)) %>%
group_by(to) %>%
arrange(date) %>%
mutate(
total_trx_amount_received_in_last_sixmonth = RcppRoll::roll_sum(daily_sum, align = "right", n = window_size, fill = NA),
date = date + 1
) %>%
filter(date >= start_date) %>%
select(date = date, from = to, total_trx_amount_received_in_last_sixmonth)
results <- left_join(big_data, daily_summary, by = c("from", "date"))
Now, what about performance? Much better than what you are reporting, at least for me. For a 100k record dataset (100 accounts, 2 years of information) I got 0.6 seconds on my laptop. For a 1m record dataset (1000 accounts, 2 years of information) I got 7-8 seconds using microbenchmark
. Possible not the most efficient approach possible but quite acceptable considering that I did no optimization and did not employ data.table
which is usually the key to high performance 2d operations in R.
Using dplyr
grouping still means we are making one call to the fast precompiled function RcppRoll::roll_sum()
per account, which is not ideal from a performance perspective, but at least we are only doing one function call per account rather than one function call per individual record. You might also want to look into the single pass rolling window functions implemented in the RollingWindow package as they may be even faster.
来源:https://stackoverflow.com/questions/63795371/sum-amount-last-6-month-prior-to-the-date-of-transaction