问题
library(plyr);
library(sqldf);
library(data.table)
library(stringi);
library(RODBC);
dbhandle <- odbcDriverConnect('driver={SQL Server};server=.;database=TEST_DB;trusted_connection=true')
res <- sqlQuery(dbhandle, 'Select Company_ID,
AsOfDate,
CashFlow FROM dbo.Accounts')
resdatatable = as.data.table(res)
odbcCloseAll();
sppv <- function(i, n) {
return((1 + i / 100) ^ (-n))
}
npv <- function(x, i) {
npv = c()
for (k in 1:length(i)) {
pvs = x * sppv(i[k], 1:length(x))
npv = c(npv, sum(pvs))
}
return(npv)
}
xirr <- function(cashflow, dates) {
if (length(cashflow) != length(dates)) {
stop("length(cashflow) != length(dates)")
}
cashflow_adj <- c(cashflow[1])
for (i in 1:(length(cashflow) - 1)) {
d1 <- as.Date(dates[i], "%d-%m-%Y", origin = "1970-01-01")
d2 <- as.Date(dates[i + 1], "%d-%m-%Y", origin = "1970-01-01")
# There are no checks about the monotone values of dates
# put a check in here if the interval is negative
interval <- as.integer(d2 - d1)
if (length(interval) > 0 && !is.na(interval)) {
cashflow_adj <- c(cashflow_adj, rep(0, interval - 1), cashflow[i + 1])
}
}
left = -10
right = 10
epsilon = 1e-8
while (abs(right - left) > 2 * epsilon) {
midpoint = (right + left) / 2
if (npv(cashflow_adj, left) * npv(cashflow_adj, midpoint) > 0) {
left = midpoint
} else {
right = midpoint
}
}
irr = (right + left) / 2 / 100
irr <- irr * 365
# Annualized yield (return) reflecting compounding effect of daily returns
irr <- (1 + irr / 365) ^ 365 - 1
irr
}
groupedCompanyNames <- unique(as.character(resdatatable$Company_ID));
groupedDatesPerCompany <- split(resdatatable$AsOfDate, resdatatable$Company_ID);
groupedCashFlowsPerCompany <- split(resdatatable$CashFlow, resdatatable$Company_ID);
resultsDataFrame <- data.table(Company_ID = character(length(groupedCompanyNames)), XIRR = numeric(length(groupedCompanyNames)));
datalist = result <- vector("list", length(groupedCompanyNames));
for (i in groupedCompanyNames) {
datesForCompany <- groupedDatesPerCompany[i];
dates <- datesForCompany[[i]];
cashFlowsForCompany <- groupedCashFlowsPerCompany[i];
cashFlows <- cashFlowsForCompany[[i]];
xirrResult <- tryCatch(xirr(cashFlows, dates),
error = function(e) {
0
});
newRow <- data.frame(Company_ID = i, XIRR = format(round(xirrResult, 2), nsmall = 2));
datalist[[i]] <- newRow;
}
resultsDataFrame <- data.table::rbindlist(datalist)
finalDataFrame <- as.data.frame(resultsDataFrame);
print(finalDataFrame);
So to provide context, I am trying to do the following:
- Get data out of the database using an RODBC connection
- Get the unique company names
- Split the cashflows and dates per company
- Initialize a data table with a known number of rows so that it doesn't need to incrementally grow.
- Loop through the unique company names and call function get xirr on the list of cashflows and dates for the company.
- Add each row with the company name and the XIRR value to a new datatable.
- Use rbindlist.
Here is a sample of the source data I'm using
Company_ID CashFlow AsOfDate
3F68D729-D69D-E711-9C98-5065F34B3E7D 368608.0000 2004-11-30 00:00:00.000
3F68D729-D69D-E711-9C98-5065F34B3E7D 366999.0000 2004-12-31 00:00:00.000
3F68D729-D69D-E711-9C98-5065F34B3E7D 326174.0000 2005-01-31 00:00:00.000
3F68D729-D69D-E711-9C98-5065F34B3E7D 345666.0000 2005-02-28 00:00:00.000
3F68D729-D69D-E711-9C98-5065F34B3E7D -1529180.0000 2005-03-31 00:00:00.000
3F68D729-D69D-E711-9C98-5065F34B3E7D -65259.0000 2005-04-30 00:00:00.000
3F68D729-D69D-E711-9C98-5065F34B3E7D 514005.0000 2005-05-31 00:00:00.000
3F68D729-D69D-E711-9C98-5065F34B3E7D 512951.0000 2005-06-30 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -6792.0000 2011-06-30 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -6792.0000 2011-07-31 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -6572.0000 2011-08-31 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -6792.0000 2011-09-30 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -6572.0000 2011-10-31 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -6792.0000 2011-11-30 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -6791.0000 2011-12-31 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -187375.0000 2012-01-31 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -215902.0000 2012-02-29 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -6572.0000 2012-03-31 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -217409.0000 2012-04-30 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -191830.0000 2012-05-31 00:00:00.000
I'm new to R - and with circa 2000 unique company names an on average 50 date, cashflow combinations each = 100000 records the loop takes about 28 secs to process.
I've looked at using the asParallel library and used foreach but that didn't seem to make any difference to the speed. If I take out the calling of the function xirr then the loop is processed and finished instantly.
The xirr needs the exception handling as sometimes its not possible to calculate an xirr value iteratively.
I know that looping is not really best practice in R - any suggestions on how to vectorise this for better performance?
回答1:
In order improve the performance, I used the doParallel library.
library(doParallel)
cl <- makeCluster(detectCores() - 1, type = 'PSOCK')
registerDoParallel(cl)
And instead of the for loop, I put the logic into a foreach
resultsDataFrame <- foreach(n = 1:length(groupedCompanyNames), .combine = rbind) %dopar% {
company_id <- groupedCompanyNames[n];
datesForCompany <- groupedDatesPerCompany[n];
dates <- unsplit(datesForCompany, company_id);
cashFlowsForCompany <- groupedCashFlowsPerCompany[n];
cashFlows <- unsplit(cashFlowsForCompany, company_id);
#now calculate the xirr for the values
xirrResult <- tryCatch(xirr(cashFlows, dates),
error = function(e) {
0
});
data.frame(Company_ID = company_id, XIRR = format(round(xirrResult, 2), nsmall = 2));
}
registerDoSEQ();
When I ran my full data set into it (4000 companies) with their dates and cashflows. A total of 400000 rows the original loop took around 10 minutes. With the foreach loop and utilising the extra cores in the machine, the operation took 60 seconds.
I hope that someone will maybe be able to suggest a further performance spike on top of this but I think that is a good improvement.
来源:https://stackoverflow.com/questions/47434610/improving-loop-performance-with-function-call-inside