问题
Using R I would like to compare the RMSE (root mean square error) from two prediction models. The first model uses estimates from 1966 to 2000 to predict 2001 and then uses estimates from 1966 to 2001 to predict 2002 and so on up to 2015. The second model uses estimates from 1991 to 2000 to predict 2001 and then uses estimates from 1992 to 2001 to predict 2002 and so on up to 2015. This problem has me really stumped and I truly appreciate any help.
DF <- data.frame(YEAR=1966:2015, TEMP=rnorm(50), PRESSURE=rnorm(50), RAINFALL=rnorm(50))
lmod <- lm(TEMP ~ PRESSURE + RAINFALL, data = DF)
rmse <- function(error) sqrt(mean(error^2))
rmse(lmod$residuals)
回答1:
You can loop it:
Method 1:
pred1<-numeric(0)
rmse1<-numeric(0)
for(i in 1:15){
DF.train1<-DF[DF$YEAR < 2000+i,]
DF.test1<-DF[DF$YEAR == 2000+i,]
lmod1 <- lm(TEMP ~ PRESSURE + RAINFALL, data = DF.train1)
pred1[i]<- predict(lmod1, newdata = DF.test1)
rmse1[i]<-sqrt(mean((DF.test1$TEMP-pred1[i])^2))
}
pred1
rmse1
mean(rmse1)
Method 2:
pred2<-numeric(0)
rmse2<-numeric(0)
for(i in 1:15){
DF.train2<-DF[DF$YEAR < 2000+i & DF$YEAR > 1989+i,]
DF.test2<-DF[DF$YEAR == 2000+i,]
lmod2 <- lm(TEMP ~ PRESSURE + RAINFALL, data = DF.train2)
pred2[i]<- predict(lmod2, newdata = DF.test2)
rmse2[i]<-sqrt(mean((DF.test2$TEMP-pred2[i])^2))
}
pred2
rmse2
mean(rmse2)
Comparing the individual components of rmse1
and rmse2
, as well as their respective means should be useful. The vectors pred1
and pred2
contain the individual TEMP
predictions for each year (2001-2015) for their respective methods.
Edit: should be working now, and Method 2 trains on a rolling 10 year gap. Also, I take RMSE to be the square root of the MSE as defined for predictors in this article.
回答2:
Here is another solution, where simulations are in a function.
The interest of this solution is to easily modify model specifications.
For example, if you want to try the model2
with a range of 15 years instead of 10, just modify the input in the function (range = 15
). This also gives you the possibility to do a light sensibility analysis.
compare_models <- function(DF, start = 1966, end = 2000, range = 10)
{
require(hydroGOF)
for (i in (end+1):tail(DF$YEAR)[6])
{
# model1
lmod_1 = lm(TEMP ~ PRESSURE + RAINFALL, data = DF[DF$YEAR >= start & DF$YEAR < i,])
DF$model1_sim[DF$YEAR == i] <- predict(lmod_1, newdata = DF[DF$YEAR == i,])
# model2
lmod_2 = lm(TEMP ~ PRESSURE + RAINFALL, data = DF[DF$YEAR >= i-range & DF$YEAR < i,])
DF$model2_sim[DF$YEAR == i] <- predict(lmod_2, newdata = DF[DF$YEAR == i,])
}
return(DF)
}
I used hydroGOF
package to compute rmse
and NSE
, which is a common indicator of model efficiency (see Nash and Sutcliffe, 1970, 11528 citations at the moment).
output = compare_models(DF)
require(hydroGOF) # compute RMSE and NSE
# RMSE
rmse(output$model1_sim,output$TEMP)
rmse(output$model2_sim,output$TEMP)
# Nash-Sutcliffe efficiency
NSE(output$model1_sim,output$TEMP, na.rm = T)
NSE(output$model2_sim,output$TEMP, na.rm = T)
And a simple simulated/observed plot to look for model predictions:
# melting data for plot
output_melt = melt(output[,c("TEMP", "model1_sim", "model2_sim")], id = "TEMP")
# Plot
ggplot(output_melt, aes(x = TEMP, y = value, color = variable)) +
theme_bw() + geom_point() + geom_abline(slope = 1, intercept = 0) +
xlim(-2,2) + ylim(-2,2) + xlab("Measured") + ylab("Simulated")
回答3:
Here's yet another solution:
year <- 2000
time.frame <- 35
train.models <- function(year, time.frame) {
predictions <- sapply(year:(max(df$YEAR)-1),
function(year) {
lmod <- lm(TEMP ~ PRESSURE + RAINFALL, DF,
subset = with(DF, YEAR %in% (year - time.frame + 1):year))
pred <- predict(lmod, newdata = DF[DF$YEAR == (year + 1),])
names(pred) <- year + 1
return (pred)
})
return (predictions)
}
models1 <- train.models(2000, 35)
models2 <- train.models(2001, 10)
rmse(models1 - DF$TEMP[DF$YEAR %in% names(models1)])
rmse(models2 - DF$TEMP[DF$YEAR %in% names(models2)])
来源:https://stackoverflow.com/questions/37661829/r-multivariate-one-step-ahead-forecasts-and-accuracy