问题
I have a simple model for a company with two farms, growing two crops (apples and pears) on each farm. The first step is just to multiply the number of trees by the amount of fruit on each tree.
The amount of fruit on each tree is simulated (across farms and crops).
I face at least three decisions when modeling this in R:
- How to structure the variables
- How to simulate
- How to multiply a simulated variable with a non-simulated variable
I want it to work even if I add another crop and/or farm - and ideally even if I add another dimension, e.g. crop variety (Granny Smith etc). I also want to refer to farms and crops by name, not by an index number.
Here is the approach I have come up with. It works, but it is hard to add another dimension, and it is a lot of code. Is there a neater way?
To structure the variables:
farms <- c('Farm 1', 'Farm 2');
crops <- c('Pear', 'Apple');
params <- c('mean','sd');
numTrees <- array(0, dim=c(length(farms), length(crops)), dimnames=list(farms,crops));
fruitPerTree <- array(0, dim=c(length(farms), length(varieties), length(params)),
dimnames=list(farms,varieties,params));
# input data e.g.
numTrees['Farm 1', 'Pear'] = 100
# and
fruitPerTree['Farm 1', 'Pear', 'mean'] = 50
To simulate:
simNormal2D <- function(dataVar, numSims) {
#
# Generate possible outcomes for dataVar (e.g. fruitPerTree).
# It generates them for each value of the first two dimensions.
#
# Args:
# dataVar: 3-dimensional array,
# with 3rd dim being the normal params
# numSims: integer, e.g. 10000
#
# e.g. sims <- simNormal2D(fruitPerTree, 10000)
#
# Returns:
# a 3D array with 3rd dimension indexing the simulated results
#
dims <- dimnames(dataVar);
sims <- array(dim=c(length(dims[[1]]), length(dims[[2]]), 0),
dimnames=list(dims[[1]], dims[[2]], NULL));
for(x in dims[[1]]) {
for(y in dims[[2]]) {
sim <- rnorm(numSims, dataVar[x, y, 'mean'],
dataVar[x, y, 'sd'] );
sims <- append(sims, sim);
}
}
# R fills array from first arg columnwise, so dims are reversed
sims <- array(sims, c(numSims, length(dims[[2]]),
length(dims[[1]])),
dimnames=list(c(1:numSims), dims[[2]], dims[[1]]));
# reverse them back again
sims <- aperm(sims, c(3,2,1));
return(sims);
}
simFruitPerTree <- simNormal2D(fruitPerTree, numSims);
To multiply simFruitPerTree
and numTrees
, I find I first have to perform a manual broadcast:
simNumTrees <- array(numTrees, dim=c(length(dims[[1]]), length(dims[[2]]), numSims),
dimnames=list(dims[[1]], dims[[2]], c(1:numSims)));
simTotalFruit <- simFruitPerTree * simNumTrees;
For comparison, in Python (which I know better than R), I can perform all these steps in a few lines by using tuples to index a dictionary, along with two dictionary comprehensions, e.g.:
fruit_per_tree = {}
fruit_per_tree[('Farm 1', 'Pear')] = (50, 15) # normal params
sim_fruit_per_tree = {key: random.normal(*params, size=num_sims)
for key, params in fruit_per_tree.items() }
sim_total_fruit = {key: sim_fruit_per_tree[key]*num_trees[key] for key in num_trees }
Is there an easy way in R too? Thanks!
回答1:
Here is how I would set up such a simulation:
#for reproducibility
set.seed(42)
#data
farms <- data.frame(farm=rep(1:2, each=2),
trees=sample(100, 4),
crop=rep(c("pear", "apple")),
mean=c(100, 200, 70, 120),
sd=c(30, 15, 25, 20))
#n
n <- 100
#simulation
fruits <- t(matrix(rnorm(n*nrow(farms), farms$mean, farms$sd), ncol=n))
#check
colMeans(fruits)
#[1] 101.10215 200.06649 68.01185 120.05096
library(reshape2)
fruits <- melt(fruits, value.name="harvest_per_tree")
farms$i <- seq_len(nrow(farms))
farm_sim <- merge(farms, fruits, by.x="i", by.y="Var2", all=TRUE)
names(farm_sim)[7] <- "sim_i"
#multiply with number of trees
farm_sim$harvest_total <- farm_sim$harvest_per_tree * farm_sim$trees
head(farm_sim)
# i farm trees crop mean sd sim_i harvest_per_tree harvest_total
# 1 1 1 92 pear 100 30 1 110.89385 10202.234
# 2 1 1 92 pear 100 30 2 145.34566 13371.801
# 3 1 1 92 pear 100 30 3 139.14609 12801.440
# 4 1 1 92 pear 100 30 4 96.00036 8832.033
# 5 1 1 92 pear 100 30 5 26.78599 2464.311
# 6 1 1 92 pear 100 30 6 94.84248 8725.508
library(ggplot2)
ggplot(farm_sim, aes(x=sim_i, y=harvest_total, colour=factor(farm))) +
geom_line() +
facet_wrap(~crop)
回答2:
If I understand you correctly, you are modeling the total fruit output from n farms, each of which has k types of crop (here, n=k=2). Each farm has some number of trees of each variety, and for each farm the productivity (fruits/tree) is a random variable distributed on N(μ,σ), where μ and σ depend on the farm and the variety.
So for the inputs, we structure a data frame, params
with 5 columns: farm, crop, trees, mean, and sd
. Then each row contains the number of trees, the mean productivity per tree, and the variation in productivity per tree, for a given farm/crop combination. These are the inputs.
If we model at the tree level, then the fruit output from each of the trees of a given variety from a given farm is:
rnorm(trees,mean,sd)
That is, the output is a random sample of length = # trees, with mean and sd appropriate for the given variety and farm. Then total output from all the trees of this variety/farm is just the sum of the vector above, and the grand total output is the sum of these sums over all farms/crops.
All of this gives us 1 iteration of the Monte Carlo model. To determine a distribution of total output, we must repeat this process some number of times. Fortunately in R this is fairly straightforward:
set.seed(1)
farms <- c('Farm 1', 'Farm 2')
crops <- c('Pear', 'Apple')
params <- expand.grid(farms=farms,crops=crops)
params$trees<- 100
params$mean <- 50
params$sd <- 10
n.iterations<- 1000
output <- function(i,p) {
pp <- p[3:5] # trees, mean, sd for each farm/crop
# fruit = total output for each farm/crop combination
fruit <- colSums(apply(pp,1,function(x)rnorm(x[1],x[2],x[3])))
return(sum(fruit)) # grand total output
}
dist <- sapply(1:n.iterations,output,params)
print(c(mean=mean(dist),sd=sd(dist)),quotes=F,digits=4)
# mean sd
# 19997.5 198.8
hist(dist, main="Distribution of Total Output",
sub=paste(n.iterations,"Iterations"),xlab="Total Fruit Output")
This code is agnostic about the number of farms or varieties; just change the farms
and crops
vectors at the beginning. If not all farms have all the varieties, set params$trees <- 0
for the missing varieties.
We can check the influence of n.iterations
as follows. This code just runs the full simulation 100, 1000, and 10,000 times and plots the distribution using ggplot
.
gg <- do.call(rbind,
lapply(c(100,1000,10000),
function(n)cbind(n=n,total=sapply(1:n,output,params))))
gg <- data.frame(gg)
library(ggplot2)
ggplot(gg)+
geom_histogram(aes(x=total, y=..density.., fill=factor(n)))+
scale_fill_discrete("Iterations")+
facet_wrap(~n)
Finally, I urge you to consider that the output per tree is more likely poisson distributed than normal. If one re-runs the simulation using rpois(...)
instead of rnorm(...)
, the overall sd is slightly lower (~150 instead of ~200).
回答3:
Here is a general solution to my problem. I started with roland's approach, and updated it so that the distribution, parameters, and dimensions can all be changed easily.
distSim <- function(nSims, simName, distFn, dimList, paramList, constList) {
#
# Simulate from a distribution across all the dimensions.
#
# Args:
# nSims: integer, e.g. 10000
# simName: name of the output column, e.g. 'harvestPerTree'
# distFn: distribution function, e.g. rnorm
# dimList: list of dimensions,
# e.g. list(farms=c('farm A', 'farm B'), crops=c('apple', 'pear', 'durian'))
# paramList: list of parameters, each of length = product(length(d) for d in dimList),
# to be passed to the distribution function,
# e.g. list(mean=c(10,20,30,5,10,15), sd=c(2,4,6,1,2,3))
# constList: optional vector of length = product(length(d) for d in dimList)
# these are included in the output dataframe
# e.g. list(nTrees=c(10,20,30,1,2,3))
#
# Returns:
# a dataframe with one row per iteration x product(d in dimList)
#
# expand out the dimensions into a dataframe grid - one row per combination
df <- do.call(expand.grid, dimList);
nRows <- nrow(df);
# add the parameters, and constants, if present
df <- cbind(df, paramList);
if (!missing(constList)) {
df <- cbind(df, constList);
}
# repeat this dataframe for each iteration of the simulation
df <- do.call("rbind",replicate(nSims, df, simplify=FALSE));
# add a new column giving the iteration number ('simId')
simId <- sort(rep(seq(1:nSims),nRows));
df <- cbind(simId, df);
# simulate from the distribution
df[simName] <- do.call(distFn, c(list(n=nrow(df)), df[names(paramList)]))
return(df);
}
Sample usage (using a normal distribution for simplicity only):
dimList <- list(farms=c('farm A', 'farm B'), crops=c('apple', 'pear', 'durian'));
constList <- list(numTrees=c(10,20,30,1,2,3));
paramList <- list(mean=c(10,20,30,5,10,15), sd=c(2,4,6,1,2,3));
distSim(nSims=3, simName='harvestPerTree', distFn=rnorm, dimList=dimList,
paramList=paramList, constList=constList);
Sample output:
simId farms crops mean sd numTrees harvestPerTree
1 1 farm A apple 10 2 10 9.602849
2 1 farm B apple 20 4 20 20.153225
3 1 farm A pear 30 6 30 25.839825
4 1 farm B pear 5 1 1 1.733120
5 1 farm A durian 10 2 2 13.506135
6 1 farm B durian 15 3 3 11.690910
7 2 farm A apple 10 2 10 7.678611
8 2 farm B apple 20 4 20 22.119560
9 2 farm A pear 30 6 30 31.488002
10 2 farm B pear 5 1 1 5.366725
11 2 farm A durian 10 2 2 6.333747
12 2 farm B durian 15 3 3 13.918085
13 3 farm A apple 10 2 10 10.387194
14 3 farm B apple 20 4 20 21.086388
15 3 farm A pear 30 6 30 34.076926
16 3 farm B pear 5 1 1 6.159415
17 3 farm A durian 10 2 2 8.322902
18 3 farm B durian 15 3 3 9.458085
Note also that you can also define the input values in a nicely indexed way; e.g. if you define
numTrees2 <- array(0, dim=c(length(farms), length(crops)), dimnames=tree_dimList);
numTrees2['Farm A','apple'] = 200;
# etc
Then the way that c(numTrees)
orders its outputs matches expand.grid
, so you can pass constList = list(numTrees=c(numTrees2)
to distSim
.
来源:https://stackoverflow.com/questions/21490788/how-to-structure-an-easily-extensible-monte-carlo-model-in-r