问题
Hi I am new to vectorizing functions in R. I have a code similar the following.
library(truncnorm)
library(microbenchmark)
num_obs=10000
Observation=seq(1,num_obs)
Obs_Type=sample(1:4, num_obs, replace=T)
Upper_bound = runif(num_obs,0,1)
Lower_bound=runif(num_obs,2,4)
mean = runif(num_obs,10,15)
df1= data.frame(Observation,Obs_Type,Upper_bound,Lower_bound,mean)
df1$draw_value = 0
Trial_func=function(df1){
for (i in 1:nrow(df1)){
if (df1[i,"Obs_Type"] ==1){
#If Type == 1; then a=-Inf, b = Upper_Bound
df1[i,"draw_value"] = rtruncnorm(1,a=-Inf,b=df1[i,"Upper_bound"],mean= df1[i,"mean"],sd=1)
} else if (df1[i,"Obs_Type"] ==2){
#If Type == 2; then a=-10, b = Upper_Bound
df1[i,"draw_value"] = rtruncnorm(1,a=-10,b=df1[i,"Upper_bound"],mean= df1[i,"mean"],sd=1)
} else if(df1[i,"Obs_Type"] ==3){
#If Type == 3; then a=Lower_bound, b = Inf
df1[i,"draw_value"] = rtruncnorm(1,a=df1[i,"Lower_bound"],b=Inf,mean= df1[i,"mean"],sd=1)
} else {
#If Type == 3; then a=Lower_bound, b = 10
df1[i,"draw_value"] = rtruncnorm(1,a=df1[i,"Lower_bound"],b=10,mean= df1[i,"mean"],sd=1)
}
}
return(df1)
}
#Benchmarking
mbm=microbenchmark(Trial_func(df1=df1),times = 10)
summary(mbm)
#For obtaining the new data
New_data=Trial_func(df1=df1)
In the above I am creating a dataframe called df1 initially. I then create a function which takes a dataset (df1). Each observation in the dataset (df1), can be one of four types. This is given by df1$Obs_Type. What I want to do is that based on the Obs_Type, I want to draw values from a truncated normal distribution with a given upper and lower points.
The rules are:
a) When Obs_Type =1; a=-Inf, b = Upper_bound value of observation i.
b) When Obs_Type =2; a=-10, b = Upper_bound value of observation i.
c) When Obs_Type =3; a=Upper_bound value of observation i, b = Inf.
d) When Obs_Type =4; a=Upper_bound value of observation i, b = 10.
Where a = lower bound, b = upper bound; Additionally, mean of observation i is given by df1$mean and sd = 1.
I am not familiar with vectorizing and was wondering if someone could help me with this a bit. I tried looking at some other examples on SO (for eg. this) but could not figure out what to do when I have multiple conditions.
My original dataset has about 10 million observations and other additional conditions (eg. instead of 4 types, my data has 16 types and the means changes with each type), but I used a simpler example here.
Please let me know if any part of the question requires any additional clarification.
回答1:
Here is a vectorized way. It creates logical vectors i1
, i2
, i3
and i4
corresponding to the 4 conditions. Then it assigns the new values to the positions indexed by them.
Trial_func2 <- function(df1){
i1 <- df1[["Obs_Type"]] == 1
i2 <- df1[["Obs_Type"]] == 2
i3 <- df1[["Obs_Type"]] == 3
i4 <- df1[["Obs_Type"]] == 4
#If Type == 1; then a=-Inf, b = Upper_Bound
df1[i1, "draw_value"] <- rtruncnorm(sum(i1), a =-Inf,
b = df1[i1, "Upper_bound"],
mean = df1[i1, "mean"], sd = 1)
#If Type == 2; then a=-10, b = Upper_Bound
df1[i2, "draw_value"] <- rtruncnorm(sum(i2), a = -10,
b = df1[i2 , "Upper_bound"],
mean = df1[i2, "mean"], sd = 1)
#If Type == 3; then a=Lower_bound, b = Inf
df1[i3,"draw_value"] <- rtruncnorm(sum(i3),
a = df1[i3, "Lower_bound"],
b = Inf, mean = df1[i3, "mean"],
sd = 1)
#If Type == 3; then a=Lower_bound, b = 10
df1[i4, "draw_value"] <- rtruncnorm(sum(i4),
a = df1[i4, "Lower_bound"],
b = 10,
mean = df1[i4,"mean"],
sd = 1)
df1
}
In the speed test I have named @Dave2e's answer Trial_func3
.
mbm <- microbenchmark(
loop = Trial_func(df1 = df1),
vect = Trial_func2(df1 = df1),
cwhen = Trial_func3(df1 = df1),
times = 10)
print(mbm, order = "median")
#Unit: milliseconds
# expr min lq mean median uq max neval cld
# vect 4.349444 4.371169 4.40920 4.401384 4.450024 4.487453 10 a
# cwhen 13.458946 13.484247 14.16045 13.528792 13.787951 19.363104 10 a
# loop 2125.665690 2138.792497 2211.20887 2157.185408 2201.391083 2453.658767 10 b
回答2:
The case_when
function in the dplyr package is handy to vectorize this type of multiple if else
statements.
Instead of passing the individual values to the "if" statements one can pass the entire vector for a very substantial performance improvement.
Also the case_when
improves the readability of the script.
library(dplyr)
Trial_func <- function(df1) {
df1[,"draw_value"] <- case_when(
df1$Obs_Type == 1 ~ rtruncnorm(1,a=-Inf,b=df1[,"Upper_bound"],mean= df1[,"mean"], sd=1),
df1$Obs_Type == 2 ~ rtruncnorm(1,a=-10,b=df1[,"Upper_bound"],mean= df1[,"mean"],sd=1),
df1$Obs_Type == 3 ~ rtruncnorm(1,a=df1[,"Lower_bound"],b=Inf,mean= df1[,"mean"],sd=1),
df1$Obs_Type == 4 ~ rtruncnorm(1,a=df1[,"Lower_bound"],b=10,mean= df1[,"mean"],sd=1)
)
df1
}
Trial_func(df1)
回答3:
library(truncnorm)
library(microbenchmark)
num_obs=10000
Observation=seq(1,num_obs)
Obs_Type=sample(1:4, num_obs, replace=T)
Upper_bound = runif(num_obs,0,1)
Lower_bound=runif(num_obs,2,4)
mean = runif(num_obs,10,15)
df1= data.frame(Observation,Obs_Type,Upper_bound,Lower_bound,mean)
df1$draw_value = 0
###########################
# Your example
###########################
Trial_func=function(df1, seed=NULL){
if (!is.null(seed)) set.seed(seed)
for (i in 1:nrow(df1)){
if (df1[i,"Obs_Type"] ==1){
#If Type == 1; then a=-Inf, b = Upper_Bound
df1[i,"draw_value"] = rtruncnorm(1,a=-Inf,b=df1[i,"Upper_bound"],mean= df1[i,"mean"],sd=1)
} else if (df1[i,"Obs_Type"] ==2){
#If Type == 2; then a=-10, b = Upper_Bound
df1[i,"draw_value"] = rtruncnorm(1,a=-10,b=df1[i,"Upper_bound"],mean= df1[i,"mean"],sd=1)
} else if(df1[i,"Obs_Type"] ==3){
#If Type == 3; then a=Lower_bound, b = Inf
df1[i,"draw_value"] = rtruncnorm(1,a=df1[i,"Lower_bound"],b=Inf,mean= df1[i,"mean"],sd=1)
} else {
#If Type == 3; then a=Lower_bound, b = 10
df1[i,"draw_value"] = rtruncnorm(1,a=df1[i,"Lower_bound"],b=10,mean= df1[i,"mean"],sd=1)
}
}
return(df1)
}
#############################
# Vectorized version
#############################
# for each row-elements define a function
truncated_normal <- function(obs_type, lower_bound, upper_bound, mean, sd) {
if (obs_type == 1) {
rtruncnorm(1, a=-Inf, b=upper_bound, mean=mean, sd=sd)
} else if (obs_type == 2){
rtruncnorm(1, a=-10, b=upper_bound, mean=mean, sd=sd)
} else if(obs_type == 3){
rtruncnorm(1, a=lower_bound, b=Inf, mean=mean, sd=sd)
} else {
rtruncnorm(1, a=lower_bound, b=10, mean=mean, sd=sd)
}
}
# vectorize it
truncated_normal <- Vectorize(truncated_normal)
Trial_func_vec <- function(df, res_col="draw_value", seed=NULL) {
if (!is.null(seed)) set.seed(seed)
df[, res_col] <- truncated_normal(
obs_type=df[, "Obs_Type"],
lower_bound=df[, "Lower_bound"],
upper_bound=df[, "Upper_bound"],
mean=df[,"mean"],
sd=1)
df
}
#Benchmarking
set.seed(1)
mbm=microbenchmark(Trial_func(df=df1),times = 10)
summary(mbm)
set.seed(1)
mbm_vec=microbenchmark(Trial_func_vec(df=df1),times = 10)
summary(mbm_vec)
## vectorization roughly 3x faster!
#For obtaining the new data
set.seed(1) # important so that randomization is reproducible
new_data=Trial_func(df=df1)
set.seed(1) # important so that randomization is reproducible
vec_data=Trial_func_vec(df=df1)
# since in both cases random number generator is provoked
# exactly once per row in the order of the rows,
# resulting df should be absolutely identical.
all(new_data == vec_data) ## TRUE! They are absolutely identical.
# proving that your code does - in principle - exactly the same
# like my vectorized code
The Benchmarking results
# @Rui Barradas' function
Trial_func2 <- function(df1){
i1 <- df1[["Obs_Type"]] == 1
i2 <- df1[["Obs_Type"]] == 2
i3 <- df1[["Obs_Type"]] == 3
i4 <- df1[["Obs_Type"]] == 4
#If Type == 1; then a=-Inf, b = Upper_Bound
df1[i1, "draw_value"] <- rtruncnorm(sum(i1), a =-Inf,
b = df1[i1, "Upper_bound"],
mean = df1[i1, "mean"], sd = 1)
#If Type == 2; then a=-10, b = Upper_Bound
df1[i2, "draw_value"] <- rtruncnorm(sum(i2), a = -10,
b = df1[i2 , "Upper_bound"],
mean = df1[i2, "mean"], sd = 1)
#If Type == 3; then a=Lower_bound, b = Inf
df1[i3,"draw_value"] <- rtruncnorm(sum(i3),
a = df1[i3, "Lower_bound"],
b = Inf, mean = df1[i3, "mean"],
sd = 1)
#If Type == 3; then a=Lower_bound, b = 10
df1[i4, "draw_value"] <- rtruncnorm(sum(i4),
a = df1[i4, "Lower_bound"],
b = 10,
mean = df1[i4,"mean"],
sd = 1)
df1
}
# @Dave2e's function
library(dplyr)
Trial_func_dplyr <- function(df1) {
df1[,"draw_value"] <- case_when(
df1$Obs_Type == 1 ~ rtruncnorm(1,a=-Inf,b=df1[,"Upper_bound"],mean= df1[,"mean"], sd=1),
df1$Obs_Type == 2 ~ rtruncnorm(1,a=-10,b=df1[,"Upper_bound"],mean= df1[,"mean"],sd=1),
df1$Obs_Type == 3 ~ rtruncnorm(1,a=df1[,"Lower_bound"],b=Inf,mean= df1[,"mean"],sd=1),
df1$Obs_Type == 4 ~ rtruncnorm(1,a=df1[,"Lower_bound"],b=10,mean= df1[,"mean"],sd=1)
)
df1
}
#Benchmarking
set.seed(1)
mbm <- microbenchmark(
loop = Trial_func(df1=df1),
ruy_vect = Trial_func2(df1=df1),
my_vect = Trial_func_vec(df=df1),
cwhen = Trial_func_dplyr(df1=df1),
times=10)
print(mbm, order = "median")
# > print(mbm, order = "median")
# Unit: milliseconds
# expr min lq mean median uq max
# ruy_vect 7.583821 7.879766 11.59954 8.815835 10.33289 36.60468
# cwhen 22.563190 23.103670 25.13804 23.965722 26.49628 30.63777
# my_vect 1326.771297 1373.415302 1413.75328 1410.995177 1484.28449 1506.11063
# loop 4149.424632 4269.475169 4486.41376 4423.527566 4742.96651 4911.31992
# neval cld
# 10 a
# 10 a
# 10 b
# 10 c
# @Rui's vectorize version wins by 3 magnitudes or order!!
来源:https://stackoverflow.com/questions/59912424/r-how-to-vectorize-a-function-with-multiple-if-else-conditions