问题

Hi I am new to vectorizing functions in R. I have a code similar the following.

library(truncnorm)
library(microbenchmark)

num_obs=10000
Observation=seq(1,num_obs)
Obs_Type=sample(1:4, num_obs, replace=T)
Upper_bound = runif(num_obs,0,1)
Lower_bound=runif(num_obs,2,4)
mean = runif(num_obs,10,15)

df1= data.frame(Observation,Obs_Type,Upper_bound,Lower_bound,mean)
df1$draw_value = 0

Trial_func=function(df1){
  for (i in 1:nrow(df1)){
    if (df1[i,"Obs_Type"] ==1){
      #If Type == 1; then a=-Inf, b = Upper_Bound
      df1[i,"draw_value"] = rtruncnorm(1,a=-Inf,b=df1[i,"Upper_bound"],mean= df1[i,"mean"],sd=1)
    } else if (df1[i,"Obs_Type"] ==2){
      #If Type == 2; then a=-10, b = Upper_Bound
      df1[i,"draw_value"] = rtruncnorm(1,a=-10,b=df1[i,"Upper_bound"],mean= df1[i,"mean"],sd=1)
    } else if(df1[i,"Obs_Type"] ==3){
      #If Type == 3; then a=Lower_bound, b = Inf
      df1[i,"draw_value"] = rtruncnorm(1,a=df1[i,"Lower_bound"],b=Inf,mean= df1[i,"mean"],sd=1)
    } else {
      #If Type == 3; then a=Lower_bound, b = 10
      df1[i,"draw_value"] = rtruncnorm(1,a=df1[i,"Lower_bound"],b=10,mean= df1[i,"mean"],sd=1)
    }
  }
  return(df1)
}

#Benchmarking
mbm=microbenchmark(Trial_func(df1=df1),times = 10)
summary(mbm)
#For obtaining the new data
New_data=Trial_func(df1=df1)

In the above I am creating a dataframe called df1 initially. I then create a function which takes a dataset (df1). Each observation in the dataset (df1), can be one of four types. This is given by df1$Obs_Type. What I want to do is that based on the Obs_Type, I want to draw values from a truncated normal distribution with a given upper and lower points.

The rules are:

a) When Obs_Type =1; a=-Inf, b = Upper_bound value of observation i.

b) When Obs_Type =2; a=-10, b = Upper_bound value of observation i.

c) When Obs_Type =3; a=Upper_bound value of observation i, b = Inf.

d) When Obs_Type =4; a=Upper_bound value of observation i, b = 10.

Where a = lower bound, b = upper bound; Additionally, mean of observation i is given by df1$mean and sd = 1.

I am not familiar with vectorizing and was wondering if someone could help me with this a bit. I tried looking at some other examples on SO (for eg. this) but could not figure out what to do when I have multiple conditions.

My original dataset has about 10 million observations and other additional conditions (eg. instead of 4 types, my data has 16 types and the means changes with each type), but I used a simpler example here.

Please let me know if any part of the question requires any additional clarification.

回答1:

Here is a vectorized way. It creates logical vectors i1, i2, i3 and i4 corresponding to the 4 conditions. Then it assigns the new values to the positions indexed by them.

Trial_func2 <- function(df1){
  i1 <- df1[["Obs_Type"]] == 1
  i2 <- df1[["Obs_Type"]] == 2
  i3 <- df1[["Obs_Type"]] == 3
  i4 <- df1[["Obs_Type"]] == 4

  #If Type == 1; then a=-Inf, b = Upper_Bound
  df1[i1, "draw_value"] <- rtruncnorm(sum(i1), a =-Inf, 
                                      b = df1[i1, "Upper_bound"], 
                                      mean = df1[i1, "mean"], sd = 1)
  #If Type == 2; then a=-10, b = Upper_Bound
  df1[i2, "draw_value"] <- rtruncnorm(sum(i2), a = -10,
                                      b = df1[i2 , "Upper_bound"],
                                      mean = df1[i2, "mean"], sd = 1)
  #If Type == 3; then a=Lower_bound, b = Inf
  df1[i3,"draw_value"] <- rtruncnorm(sum(i3), 
                                     a = df1[i3, "Lower_bound"],
                                     b = Inf, mean = df1[i3, "mean"], 
                                     sd = 1)
  #If Type == 3; then a=Lower_bound, b = 10
  df1[i4, "draw_value"] <- rtruncnorm(sum(i4), 
                                      a = df1[i4, "Lower_bound"],
                                      b = 10,
                                      mean = df1[i4,"mean"],
                                      sd = 1)
  df1
}

In the speed test I have named @Dave2e's answer Trial_func3.

mbm <- microbenchmark(
  loop = Trial_func(df1 = df1),
  vect = Trial_func2(df1 = df1),
  cwhen = Trial_func3(df1 = df1),
  times = 10)

print(mbm, order = "median")
#Unit: milliseconds
#  expr         min          lq       mean      median          uq         max neval cld
#  vect    4.349444    4.371169    4.40920    4.401384    4.450024    4.487453    10  a 
# cwhen   13.458946   13.484247   14.16045   13.528792   13.787951   19.363104    10  a 
#  loop 2125.665690 2138.792497 2211.20887 2157.185408 2201.391083 2453.658767    10   b

回答2:

The case_when function in the dplyr package is handy to vectorize this type of multiple if else statements.

Instead of passing the individual values to the "if" statements one can pass the entire vector for a very substantial performance improvement.
Also the case_when improves the readability of the script.

library(dplyr)

Trial_func <- function(df1) {
  df1[,"draw_value"] <- case_when(
    df1$Obs_Type == 1 ~ rtruncnorm(1,a=-Inf,b=df1[,"Upper_bound"],mean= df1[,"mean"], sd=1),
    df1$Obs_Type == 2 ~ rtruncnorm(1,a=-10,b=df1[,"Upper_bound"],mean= df1[,"mean"],sd=1),
    df1$Obs_Type == 3 ~ rtruncnorm(1,a=df1[,"Lower_bound"],b=Inf,mean= df1[,"mean"],sd=1),
    df1$Obs_Type == 4 ~ rtruncnorm(1,a=df1[,"Lower_bound"],b=10,mean= df1[,"mean"],sd=1)
  )
  df1
}

Trial_func(df1)

回答3:

library(truncnorm)
library(microbenchmark)

num_obs=10000
Observation=seq(1,num_obs)
Obs_Type=sample(1:4, num_obs, replace=T)
Upper_bound = runif(num_obs,0,1)
Lower_bound=runif(num_obs,2,4)
mean = runif(num_obs,10,15)

df1= data.frame(Observation,Obs_Type,Upper_bound,Lower_bound,mean)
df1$draw_value = 0

###########################
# Your example
###########################

Trial_func=function(df1, seed=NULL){
  if (!is.null(seed)) set.seed(seed)
  for (i in 1:nrow(df1)){
    if (df1[i,"Obs_Type"] ==1){
      #If Type == 1; then a=-Inf, b = Upper_Bound
      df1[i,"draw_value"] = rtruncnorm(1,a=-Inf,b=df1[i,"Upper_bound"],mean= df1[i,"mean"],sd=1)
    } else if (df1[i,"Obs_Type"] ==2){
      #If Type == 2; then a=-10, b = Upper_Bound
      df1[i,"draw_value"] = rtruncnorm(1,a=-10,b=df1[i,"Upper_bound"],mean= df1[i,"mean"],sd=1)
    } else if(df1[i,"Obs_Type"] ==3){
      #If Type == 3; then a=Lower_bound, b = Inf
      df1[i,"draw_value"] = rtruncnorm(1,a=df1[i,"Lower_bound"],b=Inf,mean= df1[i,"mean"],sd=1)
    } else {
      #If Type == 3; then a=Lower_bound, b = 10
      df1[i,"draw_value"] = rtruncnorm(1,a=df1[i,"Lower_bound"],b=10,mean= df1[i,"mean"],sd=1)
    }
  }
  return(df1)
}

#############################
# Vectorized version
#############################

# for each row-elements define a function
truncated_normal <- function(obs_type, lower_bound, upper_bound, mean, sd) {
    if (obs_type == 1) {
      rtruncnorm(1, a=-Inf, b=upper_bound, mean=mean, sd=sd)
    } else if (obs_type == 2){
      rtruncnorm(1, a=-10, b=upper_bound, mean=mean, sd=sd)
    } else if(obs_type == 3){
      rtruncnorm(1, a=lower_bound, b=Inf, mean=mean, sd=sd)
    } else {
      rtruncnorm(1, a=lower_bound, b=10, mean=mean, sd=sd)
    }
}
# vectorize it
truncated_normal <- Vectorize(truncated_normal)

Trial_func_vec <- function(df, res_col="draw_value", seed=NULL) {
    if (!is.null(seed)) set.seed(seed)
    df[, res_col] <- truncated_normal(
                         obs_type=df[, "Obs_Type"],
                         lower_bound=df[, "Lower_bound"],
                         upper_bound=df[, "Upper_bound"],
                         mean=df[,"mean"],
                         sd=1)
    df
}
#Benchmarking
set.seed(1)
mbm=microbenchmark(Trial_func(df=df1),times = 10)
summary(mbm)

set.seed(1)
mbm_vec=microbenchmark(Trial_func_vec(df=df1),times = 10)
summary(mbm_vec)

## vectorization roughly 3x faster!

#For obtaining the new data
set.seed(1) # important so that randomization is reproducible
new_data=Trial_func(df=df1)
set.seed(1) # important so that randomization is reproducible
vec_data=Trial_func_vec(df=df1)
# since in both cases random number generator is provoked
# exactly once per row in the order of the rows,
# resulting df should be absolutely identical.

all(new_data == vec_data) ## TRUE! They are absolutely identical.
# proving that your code does - in principle - exactly the same
# like my vectorized code

The Benchmarking results

# @Rui Barradas' function
Trial_func2 <- function(df1){
  i1 <- df1[["Obs_Type"]] == 1
  i2 <- df1[["Obs_Type"]] == 2
  i3 <- df1[["Obs_Type"]] == 3
  i4 <- df1[["Obs_Type"]] == 4

  #If Type == 1; then a=-Inf, b = Upper_Bound
  df1[i1, "draw_value"] <- rtruncnorm(sum(i1), a =-Inf, 
                                      b = df1[i1, "Upper_bound"], 
                                      mean = df1[i1, "mean"], sd = 1)
  #If Type == 2; then a=-10, b = Upper_Bound
  df1[i2, "draw_value"] <- rtruncnorm(sum(i2), a = -10,
                                      b = df1[i2 , "Upper_bound"],
                                      mean = df1[i2, "mean"], sd = 1)
  #If Type == 3; then a=Lower_bound, b = Inf
  df1[i3,"draw_value"] <- rtruncnorm(sum(i3), 
                                     a = df1[i3, "Lower_bound"],
                                     b = Inf, mean = df1[i3, "mean"], 
                                     sd = 1)
  #If Type == 3; then a=Lower_bound, b = 10
  df1[i4, "draw_value"] <- rtruncnorm(sum(i4), 
                                      a = df1[i4, "Lower_bound"],
                                      b = 10,
                                      mean = df1[i4,"mean"],
                                      sd = 1)
  df1
}

# @Dave2e's function
library(dplyr)

Trial_func_dplyr <- function(df1) {
  df1[,"draw_value"] <- case_when(
    df1$Obs_Type == 1 ~ rtruncnorm(1,a=-Inf,b=df1[,"Upper_bound"],mean= df1[,"mean"], sd=1),
    df1$Obs_Type == 2 ~ rtruncnorm(1,a=-10,b=df1[,"Upper_bound"],mean= df1[,"mean"],sd=1),
    df1$Obs_Type == 3 ~ rtruncnorm(1,a=df1[,"Lower_bound"],b=Inf,mean= df1[,"mean"],sd=1),
    df1$Obs_Type == 4 ~ rtruncnorm(1,a=df1[,"Lower_bound"],b=10,mean= df1[,"mean"],sd=1)
  )
  df1
}

#Benchmarking
set.seed(1)
mbm <- microbenchmark(
    loop = Trial_func(df1=df1),
    ruy_vect = Trial_func2(df1=df1),
    my_vect = Trial_func_vec(df=df1),
    cwhen = Trial_func_dplyr(df1=df1),
    times=10)


print(mbm, order = "median")

# > print(mbm, order = "median")
# Unit: milliseconds
#      expr         min          lq       mean      median         uq        max
#  ruy_vect    7.583821    7.879766   11.59954    8.815835   10.33289   36.60468
#     cwhen   22.563190   23.103670   25.13804   23.965722   26.49628   30.63777
#   my_vect 1326.771297 1373.415302 1413.75328 1410.995177 1484.28449 1506.11063
#      loop 4149.424632 4269.475169 4486.41376 4423.527566 4742.96651 4911.31992
#  neval cld
#     10 a  
#     10 a  
#     10  b 
#     10   c

# @Rui's vectorize version wins by 3 magnitudes or order!!

来源：https://stackoverflow.com/questions/59912424/r-how-to-vectorize-a-function-with-multiple-if-else-conditions

标签

vectorization

data-cleaning

R how to vectorize a function with multiple if else conditions

问题

回答1:

回答2:

回答3:

The Benchmarking results