Bandits with Rcpp | 易学教程

问题

This is a second attempt at correcting my earlier version that lives here. I am translating the epsilon-greedy algorithm for multiarmed bandits.

A summary of the code is as follows. Basically, we have a set of arms, each of which pays out a reward with a pre-defined probability and our job is to show that by drawing at random from the arms while drawing the arm with the best reward intermittently eventually allows us to converge on to the best arm.

The original algorithm can be found here.

#define ARMA_64BIT_WORD
#include <RcppArmadillo.h>

using namespace Rcpp;

// [[Rcpp::depends(RcppArmadillo)]]
// [[Rcpp::plugins(cpp11)]]

struct EpsilonGreedy {
  double epsilon;
  arma::uvec counts;
  arma::vec values;
};

int index_max(arma::uvec& v) {
  return v.index_max();
}

int index_rand(arma::vec& v) {
  int s = arma::randi<int>(arma::distr_param(0, v.n_elem-1));
  return s;
}

int select_arm(EpsilonGreedy& algo) {
  if (R::runif(0, 1) > algo.epsilon) {
    return index_max(algo.values);
  } else {
    return index_rand(algo.values);
  }
}

void update(EpsilonGreedy& algo, int chosen_arm, double reward) {
  algo.counts[chosen_arm] += 1;

  int n = algo.counts[chosen_arm];
  double value = algo.values[chosen_arm];

  algo.values[chosen_arm] = ((n-1)/n) * value + (1/n) * reward;
}

struct BernoulliArm {
  double p;
};

int draw(BernoulliArm arm) {
  if (R::runif(0, 1) > arm.p) {
    return 0;
  } else {
    return 1;
  }
}

// [[Rcpp::export]]
DataFrame test_algorithm(double epsilon, std::vector<double>& means, int 
n_sims, int horizon) {

  std::vector<BernoulliArm> arms;

  for (auto& mu : means) {
    BernoulliArm b = {mu};
    arms.push_back(b);
  }

  std::vector<int> sim_num, time, chosen_arms;
  std::vector<double> rewards;

  for (int sim = 1; sim <= n_sims; ++sim) {

    arma::uvec counts(means.size(), arma::fill::zeros);
    arma::vec values(means.size(), arma::fill::zeros); 

    EpsilonGreedy algo = {epsilon, counts, values};

    for (int t = 1; t <= horizon; ++t) {
      int chosen_arm = select_arm(algo);
      double reward = draw(arms[chosen_arm]);
      update(algo, chosen_arm, reward);

      sim_num.push_back(sim);
      time.push_back(t);
      chosen_arms.push_back(chosen_arm);
      rewards.push_back(reward);
    }
  }

  DataFrame results = DataFrame::create(Named("sim_num") = sim_num,
                                        Named("time") = time,
                                        Named("chosen_arm") = chosen_arms,
                                        Named("reward") = rewards);

  return results;
}


/***R

library(tidyverse)
means <- c(0.1, 0.1, 0.1, 0.1, 0.9)

total_results <- data.frame(sim_num = integer(), time = integer(), 
                            chosen_arm = integer(),
                            reward = numeric(), epsilon = numeric())

for (epsilon in seq(0.1, 0.5, length.out = 5)) {

  cat("Starting with ", epsilon, " at: ", format(Sys.time(), "%H:%M"), "\n")

  results <- test_algorithm(epsilon, means, 5000, 250)
  results$epsilon <- epsilon

  total_results <- rbind(total_results, results)

 }

avg_reward <- total_results %>% group_by(time, epsilon) %>%
                            summarize(avg_reward = mean(reward))

dev.new()

ggplot(avg_reward) +
  geom_line(aes(x = time, y = avg_reward,
            group = epsilon, color = epsilon), size = 1) +
  scale_color_gradient(low = "grey", high = "black") +
  labs(x = "Time",
       y = "Average reward",
       title = "Performance of the Epsilon-Greedy Algorithm",
       color = "epsilon\n")

The above code returns the following plot:

This plot is just wrong! However, I am unable to zero-in on a logical flaw in the code.... Where am I going off-track?

Edit: As per the comments, the following is the expected plot:

回答1:

In this piece of code:

int n = algo.counts[chosen_arm];
//...
algo.values[chosen_arm] = ((n-1)/n) * value + (1/n) * reward;

n is declared as an integer, so (n-1)/n and 1/n will be integer expressions that both evaluate to 0. You can fix this by changing the 1 to 1.0 which is a floating-point constant, to force the expressions to be evaluated as double:

algo.values[chosen_arm] = ((n-1.0)/n) * value + (1.0/n) * reward;

来源：https://stackoverflow.com/questions/49727727/bandits-with-rcpp

标签

rcpp

armadillo