CPU cache critical stride test giving unexpected results based on access type

后端未结

关注

 3  1305

野趣味 2021-02-02 15:40

Inspired by this recent question on SO and the answers given, which made me feel very ignorant, I decided I\'d spend some time to learn more about CPU caching a

3条回答

温柔的废话 (楼主)

2021-02-02 16:12

I also tried to step on stride rake once I read about cache mechanics in Optimization C++ by Agner Frog.

According to this books your second assumption is wrong, because memory address always belong to a specific cache line in a set. So every byte could be cached by the same cache lines in different "ways".

My first attempt to do this in user space failed. (I have CPU i5-4200).

Total size 128kb cache set size 8kb => time 18ms; 568000000
Total size 256kb cache set size 16kb => time 13ms; 120000000
Total size 384kb cache set size 24kb => time 12ms; 688000000
Total size 512kb cache set size 32kb => time 14ms; 240000000

$ g++ -std=c++11 -march=native -O3 hit-stride.cpp -o hit-stride

#include
#include

using namespace std::chrono;
using namespace std;

int main(int argc, char** argv) {
  unsigned int cacheSetSizes[] = { 8, 16, 24, 32 };
  const int ways = 8;

  for (unsigned int i = 0; i < sizeof(cacheSetSizes) / sizeof(int); ++i) {
    const unsigned int setSize = cacheSetSizes[i] * 1024;
    const unsigned int size = setSize * ways * 2;
    char* buffer = new char[size];
    for (int k = 0; k < size; ++k) {
      buffer[k] = k % 127;
    }
    const auto started = steady_clock::now();
    int sum = 0;
    for (int j = 0; j < 1000000; ++j) {
      for (int k = 0; k < size; k += setSize) {
        sum += buffer[k];
      }
    }
    const auto ended = steady_clock::now();
    cout << "Total size " << (size >> 10) << "kb cache set size " << cacheSetSizes[i]
         << "kb => time " << duration_cast(ended - started).count()
         << "ms; " << sum << endl;
    delete buffer;
  }
  return 0;
}

The "same" code wrapped into a kernel module looks like hits L2: I realized that I need to make memory physically contiguous. It's only possible to do in the kernel mode. My L1 cache size 32kb. In the test I walk over memory range longer that number of ways (8) with step equal to cache size. So I get noticeable slowdown on 32kb (last line).

Apr 26 11:13:54 diehard kernel: [24992.943076] Memory 512 kb is allocated
Apr 26 11:13:54 diehard kernel: [24992.969814] Duration  23524369 ns for cache set size         8 kb; sum = 568000000
Apr 26 11:13:54 diehard kernel: [24992.990886] Duration  21076036 ns for cache set size        16 kb; sum = 120000000
Apr 26 11:13:54 diehard kernel: [24993.013832] Duration  22950526 ns for cache set size        24 kb; sum = 688000000
Apr 26 11:13:54 diehard kernel: [24993.045584] Duration  31760368 ns for cache set size        32 kb; sum = 240000000

$ make && sudo insmod hello.ko && sleep 1 && tail -n 100 /var/log/syslog

#include    /* Needed by all modules */
#include    /* Needed for KERN_INFO */
#include     

static unsigned long p = 0;
static struct timespec started, ended;
static unsigned int cacheSetSizes[] = { 8, 16, 24, 32 };
static const u32 ways = 8;
static const u32 m = 2;
static char* buffer;
static unsigned int setSize;
static unsigned int size;
static unsigned int i, j, k;
static int sum;

int init_module(void) {
  s64 st, en, duration;
  u32 max = 1*1024*1024;
  printk(KERN_INFO "Hello world 1.\n");
  p = __get_free_pages(GFP_DMA, get_order(max));
  printk(KERN_INFO "Memory %u kb is allocated\n", ways * m * 32);
  buffer = (char*) p;

  for (k = 0; k < max; ++k) {
    buffer[k] = k % 127;
  }

  for (i = 0; i < sizeof(cacheSetSizes) / sizeof(int); ++i) {
    setSize = cacheSetSizes[i] * 1024;
    size = setSize * ways * m;
    if (size > max) {
      printk(KERN_INFO "size %u is more that %u", size, max);
      return 0;
    }
    getnstimeofday(&started);
    st = timespec_to_ns(&started);

    sum = 0;
    for (j = 0; j < 1000000; ++j) {
      for (k = 0; k < size; k += setSize) {
        sum += buffer[k];
      }
    }

    getnstimeofday(&ended);
    en = timespec_to_ns(&ended);
    duration = en - st;
    printk(KERN_INFO "Duration %9lld ns for cache set size %9u kb; sum = %9d\n",
           duration, cacheSetSizes[i], sum);
  }
  return 0;
}

void cleanup_module(void) {
  printk(KERN_INFO "Goodbye world 1.\n");
  free_pages(p, get_order(1*1024*1024));
  printk(KERN_INFO "Memory is free\n");
}

0 讨论(0)

查看其它3个回答