CPU cache critical stride test giving unexpected results based on access type

后端 未结 3 1305
野趣味
野趣味 2021-02-02 15:40

Inspired by this recent question on SO and the answers given, which made me feel very ignorant, I decided I\'d spend some time to learn more about CPU caching a

3条回答
  •  温柔的废话
    2021-02-02 16:12

    I also tried to step on stride rake once I read about cache mechanics in Optimization C++ by Agner Frog.

    According to this books your second assumption is wrong, because memory address always belong to a specific cache line in a set. So every byte could be cached by the same cache lines in different "ways".

    My first attempt to do this in user space failed. (I have CPU i5-4200).

    Total size 128kb cache set size 8kb => time 18ms; 568000000
    Total size 256kb cache set size 16kb => time 13ms; 120000000
    Total size 384kb cache set size 24kb => time 12ms; 688000000
    Total size 512kb cache set size 32kb => time 14ms; 240000000
    

    $ g++ -std=c++11 -march=native -O3 hit-stride.cpp -o hit-stride

    #include
    #include
    
    using namespace std::chrono;
    using namespace std;
    
    int main(int argc, char** argv) {
      unsigned int cacheSetSizes[] = { 8, 16, 24, 32 };
      const int ways = 8;
    
      for (unsigned int i = 0; i < sizeof(cacheSetSizes) / sizeof(int); ++i) {
        const unsigned int setSize = cacheSetSizes[i] * 1024;
        const unsigned int size = setSize * ways * 2;
        char* buffer = new char[size];
        for (int k = 0; k < size; ++k) {
          buffer[k] = k % 127;
        }
        const auto started = steady_clock::now();
        int sum = 0;
        for (int j = 0; j < 1000000; ++j) {
          for (int k = 0; k < size; k += setSize) {
            sum += buffer[k];
          }
        }
        const auto ended = steady_clock::now();
        cout << "Total size " << (size >> 10) << "kb cache set size " << cacheSetSizes[i]
             << "kb => time " << duration_cast(ended - started).count()
             << "ms; " << sum << endl;
        delete buffer;
      }
      return 0;
    }
    

    The "same" code wrapped into a kernel module looks like hits L2: I realized that I need to make memory physically contiguous. It's only possible to do in the kernel mode. My L1 cache size 32kb. In the test I walk over memory range longer that number of ways (8) with step equal to cache size. So I get noticeable slowdown on 32kb (last line).

    Apr 26 11:13:54 diehard kernel: [24992.943076] Memory 512 kb is allocated
    Apr 26 11:13:54 diehard kernel: [24992.969814] Duration  23524369 ns for cache set size         8 kb; sum = 568000000
    Apr 26 11:13:54 diehard kernel: [24992.990886] Duration  21076036 ns for cache set size        16 kb; sum = 120000000
    Apr 26 11:13:54 diehard kernel: [24993.013832] Duration  22950526 ns for cache set size        24 kb; sum = 688000000
    Apr 26 11:13:54 diehard kernel: [24993.045584] Duration  31760368 ns for cache set size        32 kb; sum = 240000000
    

    $ make && sudo insmod hello.ko && sleep 1 && tail -n 100 /var/log/syslog

    #include    /* Needed by all modules */
    #include    /* Needed for KERN_INFO */
    #include     
    
    static unsigned long p = 0;
    static struct timespec started, ended;
    static unsigned int cacheSetSizes[] = { 8, 16, 24, 32 };
    static const u32 ways = 8;
    static const u32 m = 2;
    static char* buffer;
    static unsigned int setSize;
    static unsigned int size;
    static unsigned int i, j, k;
    static int sum;
    
    int init_module(void) {
      s64 st, en, duration;
      u32 max = 1*1024*1024;
      printk(KERN_INFO "Hello world 1.\n");
      p = __get_free_pages(GFP_DMA, get_order(max));
      printk(KERN_INFO "Memory %u kb is allocated\n", ways * m * 32);
      buffer = (char*) p;
    
      for (k = 0; k < max; ++k) {
        buffer[k] = k % 127;
      }
    
      for (i = 0; i < sizeof(cacheSetSizes) / sizeof(int); ++i) {
        setSize = cacheSetSizes[i] * 1024;
        size = setSize * ways * m;
        if (size > max) {
          printk(KERN_INFO "size %u is more that %u", size, max);
          return 0;
        }
        getnstimeofday(&started);
        st = timespec_to_ns(&started);
    
        sum = 0;
        for (j = 0; j < 1000000; ++j) {
          for (k = 0; k < size; k += setSize) {
            sum += buffer[k];
          }
        }
    
        getnstimeofday(&ended);
        en = timespec_to_ns(&ended);
        duration = en - st;
        printk(KERN_INFO "Duration %9lld ns for cache set size %9u kb; sum = %9d\n",
               duration, cacheSetSizes[i], sum);
      }
      return 0;
    }
    
    void cleanup_module(void) {
      printk(KERN_INFO "Goodbye world 1.\n");
      free_pages(p, get_order(1*1024*1024));
      printk(KERN_INFO "Memory is free\n");
    }
    

提交回复
热议问题