Inspired by this recent question on SO and the answers given, which made me feel very ignorant, I decided I\'d spend some time to learn more about CPU caching a
I also tried to step on stride rake once I read about cache mechanics in Optimization C++ by Agner Frog.
According to this books your second assumption is wrong, because memory address always belong to a specific cache line in a set. So every byte could be cached by the same cache lines in different "ways".
My first attempt to do this in user space failed. (I have CPU i5-4200).
Total size 128kb cache set size 8kb => time 18ms; 568000000
Total size 256kb cache set size 16kb => time 13ms; 120000000
Total size 384kb cache set size 24kb => time 12ms; 688000000
Total size 512kb cache set size 32kb => time 14ms; 240000000
$ g++ -std=c++11 -march=native -O3 hit-stride.cpp -o hit-stride
using namespace std::chrono;
using namespace std;
int main(int argc, char** argv) {
unsigned int cacheSetSizes[] = { 8, 16, 24, 32 };
const int ways = 8;
for (unsigned int i = 0; i < sizeof(cacheSetSizes) / sizeof(int); ++i) {
const unsigned int setSize = cacheSetSizes[i] * 1024;
const unsigned int size = setSize * ways * 2;
char* buffer = new char[size];
for (int k = 0; k < size; ++k) {
buffer[k] = k % 127;
const auto started = steady_clock::now();
int sum = 0;
for (int j = 0; j < 1000000; ++j) {
for (int k = 0; k < size; k += setSize) {
sum += buffer[k];
const auto ended = steady_clock::now();
cout << "Total size " << (size >> 10) << "kb cache set size " << cacheSetSizes[i]
<< "kb => time " << duration_cast(ended - started).count()
<< "ms; " << sum << endl;
delete buffer;
return 0;
The "same" code wrapped into a kernel module looks like hits L2: I realized that I need to make memory physically contiguous. It's only possible to do in the kernel mode. My L1 cache size 32kb. In the test I walk over memory range longer that number of ways (8) with step equal to cache size. So I get noticeable slowdown on 32kb (last line).
Apr 26 11:13:54 diehard kernel: [24992.943076] Memory 512 kb is allocated
Apr 26 11:13:54 diehard kernel: [24992.969814] Duration 23524369 ns for cache set size 8 kb; sum = 568000000
Apr 26 11:13:54 diehard kernel: [24992.990886] Duration 21076036 ns for cache set size 16 kb; sum = 120000000
Apr 26 11:13:54 diehard kernel: [24993.013832] Duration 22950526 ns for cache set size 24 kb; sum = 688000000
Apr 26 11:13:54 diehard kernel: [24993.045584] Duration 31760368 ns for cache set size 32 kb; sum = 240000000
$ make && sudo insmod hello.ko && sleep 1 && tail -n 100 /var/log/syslog
#include /* Needed by all modules */
#include /* Needed for KERN_INFO */
static unsigned long p = 0;
static struct timespec started, ended;
static unsigned int cacheSetSizes[] = { 8, 16, 24, 32 };
static const u32 ways = 8;
static const u32 m = 2;
static char* buffer;
static unsigned int setSize;
static unsigned int size;
static unsigned int i, j, k;
static int sum;
int init_module(void) {
s64 st, en, duration;
u32 max = 1*1024*1024;
printk(KERN_INFO "Hello world 1.\n");
p = __get_free_pages(GFP_DMA, get_order(max));
printk(KERN_INFO "Memory %u kb is allocated\n", ways * m * 32);
buffer = (char*) p;
for (k = 0; k < max; ++k) {
buffer[k] = k % 127;
for (i = 0; i < sizeof(cacheSetSizes) / sizeof(int); ++i) {
setSize = cacheSetSizes[i] * 1024;
size = setSize * ways * m;
if (size > max) {
printk(KERN_INFO "size %u is more that %u", size, max);
return 0;
st = timespec_to_ns(&started);
sum = 0;
for (j = 0; j < 1000000; ++j) {
for (k = 0; k < size; k += setSize) {
sum += buffer[k];
en = timespec_to_ns(&ended);
duration = en - st;
printk(KERN_INFO "Duration %9lld ns for cache set size %9u kb; sum = %9d\n",
duration, cacheSetSizes[i], sum);
return 0;
void cleanup_module(void) {
printk(KERN_INFO "Goodbye world 1.\n");
free_pages(p, get_order(1*1024*1024));
printk(KERN_INFO "Memory is free\n");