I read an article (1.5 years old http://www.drdobbs.com/parallel/cache-friendly-code-solving-manycores-ne/240012736) which talks about cache performance and size of data. They
Some results (OSX, Sandy Bridge):
Size=1 ITERATIONS=1073741824, TIME=2416.06 ms
Size=2 ITERATIONS=536870912, TIME=1885.46 ms
Size=4 ITERATIONS=268435456, TIME=1782.92 ms
Size=16 ITERATIONS=67108864, TIME=2023.71 ms
Size=32 ITERATIONS=33554432, TIME=2184.99 ms
Size=64 ITERATIONS=16777216, TIME=2464.09 ms
Size=256 ITERATIONS=4194304, TIME=2358.31 ms
Size=1024 ITERATIONS=1048576, TIME=2333.77 ms
Size=2048 ITERATIONS=524288, TIME=2340.16 ms
Size=4096 ITERATIONS=262144, TIME=2349.97 ms
Size=8192 ITERATIONS=131072, TIME=2346.96 ms
Size=16384 ITERATIONS=65536, TIME=2350.3 ms
Size=32768 ITERATIONS=32768, TIME=2348.71 ms
Size=65536 ITERATIONS=16384, TIME=2355.28 ms
Size=262144 ITERATIONS=4096, TIME=2358.97 ms
Size=524288 ITERATIONS=2048, TIME=2476.46 ms
Size=1048576 ITERATIONS=1024, TIME=2429.07 ms
Size=2097152 ITERATIONS=512, TIME=2427.09 ms
Size=4194304 ITERATIONS=256, TIME=2443.42 ms
Size=8388608 ITERATIONS=128, TIME=2435.54 ms
Size=33554432 ITERATIONS=32, TIME=2389.08 ms
Size=134217728 ITERATIONS=8, TIME=2444.43 ms
Size=536870912 ITERATIONS=2, TIME=2600.91 ms
Size=1 ITERATIONS=1073741824, TIME=2197.12 ms
Size=2 ITERATIONS=536870912, TIME=996.409 ms
Size=4 ITERATIONS=268435456, TIME=606.252 ms
Size=16 ITERATIONS=67108864, TIME=306.904 ms
Size=32 ITERATIONS=33554432, TIME=897.692 ms
Size=64 ITERATIONS=16777216, TIME=847.794 ms
Size=256 ITERATIONS=4194304, TIME=802.136 ms
Size=1024 ITERATIONS=1048576, TIME=761.971 ms
Size=2048 ITERATIONS=524288, TIME=760.136 ms
Size=4096 ITERATIONS=262144, TIME=759.149 ms
Size=8192 ITERATIONS=131072, TIME=749.881 ms
Size=16384 ITERATIONS=65536, TIME=756.672 ms
Size=32768 ITERATIONS=32768, TIME=759.565 ms
Size=65536 ITERATIONS=16384, TIME=754.81 ms
Size=262144 ITERATIONS=4096, TIME=745.899 ms
Size=524288 ITERATIONS=2048, TIME=749.527 ms
Size=1048576 ITERATIONS=1024, TIME=758.009 ms
Size=2097152 ITERATIONS=512, TIME=776.671 ms
Size=4194304 ITERATIONS=256, TIME=778.963 ms
Size=8388608 ITERATIONS=128, TIME=783.191 ms
Size=33554432 ITERATIONS=32, TIME=770.603 ms
Size=134217728 ITERATIONS=8, TIME=785.703 ms
Size=536870912 ITERATIONS=2, TIME=911.875 ms
(Note how the first one is really slower, I feel like there may be a mis-speculation somewhere around load-store forwarding...)
Interestingly turning the optimizations on and removing the volatile shows a somehow nicer curve:
Size=1 ITERATIONS=1073741824, TIME=0 ms
Size=2 ITERATIONS=536870912, TIME=0 ms
Size=4 ITERATIONS=268435456, TIME=0 ms
Size=16 ITERATIONS=67108864, TIME=0.001 ms
Size=32 ITERATIONS=33554432, TIME=125.581 ms
Size=64 ITERATIONS=16777216, TIME=140.654 ms
Size=256 ITERATIONS=4194304, TIME=217.559 ms
Size=1024 ITERATIONS=1048576, TIME=168.155 ms
Size=2048 ITERATIONS=524288, TIME=159.031 ms
Size=4096 ITERATIONS=262144, TIME=154.373 ms
Size=8192 ITERATIONS=131072, TIME=153.858 ms
Size=16384 ITERATIONS=65536, TIME=156.819 ms
Size=32768 ITERATIONS=32768, TIME=156.505 ms
Size=65536 ITERATIONS=16384, TIME=156.921 ms
Size=262144 ITERATIONS=4096, TIME=215.911 ms
Size=524288 ITERATIONS=2048, TIME=220.298 ms
Size=1048576 ITERATIONS=1024, TIME=235.648 ms
Size=2097152 ITERATIONS=512, TIME=320.284 ms
Size=4194304 ITERATIONS=256, TIME=409.433 ms
Size=8388608 ITERATIONS=128, TIME=431.743 ms
Size=33554432 ITERATIONS=32, TIME=429.436 ms
Size=134217728 ITERATIONS=8, TIME=430.052 ms
Size=536870912 ITERATIONS=2, TIME=535.773 ms
To help anyone reproduce the "issue", here is some standard (I hope) C++ code:
#include
#include
#include
#include
#include
template
void test_body(volatile int *array) {
for (int i = 0; i < ITERATIONS; i++)
{
for (int x = 0; x < SIZE; x++)
{
array[x]++;
}
}
}
template
static void test_function()
{
static_assert(SIZE*ITERATIONS == 1024*1024*1024, "SIZE MISMATCH");
std::unique_ptr array { new int[SIZE] };
// Warmup
test_body(array.get());
auto start = std::chrono::steady_clock::now();
test_body(array.get());
auto end = std::chrono::steady_clock::now();
auto diff = end - start;
std::cout << "Size=" << SIZE << " ITERATIONS=" << ITERATIONS << ", TIME=" << std::chrono::duration (diff).count() << " ms" << std::endl;
}
int main()
{
enum { eIterations= 1024*1024*1024};
test_function<1, eIterations>();
test_function<2, eIterations/2>();
test_function<4, eIterations/4>();
test_function<16, eIterations/16>();
test_function<32, eIterations/ 32>();
test_function<64, eIterations/ 64>();
test_function<256, eIterations/ 256>();
test_function<1024, eIterations/ 1024>();
test_function<2048, eIterations/ 2048>();
test_function<4096, eIterations/ 4096>();
test_function<8192, eIterations/ 8192>();
test_function<16384, eIterations/ 16384>();
test_function<32768, eIterations/ 32768>();
test_function<65536, eIterations/ 65536>();
test_function<262144, eIterations/ 262144>();
test_function<524288, eIterations/ 524288>();
test_function<1048576, eIterations/ 1048576>();
test_function<2097152, eIterations/ 2097152>();
test_function<4194304, eIterations/ 4194304>();
test_function<8388608, eIterations/ 8388608>();
test_function<33554432, eIterations/ 33554432>();
test_function<134217728, eIterations/ 134217728>();
test_function<536870912, eIterations/ 536870912>();
}