C++ Cache performance odd behavior

前端 未结 4 595
天涯浪人
天涯浪人 2021-02-04 19:44

I read an article (1.5 years old http://www.drdobbs.com/parallel/cache-friendly-code-solving-manycores-ne/240012736) which talks about cache performance and size of data. They

4条回答
  •  爱一瞬间的悲伤
    2021-02-04 20:15

    Some results (OSX, Sandy Bridge):

    GCC -O0

    Size=1 ITERATIONS=1073741824, TIME=2416.06 ms
    Size=2 ITERATIONS=536870912, TIME=1885.46 ms
    Size=4 ITERATIONS=268435456, TIME=1782.92 ms
    Size=16 ITERATIONS=67108864, TIME=2023.71 ms
    Size=32 ITERATIONS=33554432, TIME=2184.99 ms
    Size=64 ITERATIONS=16777216, TIME=2464.09 ms
    Size=256 ITERATIONS=4194304, TIME=2358.31 ms
    Size=1024 ITERATIONS=1048576, TIME=2333.77 ms
    Size=2048 ITERATIONS=524288, TIME=2340.16 ms
    Size=4096 ITERATIONS=262144, TIME=2349.97 ms
    Size=8192 ITERATIONS=131072, TIME=2346.96 ms
    Size=16384 ITERATIONS=65536, TIME=2350.3 ms
    Size=32768 ITERATIONS=32768, TIME=2348.71 ms
    Size=65536 ITERATIONS=16384, TIME=2355.28 ms
    Size=262144 ITERATIONS=4096, TIME=2358.97 ms
    Size=524288 ITERATIONS=2048, TIME=2476.46 ms
    Size=1048576 ITERATIONS=1024, TIME=2429.07 ms
    Size=2097152 ITERATIONS=512, TIME=2427.09 ms
    Size=4194304 ITERATIONS=256, TIME=2443.42 ms
    Size=8388608 ITERATIONS=128, TIME=2435.54 ms
    Size=33554432 ITERATIONS=32, TIME=2389.08 ms
    Size=134217728 ITERATIONS=8, TIME=2444.43 ms
    Size=536870912 ITERATIONS=2, TIME=2600.91 ms
    

    GCC -O3

    Size=1 ITERATIONS=1073741824, TIME=2197.12 ms
    Size=2 ITERATIONS=536870912, TIME=996.409 ms
    Size=4 ITERATIONS=268435456, TIME=606.252 ms
    Size=16 ITERATIONS=67108864, TIME=306.904 ms
    Size=32 ITERATIONS=33554432, TIME=897.692 ms
    Size=64 ITERATIONS=16777216, TIME=847.794 ms
    Size=256 ITERATIONS=4194304, TIME=802.136 ms
    Size=1024 ITERATIONS=1048576, TIME=761.971 ms
    Size=2048 ITERATIONS=524288, TIME=760.136 ms
    Size=4096 ITERATIONS=262144, TIME=759.149 ms
    Size=8192 ITERATIONS=131072, TIME=749.881 ms
    Size=16384 ITERATIONS=65536, TIME=756.672 ms
    Size=32768 ITERATIONS=32768, TIME=759.565 ms
    Size=65536 ITERATIONS=16384, TIME=754.81 ms
    Size=262144 ITERATIONS=4096, TIME=745.899 ms
    Size=524288 ITERATIONS=2048, TIME=749.527 ms
    Size=1048576 ITERATIONS=1024, TIME=758.009 ms
    Size=2097152 ITERATIONS=512, TIME=776.671 ms
    Size=4194304 ITERATIONS=256, TIME=778.963 ms
    Size=8388608 ITERATIONS=128, TIME=783.191 ms
    Size=33554432 ITERATIONS=32, TIME=770.603 ms
    Size=134217728 ITERATIONS=8, TIME=785.703 ms
    Size=536870912 ITERATIONS=2, TIME=911.875 ms
    

    (Note how the first one is really slower, I feel like there may be a mis-speculation somewhere around load-store forwarding...)

    Interestingly turning the optimizations on and removing the volatile shows a somehow nicer curve:

    Size=1 ITERATIONS=1073741824, TIME=0 ms
    Size=2 ITERATIONS=536870912, TIME=0 ms
    Size=4 ITERATIONS=268435456, TIME=0 ms
    Size=16 ITERATIONS=67108864, TIME=0.001 ms
    Size=32 ITERATIONS=33554432, TIME=125.581 ms
    Size=64 ITERATIONS=16777216, TIME=140.654 ms
    Size=256 ITERATIONS=4194304, TIME=217.559 ms
    Size=1024 ITERATIONS=1048576, TIME=168.155 ms
    Size=2048 ITERATIONS=524288, TIME=159.031 ms
    Size=4096 ITERATIONS=262144, TIME=154.373 ms
    Size=8192 ITERATIONS=131072, TIME=153.858 ms
    Size=16384 ITERATIONS=65536, TIME=156.819 ms
    Size=32768 ITERATIONS=32768, TIME=156.505 ms
    Size=65536 ITERATIONS=16384, TIME=156.921 ms
    Size=262144 ITERATIONS=4096, TIME=215.911 ms
    Size=524288 ITERATIONS=2048, TIME=220.298 ms
    Size=1048576 ITERATIONS=1024, TIME=235.648 ms
    Size=2097152 ITERATIONS=512, TIME=320.284 ms
    Size=4194304 ITERATIONS=256, TIME=409.433 ms
    Size=8388608 ITERATIONS=128, TIME=431.743 ms
    Size=33554432 ITERATIONS=32, TIME=429.436 ms
    Size=134217728 ITERATIONS=8, TIME=430.052 ms
    Size=536870912 ITERATIONS=2, TIME=535.773 ms
    

    To help anyone reproduce the "issue", here is some standard (I hope) C++ code:

    #include 
    #include 
    #include 
    #include 
    #include 
    
    template 
    void test_body(volatile int *array) {
        for (int i = 0; i < ITERATIONS; i++)
        {
            for (int  x = 0; x < SIZE; x++)
            {
                array[x]++;
            }
        }
    
    }
    
    
    template 
    static void test_function()
    {
        static_assert(SIZE*ITERATIONS == 1024*1024*1024, "SIZE MISMATCH");
        std::unique_ptr array { new int[SIZE] };
    
        // Warmup
        test_body(array.get());
    
        auto start = std::chrono::steady_clock::now();
    
        test_body(array.get());
    
        auto end = std::chrono::steady_clock::now();
        auto diff = end - start;
        std::cout << "Size=" << SIZE << " ITERATIONS=" << ITERATIONS << ", TIME=" << std::chrono::duration  (diff).count() << " ms" << std::endl;
    }
    
    int main()
    {
        enum { eIterations= 1024*1024*1024};
        test_function<1, eIterations>();
        test_function<2, eIterations/2>();
        test_function<4, eIterations/4>();
        test_function<16, eIterations/16>();
        test_function<32, eIterations/ 32>();
        test_function<64, eIterations/ 64>();
        test_function<256, eIterations/ 256>();
        test_function<1024, eIterations/ 1024>();
        test_function<2048, eIterations/ 2048>();
        test_function<4096, eIterations/ 4096>();
        test_function<8192, eIterations/ 8192>();
        test_function<16384, eIterations/ 16384>();
        test_function<32768, eIterations/ 32768>();
        test_function<65536, eIterations/ 65536>();
        test_function<262144, eIterations/ 262144>();
        test_function<524288, eIterations/ 524288>();
        test_function<1048576, eIterations/ 1048576>();
        test_function<2097152, eIterations/ 2097152>();
        test_function<4194304, eIterations/ 4194304>();
        test_function<8388608, eIterations/ 8388608>();
        test_function<33554432, eIterations/ 33554432>();
        test_function<134217728, eIterations/ 134217728>();
        test_function<536870912, eIterations/ 536870912>();
    }
    

提交回复
热议问题