Demonstrator code failing to show 4 times faster SIMD speed with optimization disabled

前端未结

关注

 3  1704

不知归路 2021-01-21 17:23

I am trying to understand the benefit of using SIMD vectorization and wrote a simple demonstrator code to see what would be the speed gain of an algorithm leveraging vectorizati

3条回答

轻奢々 (楼主)

2021-01-21 17:49

I put together some sample code below to illustrate how you might see the benefits of SIMD versus scalar code. The example code is a little contrived, but the main point to note is that there need to be sufficient arithmetic operations in the loop to mitigate load/store latency and loop overheads - a single add operation, as in your initial experiment, is not sufficient.

This example achieves around 4x throughput improvement for 32 bit int data. There are two versions of the SIMD loop: one simple loop with no unrolling, and an alternate loop with 2x unrolling. As might be expected the unrolled loop is a little faster.

#include 
#include 
#include 
#include 
#include    // gettimeofday
#include   // SSE 4.1

static void foo_scalar(uint32_t *a, const uint32_t *b, const uint32_t *c, size_t n)
{
    for (size_t i = 0; i < n; ++i)
    {
        a[i] = (b[i] + c[i] + 1) / 2;
    }
}

static void foo_simd(uint32_t *a, const uint32_t *b, const uint32_t *c, size_t n)
{
    size_t i;

#ifndef UNROLL
    for (i = 0; i <= n - 4; i += 4)
    {
        __m128i vb = _mm_loadu_si128((__m128i *)&b[i]);
        __m128i vc = _mm_loadu_si128((__m128i *)&c[i]);
        __m128i v = _mm_add_epi32(vb, vc);
        v = _mm_add_epi32(v, _mm_set1_epi32(1));
        v = _mm_srli_epi32(v, 1);
        _mm_storeu_si128((__m128i *)&a[i], v);
    }
#else
    for (i = 0; i <= n - 8; i += 8)
    {
        __m128i vb0 = _mm_loadu_si128((__m128i *)&b[i]);
        __m128i vb1 = _mm_loadu_si128((__m128i *)&b[i + 4]);
        __m128i vc0 = _mm_loadu_si128((__m128i *)&c[i]);
        __m128i vc1 = _mm_loadu_si128((__m128i *)&c[i + 4]);
        __m128i v0 = _mm_add_epi32(vb0, vc0);
        __m128i v1 = _mm_add_epi32(vb1, vc1);
        v0 = _mm_add_epi32(v0, _mm_set1_epi32(1));
        v1 = _mm_add_epi32(v1, _mm_set1_epi32(1));
        v0 = _mm_srli_epi32(v0, 1);
        v1 = _mm_srli_epi32(v1, 1);
        _mm_storeu_si128((__m128i *)&a[i], v0);
        _mm_storeu_si128((__m128i *)&a[i + 4], v1);
    }
#endif
    foo_scalar(&a[i], &b[i], &c[i], n - i);
}

int main(int argc, char *argv[])
{
    const size_t kLoops = 100000;
    size_t n = 2 * 1024;
    struct timeval t0, t1;
    double t_scalar_ms, t_simd_ms;

    if (argc > 1)
    {
        n = atoi(argv[1]);
    }

    printf("kLoops = %zu, n = %zu\n", kLoops, n);

    uint32_t * a_scalar = malloc(n * sizeof(uint32_t));
    uint32_t * a_simd = malloc(n * sizeof(uint32_t));
    uint32_t * b = malloc(n * sizeof(uint32_t));
    uint32_t * c = malloc(n * sizeof(uint32_t));

    for (size_t i = 0; i < n; ++i)
    {
        a_scalar[i] = a_simd[i] = 0;
        b[i] = rand();
        c[i] = rand();
    }

    gettimeofday(&t0, NULL);
    for (size_t k = 0; k < kLoops; ++k)
    {
        foo_scalar(a_scalar, b, c, n);
    }
    gettimeofday(&t1, NULL);
    t_scalar_ms = ((double)(t1.tv_sec - t0.tv_sec) + (double)(t1.tv_usec - t0.tv_usec) * 1.0e-6) * 1.0e3;

    gettimeofday(&t0, NULL);
    for (size_t k = 0; k < kLoops; ++k)
    {
        foo_simd(a_simd, b, c, n);
    }
    gettimeofday(&t1, NULL);
    t_simd_ms = ((double)(t1.tv_sec - t0.tv_sec) + (double)(t1.tv_usec - t0.tv_usec) * 1.0e-6) * 1.0e3;

    int64_t sum_scalar = 0, sum_simd = 0;
    for (size_t i = 0; i < n; ++i)
    {
        sum_scalar += a_scalar[i];
        sum_simd += a_simd[i];
    }
    assert(sum_scalar == sum_simd);

    printf("t_scalar = %8g ms = %8g ns / point\n", t_scalar_ms, t_scalar_ms / (kLoops * n) * 1e6);
    printf("t_simd   = %8g ms = %8g ns / point\n", t_simd_ms, t_simd_ms / (kLoops * n) * 1e6);
    printf("Speed-up = %2.1fx\n",  t_scalar_ms / t_simd_ms);

    return 0;
}

Compile and run (no SIMD loop unrolling):

$ gcc-4.8 -fno-tree-vectorize -std=gnu99 -Wall gros_lalo.c -O3 -msse4.1 && ./a.out
kLoops = 100000, n = 2048
t_scalar =  122.668 ms = 0.598965 ns / point
t_simd   =   33.785 ms = 0.164966 ns / point
Speed-up = 3.6x

Compile and run (2x SIMD loop unrolling):

$ gcc-4.8 -fno-tree-vectorize -std=gnu99 -Wall gros_lalo.c -O3 -msse4.1 -DUNROLL && ./a.out
kLoops = 100000, n = 2048
t_scalar =  121.897 ms =   0.5952 ns / point
t_simd   =    29.07 ms = 0.141943 ns / point
Speed-up = 4.2x

It is interesting to look at the generated code:

Scalar:

    xorl    %ecx, %ecx
    .align 4
L10:
    movl    0(%rbp,%rcx,4), %esi
    addl    (%rbx,%rcx,4), %esi
    addl    $1, %esi
    shrl    %esi
    movl    %esi, (%r15,%rcx,4)
    addq    $1, %rcx
    cmpq    %r12, %rcx
    jne L10

SIMD (no unrolling):

    xorl    %ecx, %ecx
    xorl    %eax, %eax
    .align 4
L18:
    movdqu  0(%rbp,%rcx), %xmm2
    addq    $4, %rax
    movdqu  (%rbx,%rcx), %xmm1
    paffffd   %xmm2, %xmm1
    paffffd   %xmm3, %xmm1
    psrld   $1, %xmm1
    movdqu  %xmm1, (%r14,%rcx)
    addq    $16, %rcx
    cmpq    %r9, %rax
    jbe L18

SIMD (2x unrolling):

    xorl    %edx, %edx
    xorl    %ecx, %ecx
    .align 4
L18:
    movdqu  0(%rbp,%rdx), %xmm5
    addq    $8, %rcx
    movdqu  (%r11,%rdx), %xmm4
    movdqu  (%rbx,%rdx), %xmm2
    movdqu  (%r10,%rdx), %xmm1
    paffffd   %xmm5, %xmm2
    paffffd   %xmm4, %xmm1
    paffffd   %xmm3, %xmm2
    paffffd   %xmm3, %xmm1
    psrld   $1, %xmm2
    psrld   $1, %xmm1
    movdqu  %xmm2, 0(%r13,%rdx)
    movdqu  %xmm1, (%rax,%rdx)
    addq    $32, %rdx
    cmpq    %r15, %rcx
    jbe L18

Note that there are a similar number of instructions in the first two loops, but the SIMD loop is of course processing four elements per iteration, whereas the scalar loop is only processing one element per iteration. For the third, unrolled loop we have more instructions but we are processing eight elements per iteration - note that the proportion of loop housekeeping instructions has been reduced relative to the SIMD loop without loop unrolling.

Timing data was collected using a 2.6 GHz Core i7 Haswell CPU using gcc 4.8 on Mac OS X 10.10. Performance results should be similar on any reasonably current x86 CPU however.

0 讨论(0)

查看其它3个回答