Poor memcpy Performance on Linux

后端 未结 7 1875
执笔经年
执笔经年 2021-01-29 21:04

We have recently purchased some new servers and are experiencing poor memcpy performance. The memcpy performance is 3x slower on the servers compared to our laptops.

7条回答
  •  后悔当初
    2021-01-29 21:49

    The question was already answered above, but in any case, here is an implementation using AVX that should be faster for large copies if that's what you are worried about:

    #define ALIGN(ptr, align) (((ptr) + (align) - 1) & ~((align) - 1))
    
    void *memcpy_avx(void *dest, const void *src, size_t n)
    {
        char * d = static_cast(dest);
        const char * s = static_cast(src);
    
        /* fall back to memcpy() if misaligned */
        if ((reinterpret_cast(d) & 31) != (reinterpret_cast(s) & 31))
            return memcpy(d, s, n);
    
        if (reinterpret_cast(d) & 31) {
            uintptr_t header_bytes = 32 - (reinterpret_cast(d) & 31);
            assert(header_bytes < 32);
    
            memcpy(d, s, min(header_bytes, n));
    
            d = reinterpret_cast(ALIGN(reinterpret_cast(d), 32));
            s = reinterpret_cast(ALIGN(reinterpret_cast(s), 32));
            n -= min(header_bytes, n);
        }
    
        for (; n >= 64; s += 64, d += 64, n -= 64) {
            __m256i *dest_cacheline = (__m256i *)d;
            __m256i *src_cacheline = (__m256i *)s;
    
            __m256i temp1 = _mm256_stream_load_si256(src_cacheline + 0);
            __m256i temp2 = _mm256_stream_load_si256(src_cacheline + 1);
    
            _mm256_stream_si256(dest_cacheline + 0, temp1);
            _mm256_stream_si256(dest_cacheline + 1, temp2);
        }
    
        if (n > 0)
            memcpy(d, s, n);
    
        return dest;
    }
    

提交回复
热议问题