Poor memcpy Performance on Linux

后端 未结 7 1893
执笔经年
执笔经年 2021-01-29 21:04

We have recently purchased some new servers and are experiencing poor memcpy performance. The memcpy performance is 3x slower on the servers compared to our laptops.

相关标签:
7条回答
  • 2021-01-29 21:49

    The question was already answered above, but in any case, here is an implementation using AVX that should be faster for large copies if that's what you are worried about:

    #define ALIGN(ptr, align) (((ptr) + (align) - 1) & ~((align) - 1))
    
    void *memcpy_avx(void *dest, const void *src, size_t n)
    {
        char * d = static_cast<char*>(dest);
        const char * s = static_cast<const char*>(src);
    
        /* fall back to memcpy() if misaligned */
        if ((reinterpret_cast<uintptr_t>(d) & 31) != (reinterpret_cast<uintptr_t>(s) & 31))
            return memcpy(d, s, n);
    
        if (reinterpret_cast<uintptr_t>(d) & 31) {
            uintptr_t header_bytes = 32 - (reinterpret_cast<uintptr_t>(d) & 31);
            assert(header_bytes < 32);
    
            memcpy(d, s, min(header_bytes, n));
    
            d = reinterpret_cast<char *>(ALIGN(reinterpret_cast<uintptr_t>(d), 32));
            s = reinterpret_cast<char *>(ALIGN(reinterpret_cast<uintptr_t>(s), 32));
            n -= min(header_bytes, n);
        }
    
        for (; n >= 64; s += 64, d += 64, n -= 64) {
            __m256i *dest_cacheline = (__m256i *)d;
            __m256i *src_cacheline = (__m256i *)s;
    
            __m256i temp1 = _mm256_stream_load_si256(src_cacheline + 0);
            __m256i temp2 = _mm256_stream_load_si256(src_cacheline + 1);
    
            _mm256_stream_si256(dest_cacheline + 0, temp1);
            _mm256_stream_si256(dest_cacheline + 1, temp2);
        }
    
        if (n > 0)
            memcpy(d, s, n);
    
        return dest;
    }
    
    0 讨论(0)
提交回复
热议问题