We have recently purchased some new servers and are experiencing poor memcpy performance. The memcpy performance is 3x slower on the servers compared to our laptops.
The question was already answered above, but in any case, here is an implementation using AVX that should be faster for large copies if that's what you are worried about:
#define ALIGN(ptr, align) (((ptr) + (align) - 1) & ~((align) - 1))
void *memcpy_avx(void *dest, const void *src, size_t n)
{
char * d = static_cast(dest);
const char * s = static_cast(src);
/* fall back to memcpy() if misaligned */
if ((reinterpret_cast(d) & 31) != (reinterpret_cast(s) & 31))
return memcpy(d, s, n);
if (reinterpret_cast(d) & 31) {
uintptr_t header_bytes = 32 - (reinterpret_cast(d) & 31);
assert(header_bytes < 32);
memcpy(d, s, min(header_bytes, n));
d = reinterpret_cast(ALIGN(reinterpret_cast(d), 32));
s = reinterpret_cast(ALIGN(reinterpret_cast(s), 32));
n -= min(header_bytes, n);
}
for (; n >= 64; s += 64, d += 64, n -= 64) {
__m256i *dest_cacheline = (__m256i *)d;
__m256i *src_cacheline = (__m256i *)s;
__m256i temp1 = _mm256_stream_load_si256(src_cacheline + 0);
__m256i temp2 = _mm256_stream_load_si256(src_cacheline + 1);
_mm256_stream_si256(dest_cacheline + 0, temp1);
_mm256_stream_si256(dest_cacheline + 1, temp2);
}
if (n > 0)
memcpy(d, s, n);
return dest;
}