问题
I'm trying to implement multiplication of 128 unsigned int on two 64 unsigned integers by Intel AVX. The problem is that non vectorised version works faster than hand-vectorised version.
Here is my test benchmark. On my laptop i got next results:
- AVX - 5200
- NonAVX - 1600
#include <iostream>
#include <chrono>
#include <emmintrin.h>
#include <immintrin.h>
constexpr uint32_t N = 28u;
constexpr uint32_t X = 64u;
constexpr uint32_t Y = 32u;
constexpr uint32_t Z = 128u;
constexpr uint32_t LOW[4] = { 4294967295u, 0u, 4294967295u, 0u };
__m128i L = _mm_lddqu_si128( reinterpret_cast< __m128i const* >( LOW ) );
typedef union
{
__m128i x;
uint32_t u[4];
}
__u128i;
static inline __attribute__((always_inline)) void multiply128x128( const uint32_t ABCD[4], const uint32_t EFGH[4], uint32_t OUT[2][4] ) noexcept
{
__m128i IN = _mm_lddqu_si128( reinterpret_cast< __m128i const* >( EFGH ) );
__m128i A = _mm_set1_epi32( ABCD[0] );
__m128i B = _mm_set1_epi32( ABCD[1] );
__m128i C = _mm_set1_epi32( ABCD[2] );
__m128i D = _mm_set1_epi32( ABCD[3] );
__m128i ED = _mm_mul_epu32( IN, D );
__m128i EC = _mm_mul_epu32( IN, C );
__m128i EB = _mm_mul_epu32( IN, B );
__m128i EA = _mm_mul_epu32( IN, A );
IN = _mm_srli_epi64( IN, 32 );
__m128i FD = _mm_mul_epu32( IN, D );
__m128i FC = _mm_mul_epu32( IN, C );
__m128i FB = _mm_mul_epu32( IN, B );
__m128i FA = _mm_mul_epu32( IN, A );
__m128i FD_H = _mm_srli_epi64( FD, 32 );
__m128i FD_L = _mm_and_si128 ( L, FD );
__m128i FC_H = _mm_srli_epi64( FC, 32 );
__m128i FC_L = _mm_and_si128 ( L, FC );
__m128i FB_H = _mm_srli_epi64( FB, 32 );
__m128i FB_L = _mm_and_si128 ( L, FB );
__m128i FA_H = _mm_srli_epi64( FA, 32 );
__m128i FA_L = _mm_and_si128 ( L, FA );
__m128i ED_H = _mm_srli_epi64( ED, 32 );
__m128i ED_L = _mm_and_si128 ( L, ED );
__m128i EC_H = _mm_srli_epi64( EC, 32 );
__m128i EC_L = _mm_and_si128 ( L, EC );
__m128i EB_H = _mm_srli_epi64( EB, 32 );
__m128i EB_L = _mm_and_si128 ( L, EB );
__m128i EA_H = _mm_srli_epi64( EA, 32 );
__m128i EA_L = _mm_and_si128 ( L, EA );
__m128i SUM_FC_L_FD_H = _mm_add_epi64( FC_L, FD_H );
__m128i SUM_FB_L_FC_H = _mm_add_epi64( FB_L, FC_H );
__m128i SUM_FA_L_FB_H = _mm_add_epi64( FA_L, FB_H );
__m128i SUM_EC_L_ED_H = _mm_add_epi64( EC_L, ED_H );
__m128i SUM_EB_L_EC_H = _mm_add_epi64( EB_L, EC_H );
__m128i SUM_EA_L_EB_H = _mm_add_epi64( EA_L, EB_H );
__m128i SUM_FC_L_FD_H_ED_L = _mm_add_epi64( SUM_FC_L_FD_H, ED_L );
__m128i SUM_FB_L_FC_H_EC_L_ED_H = _mm_add_epi64( SUM_FB_L_FC_H, SUM_EC_L_ED_H );
__m128i SUM_FA_L_FB_H_EB_L_EC_H = _mm_add_epi64( SUM_FA_L_FB_H, SUM_EB_L_EC_H );
__m128i SUM_FA_H_EA_L_EB_H = _mm_add_epi64( FA_H, SUM_EA_L_EB_H );
__u128i SUM_FC_L_FD_H_ED_L_L;
SUM_FC_L_FD_H_ED_L_L.x = _mm_srli_epi64( SUM_FC_L_FD_H_ED_L, 32 );
SUM_FC_L_FD_H_ED_L_L.x = _mm_add_epi64 ( SUM_FC_L_FD_H_ED_L_L.x, SUM_FB_L_FC_H_EC_L_ED_H );
__u128i SUM_FC_L_FD_H_ED_L_L_L;
SUM_FC_L_FD_H_ED_L_L_L.x = _mm_srli_epi64( SUM_FC_L_FD_H_ED_L_L.x, 32 );
SUM_FC_L_FD_H_ED_L_L_L.x = _mm_add_epi64 ( SUM_FC_L_FD_H_ED_L_L_L.x, SUM_FA_L_FB_H_EB_L_EC_H );
__u128i SUM_FC_L_FD_H_ED_L_L_L_L;
SUM_FC_L_FD_H_ED_L_L_L_L.x = _mm_srli_epi64( SUM_FC_L_FD_H_ED_L_L_L.x, 32 );
SUM_FC_L_FD_H_ED_L_L_L_L.x = _mm_add_epi64 ( SUM_FC_L_FD_H_ED_L_L_L_L.x, SUM_FA_H_EA_L_EB_H );
__u128i SUM_FC_L_FD_H_ED_L_L_L_L_L;
SUM_FC_L_FD_H_ED_L_L_L_L_L.x = _mm_srli_epi64( SUM_FC_L_FD_H_ED_L_L_L_L.x, 32 );
SUM_FC_L_FD_H_ED_L_L_L_L_L.x = _mm_add_epi64 ( SUM_FC_L_FD_H_ED_L_L_L_L_L.x, EA_H );
OUT[0][0] = SUM_FC_L_FD_H_ED_L_L_L_L_L.u[0];
OUT[0][1] = SUM_FC_L_FD_H_ED_L_L_L_L.u[0];
OUT[0][2] = SUM_FC_L_FD_H_ED_L_L_L.u[0];
OUT[0][3] = SUM_FC_L_FD_H_ED_L_L.u[0];
OUT[1][0] = SUM_FC_L_FD_H_ED_L_L_L_L_L.u[2];
OUT[1][1] = SUM_FC_L_FD_H_ED_L_L_L_L.u[2];
OUT[1][2] = SUM_FC_L_FD_H_ED_L_L_L.u[2];
OUT[1][3] = SUM_FC_L_FD_H_ED_L_L.u[2];
}
static inline void multiply128x128_1( const uint32_t ABCD[4], const uint32_t EFGH[4], uint32_t OUT[2][4] ) noexcept
{
uint64_t ED = static_cast<uint64_t>( ABCD[3] ) * static_cast<uint64_t>( EFGH[0] );
uint64_t EC = static_cast<uint64_t>( ABCD[2] ) * static_cast<uint64_t>( EFGH[0] );
uint64_t EB = static_cast<uint64_t>( ABCD[1] ) * static_cast<uint64_t>( EFGH[0] );
uint64_t EA = static_cast<uint64_t>( ABCD[0] ) * static_cast<uint64_t>( EFGH[0] );
uint64_t FD = static_cast<uint64_t>( ABCD[3] ) * static_cast<uint64_t>( EFGH[1] );
uint64_t FC = static_cast<uint64_t>( ABCD[2] ) * static_cast<uint64_t>( EFGH[1] );
uint64_t FB = static_cast<uint64_t>( ABCD[1] ) * static_cast<uint64_t>( EFGH[1] );
uint64_t FA = static_cast<uint64_t>( ABCD[0] ) * static_cast<uint64_t>( EFGH[1] );
uint64_t SUM_FC_L_FD_H = ( FC & 0xFFFFFFFF ) + ( FD >> 32u );
uint64_t SUM_FB_L_FC_H = ( FB & 0xFFFFFFFF ) + ( FC >> 32u );
uint64_t SUM_FA_L_FB_H = ( FA & 0xFFFFFFFF ) + ( FB >> 32u );
uint64_t SUM_EC_L_ED_H = ( EC & 0xFFFFFFFF ) + ( ED >> 32u );
uint64_t SUM_EB_L_EC_H = ( EB & 0xFFFFFFFF ) + ( EC >> 32u );
uint64_t SUM_EA_L_EB_H = ( EA & 0xFFFFFFFF ) + ( EB >> 32u );
uint64_t SUM_FC_L_FD_H_ED_L = SUM_FC_L_FD_H + ( ED & 0xFFFFFFFF );
uint64_t SUM_FB_L_FC_H_EC_L_ED_H = SUM_FB_L_FC_H + SUM_EC_L_ED_H;
uint64_t SUM_FA_L_FB_H_EB_L_EC_H = SUM_FA_L_FB_H + SUM_EB_L_EC_H;
uint64_t SUM_FA_H_EA_L_EB_H = SUM_EA_L_EB_H + ( FA >> 32u );
uint64_t SUM_FC_L_FD_H_ED_L_L = ( SUM_FC_L_FD_H_ED_L >> 32u ) + SUM_FB_L_FC_H_EC_L_ED_H;
uint64_t SUM_FC_L_FD_H_ED_L_L_L = ( SUM_FC_L_FD_H_ED_L_L >> 32u ) + SUM_FA_L_FB_H_EB_L_EC_H;
uint64_t SUM_FC_L_FD_H_ED_L_L_L_L = ( SUM_FC_L_FD_H_ED_L_L_L >> 32u ) + SUM_FA_H_EA_L_EB_H;
uint64_t SUM_FC_L_FD_H_ED_L_L_L_L_L = ( SUM_FC_L_FD_H_ED_L_L_L_L >> 32u ) + ( EA >> 32u );
OUT[0][0] = SUM_FC_L_FD_H_ED_L_L_L_L_L;
OUT[0][1] = SUM_FC_L_FD_H_ED_L_L_L_L;
OUT[0][2] = SUM_FC_L_FD_H_ED_L_L_L;
OUT[0][3] = SUM_FC_L_FD_H_ED_L_L;
uint64_t GD = static_cast<uint64_t>( ABCD[3] ) * static_cast<uint64_t>( EFGH[2] );
uint64_t GC = static_cast<uint64_t>( ABCD[2] ) * static_cast<uint64_t>( EFGH[2] );
uint64_t GB = static_cast<uint64_t>( ABCD[1] ) * static_cast<uint64_t>( EFGH[2] );
uint64_t GA = static_cast<uint64_t>( ABCD[0] ) * static_cast<uint64_t>( EFGH[2] );
uint64_t HD = static_cast<uint64_t>( ABCD[3] ) * static_cast<uint64_t>( EFGH[3] );
uint64_t HC = static_cast<uint64_t>( ABCD[2] ) * static_cast<uint64_t>( EFGH[3] );
uint64_t HB = static_cast<uint64_t>( ABCD[1] ) * static_cast<uint64_t>( EFGH[3] );
uint64_t HA = static_cast<uint64_t>( ABCD[0] ) * static_cast<uint64_t>( EFGH[3] );
uint64_t SUM_HC_L_HD_H = ( HC & 0xFFFFFFFF ) + ( HD >> 32u );
uint64_t SUM_HB_L_HC_H = ( HB & 0xFFFFFFFF ) + ( HC >> 32u );
uint64_t SUM_HA_L_HB_H = ( HA & 0xFFFFFFFF ) + ( HB >> 32u );
uint64_t SUM_GC_L_GD_H = ( GC & 0xFFFFFFFF ) + ( GD >> 32u );
uint64_t SUM_GB_L_GC_H = ( GB & 0xFFFFFFFF ) + ( GC >> 32u );
uint64_t SUM_GA_L_GB_H = ( GA & 0xFFFFFFFF ) + ( GB >> 32u );
uint64_t SUM_HC_L_HD_H_GD_L = SUM_HC_L_HD_H + ( GD & 0xFFFFFFFF );
uint64_t SUM_HB_L_HC_H_GC_L_GD_H = SUM_HB_L_HC_H + SUM_GC_L_GD_H;
uint64_t SUM_HA_L_HB_H_GB_L_GC_H = SUM_HA_L_HB_H + SUM_GB_L_GC_H;
uint64_t SUM_HA_H_GA_L_GB_H = SUM_GA_L_GB_H + ( HA >> 32u );
uint64_t SUM_HC_L_HD_H_GD_L_L = ( SUM_HC_L_HD_H_GD_L >> 32u ) + SUM_HB_L_HC_H_GC_L_GD_H;
uint64_t SUM_HC_L_HD_H_GD_L_L_L = ( SUM_HC_L_HD_H_GD_L_L >> 32u ) + SUM_HA_L_HB_H_GB_L_GC_H;
uint64_t SUM_HC_L_HD_H_GD_L_L_L_L = ( SUM_HC_L_HD_H_GD_L_L_L >> 32u ) + SUM_HA_H_GA_L_GB_H;
uint64_t SUM_HC_L_HD_H_GD_L_L_L_L_L = ( SUM_HC_L_HD_H_GD_L_L_L_L >> 32u ) + ( GA >> 32u );
OUT[1][0] = SUM_HC_L_HD_H_GD_L_L_L_L_L;
OUT[1][1] = SUM_HC_L_HD_H_GD_L_L_L_L;
OUT[1][2] = SUM_HC_L_HD_H_GD_L_L_L;
OUT[1][3] = SUM_HC_L_HD_H_GD_L_L;
}
int main()
{
uint32_t OUT[2][4];
uint32_t ABCD[4] = { 4294967295u, 4294967295u, 4294967295u, 4294967295u };
uint32_t EFGH[4] = { 4294967295u, 4294967295u, 4294967295u, 4294967295u };
multiply128x128_1( ABCD, EFGH, OUT );
uint64_t S_1 = 0u;
uint64_t S_2 = 0u;
uint64_t S_3 = 0u;
uint64_t S_4 = 0u;
uint64_t S_5 = 0u;
auto start_1 = std::chrono::high_resolution_clock::now();
for ( uint32_t i = 0; i < ( 1 << N ); ++i )
{
EFGH[0] = i;
EFGH[1] = i + X;
EFGH[2] = i + Y;
EFGH[3] = i + Z;
ABCD[0] = i;
ABCD[1] = i + X;
ABCD[2] = i + Y;
ABCD[3] = i + Z;
multiply128x128( ABCD, EFGH, OUT );
S_1 += OUT[0][0] + OUT[0][1] + OUT[0][2] + OUT[0][3];
S_4 += OUT[1][0] + OUT[1][1] + OUT[1][2] + OUT[1][3];
}
auto stop_1 = std::chrono::high_resolution_clock::now();
std::cout << "Test A: " << std::chrono::duration_cast<std::chrono::milliseconds>( stop_1 - start_1 ).count() << '\n';
auto start_3 = std::chrono::high_resolution_clock::now();
for ( uint32_t i = 0; i < ( 1 << N ); ++i )
{
EFGH[0] = i;
EFGH[1] = i + X;
EFGH[2] = i + Y;
EFGH[3] = i + Z;
ABCD[0] = i;
ABCD[1] = i + X;
ABCD[2] = i + Y;
ABCD[3] = i + Z;
multiply128x128_1( ABCD, EFGH, OUT );
S_3 += OUT[0][0] + OUT[0][1] + OUT[0][2] + OUT[0][3];
S_5 += OUT[1][0] + OUT[1][1] + OUT[1][2] + OUT[1][3];
}
auto stop_3 = std::chrono::high_resolution_clock::now();
std::cout << "Test C: " << std::chrono::duration_cast<std::chrono::milliseconds>( stop_3 - start_3 ).count() << '\n';
std::cout << S_1 << " " << S_3 << " " << S_4 << " " << S_5 << '\n';
return 0;
}
How can i optimise my SIMD - AVX code?
来源:https://stackoverflow.com/questions/57691466/optimize-128x128-to-256-bit-multiply-for-intel-avxsimd