问题
I have _m256i vectors that contain 10-bit words inside 16-bit integers (so 16*16-bit containing only 16*10 useful bits). What is the best/fastest way to extract only those 10-bits and pack them to produce an output bitstream of 10-bit values?
回答1:
Here’s my attempt.
Have not benchmarked, but I think it should work pretty fast overall: not too many instructions, all of them have 1 cycle of latency on modern processors. Also the stores are efficient, 2 store instructions for 20 bytes of data.
The code only uses 3 constants. If you call this function in a loop, good compilers should load all three outside of the loop and keep them in registers.
// bitwise blend according to a mask
inline void combineHigh( __m256i& vec, __m256i high, const __m256i lowMask )
{
vec = _mm256_and_si256( vec, lowMask );
high = _mm256_andnot_si256( lowMask, high );
vec = _mm256_or_si256( vec, high );
}
// Store 10-bit pieces from each of the 16-bit lanes of the AVX2 vector.
// The function writes 20 bytes to the pointer.
inline void store_10x16_avx2( __m256i v, uint8_t* rdi )
{
// Pack pairs of 10 bits into 20, into 32-bit lanes
__m256i high = _mm256_srli_epi32( v, 16 - 10 );
const __m256i low10 = _mm256_set1_epi32( ( 1 << 10 ) - 1 ); // Bitmask of 10 lowest bits in 32-bit lanes
combineHigh( v, high, low10 );
// Now the vector contains 32-bit lanes with 20 payload bits / each
// Pack pairs of 20 bits into 40, into 64-bit lanes
high = _mm256_srli_epi64( v, 32 - 20 );
const __m256i low20 = _mm256_set1_epi64x( ( 1 << 20 ) - 1 ); // Bitmask of 20 lowest bits in 64-bit lanes
combineHigh( v, high, low20 );
// Now the vector contains 64-bit lanes with 40 payload bits / each
// 40 bits = 5 bytes, store initial 4 bytes of the result
_mm_storeu_si32( rdi, _mm256_castsi256_si128( v ) );
// Shuffle the remaining 16 bytes of payload into correct positions.
// The indices of the payload bytes are [ 0 .. 4 ] and [ 8 .. 12 ]
// _mm256_shuffle_epi8 can only move data within 16-byte lanes
const __m256i shuffleIndices = _mm256_setr_epi8(
// 6 remaining payload bytes from the lower half of the vector
4, 8, 9, 10, 11, 12,
// 10 bytes gap, will be zeros
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
// 6 bytes gap, will be zeros
-1, -1, -1, -1, -1, -1,
// 10 payload bytes from the higher half of the vector
0, 1, 2, 3, 4,
8, 9, 10, 11, 12
);
v = _mm256_shuffle_epi8( v, shuffleIndices );
// Combine and store the final 16 bytes of payload
const __m128i low16 = _mm256_castsi256_si128( v );
const __m128i high16 = _mm256_extracti128_si256( v, 1 );
const __m128i result = _mm_or_si128( low16, high16 );
_mm_storeu_si128( ( __m128i* )( rdi + 4 ), result );
}
This code truncates unused higher 6 bits of the values.
If you want to saturate instead, you’ll need one more instruction, _mm256_min_epu16
.
Also, if you do that, the first step of the function can use pmaddwd
. Here’s the complete function which saturates the source numbers, with couple extra adjustments.
// Store 10-bit pieces from 16-bit lanes of the AVX2 vector, with saturation.
// The function writes 20 bytes to the pointer.
inline void store_10x16_avx2( __m256i v, uint8_t* rdi )
{
const __m256i low10 = _mm256_set1_epi16( ( 1 << 10 ) - 1 );
#if 0
// Truncate higher 6 bits; pmaddwd won't truncate, it needs zeroes in the unused higher bits.
v = _mm256_and_si256( v, low10 );
#else
// Saturate numbers into the range instead of truncating
v = _mm256_min_epu16( v, low10 );
#endif
// Pack pairs of 10 bits into 20, into 32-bit lanes
// pmaddwd computes a[ 0 ] * b[ 0 ] + a[ 1 ] * b[ 1 ] for pairs of 16-bit lanes, making a single 32-bit number out of two pairs.
// Initializing multiplier with pairs of [ 1, 2^10 ] to implement bit shifts + packing
const __m256i multiplier = _mm256_set1_epi32( 1 | ( 1 << ( 10 + 16 ) ) );
v = _mm256_madd_epi16( v, multiplier );
// Now the vector contains 32-bit lanes with 20 payload bits / each
// Pack pairs of 20 bits into 40 in 64-bit lanes
__m256i low = _mm256_slli_epi32( v, 12 );
v = _mm256_blend_epi32( v, low, 0b01010101 );
v = _mm256_srli_epi64( v, 12 );
// Now the vector contains 64-bit lanes with 40 payload bits / each
// 40 bits = 5 bytes, store initial 4 bytes of the result
_mm_storeu_si32( rdi, _mm256_castsi256_si128( v ) );
// Shuffle the remaining 16 bytes of payload into correct positions.
const __m256i shuffleIndices = _mm256_setr_epi8(
// Lower half
4, 8, 9, 10, 11, 12,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
// Higher half
-1, -1, -1, -1, -1, -1,
0, 1, 2, 3, 4,
8, 9, 10, 11, 12
);
v = _mm256_shuffle_epi8( v, shuffleIndices );
// Combine and store the final 16 bytes of payload
const __m128i low16 = _mm256_castsi256_si128( v );
const __m128i high16 = _mm256_extracti128_si256( v, 1 );
const __m128i result = _mm_or_si128( low16, high16 );
_mm_storeu_si128( ( __m128i* )( rdi + 4 ), result );
}
This may be slightly faster or slower overall depending on the processor, compiler, and the code calling the function, but definitely helps with code size. No one cares about binary size anymore, but CPUs have limited L1I and µop caches.
For completeness here’s another one that uses SSE2 and optionally SSSE3 instead of AVX2, only slightly slower in practice.
// Compute v = ( v & lowMask ) | ( high & ( ~lowMask ) ), for 256 bits of data in two registers
inline void combineHigh( __m128i& v1, __m128i& v2, __m128i h1, __m128i h2, const __m128i lowMask )
{
v1 = _mm_and_si128( v1, lowMask );
v2 = _mm_and_si128( v2, lowMask );
h1 = _mm_andnot_si128( lowMask, h1 );
h2 = _mm_andnot_si128( lowMask, h2 );
v1 = _mm_or_si128( v1, h1 );
v2 = _mm_or_si128( v2, h2 );
}
inline void store_10x16_sse( __m128i v1, __m128i v2, uint8_t* rdi )
{
// Pack pairs of 10 bits into 20, in 32-bit lanes
__m128i h1 = _mm_srli_epi32( v1, 16 - 10 );
__m128i h2 = _mm_srli_epi32( v2, 16 - 10 );
const __m128i low10 = _mm_set1_epi32( ( 1 << 10 ) - 1 );
combineHigh( v1, v2, h1, h2, low10 );
// Pack pairs of 20 bits into 40, in 64-bit lanes
h1 = _mm_srli_epi64( v1, 32 - 20 );
h2 = _mm_srli_epi64( v2, 32 - 20 );
const __m128i low20 = _mm_set1_epi64x( ( 1 << 20 ) - 1 );
combineHigh( v1, v2, h1, h2, low20 );
#if 1
// 40 bits is 5 bytes, for the final shuffle we use pshufb instruction from SSSE3 set
// If you don't have SSSE3, below under `#else` there's SSE2-only workaround.
const __m128i shuffleIndices = _mm_setr_epi8(
0, 1, 2, 3, 4,
8, 9, 10, 11, 12,
-1, -1, -1, -1, -1, -1 );
v1 = _mm_shuffle_epi8( v1, shuffleIndices );
v2 = _mm_shuffle_epi8( v2, shuffleIndices );
#else
// SSE2-only version of the above, uses 8 instructions + 2 constants to emulate 2 instructions + 1 constant
// Need two constants because after this step we want zeros in the unused higher 6 bytes.
h1 = _mm_srli_si128( v1, 3 );
h2 = _mm_srli_si128( v2, 3 );
const __m128i low40 = _mm_setr_epi8( -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 );
const __m128i high40 = _mm_setr_epi8( 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0 );
const __m128i l1 = _mm_and_si128( v1, low40 );
const __m128i l2 = _mm_and_si128( v2, low40 );
h1 = _mm_and_si128( h1, high40 );
h2 = _mm_and_si128( h2, high40 );
v1 = _mm_or_si128( h1, l1 );
v2 = _mm_or_si128( h2, l2 );
#endif
// Now v1 and v2 vectors contain densely packed 10 bytes / each.
// Produce final result: 16 bytes in the low part, 4 bytes in the high part
__m128i low16 = _mm_or_si128( v1, _mm_slli_si128( v2, 10 ) );
__m128i high16 = _mm_srli_si128( v2, 6 );
// Store these 20 bytes with 2 instructions
_mm_storeu_si128( ( __m128i* )rdi, low16 );
_mm_storeu_si32( rdi + 16, high16 );
}
回答2:
In a loop, you may want to use partially-overlapping stores that write past the end of the 20 byte destination for each vector of source data. That saves the work of shuffling data across the 16-byte boundary to set up for 16 + 4 byte stores.
(@Soont's updated answer with one vmovd
and one vmovdqu
store is very good and only has 2 total shuffle uops including vpshufb
and vextracti128
. When I initially wrote this, we hadn't yet thought of a good way to avoid storing outside the 20 bytes without spending more shuffle uops which would create a bottleneck worse than the front-end. But vmovdqu
+ vextracti128 mem, ymm, 1
(2 uops not micro-fused) is still slightly cheaper: 3 uops after the vpshufb
instead of 4.)
Or unrolling could be good for large arrays, LCM(20,16) = 80, so with a large unroll (and different shuffle-control vectors for each position within that) you could be doing only aligned 16-byte stores. But that might take a lot of shuffling, including between source chunks probably with palignr
.
Example of two overlapping 16-byte stores
Use this as a loop body where overwriting past 20 bytes is ok.
#include <immintrin.h>
#include <stdint.h>
// Store 10-bit pieces from each of the 16-bit lanes of the AVX2 vector.
// The function writes 20 useful bytes to the pointer
// but actually steps on data out to 26 bytes from dst
void pack10bit_avx2_store26( __m256i v, uint8_t* dst)
{
// clear high garbage if elements aren't already zero-extended
//v = _mm256_and_si256(v, _mm256_set1_epi16( (1<<10)-1) );
... prep data somehow; pmaddwd + a couple shifts is good for throughput
// Now the vector contains 64-bit lanes with 40 payload bits / each; 40 bits = 5 bytes.
// Shuffle these bytes into a very special order.
// Note _mm256_shuffle_epi8 can only move data within 16-byte lanes.
const __m256i shuffleIndices = _mm256_setr_epi8(
// 6 bytes gap with zeros
// Pack the two 5-byte chunks into the bottom of each 16-byte lane
0, 1, 2, 3, 4,
8, 9, 10, 11, 12,
-1, -1, -1, -1, -1, -1,
0, 1, 2, 3, 4,
8, 9, 10, 11, 12,
-1, -1, -1, -1, -1, -1);
v = _mm256_shuffle_epi8(v, shuffleIndices );
// Split the vector into halves
__m128i low16 = _mm256_castsi256_si128( v );
_mm_storeu_si128( ( __m128i* )dst, low16 ); // vmovdqu mem, xmm
__m128i high16 = _mm256_extracti128_si256( v, 1 );
_mm_storeu_si128( ( __m128i* )(dst+10), high16 ); // vextracti128 mem, ymm, 1
// An AVX-512 masked store could avoid writing past the end
}
We can see how it might inline into a loop by compiling it to a stand-alone function (https://godbolt.org/z/8T7KhT).
# clang -O3 -march=skylake
pack10bit_avx2(long long __vector(4), unsigned char*):
# vpand commented out
vpmaddwd ymm0, ymm0, ymmword ptr [rip + .LCPI0_0]
... # work in progress, original PMADDWD idea ignored some limitations! See Soonts' answer
vpshufb ymm0, ymm0, ymmword ptr [rip + .LCPI0_1] # ymm0 = ymm0[0,1,2,3,4,8,9,10,11,12],zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,24,25,26,27,28],zero,zero,zero,zero,zero,zero
vmovdqu xmmword ptr [rdi], xmm0
vextracti128 xmmword ptr [rdi + 10], ymm0, 1
vzeroupper # overhead that goes away when inlining into a loop
ret
In a loop, compilers would load those 2 vector constants into registers, hopefully using broadcast-loads.
Unlike some wider integer multiplies or horizontal add, vpmaddwd
is handed efficiently, as a single uop with 5 cycle latency. https://uops.info/
The vextracti128
store can't micro-fuse on Intel, but unlike vpextrd
there's no shuffle uop involved. Just store-address and store-data. Zen2 also runs it as 2 uops, with throughput of one per 2 cycles unfortunately. (Worse than Zen1).
Before Ice Lake, both Intel and AMD can run 1 store per clock.
If you do actually want the packed data back in registers, you might want @Soont's original shuffles using palignr
, or you could do a block of this and then some reloads. Latency would be higher (especially because of store-forwarding stalls on the reloads), but if your block is several registers worth of data then that should overlap or even hide the latency, maybe giving the stores time to even commit to L1d and not cause a stall when reloaded.
BMI2 pext
uint64_t packed = _pext_u64(x, 0x03FF03FF03FF03FF);
Maybe good for scalar cleanup or a short chunk of 4 pixels or whatever. This leaves you with the problem of doing a 5-byte store (or 8-byte store with trailing zeros). Beware of strict-aliasing and alignment if using this, e.g. use memcpy
to get unaligned may-alias data into a uint64_t, or make an __attribute__((aligned(1),may_alias))
typedef.
pext
is very efficient on Intel (1 uop, 3c latency), but very bad on AMD, much worse than just using the low part of one SIMD step.
AVX-512
AVX512VBMI (Ice Lake) would give you vpermb
(lane crossing) instead of vpshufb
. (AVX512BW for vpermw
on Skylake-X / Cascade Lake would require you to have already combined into an even number of bytes, and it's 2 uops even on Ice Lake where vpermb
is 1, so it's pretty bad.) vpermb
could set up for a single unaligned 32-byte store (with 20 useful bytes), which you overlap in a loop.
AVX-512 stores can be efficiently masked to not actually overwrite past the end, e.g. using dword masking. vmovdqu32 [rdi]{k}, ymm0
is 1 uop on Skylake-X. But AVX2 vmaskmovd
is a few uops even on Intel, and extremely expensive on AMD, so you don't want to do that. And dword masking only works if you have all 20 bytes ready for one store, otherwise you need at least 16-bit granularity.
Other AVX-512 instructions: VBMI vpmultishiftqb, a parallel bitfield extract, seems like it might be useful, but it can only write aligned 8-bit destination chunks from unaligned but contiguous source chunks. I don't think that's better than what we can do with variable-shifts and rotates. vpmultishiftqb
would let us unpack this format (inverse of this function) in probably 2 instructions: 1 shuffle (such as vpexpandb
or vpermb
) to put the needed data into each qword in the vector, and one multishift to grab the right 10-bit field for the bottom of each word.
AVX-512 has variable-count shifts and rotates, including with word (16-bit) granularity, so that would be an option instead of vpmaddwd
for the first step. Using shifts ignores high garbage for free. It has lower latency, and merge-masking for the immediate version can replace the need for a control vector. (But then you need a mask constant).
With masking the latency is 3 cycles, vs 1 without, and AVX-512 makes it about as efficient to broadcast a control vector from an immediate as to mov reg,imm
/ kmov kreg, reg
. e.g. mov reg,imm
/ vpbroadcastd ymm, reg
(1 uop). Merge-masking also constrains the optimizer to overwrite the destination register instead of copy-and-shift, although that shouldn't matter here if the optimizer is smart. Neither way lets the load of the data fold into a memory source operand for the shift: sllvw
can only take the counts from memory, and sllw
needs to merge into the original in a register.
Shifts can run on ports 0 or 1 on Intel (and AMD doesn't support AVX-512). Or only port 0 for 512-bit uops, shutting down port 1 for any vector-ALU uop while any 512-bit uops are in flight. So there's a potential throughput bottleneck on port 0 for a __m512i
version of this, but for 256-bit there are enough other uops (shuffle and store, and presumably loop overhead if doing this for an array of data) that this should be fairly evenly distributed.
This shift part (before _mm256_permutexvar_epi8
) only requires AVX-512BW (+VL), and will work on Skylake-X. It leaves the data in the same place as other methods, so is a drop-in replacement you can mix and match with various strategies.
// Ice Lake. Could work on __m512i but then shifts could only run on p0, not p0/p1,
// and almost every store would be a cache line split.
inline void store_10x16_avx512vbmi( __m256i v, uint8_t* dst )
{
// no _mm256_and_si256 needed, we safely ignore high bits
// v = [ ?(6) ... B[9:0] | ?(6) ... A[9:0] ] repeated
v = _mm256_sllv_epi16(v, _mm256_set1_epi32((0<<16) | 6)); // alternative: simple repeated-pattern control vector
// v = _mm256_mask_slli_epi16(v, 0x5555, v, 6); // merge-masking, updating only elements 0,2, etc.
// v = [ ?(6) ... B[9:0] | A[9:0] ... 0(6) ] repeated
v = _mm256_rolv_epi32(v, _mm256_set1_epi64x(((32ULL-6)<<32) | 6)); // top half right, bottom half left
// v = [ 0(6) .. ?(6) .. D[9:0] | C[9:0] | B[9:0] | A[9:0] ... 0(12) ] repeated
v = _mm256_srli_epi64(v, 12); // 40 bit chunks at the bottom of each qword
const __m256i permb = _mm256_setr_epi8( 0, 1, 2, 3, 4, 8, 9,10,11,12,
16,17,18,19,20, 24,25,26,27,28,
28,28,28,28,28,28,28,28,28,28,28,28 );
// repeat last byte as filler. vpermb can't zero (except by maskz) but we can do a masked store
v = _mm256_permutexvar_epi8(v, permb); // AVX512_VBMI
_mm256_mask_storeu_epi32( dst, 0x1F, v); // 32-bit masking granularity in case that's cheaper for HW. 20 bytes = 5 dwords.
}
Compiles like so (Godbolt):
# clang -O3 -march=icelake-client. GCC is essentially the same.
store_10x16_avx512vbmi(long long __vector(4), unsigned char*):
vpsllvw ymm0, ymm0, ymmword ptr [rip + .LCPI0_0]
vprolvd ymm0, ymm0, ymmword ptr [rip + .LCPI0_1]
vpsrlq ymm0, ymm0, 12
vpermb ymm0, ymm0, ymmword ptr [rip + .LCPI0_2]
mov al, 31 # what the heck, clang? partial register false dependency for no reason!
kmovd k1, eax
vmovdqu32 ymmword ptr [rdi] {k1}, ymm0
# vzeroupper not needed because the caller was using __m256i args. GCC omits it.
ret
Even if you use the same shift constant vector twice to make the compiler keep it around in a register (instead of use directly from a memory source operand), it still chooses to load it from memory instead of mov eax,6
/ vpbroadcast ymm1, eax
or something. This saves 1 uop at the cost of needing the constant in .rodata. To be fair, we do need other constants probably in the same cache line, but the way GCC wastes space they don't all fit in one cache line! clang notices the pattern and uses a vpbroadcastd
or q
load, gcc wastefully loads a full 32 bytes. (kmov k1, [mem]
is 3 front-end uops so it wouldn't save a uop to load mask constants from memory.)
Using _mm256_mask_slli_epi16(v, 0x5555, v, 6)
, clang optimizes it back into vpsllvw ymm0, ymm0, ymmword ptr [rip + .LCPI0_0]
with the same 6,0 repeating constant. So I guess that's a good sign I got it right. But GCC compiles as written:
store_10x16_avx512vbmi(long long __vector(4), unsigned char*):
mov eax, 21845
kmovw k1, eax
vpsllw ymm0{k1}, ymm0, 6
vprolvd ymm0, ymm0, YMMWORD PTR .LC0[rip]
mov eax, 31
kmovb k2, eax
vpsrlq ymm0, ymm0, 12
vpermb ymm0, ymm0, YMMWORD PTR .LC1[rip]
vmovdqu32 YMMWORD PTR [rdi]{k2}, ymm0
ret
_mm256_sllv_epi16
requires AVX-512BW and AVX-512VL. rolv_epi32 only requires AVX-512VL. (Or just AVX-512F for the 512-bit version.) Rotates only come in 32 and 64 element sizes, not 16, but AVX-512 does extend variable-shift granularity down to 16 (from 32 or 64 in AVX2).
vpcompressb [rdi]{k1}, ymm0
(AVX512VBMI = Ice Lake and later) would be an alternative to vpermb + store to pack bytes at the bottom of a register (like BMI2 pext
but for vector elements instead of bits in a scalar register). But it's actually more expensive: 6 uops on Ice Lake, with one per 6c throughput. (vpcompressd
is not as bad).
Even vpcompressb
into a vector register is 2 uops, so for a constant shuffle-control it's better to load a vector constant for vpermb
, unless cache misses for control vectors is a problem, e.g. if you're only doing this once every so often then let the HW process a k mask instead of a load.
AVX-512 without VBMI: 2x 16-byte stores without exceeding the 20-byte range
... // same setup as usual, leaving 40-bit chunks at the bottom of each qword
const __m256i shuffleIndices = _mm256_setr_epi8(
// 6 bytes gap with zeros
// Pack the two 5-byte chunks into the bottom of each 16-byte lane
0, 1, 2, 3, 4,
8, 9, 10, 11, 12,
-1, -1, -1, -1, -1, -1,
0, 1, 2, 3, 4,
8, 9, 10, 11, 12,
-1, -1, -1, -1, -1, -1);
v = _mm256_shuffle_epi8(v, shuffleIndices );
// Split the vector into halves
__m128i low16 = _mm256_castsi256_si128( v );
_mm_storeu_si128( ( __m128i* )dst, low16 ); // vmovdqu mem, xmm no masking
// An AVX-512BW masked store avoiding writing past the end costs more instructions (and back-end uops), same front-end uops
__m128i high16 = _mm256_extracti128_si256( v, 1 ); // vextracti128 xmm, ymm, 1
_mm_mask_storeu_epi8( dst+10, 0x3FF, high16 ); // vmovdqu8 [mem]{k}, xmm
This needs vextracti128 xmm, ymm, 1
to set up for vmovdqu8
. Unlike with writing 26 bytes, we can't extract directly to memory. There is no vextracti8x16
, only vextracti32x4
and 64x2
(and 32x8 / 64x4 256-bit extracts). We need byte-granularity masking but can't get it with an instruction that extracts directly to memory, only via a shuffle (vextract
into a register) and then vmovdqu8
.
So the asm we get is
# clang
... vpshufb result in YMM0
vmovdqu [rdi], xmm0 # same as before
vextracti128 xmm0, ymm0, 1 # 1 shuffle uop
mov ax, 1023
kmovd k1, eax # will be hoisted
vmovdqu8 [rdi + 10] {k1}, xmm0 # 1 micro-fused uop
Since vextracti128 [mem], ymm, 1
was 2 front-end uops anyway, this doesn't hurt front-end throughput. (It does create more pressure on back-end execution ports, thanks to the shuffle uop).
来源:https://stackoverflow.com/questions/66091979/keep-only-the-10-useful-bits-in-16-bit-words