Efficient way to set first N or last N bits of __m256i to 1, the rest to 0

后端 未结 1 534
再見小時候
再見小時候 2021-01-04 21:34

How to set to 1 efficiently with AVX2

  1. first N bits
  2. last N bits

of __m256i, setting

相关标签:
1条回答
  • 2021-01-04 22:15

    The AVX2 shift instructions vpsllvd and vpsrlvd have the nice property that shift counts greater than or equal to 32 lead to zero integers within the ymm register. In other words: the shift counts are not masked, in contrast to the shift counts for the x86 scalar shift instructions.

    Therefore the code is fairly simple:

    /*
    gcc -O3 -m64 -Wall -mavx2 -march=broadwell avx2_bit_mask.c
    */
    #include <immintrin.h>
    #include <stdio.h>
    
    __m256i bit_mask_avx2_msb(unsigned int n)      
    {           
        __m256i ones       = _mm256_set1_epi32(-1);
        __m256i cnst32_256 = _mm256_set_epi32(32,64,96,128, 160,192,224,256);
    
        __m256i shift      = _mm256_set1_epi32(n);   
                shift      = _mm256_subs_epu16(cnst32_256,shift);  
                      return _mm256_sllv_epi32(ones,shift);         
    }
    
    
    __m256i bit_mask_avx2_lsb(unsigned int n)               
    {           
        __m256i ones       = _mm256_set1_epi32(-1);
        __m256i cnst32_256 = _mm256_set_epi32(256,224,192,160, 128,96,64,32);
    
        __m256i shift      = _mm256_set1_epi32(n);   
                shift      = _mm256_subs_epu16(cnst32_256,shift);  
                      return _mm256_srlv_epi32(ones,shift);
    }
    
    
    int print_avx2_hex(__m256i ymm)
    {
        long unsigned int x[4];
            _mm256_storeu_si256((__m256i*)x,ymm);
            printf("%016lX %016lX %016lX %016lX\n", x[3],x[2],x[1],x[0]);
    
        return 0;
    }
    
    
    int main()
    {
        unsigned int i;
    
        for (i=0;i<259;i++){
            printf("bit_mask_avx2_lsb(%3d) ",i);
            print_avx2_hex(bit_mask_avx2_lsb(i));
        }
        printf("\n");
    
        for (i=0;i<259;i++){
            printf("bit_mask_avx2_msb(%3d) ",i);
            print_avx2_hex(bit_mask_avx2_msb(i));
        }
        printf("\n");
    
    
        return 0;
    }
    

    The results are:

    $ ./a.out
    bit_mask_avx2_lsb(  0) 0000000000000000 0000000000000000 0000000000000000 0000000000000000
    bit_mask_avx2_lsb(  1) 0000000000000000 0000000000000000 0000000000000000 0000000000000001
    bit_mask_avx2_lsb(  2) 0000000000000000 0000000000000000 0000000000000000 0000000000000003
    bit_mask_avx2_lsb(  3) 0000000000000000 0000000000000000 0000000000000000 0000000000000007
    bit_mask_avx2_lsb(  4) 0000000000000000 0000000000000000 0000000000000000 000000000000000F
    bit_mask_avx2_lsb(  5) 0000000000000000 0000000000000000 0000000000000000 000000000000001F
    bit_mask_avx2_lsb(  6) 0000000000000000 0000000000000000 0000000000000000 000000000000003F
    bit_mask_avx2_lsb(  7) 0000000000000000 0000000000000000 0000000000000000 000000000000007F
    bit_mask_avx2_lsb(  8) 0000000000000000 0000000000000000 0000000000000000 00000000000000FF
    bit_mask_avx2_lsb(  9) 0000000000000000 0000000000000000 0000000000000000 00000000000001FF
    bit_mask_avx2_lsb( 10) 0000000000000000 0000000000000000 0000000000000000 00000000000003FF
    bit_mask_avx2_lsb( 11) 0000000000000000 0000000000000000 0000000000000000 00000000000007FF
    ...
    bit_mask_avx2_lsb(124) 0000000000000000 0000000000000000 0FFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
    bit_mask_avx2_lsb(125) 0000000000000000 0000000000000000 1FFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
    bit_mask_avx2_lsb(126) 0000000000000000 0000000000000000 3FFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
    bit_mask_avx2_lsb(127) 0000000000000000 0000000000000000 7FFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
    bit_mask_avx2_lsb(128) 0000000000000000 0000000000000000 FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
    bit_mask_avx2_lsb(129) 0000000000000000 0000000000000001 FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
    bit_mask_avx2_lsb(130) 0000000000000000 0000000000000003 FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
    bit_mask_avx2_lsb(131) 0000000000000000 0000000000000007 FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
    bit_mask_avx2_lsb(132) 0000000000000000 000000000000000F FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
    ...
    bit_mask_avx2_lsb(248) 00FFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
    bit_mask_avx2_lsb(249) 01FFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
    bit_mask_avx2_lsb(250) 03FFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
    bit_mask_avx2_lsb(251) 07FFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
    bit_mask_avx2_lsb(252) 0FFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
    bit_mask_avx2_lsb(253) 1FFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
    bit_mask_avx2_lsb(254) 3FFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
    bit_mask_avx2_lsb(255) 7FFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
    bit_mask_avx2_lsb(256) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
    bit_mask_avx2_lsb(257) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
    bit_mask_avx2_lsb(258) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
    
    
    
    bit_mask_avx2_msb(  0) 0000000000000000 0000000000000000 0000000000000000 0000000000000000
    bit_mask_avx2_msb(  1) 8000000000000000 0000000000000000 0000000000000000 0000000000000000
    bit_mask_avx2_msb(  2) C000000000000000 0000000000000000 0000000000000000 0000000000000000
    bit_mask_avx2_msb(  3) E000000000000000 0000000000000000 0000000000000000 0000000000000000
    bit_mask_avx2_msb(  4) F000000000000000 0000000000000000 0000000000000000 0000000000000000
    bit_mask_avx2_msb(  5) F800000000000000 0000000000000000 0000000000000000 0000000000000000
    bit_mask_avx2_msb(  6) FC00000000000000 0000000000000000 0000000000000000 0000000000000000
    bit_mask_avx2_msb(  7) FE00000000000000 0000000000000000 0000000000000000 0000000000000000
    bit_mask_avx2_msb(  8) FF00000000000000 0000000000000000 0000000000000000 0000000000000000
    bit_mask_avx2_msb(  9) FF80000000000000 0000000000000000 0000000000000000 0000000000000000
    bit_mask_avx2_msb( 10) FFC0000000000000 0000000000000000 0000000000000000 0000000000000000
    bit_mask_avx2_msb( 11) FFE0000000000000 0000000000000000 0000000000000000 0000000000000000
    ...
    bit_mask_avx2_msb(124) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFF0 0000000000000000 0000000000000000
    bit_mask_avx2_msb(125) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFF8 0000000000000000 0000000000000000
    bit_mask_avx2_msb(126) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFC 0000000000000000 0000000000000000
    bit_mask_avx2_msb(127) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFE 0000000000000000 0000000000000000
    bit_mask_avx2_msb(128) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF 0000000000000000 0000000000000000
    bit_mask_avx2_msb(129) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF 8000000000000000 0000000000000000
    bit_mask_avx2_msb(130) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF C000000000000000 0000000000000000
    bit_mask_avx2_msb(131) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF E000000000000000 0000000000000000
    bit_mask_avx2_msb(132) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF F000000000000000 0000000000000000
    ...
    bit_mask_avx2_msb(248) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFF00
    bit_mask_avx2_msb(249) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFF80
    bit_mask_avx2_msb(250) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFC0
    bit_mask_avx2_msb(251) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFE0
    bit_mask_avx2_msb(252) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFF0
    bit_mask_avx2_msb(253) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFF8
    bit_mask_avx2_msb(254) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFC
    bit_mask_avx2_msb(255) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFE
    bit_mask_avx2_msb(256) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
    bit_mask_avx2_msb(257) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
    bit_mask_avx2_msb(258) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
    

    For a value n, with 256<=n<=65535, all bits are set to one, as one might expect. The upper limit of 65535 is due to the 16-bit saturated arithmetic of _mm256_subs_epu16(). With n=65536 the bitmask (the output value) is zero. It is possible to modify the code such that all bits are set to one for the range of 256<=n<=INT_MAX. This can be achieved by replacing shift = _mm256_subs_epu16(cnst32_256,shift); with

        __m256i mask       = _mm256_cmpgt_epi32(cnst32_256,shift);
                shift      = _mm256_sub_epi32(cnst32_256,shift);
                shift      = _mm256_and_si256(shift,mask);
    

    These three intrinsics more or less emulate _mm256_subs_epu32(cnst32_256,shift), which doesn't exist.

    0 讨论(0)
提交回复
热议问题