SSE 4 popcount for 16 8-bit values?

I have the following code which compiles with GCC using the flag -msse4 but the problem is that the pop count only gets the last four 8-bits of the converted __m128i type. Basically what I want is to count all 16 numbers inside the __m128i type but I'm not sure what intrinsic function call to make after creating the variable popA. Somehow popA has to be converted into an integer that contains all the 128-bits of information? I suppose theres _mm_cvtsi128_si64 and using a few shuffle few operations but my OS is 32-bit. Is there only the shuffle method and using _mm_cvtsi128_si32?

EDIT: If the shuffle method is the only option I need help implementing it for my 32-bit OS, please.

Heres the code.

#include <stdio.h>
#include <smmintrin.h>
#include <emmintrin.h>

int main(void)
{
    int A = 1;
    __m128i popA = _mm_set_epi8( A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A);

    unsigned int integer = _mm_cvtsi128_si32(popA);
    //long long LONG = _mm_cvtsi128_si64(popA);//my OS is 32-bits so no luck here

    printf("integer = %d\n", integer);
    int pop = _mm_popcnt_u32(integer);
    //int popLONG = _mm_popcnt_u64(LONG);
    printf("popcount = %d\n", pop);
    //printf("popcount LONG = %d\n", popLONG);

    return 0;
}

EDIT 2: This one finally runs (with GCC compiler flags -msse -msse2 -msse3 -msse4) although I'm not sure if the output for pop_count1() is correct.

Output: pop_count1(): 1799 1799 1799 1799 1799 1799 1799 1799

pop_count2():population count for each byte: 1 1 1 1 1 1 1 1 0 1 2 3 4 5 6 7

  #include <stdio.h>
#include <xmmintrin.h>
#include <emmintrin.h>
#include <mmintrin.h>
#include <stdint.h>
#include <tmmintrin.h>

void print128_num(__m128i var)
{
    uint16_t *val = (uint16_t*) &var;
    printf("pop_count1(): %i %i %i %i %i %i %i %i \n",
           val[0], val[1], val[2], val[3], val[4], val[5],
           val[6], val[7]);
}
static __m128i parallelPopcnt16bytes (__m128i xmm)//for pop_count2
{
    const __m128i mask4 = _mm_set1_epi8 (0x0F);
    const __m128i lookup = _mm_setr_epi8 (0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
   __m128i low, high, count;

   low = _mm_and_si128 (mask4, xmm);
   high = _mm_and_si128 (mask4, _mm_srli_epi16 (xmm, 4));
   count = _mm_add_epi8 (_mm_shuffle_epi8 (lookup, low), _mm_shuffle_epi8 (lookup, high));
   return count;
}
void pop_count1()
{
    int A = 1;
    __m128i in = _mm_set_epi8( A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A);
    __m128i bit0 = _mm_set1_epi8( 0x80 );
    __m128i mask0 = _mm_and_si128( in, bit0 );
    __m128i sum = _mm_cmpeq_epi8( mask0, _mm_setzero_si128() );

/* general pattern */
    __m128i bit1 = _mm_set1_epi8( 0x40 );
    __m128i mask1 = _mm_and_si128( in, bit1 );
    mask1 = _mm_cmpeq_epi8( mask1, _mm_setzero_si128() );
    sum = _mm_add_epi8( sum, mask1 );

/* next bit */
    __m128i bit2 = _mm_set1_epi8( 0x20 );
    __m128i mask2 = _mm_and_si128( in, bit2 );
    mask2 = _mm_cmpeq_epi8( mask2, _mm_setzero_si128() );
    sum = _mm_add_epi8( sum, mask2 );

    __m128i bit3 = _mm_set1_epi8( 0x10 );
    __m128i mask3 = _mm_and_si128( in, bit3 );
    mask3 = _mm_cmpeq_epi8( mask3, _mm_setzero_si128() );
    sum = _mm_add_epi8( sum, mask3 );

    __m128i bit4 = _mm_set1_epi8( 0x08 );
    __m128i mask4 = _mm_and_si128( in, bit4 );
    mask4 = _mm_cmpeq_epi8( mask4, _mm_setzero_si128() );
    sum = _mm_add_epi8( sum, mask4 );

    __m128i bit5 = _mm_set1_epi8( 0x04 );
    __m128i mask5 = _mm_and_si128( in, bit5 );
    mask5 = _mm_cmpeq_epi8( mask5, _mm_setzero_si128() );
    sum = _mm_add_epi8( sum, mask5 );

    __m128i bit6 = _mm_set1_epi8( 0x02 );
    __m128i mask6 = _mm_and_si128( in, bit6 );
    mask6 = _mm_cmpeq_epi8( mask6, _mm_setzero_si128() );
    sum = _mm_add_epi8( sum, mask6 );

    __m128i bit7 = _mm_set1_epi8( 0x01 );
    __m128i mask7 = _mm_and_si128( in, bit7 );
    mask7 = _mm_cmpeq_epi8( mask7, _mm_setzero_si128() );
    sum = _mm_add_epi8( sum, mask7 );

/* finish up */
    sum = _mm_sub_epi8( _mm_setzero_si128(), sum );

    print128_num(sum);
}
void pop_count2()
{
    int index;
    __m128i testVector = _mm_set_epi8 (1, 2, 4, 8, 16, 32, 64, 128, 0, 1, 3, 7, 15, 31, 63, 127);
    __m128i counts = parallelPopcnt16bytes (testVector);

    printf ("pop_count2():population count for each byte:");
    for (index = 15; index >= 0; index--)
        {
        uint8_t *bytes = (void *) &counts;
        printf (" %d", bytes [index]);
        }
    printf ("\n");
}
int main(void)
{
    pop_count1();
    pop_count2();

    return 0;
}

SSE 4 popcount for 16 8-bit values can be done in parallel this way:

#include <stdio.h>
#include <stdint.h>
#include <immintrin.h>

//----------------------------------------------------------------------------
//
// parallelPopcnt16bytes - find population count for 8-bit groups in xmm (16 groups)
//                         each byte of xmm result contains a value ranging from 0 to 8
//
static __m128i parallelPopcnt16bytes (__m128i xmm)
   {
    const __m128i mask4 = _mm_set1_epi8 (0x0F);
    const __m128i lookup = _mm_setr_epi8 (0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
   __m128i low, high, count;

   low = _mm_and_si128 (mask4, xmm);
   high = _mm_and_si128 (mask4, _mm_srli_epi16 (xmm, 4));
   count = _mm_add_epi8 (_mm_shuffle_epi8 (lookup, low), _mm_shuffle_epi8 (lookup, high));
   return count;
   }

//----------------------------------------------------------------------------

int main (void)
    {
    int index;
    __m128i testVector = _mm_set_epi8 (1, 2, 4, 8, 16, 32, 64, 128, 0, 1, 3, 7, 15, 31, 63, 127);
    __m128i counts = parallelPopcnt16bytes (testVector);

    printf ("population count for each byte:");
    for (index = 15; index >= 0; index--)
        {
        uint8_t *bytes = (void *) &counts;
        printf (" %d", bytes [index]);
        }
    printf ("\n");
    return 0;
    }

//----------------------------------------------------------------------------

popcnt was introduced simultaneously with the SSE4.2 ISA extension but does not operate on SSE vector registers. You will need a separate instruction for each individual result.

Furthermore it's not defined for 8-bit operands. You will need to pad to 16 bits if you need a count for each individual byte.

You could sum 8 bytes at a time in 64-bit registers, but that doesn't sound like what you're after.

Reference: The SSE4 manual.

SSE2 solution.

I haven't tested this, but you could AND the SSE register with 0x80808080… to get a 16-byte mask of all 1's or all 0's. Repeat for all 8 bits in a byte, and sum the masks. Since all 1's represents -1 in two's complement, negate the 16 bytes, and you have all the results.

The AND and comparison operations should be able to run in parallel. The chain of additions is dependent but it should still run plenty fast, and it fits in 32 instructions. (Only 7 additions needed.)

/* init */
__m128i bit0 = _mm_set1_epi8( 0x80 );
__m128i mask0 = _mm_and_si128( in, bit0 );
__m128i sum = _mm_cmpeq_epi8( mask0, _mm_setzero_si128() );

/* general pattern */
__m128i bit1 = _mm_set1_epi8( 0x40 );
__m128i mask1 = _mm_and_si128( in, bit1 );
mask1 = _mm_cmpeq_epi8( mask1, _mm_setzero_si128() );
sum = _mm_add_epi8( sum, mask1 );

/* next bit */
__m128i bit2 = _mm_set1_epi8( 0x20 );
__m128i mask2 = _mm_and_si128( in, bit2 );
mask2 = _mm_cmpeq_epi8( mask2, _mm_setzero_si128() );
sum = _mm_add_epi8( sum, mask2 );

...

/* finish up */
sum = _mm_sub_epi8( _mm_setzero_si128(), sum );

来源：https://stackoverflow.com/questions/17518774/sse-4-popcount-for-16-8-bit-values

标签

gcc

counter

sse

intrinsics

population