Matrix transpose and population count

前端 未结 3 1467
旧巷少年郎
旧巷少年郎 2021-01-22 18:03

I have a square boolean matrix M of size N, stored by rows and I want to count the number of bits set to 1 for each column.

For instance for n=4:

1101
01         


        
相关标签:
3条回答
  • 2021-01-22 18:41

    I made some benchmark between the two approaches:

    1. transpose + popcount
    2. update row by row

    I wrote a naive version and an AVX2 one for both approaches. I used some functions (found on stackoverflow or elsewhere) for the AVX2 "transpose+popcount" approach.

    In my test, I make the assumption that the input is a nbRowsx32 matrix in a bits packed format (nbRows itself being a multiple of 32); the matrix is therefore stored as an array of uint32_t.

    The code is the following:

    #include <cinttypes>
    #include <cstdio>
    #include <cstring>
    #include <cmath>
    #include <cassert>
    #include <chrono>
    #include <immintrin.h>
    #include <asmlib.h>
    
    using namespace std;
    using namespace std::chrono;
    
    // see https://stackoverflow.com/questions/24225786/fastest-way-to-unpack-32-bits-to-a-32-byte-simd-vector
    static __m256i expand_bits_to_bytes (uint32_t x);
    
    // see https://mischasan.wordpress.com/2011/10/03/the-full-sse2-bit-matrix-transpose-routine/
    static void sse_trans(char const *inp, char *out);
    
    static double deviation (double n, double sum2, double sum);
    
    ////////////////////////////////////////////////////////////////////////////////
    // Naive approach (matrix transposition)
    ////////////////////////////////////////////////////////////////////////////////
    void test_transpose_popcnt_naive (uint64_t nbRows, const uint32_t* bitmap, uint64_t*  globalSums)
    {
        assert (nbRows%32==0);
    
        uint8_t transpo[32][32];  memset (transpo, 0, sizeof(transpo));
    
        for (uint64_t k=0; k<nbRows; k+=32)
        {
            // We unpack and transpose the input into a 32x32 bytes matrix
            for (size_t row=0; row<32; row++)
            {
                for (size_t col=0; col<32; col++)  {  transpo[col][row] = (bitmap[k+row] >> col) & 1 ;  }
            }
    
            for (size_t row=0; row<32; row++)
            {
                // We popcount the current row
                u_int8_t sum=0;
                for (size_t col=0; col<32; col++)  {  sum += transpo[row][col];  }
    
                // We update the corresponding global sum
                globalSums[row] += sum;
            }
        }
    }
    
    ////////////////////////////////////////////////////////////////////////////////
    // Naive approach (row by row)
    ////////////////////////////////////////////////////////////////////////////////
    void test_update_row_by_row_naive (uint64_t nbRows, const uint32_t* bitmap, uint64_t*  globalSums)
    {
        for (uint64_t row=0; row<nbRows; row++)
        {
            for (size_t col=0; col<32; col++)
            {
                globalSums[col] += (bitmap[row] >> col) & 1;
            }
        }
    }
    
    ////////////////////////////////////////////////////////////////////////////////
    // AVX2 (matrix transposition + popcount)
    ////////////////////////////////////////////////////////////////////////////////
    void test_transpose_popcnt_avx2 (uint64_t nbRows, const uint32_t* bitmap, uint64_t*  globalSums)
    {
        assert (nbRows%32==0);
    
        uint32_t transpo[32];
    
        const uint32_t* loop = bitmap;
        for (uint64_t k=0; k<nbRows; loop+=32, k+=32)
        {
            // We transpose the input as a 32x32 bytes matrix
            sse_trans ((const char*)loop, (char*)transpo);
    
            // We update the global sums
            for (size_t i=0; i<32; i++)
            {
                globalSums[i] += __builtin_popcount (transpo[i]);
            }
        }
    }
    
    ////////////////////////////////////////////////////////////////////////////////
    // AVX2 approach (update totals row by row)
    ////////////////////////////////////////////////////////////////////////////////
    
    // Note: we use template specialization to unroll some portions of a loop
    template<int N>
    void UpdateLocalSums (__m256i& localSums, const uint32_t* bitmap, uint64_t& k)
    {
        // We update the local sums with the current row
        localSums = _mm256_sub_epi8 (localSums, expand_bits_to_bytes (bitmap[k++]));
    
        // Go recursively
        UpdateLocalSums<N-1>(localSums, bitmap, k);
    }
    
    template<>
    void UpdateLocalSums<0> (__m256i& localSums, const uint32_t* bitmap, uint64_t& k)
    {
    }
    
    // Dillon Davis proposal: use 4 registers holding uint32_t values and update them from local sums with AVX2
    #define USE_AVX2_FOR_GRAND_TOTALS 1
    
    void test_update_row_by_row_avx2 (uint64_t nbRows, const uint32_t* bitmap, uint64_t*  globalSums)
    {
        union U256i {  __m256i v;   uint8_t a[32];  uint32_t b[8];  };
    
        // We use 1 register for updating local totals
        __m256i   localSums = _mm256_setzero_si256();
    
    #ifdef USE_AVX2_FOR_GRAND_TOTALS
        // Dillon Davis proposal: use 4 registers holding uint32_t values and update them from local sums with AVX2
        __m256i   globalSumsReg[4];  for (size_t r=0; r<4; r++)  {   globalSumsReg[r] = _mm256_setzero_si256(); }
    #endif
    
        uint64_t steps = nbRows / 255;
        uint64_t k=0;
    
        const int divisorOf255 = 5;
    
        // We iterate over all rows
        for (uint64_t i=0; i<steps; i++)
        {
            // we update the local totals (255*32=8160 additions)
            for (int j=0; j<255/divisorOf255; j++)
            {
                // unroll some portion of the 255 loop through template specialization
                UpdateLocalSums<divisorOf255>(localSums, bitmap, k);
            }
    
    #ifdef USE_AVX2_FOR_GRAND_TOTALS
            // Dillon Davis proposal: use 4 registers holding uint32_t values and update them from local sums
    
            // We take the 128 high bits of the local sums
            __m256i   localSums2 = _mm256_broadcastsi128_si256(_mm256_extracti128_si256(localSums,1));
    
            globalSumsReg[0] = _mm256_add_epi32 (globalSumsReg[0],
                _mm256_cvtepu8_epi32 (_mm256_castsi256_si128 (_mm256_srli_si256(localSums, 0)))
            );
            globalSumsReg[1] = _mm256_add_epi32 (globalSumsReg[1],
                _mm256_cvtepu8_epi32 (_mm256_castsi256_si128 (_mm256_srli_si256(localSums, 8)))
            );
            globalSumsReg[2] = _mm256_add_epi32 (globalSumsReg[2],
                _mm256_cvtepu8_epi32 (_mm256_castsi256_si128 (_mm256_srli_si256(localSums2, 0)))
            );
            globalSumsReg[3] = _mm256_add_epi32 (globalSumsReg[3],
                _mm256_cvtepu8_epi32 (_mm256_castsi256_si128 (_mm256_srli_si256(localSums2, 8)))
            );
    #else
            // we update the global totals
            U256i tmp = { localSums };
            for (size_t k=0; k<32; k++)  {  globalSums[k] += tmp.a[k];  }
    #endif
            // we reset the local totals
            localSums = _mm256_setzero_si256();
        }
    
    #ifdef USE_AVX2_FOR_GRAND_TOTALS
        // We update the global totals into the final uint32_t array
        for (size_t r=0; r<4; r++)
        {
            U256i tmp = { globalSumsReg[r] };
            for (size_t k=0; k<8; k++)  {  globalSums[r*8+k] += tmp.b[k];  }
        }
    #endif
    
        // we update the remaining local totals
        for (uint64_t i=steps*255; i<nbRows; i++)
        {
            UpdateLocalSums<1>(localSums, bitmap, k);
        }
    
        // we update the global totals
        U256i tmp = { localSums };
        for (size_t k=0; k<32; k++)  {  globalSums[k] += tmp.a[k];  }
    }
    
    ////////////////////////////////////////////////////////////////////////////////
    void execute (
        const char* name,
        void (*fct)(uint64_t nbRows, const uint32_t* bitmap, uint64_t*  globalSums),
        size_t nbRuns,
        uint64_t nbRows,
        u_int32_t* bitmap
    )
    {
        uint64_t  sums[32];
    
        double timeTotal=0;
        double cycleTotal=0;
        double timeTotal2=0;
        double cycleTotal2=0;
        uint64_t check=0;
    
        for (size_t n=0; n<nbRuns; n++)
        {
            memset(sums,0,sizeof(sums));
    
            // We want both time and cpu cycles information
            milliseconds t0 = duration_cast< milliseconds >(system_clock::now().time_since_epoch());
            uint64_t c0 = ReadTSC();
    
            // We run the test
            (*fct) (nbRows, bitmap, sums);
    
            uint64_t c1 = ReadTSC();
            milliseconds t1 = duration_cast< milliseconds >(system_clock::now().time_since_epoch());
    
            timeTotal  += (t1-t0).count();
            cycleTotal += (double)(c1-c0) / nbRows;
    
            timeTotal2  += (t1-t0).count() * (t1-t0).count();
            cycleTotal2 += ((double)(c1-c0) / nbRows) * ((double)(c1-c0) / nbRows);
    
            // We compute some dummy checksum
            for (size_t k=0; k<32; k++)  {  check += sums[k];  }
        }
    
        printf ("%-21s |  %5.0lf (%5.1lf)            |  %5.2lf (%4.2lf)          |  %.3lf           |  0x%lx\n",
            name,
            timeTotal / nbRuns,
            deviation (nbRuns, timeTotal2, timeTotal),
            cycleTotal/nbRuns,
            deviation (nbRuns, cycleTotal2, cycleTotal),
            check,
            nbRows * cycleTotal / timeTotal / 1000000.0
        );
    }
    
    ////////////////////////////////////////////////////////////////////////////////
    int main(int argc, char **argv)
    {
        // We set rows number as 2^n where n is the provided argument
        // For simplification, we assume that the rows number is a multiple of 32
        uint64_t nbRows = 1ULL << (argc>1 ? atoi(argv[1]) : 28);
        size_t   nbRuns = argc>2 ? atoi(argv[2]) : 10;
    
        // We build an bitmap of size nbRows*32
        uint32_t* bitmap = new uint32_t[nbRows];
        if (bitmap==nullptr)
        {
            fprintf(stderr, "unable to allocate the bitmap\n");
            exit(1);
        }
    
        // We fill the bitmap with random values
        srand(time(nullptr));
        for (uint64_t i=0; i<nbRows; i++)    {  bitmap[i] = rand() & 0xFFFFFFFF;  }
    
        printf ("\n");
        printf ("nbRows=%ld  nbRuns=%ld\n", nbRows, nbRuns);
        printf ("------------------------------------------------------------------------------------------------------------\n");
        printf ("name                  | time in msec : mean (sd)  | cycles/row : mean (sd) | frequency in GHz | checksum\n");
        printf ("------------------------------------------------------------------------------------------------------------\n");
    
        // We launch the benchmark
        execute ("naive (transpo)   ", test_transpose_popcnt_naive,  nbRuns, nbRows, bitmap);
        execute ("naive (row by row)", test_update_row_by_row_naive, nbRuns, nbRows, bitmap);
        execute ("AVX2  (transpo)   ", test_transpose_popcnt_avx2,   nbRuns, nbRows, bitmap);
        execute ("AVX2  (row by row)", test_update_row_by_row_avx2,  nbRuns, nbRows, bitmap);
    
        printf ("\n");
    
        // Some clean up
        delete[] bitmap;
    
        return EXIT_SUCCESS;
    }
    
    ////////////////////////////////////////////////////////////////////////////////
    __m256i expand_bits_to_bytes(uint32_t x)
    {
        __m256i xbcast = _mm256_set1_epi32(x);
    
        // Each byte gets the source byte containing the corresponding bit
        __m256i shufmask = _mm256_set_epi64x(
            0x0303030303030303, 0x0202020202020202,
            0x0101010101010101, 0x0000000000000000);
        __m256i shuf     = _mm256_shuffle_epi8(xbcast, shufmask);
        __m256i andmask  = _mm256_set1_epi64x(0x8040201008040201);  // every 8 bits -> 8 bytes, pattern repeats.
        __m256i isolated_inverted = _mm256_and_si256(shuf, andmask);
    
        // Avoid an _mm256_add_epi8 thanks to Peter Cordes's comment
        return _mm256_cmpeq_epi8(isolated_inverted, andmask);
    }
    
    ////////////////////////////////////////////////////////////////////////////////
    void sse_trans(char const *inp, char *out)
    {
    #define INP(x,y) inp[(x)*4 + (y)/8]
    #define OUT(x,y) out[(y)*4 + (x)/8]
    
        int rr, cc, i, h;
        union { __m256i x; uint8_t b[32]; } tmp;
    
        for (cc = 0; cc < 32; cc += 8)
        {
            for (i = 0; i < 32; ++i)
                tmp.b[i] = INP(i, cc);
    
            for (i = 8; i--; tmp.x = _mm256_slli_epi64(tmp.x, 1))
                *(uint32_t*)&OUT(0, cc + i) = _mm256_movemask_epi8(tmp.x);
        }
    }
    
    ////////////////////////////////////////////////////////////////////////////////
    double deviation (double n, double sum2, double sum)  {  return sqrt (sum2/n - (sum/n)*(sum/n)); }
    

    Some remarks:

    • I used the Agner Fog's asmlib to have a function that returns CPU cycles
    • The compilation command is g++ -O3 -march=native ../Test.cpp -o ./Test -laelf64
    • The gcc version is 7.3.1
    • The CPU is Intel(R) Core(TM) i7-6700HQ CPU @ 2.60GHz
    • I compute some dummy checksum to compare the results of the different tests

    Now the results:

    ------------------------------------------------------------------------------------------------------------
    name                  | time in msec : mean (sd)  | cycles/row : mean (sd) | frequency in GHz | checksum
    ------------------------------------------------------------------------------------------------------------
    naive (transpo)       |   4548 ( 36.5)            |  43.91 (0.35)          |  2.592           |  0x9affeb5a6
    naive (row by row)    |   3033 ( 11.0)            |  29.29 (0.11)          |  2.592           |  0x9affeb5a6
    AVX2  (transpo)       |    767 ( 12.8)            |   7.40 (0.12)          |  2.592           |  0x9affeb5a6
    AVX2  (row by row)    |    130 (  4.0)            |   1.25 (0.04)          |  2.591           |  0x9affeb5a6
    

    So it seems that the "row by row" in AVX2 is the best so far.

    Note that when I saw this result (less than 2 cycles per row), I made no more effort to optimize the AVX2 "transpose+popcount" method, which should be feasable by computing several popcounts in parallel (I may test it later).

    0 讨论(0)
  • 2021-01-22 18:42

    I eventually wrote another implementation, following the high entropy SWAR approach proposed by Peter Cordes. This implementation is recursive and relies on C++ template specialization.

    The global idea is to fill N-bit accumulators to their maximum without carry overflow (this is where recursion is used). When these accumulators are filled, we update the grand totals and we start again with new N-bit accumulators to fill until all rows have been processed.

    Here is the code (see function test_SWAR_recursive):

    #include <immintrin.h>
    #include <cassert>
    #include <chrono>
    #include <cinttypes>
    #include <cmath>
    #include <cstdio>
    #include <cstring>
    
    using namespace std;
    using namespace std::chrono;
    
    // avoid the #include <asmlib.h>
    extern "C" u_int64_t ReadTSC();
    
    static double deviation (double n, double sum2, double sum)  {  return sqrt (sum2/n - (sum/n)*(sum/n)); }
    
    ////////////////////////////////////////////////////////////////////////////////
    // Recursive SWAR approach (with template specialization)
    ////////////////////////////////////////////////////////////////////////////////
    
    template<int DEPTH>
    struct RecursiveSWAR
    {
        // Number of accumulators for current depth
        static const int N = 1<<DEPTH;
    
        // Array of N-bit accumulators
        typedef __m256i Array[N];
    
        // Magic numbers (0x55555555, 0x33333333, ...) computed recursively
        static const u_int32_t MAGIC_NUMBER =
            RecursiveSWAR<DEPTH-1>::MAGIC_NUMBER
                * (1 + (1<<(1<<(DEPTH-1))))
                / (1 + (1<<(1<<(DEPTH+0))));
    
        static void fillAccumulators (u_int32_t*& begin, const u_int32_t* end, Array accumulators)
        {
            // We reset the N-bit accumulators
            for (int i=0; i<N; i++)  {  accumulators[i] = _mm256_setzero_si256();  }
    
            // We check (only for depth big enough) that we have still rows to process
            if (DEPTH>=3)  if (begin>=end)  { return; }
    
            typename RecursiveSWAR<DEPTH-1>::Array accumulatorsMinusOne;
    
            // We load a register with the mask
            __m256i mask = _mm256_set1_epi32 (RecursiveSWAR<DEPTH-1>::MAGIC_NUMBER);
    
            // We fill the N-bit accumulators to their maximum capacity without carry overflow
            for (int i=0; i<N+1; i++)
            {
                // We fill (N-1)-bit accumulators recursively
                RecursiveSWAR<DEPTH-1>::fillAccumulators (begin, end, accumulatorsMinusOne);
    
                // We update the N-bit accumulators from the (N-1)-bit accumulators
                for (int j=0; j<RecursiveSWAR<DEPTH-1>::N; j++)
                {
                    // LOW part
                    accumulators[2*j+0] = _mm256_add_epi32 (
                        accumulators[2*j+0],
                        _mm256_and_si256 (
                            accumulatorsMinusOne[j],
                            mask
                        )
                    );
    
                    // HIGH part
                    accumulators[2*j+1] = _mm256_add_epi32 (
                        accumulators[2*j+1],
                        _mm256_and_si256 (
                            _mm256_srli_epi32 (
                                accumulatorsMinusOne[j],
                                RecursiveSWAR<DEPTH-1>::N
                            ),
                            mask
                        )
                    );
                }
            }
        }
    };
    
    // Template specialization for DEPTH=0
    template<>
    struct RecursiveSWAR<0>
    {
        static const int N = 1;
    
        typedef __m256i Array[N];
    
        static const u_int32_t MAGIC_NUMBER = 0x55555555;
    
        static void fillAccumulators (u_int32_t*& begin, const u_int32_t* end, Array result)
        {
            // We just load 8 rows in the AVX2 register
            result[0] = _mm256_loadu_si256 ((__m256i*)begin);
    
            // We update the iterator
            begin += 1*sizeof(__m256i)/sizeof(u_int32_t);
        }
    };
    
    template<int DEPTH> struct TypeInfo  { };
    template<> struct TypeInfo<3>  {  typedef u_int8_t  Type; };
    template<> struct TypeInfo<4>  {  typedef u_int16_t Type; };
    template<> struct TypeInfo<5>  {  typedef u_int32_t Type; };
    
    unsigned char reversebits (unsigned char b)
    {
        return ((b * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
    }
    
    void test_SWAR_recursive (uint64_t nbRows, const uint32_t* bitmap, uint32_t*  globalSums)
    {
        static const int DEPTH = 4;
    
        RecursiveSWAR<DEPTH>::Array accumulators;
    
              uint32_t* begin = (uint32_t*) bitmap;
        const uint32_t* end   = bitmap + nbRows;
    
        // We reset the grand totals
        for (int i=0; i<32; i++)  { globalSums[i] = 0; }
    
        while (begin < end)
        {
            // We fill the N-bit accumulators to the maximum without overflow
            RecursiveSWAR<DEPTH>::fillAccumulators (begin, end, accumulators);
    
            // We update grand totals from the filled N-bit accumulators
            for (int i=0; i<RecursiveSWAR<DEPTH>::N; i++)
            {
                int r = reversebits(i) >> (8-DEPTH);
                u_int32_t* sums   = globalSums+r;
                TypeInfo<DEPTH>::Type*  values = (TypeInfo<DEPTH>::Type*) (accumulators+i);
    
                for (int j=0; j<8*(1<<(5-DEPTH)); j++)
                {
                    sums[(j*RecursiveSWAR<DEPTH>::N) % 32] += values[j];
                }
            }
        }
    }
    
    ////////////////////////////////////////////////////////////////////////////////
    void execute (
        const char* name,
        void (*fct)(uint64_t nbRows, const uint32_t* bitmap, uint32_t*  globalSums),
        size_t nbRuns,
        uint64_t nbRows,
        u_int32_t* bitmap
    )
    {
        uint32_t  sums[32];
    
        double timeTotal=0;
        double cycleTotal=0;
        double timeTotal2=0;
        double cycleTotal2=0;
        uint64_t check=0;
    
        for (size_t n=0; n<nbRuns; n++)
        {
            memset(sums,0,sizeof(sums));
    
            // We want both time and cpu cycles information
            milliseconds t0 = duration_cast< milliseconds >(system_clock::now().time_since_epoch());
            uint64_t c0 = ReadTSC();
    
            // We run the test
            (*fct) (nbRows, bitmap, sums);
    
            uint64_t c1 = ReadTSC();
            milliseconds t1 = duration_cast< milliseconds >(system_clock::now().time_since_epoch());
    
            timeTotal  += (t1-t0).count();
            cycleTotal += (double)(c1-c0) / nbRows;
    
            timeTotal2  += (t1-t0).count() * (t1-t0).count();
            cycleTotal2 += ((double)(c1-c0) / nbRows) * ((double)(c1-c0) / nbRows);
    
            // We compute some dummy checksum
            for (size_t k=0; k<32; k++)  {  check += (k+1)*sums[k];  }
        }
    
        printf ("%-21s |  %5.0lf (%5.1lf)            |  %5.2lf (%5.3lf)         |  %.3lf           |  0x%lx\n",
            name,
            timeTotal / nbRuns,
            deviation (nbRuns, timeTotal2, timeTotal),
            cycleTotal/nbRuns,
            deviation (nbRuns, cycleTotal2, cycleTotal),
            nbRows * cycleTotal / timeTotal / 1000000.0,
            check/nbRuns
        );
    }
    
    
    ////////////////////////////////////////////////////////////////////////////////
    int main(int argc, char **argv)
    {
        // We set rows number as 2^n where n is the provided argument
        // For simplification, we assume that the rows number is a multiple of 32
        uint64_t nbRows = 1ULL << (argc>1 ? atoi(argv[1]) : 28);
        size_t   nbRuns = argc>2 ? atoi(argv[2]) : 10;
    
        // We build an bitmap of size nbRows*32
        uint64_t actualNbRows = nbRows + 100000;
        uint32_t* bitmap = (uint32_t*)_mm_malloc(sizeof(uint32_t)*actualNbRows, 256);
        if (bitmap==nullptr)
        {
            fprintf(stderr, "unable to allocate the bitmap\n");
            exit(1);
        }
        memset (bitmap, 0, sizeof(u_int32_t)*actualNbRows);
    
        // We fill the bitmap with random values
        //    srand(time(nullptr));
        for (uint64_t i=0; i<nbRows; i++)    {  bitmap[i] = rand() & 0xFFFFFFFF;  }
    
    
        printf ("\n");
        printf ("nbRows=%ld  nbRuns=%ld\n", nbRows, nbRuns);
        printf ("------------------------------------------------------------------------------------------------------------\n");
        printf ("name                  | time in msec : mean (sd)  | cycles/row : mean (sd) | frequency in GHz | checksum\n");
        printf ("------------------------------------------------------------------------------------------------------------\n");
    
        // We launch the benchmark
        execute ("AVX2  (SWAR rec)  ", test_SWAR_recursive,          nbRuns, nbRows, bitmap);
    
        printf ("\n");
    
        // Some clean up
        _mm_free (bitmap);
    
        return EXIT_SUCCESS;
    }
    

    The size of the accumulators is 2DEPTH in this code. Note that this implementation is valid up to DEPTH=5. For DEPTH=4, here are the performance results compared to the implementation of Peter Cordes (named high entropy SWAR):

    The graph gives the number of cycles required to process a row (of 32 items) as a function of the number of rows of the matrix. As expected, the results are pretty similar since the main idea is the same. It is interesting to note the three parts of the graph:

    • constant value for log2(n)<=20
    • increasing value for log2(n) between 20 and 22
    • constant value for log2(n)>=22

    I guess that CPU caches properties can explain this behaviour.

    0 讨论(0)
  • 2021-01-22 18:52

    Related: Count each bit-position separately over many 64-bit bitmasks, with AVX but not AVX2 and https://github.com/mklarqvist/positional-popcount


    I had another idea which I haven't finished writing up nicely.

    Godbolt link to messy work-in-progress which doesn't have correct loop bounds / cleanup, but for large buffers runs ~3x faster than @edrezen's version on my Skylake i7-6700k, with g++7.3 -O3 -march=native. See the test_SWAR_avx2 function. (I know it doesn't compile on Godbolt; Agner Fog's asmlib.h isn't present.)

    I might have some columns in the wrong order, too, but from stepping through the asm I think it's doing the right amount of work. i.e. any necessary bugfixes won't slow it down.

    I used 16-bit accumulators, so another outer loop might be necessary if you care about inputs large enough to overflow 16-bit per-column counters.

    Interesting observation: An earlier buggy version of my loop used sum0123 twice in store_globalsums_from_vec16, leaving sum4567 unused, so it optimized away in the main loop. With less work, gcc fully unrolled the large for(int i=0 ; i<5 ; i++) loop, and the code ran slower, like about 1 cycle per byte instead of 0.5. The loop was probably too big for the uop cache or something (I didn't profile yet but a front-end decode bottleneck would explain it). For some reason @edrezen's version is only running at about 1.5c/B for me, not the ~1.25 reported in the answer. My CPU is actually running 3.9GHz, but Agner Fog's library detects it at 4.0, but that's not enough to explain it.

    Also, gcc spills sum4567_16bit to the stack, so we're already pushing the boundary of register pressure without AVX512. It's updated infrequently and isn't a problem, but needing more accumulators in the inner loop could be.


    Your data layout isn't clear about when the number of columns isn't 32.

    It seems that for each uint32_t chunk of 32 columns, you have all the rows stored contiguously in memory. i.e. looping over the rows for a column is efficient. If you had more than 32 columns, the rows for columns 32..63 will be contiguous and come after all the rows for columns 0..31.

    (If instead you have all the columns for a single row contiguous, you could still use this idea, but might need to spill/reload some accumulators to memory, or let the compiler do that for you if it makes good choices.)

    So loading a 32-byte (8 dword) vector gets 8 rows of data for one column chunk. That's extremely convenient, and allows widening from 1-bit (in memory) to 2-bit accumulators, then grab more data before we widen to 4-bit, and so on, summing along the way so we get significant work done while the data is still dense. (Rather than only adding 1 bit (0 or 1) per byte to vector accumulators.)

    The more we unroll, the more data we can grab from memory to make better use of the coding space in our vectors. i.e. our variables have higher entropy. Throwing around more data (in terms of bits of memory that contributed to it) per vpaddb/w/d/q or unpack/shuffle instruction is a Good Thing.

    Accumulators narrower than 1 byte within a SIMD vector is basically an https://en.wikipedia.org/wiki/SWAR technique, where you have to AND away bits that you shift past an element boundary, because we don't have SIMD element boundaries to do it for us. (And we avoid overflow anyway, so ADD carrying into the next element isn't a problem.)

    Each inner loop iteration:

    • take a vector of data from the same columns in each of 2 or 3 (groups of) rows. So you either have 3 * 8 rows from one chunk of 32 columns, or 3 rows of 256 columns.

    • mask them with set1(0b01010101) to get the even (low) bits, and with (vec>>1) & mask (_mm256_srli_epi32(v,1)) to get the odd (high) bits. Use _mm256_add_epi8 to accumulate within those 2-bit accumulators. They can't overflow with only 3 ones, so carry-propagation boundaries don't actually matter.

      Each byte of your vector has 4 separate vertical sums, and you have two vectors (odd/even).

    • Repeat the above again, to get another pair of vectors from 3 vectors of data from memory.

    • Combine again to get 4 vectors of 4-bit accumulators (with possible values 0..6). Still without mixing bits from within a single 32-bit element, of course, because we must never do that. Shifts only move bits for odd / high columns to the bottom of the 2-bit or 4-bit unit that contains them so they can be added with bits that were moved the same way in other vectors.

    • _mm256_unpacklo/hi_epi8 and mask or shift+mask to get 8-bit accumulators

    • Put the above in a loop that runs up to 5 times, so the 0..12 accumulator values go up to 0..60 (i.e. leaving 2 bits of headroom for unpacking the 8-bit accumulators, using all their coding space.)


    If you have the data layout from your answer, then we can add data from dword elements within the same vector. We can do that so we don't run out of registers when widening our accumulators up to 16-bit (because x86-64 only has 16 YMM registers, and we need some for constants.)

    • _mm256_unpacklo/hi_epi16 and add, to interleave pairs of 8-bit counters so a group of counters for the same column has expanded from a dword to a qword.

    Repeat this general idea to reduce the number of registers (or __m256i variables) your accumulators are spread over.

    Efficiently handling the lack of a lane-crossing 2-input byte or word shuffle is inconvenient, but it's a pretty small part of the total work. vextracti128 / vpaddb xmm -> vpmovzxbw worked well enough.

    0 讨论(0)
提交回复
热议问题