Matrix transpose and population count

前端 未结 3 1464
旧巷少年郎
旧巷少年郎 2021-01-22 18:03

I have a square boolean matrix M of size N, stored by rows and I want to count the number of bits set to 1 for each column.

For instance for n=4:

1101
01         


        
3条回答
  •  一整个雨季
    2021-01-22 18:42

    I eventually wrote another implementation, following the high entropy SWAR approach proposed by Peter Cordes. This implementation is recursive and relies on C++ template specialization.

    The global idea is to fill N-bit accumulators to their maximum without carry overflow (this is where recursion is used). When these accumulators are filled, we update the grand totals and we start again with new N-bit accumulators to fill until all rows have been processed.

    Here is the code (see function test_SWAR_recursive):

    #include 
    #include 
    #include 
    #include 
    #include 
    #include 
    #include 
    
    using namespace std;
    using namespace std::chrono;
    
    // avoid the #include 
    extern "C" u_int64_t ReadTSC();
    
    static double deviation (double n, double sum2, double sum)  {  return sqrt (sum2/n - (sum/n)*(sum/n)); }
    
    ////////////////////////////////////////////////////////////////////////////////
    // Recursive SWAR approach (with template specialization)
    ////////////////////////////////////////////////////////////////////////////////
    
    template
    struct RecursiveSWAR
    {
        // Number of accumulators for current depth
        static const int N = 1<::MAGIC_NUMBER
                * (1 + (1<<(1<<(DEPTH-1))))
                / (1 + (1<<(1<<(DEPTH+0))));
    
        static void fillAccumulators (u_int32_t*& begin, const u_int32_t* end, Array accumulators)
        {
            // We reset the N-bit accumulators
            for (int i=0; i=3)  if (begin>=end)  { return; }
    
            typename RecursiveSWAR::Array accumulatorsMinusOne;
    
            // We load a register with the mask
            __m256i mask = _mm256_set1_epi32 (RecursiveSWAR::MAGIC_NUMBER);
    
            // We fill the N-bit accumulators to their maximum capacity without carry overflow
            for (int i=0; i::fillAccumulators (begin, end, accumulatorsMinusOne);
    
                // We update the N-bit accumulators from the (N-1)-bit accumulators
                for (int j=0; j::N; j++)
                {
                    // LOW part
                    accumulators[2*j+0] = _mm256_add_epi32 (
                        accumulators[2*j+0],
                        _mm256_and_si256 (
                            accumulatorsMinusOne[j],
                            mask
                        )
                    );
    
                    // HIGH part
                    accumulators[2*j+1] = _mm256_add_epi32 (
                        accumulators[2*j+1],
                        _mm256_and_si256 (
                            _mm256_srli_epi32 (
                                accumulatorsMinusOne[j],
                                RecursiveSWAR::N
                            ),
                            mask
                        )
                    );
                }
            }
        }
    };
    
    // Template specialization for DEPTH=0
    template<>
    struct RecursiveSWAR<0>
    {
        static const int N = 1;
    
        typedef __m256i Array[N];
    
        static const u_int32_t MAGIC_NUMBER = 0x55555555;
    
        static void fillAccumulators (u_int32_t*& begin, const u_int32_t* end, Array result)
        {
            // We just load 8 rows in the AVX2 register
            result[0] = _mm256_loadu_si256 ((__m256i*)begin);
    
            // We update the iterator
            begin += 1*sizeof(__m256i)/sizeof(u_int32_t);
        }
    };
    
    template struct TypeInfo  { };
    template<> struct TypeInfo<3>  {  typedef u_int8_t  Type; };
    template<> struct TypeInfo<4>  {  typedef u_int16_t Type; };
    template<> struct TypeInfo<5>  {  typedef u_int32_t Type; };
    
    unsigned char reversebits (unsigned char b)
    {
        return ((b * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
    }
    
    void test_SWAR_recursive (uint64_t nbRows, const uint32_t* bitmap, uint32_t*  globalSums)
    {
        static const int DEPTH = 4;
    
        RecursiveSWAR::Array accumulators;
    
              uint32_t* begin = (uint32_t*) bitmap;
        const uint32_t* end   = bitmap + nbRows;
    
        // We reset the grand totals
        for (int i=0; i<32; i++)  { globalSums[i] = 0; }
    
        while (begin < end)
        {
            // We fill the N-bit accumulators to the maximum without overflow
            RecursiveSWAR::fillAccumulators (begin, end, accumulators);
    
            // We update grand totals from the filled N-bit accumulators
            for (int i=0; i::N; i++)
            {
                int r = reversebits(i) >> (8-DEPTH);
                u_int32_t* sums   = globalSums+r;
                TypeInfo::Type*  values = (TypeInfo::Type*) (accumulators+i);
    
                for (int j=0; j<8*(1<<(5-DEPTH)); j++)
                {
                    sums[(j*RecursiveSWAR::N) % 32] += values[j];
                }
            }
        }
    }
    
    ////////////////////////////////////////////////////////////////////////////////
    void execute (
        const char* name,
        void (*fct)(uint64_t nbRows, const uint32_t* bitmap, uint32_t*  globalSums),
        size_t nbRuns,
        uint64_t nbRows,
        u_int32_t* bitmap
    )
    {
        uint32_t  sums[32];
    
        double timeTotal=0;
        double cycleTotal=0;
        double timeTotal2=0;
        double cycleTotal2=0;
        uint64_t check=0;
    
        for (size_t n=0; n(system_clock::now().time_since_epoch());
            uint64_t c0 = ReadTSC();
    
            // We run the test
            (*fct) (nbRows, bitmap, sums);
    
            uint64_t c1 = ReadTSC();
            milliseconds t1 = duration_cast< milliseconds >(system_clock::now().time_since_epoch());
    
            timeTotal  += (t1-t0).count();
            cycleTotal += (double)(c1-c0) / nbRows;
    
            timeTotal2  += (t1-t0).count() * (t1-t0).count();
            cycleTotal2 += ((double)(c1-c0) / nbRows) * ((double)(c1-c0) / nbRows);
    
            // We compute some dummy checksum
            for (size_t k=0; k<32; k++)  {  check += (k+1)*sums[k];  }
        }
    
        printf ("%-21s |  %5.0lf (%5.1lf)            |  %5.2lf (%5.3lf)         |  %.3lf           |  0x%lx\n",
            name,
            timeTotal / nbRuns,
            deviation (nbRuns, timeTotal2, timeTotal),
            cycleTotal/nbRuns,
            deviation (nbRuns, cycleTotal2, cycleTotal),
            nbRows * cycleTotal / timeTotal / 1000000.0,
            check/nbRuns
        );
    }
    
    
    ////////////////////////////////////////////////////////////////////////////////
    int main(int argc, char **argv)
    {
        // We set rows number as 2^n where n is the provided argument
        // For simplification, we assume that the rows number is a multiple of 32
        uint64_t nbRows = 1ULL << (argc>1 ? atoi(argv[1]) : 28);
        size_t   nbRuns = argc>2 ? atoi(argv[2]) : 10;
    
        // We build an bitmap of size nbRows*32
        uint64_t actualNbRows = nbRows + 100000;
        uint32_t* bitmap = (uint32_t*)_mm_malloc(sizeof(uint32_t)*actualNbRows, 256);
        if (bitmap==nullptr)
        {
            fprintf(stderr, "unable to allocate the bitmap\n");
            exit(1);
        }
        memset (bitmap, 0, sizeof(u_int32_t)*actualNbRows);
    
        // We fill the bitmap with random values
        //    srand(time(nullptr));
        for (uint64_t i=0; i

    The size of the accumulators is 2DEPTH in this code. Note that this implementation is valid up to DEPTH=5. For DEPTH=4, here are the performance results compared to the implementation of Peter Cordes (named high entropy SWAR):

    The graph gives the number of cycles required to process a row (of 32 items) as a function of the number of rows of the matrix. As expected, the results are pretty similar since the main idea is the same. It is interesting to note the three parts of the graph:

    • constant value for log2(n)<=20
    • increasing value for log2(n) between 20 and 22
    • constant value for log2(n)>=22

    I guess that CPU caches properties can explain this behaviour.

提交回复
热议问题