Matrix transpose and population count

前端未结

关注

 3  1464

旧巷少年郎 2021-01-22 18:03

I have a square boolean matrix M of size N, stored by rows and I want to count the number of bits set to 1 for each column.

For instance for n=4:

1101
01


      
      
        
          3条回答        

        
                    
            
            
                         
                
              
              
                
                   一整个雨季
                                             
                
                
                (楼主)
            
              
              
                2021-01-22 18:42
              

            
            
                        
I eventually wrote another implementation, following the high entropy SWAR approach proposed by Peter Cordes. This implementation is recursive and relies on C++ template specialization. 

The global idea is to fill N-bit accumulators to their maximum without carry overflow (this is where recursion is used). When these accumulators are filled, we update the grand totals and we start again with new N-bit accumulators to fill until all rows have been processed.

Here is the code (see function test_SWAR_recursive):

#include 
#include 
#include 
#include 
#include 
#include 
#include 

using namespace std;
using namespace std::chrono;

// avoid the #include 
extern "C" u_int64_t ReadTSC();

static double deviation (double n, double sum2, double sum)  {  return sqrt (sum2/n - (sum/n)*(sum/n)); }

////////////////////////////////////////////////////////////////////////////////
// Recursive SWAR approach (with template specialization)
////////////////////////////////////////////////////////////////////////////////

template
struct RecursiveSWAR
{
    // Number of accumulators for current depth
    static const int N = 1<::MAGIC_NUMBER
            * (1 + (1<<(1<<(DEPTH-1))))
            / (1 + (1<<(1<<(DEPTH+0))));

    static void fillAccumulators (u_int32_t*& begin, const u_int32_t* end, Array accumulators)
    {
        // We reset the N-bit accumulators
        for (int i=0; i=3)  if (begin>=end)  { return; }

        typename RecursiveSWAR::Array accumulatorsMinusOne;

        // We load a register with the mask
        __m256i mask = _mm256_set1_epi32 (RecursiveSWAR::MAGIC_NUMBER);

        // We fill the N-bit accumulators to their maximum capacity without carry overflow
        for (int i=0; i::fillAccumulators (begin, end, accumulatorsMinusOne);

            // We update the N-bit accumulators from the (N-1)-bit accumulators
            for (int j=0; j::N; j++)
            {
                // LOW part
                accumulators[2*j+0] = _mm256_add_epi32 (
                    accumulators[2*j+0],
                    _mm256_and_si256 (
                        accumulatorsMinusOne[j],
                        mask
                    )
                );

                // HIGH part
                accumulators[2*j+1] = _mm256_add_epi32 (
                    accumulators[2*j+1],
                    _mm256_and_si256 (
                        _mm256_srli_epi32 (
                            accumulatorsMinusOne[j],
                            RecursiveSWAR::N
                        ),
                        mask
                    )
                );
            }
        }
    }
};

// Template specialization for DEPTH=0
template<>
struct RecursiveSWAR<0>
{
    static const int N = 1;

    typedef __m256i Array[N];

    static const u_int32_t MAGIC_NUMBER = 0x55555555;

    static void fillAccumulators (u_int32_t*& begin, const u_int32_t* end, Array result)
    {
        // We just load 8 rows in the AVX2 register
        result[0] = _mm256_loadu_si256 ((__m256i*)begin);

        // We update the iterator
        begin += 1*sizeof(__m256i)/sizeof(u_int32_t);
    }
};

template struct TypeInfo  { };
template<> struct TypeInfo<3>  {  typedef u_int8_t  Type; };
template<> struct TypeInfo<4>  {  typedef u_int16_t Type; };
template<> struct TypeInfo<5>  {  typedef u_int32_t Type; };

unsigned char reversebits (unsigned char b)
{
    return ((b * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
}

void test_SWAR_recursive (uint64_t nbRows, const uint32_t* bitmap, uint32_t*  globalSums)
{
    static const int DEPTH = 4;

    RecursiveSWAR::Array accumulators;

          uint32_t* begin = (uint32_t*) bitmap;
    const uint32_t* end   = bitmap + nbRows;

    // We reset the grand totals
    for (int i=0; i<32; i++)  { globalSums[i] = 0; }

    while (begin < end)
    {
        // We fill the N-bit accumulators to the maximum without overflow
        RecursiveSWAR::fillAccumulators (begin, end, accumulators);

        // We update grand totals from the filled N-bit accumulators
        for (int i=0; i::N; i++)
        {
            int r = reversebits(i) >> (8-DEPTH);
            u_int32_t* sums   = globalSums+r;
            TypeInfo::Type*  values = (TypeInfo::Type*) (accumulators+i);

            for (int j=0; j<8*(1<<(5-DEPTH)); j++)
            {
                sums[(j*RecursiveSWAR::N) % 32] += values[j];
            }
        }
    }
}

////////////////////////////////////////////////////////////////////////////////
void execute (
    const char* name,
    void (*fct)(uint64_t nbRows, const uint32_t* bitmap, uint32_t*  globalSums),
    size_t nbRuns,
    uint64_t nbRows,
    u_int32_t* bitmap
)
{
    uint32_t  sums[32];

    double timeTotal=0;
    double cycleTotal=0;
    double timeTotal2=0;
    double cycleTotal2=0;
    uint64_t check=0;

    for (size_t n=0; n(system_clock::now().time_since_epoch());
        uint64_t c0 = ReadTSC();

        // We run the test
        (*fct) (nbRows, bitmap, sums);

        uint64_t c1 = ReadTSC();
        milliseconds t1 = duration_cast< milliseconds >(system_clock::now().time_since_epoch());

        timeTotal  += (t1-t0).count();
        cycleTotal += (double)(c1-c0) / nbRows;

        timeTotal2  += (t1-t0).count() * (t1-t0).count();
        cycleTotal2 += ((double)(c1-c0) / nbRows) * ((double)(c1-c0) / nbRows);

        // We compute some dummy checksum
        for (size_t k=0; k<32; k++)  {  check += (k+1)*sums[k];  }
    }

    printf ("%-21s |  %5.0lf (%5.1lf)            |  %5.2lf (%5.3lf)         |  %.3lf           |  0x%lx\n",
        name,
        timeTotal / nbRuns,
        deviation (nbRuns, timeTotal2, timeTotal),
        cycleTotal/nbRuns,
        deviation (nbRuns, cycleTotal2, cycleTotal),
        nbRows * cycleTotal / timeTotal / 1000000.0,
        check/nbRuns
    );
}


////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
    // We set rows number as 2^n where n is the provided argument
    // For simplification, we assume that the rows number is a multiple of 32
    uint64_t nbRows = 1ULL << (argc>1 ? atoi(argv[1]) : 28);
    size_t   nbRuns = argc>2 ? atoi(argv[2]) : 10;

    // We build an bitmap of size nbRows*32
    uint64_t actualNbRows = nbRows + 100000;
    uint32_t* bitmap = (uint32_t*)_mm_malloc(sizeof(uint32_t)*actualNbRows, 256);
    if (bitmap==nullptr)
    {
        fprintf(stderr, "unable to allocate the bitmap\n");
        exit(1);
    }
    memset (bitmap, 0, sizeof(u_int32_t)*actualNbRows);

    // We fill the bitmap with random values
    //    srand(time(nullptr));
    for (uint64_t i=0; i


The size of the accumulators is 2^DEPTH in this code. Note that this implementation is valid up to DEPTH=5. For DEPTH=4, here are the performance results compared to the implementation of Peter Cordes (named high entropy SWAR):



The graph gives the number of cycles required to process a row (of 32 items) as a function of the number of rows of the matrix. As expected, the results are pretty similar since the main idea is the same. It is interesting to note the three parts of the graph:


constant value for log2(n)<=20
increasing value for log2(n) between 20 and 22
constant value for log2(n)>=22


I guess that CPU caches properties can explain this behaviour.
    
             
                                                        
            

            
              
                
                0
              
                   
                
               讨论(0)
              
                                                  
              
              
                          
             
       
          
              
                                       
     查看其它3个回答


            
                         
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          

                              			
        

        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复