Is there a more efficient way of expanding a char to an uint64_t?

前端未结

关注

 8  626

星月不相逢 2021-01-18 06:32

I want to inflate an unsigned char to an uint64_t by repeating each bit 8 times. E.g.

char -> uint64_t
0x00 -> 0x00
0x01 ->


      
      
        
          8条回答        

        
                    
            
            
                         
                
              
              
                
                   天涯浪人
                                             
                
                
                (楼主)
            
              
              
                2021-01-18 07:00
              

            
            
                        
Here is one more method using only simple arithmetics:

uint64_t inflate_chqrlie(uint8_t value) {
    uint64_t x = value;
    x = (x | (x << 28));
    x = (x | (x << 14));
    x = (x | (x <<  7)) & 0x0101010101010101ULL;
    x = (x << 8) - x;
    return x;
}


Another very efficient and concise one by phuclv using multiplication and mask:

static uint64_t inflate_phuclv(uint8_t b) {
    uint64_t MAGIC = 0x8040201008040201ULL;
    uint64_t MASK  = 0x8080808080808080ULL;
    return ((MAGIC * b) & MASK) >> 7;
}


And another with a small lookup table:

static uint32_t const lut_4_32[16] = {
    0x00000000, 0x000000FF, 0x0000FF00, 0x0000FFFF, 
    0x00FF0000, 0x00FF00FF, 0x00FFFF00, 0x00FFFFFF, 
    0xFF000000, 0xFF0000FF, 0xFF00FF00, 0xFF00FFFF, 
    0xFFFF0000, 0xFFFF00FF, 0xFFFFFF00, 0xFFFFFFFF, 
};

static uint64_t inflate_lut32(uint8_t b) {
    return lut_4_32[b & 15] | ((uint64_t)lut_4_32[b >> 4] << 32);
}


I wrote a benchmarking program to determine relative performance of the different approaches on my system (x86_64-apple-darwin16.7.0, Apple LLVM version 9.0.0 (clang-900.0.39.2, clang -O3).

The results show that my function inflate_chqrlie is faster than naive approaches but slower than other elaborate versions, all of which are beaten hands down by inflate_lut64 using a 2KB the lookup table in cache optimal situations.

The function inflate_lut32, using a much smaller lookup table (64 bytes instead of 2KB) is not as fast as inflate_lut64, but seems a good compromise for 32-bit architectures as it is still much faster than all other alternatives.

64-bit benchmark:

             inflate: 0, 848.316ms
        inflate_Curd: 0, 845.424ms
     inflate_chqrlie: 0, 371.502ms
 fast_inflate_njuffa: 0, 288.669ms
   inflate_parallel1: 0, 242.827ms
   inflate_parallel2: 0, 315.105ms
   inflate_parallel3: 0, 363.379ms
   inflate_parallel4: 0, 304.051ms
   inflate_parallel5: 0, 301.205ms
      inflate_phuclv: 0, 109.130ms
       inflate_lut32: 0, 197.178ms
       inflate_lut64: 0, 25.160ms


32-bit benchmark:

             inflate: 0, 1451.464ms
        inflate_Curd: 0, 955.509ms
     inflate_chqrlie: 0, 385.036ms
 fast_inflate_njuffa: 0, 463.212ms
   inflate_parallel1: 0, 468.070ms
   inflate_parallel2: 0, 570.107ms
   inflate_parallel3: 0, 511.741ms
   inflate_parallel4: 0, 601.892ms
   inflate_parallel5: 0, 506.695ms
      inflate_phuclv: 0, 192.431ms
       inflate_lut32: 0, 140.968ms
       inflate_lut64: 0, 28.776ms


Here is the code:

#include 
#include 
#include 

static uint64_t inflate(unsigned char a) {
#define BIT_SET(var, pos) ((var) & (1 << (pos)))
    uint64_t MASK = 0xFF;
    uint64_t result = 0;
    for (int i = 0; i < 8; i++) {
        if (BIT_SET(a, i))
            result |= (MASK << (8 * i));
    }

    return result;
}

static uint64_t inflate_Curd(unsigned char a) {
    uint64_t mask = 0xFF;
    uint64_t result = 0;
    for (int i = 0; i < 8; i++) {
        if (a & 1)
            result |= mask;
        mask <<= 8;
        a >>= 1;
    }
    return result;
}

uint64_t inflate_chqrlie(uint8_t value) {
    uint64_t x = value;
    x = (x | (x << 28));
    x = (x | (x << 14));
    x = (x | (x <<  7)) & 0x0101010101010101ULL;
    x = (x << 8) - x;
    return x;
}

uint64_t fast_inflate_njuffa(uint8_t a) {
    const uint64_t spread7 = (1ULL << 42) | (1ULL << 35) | (1ULL << 28) | (1ULL << 21) |
        (1ULL << 14) | (1ULL <<  7) | (1UL <<   0);
    const uint64_t byte_lsb = (1ULL << 56) | (1ULL << 48) | (1ULL << 40) | (1ULL << 32) |
        (1ULL << 24) | (1ULL << 16) | (1ULL <<  8) | (1ULL <<  0);
    uint64_t r;
    /* spread bits to lsbs of each byte */
    r = (((uint64_t)(a & 0x7f) * spread7) + ((uint64_t)a << 49));
    /* extract the lsbs of all bytes */
    r = r & byte_lsb;
    /* fill each byte with its lsb */
    r = r * 0xff;
    return r;
}

// Aki Suuihkonen: 1.265
static uint64_t inflate_parallel1(unsigned char a) {
    uint64_t vector = a * 0x0101010101010101ULL;
    // replicate the word all over qword
    // A5 becomes A5 A5 A5 A5 A5 A5 A5 A5
    vector &= 0x8040201008040201;  // becomes 80 00 20 00 00 04 00 01 <--
    vector += 0x00406070787c7e7f;  // becomes 80 40 80 70 78 80 7e 80
    // MSB is correct
    vector = (vector >> 7) & 0x0101010101010101ULL;  // LSB is correct
    return vector * 255;                             // all bits correct
}

// By seizet and then combine: 1.583
static uint64_t inflate_parallel2(unsigned char a) {
    uint64_t vector1 = a * 0x0002000800200080ULL;
    uint64_t vector2 = a * 0x0000040010004001ULL;
    uint64_t vector = (vector1 & 0x0100010001000100ULL) | (vector2 & 0x0001000100010001ULL);
    return vector * 255;
}

// Stay in 32 bits as much as possible: 1.006
static uint64_t inflate_parallel3(unsigned char a) {
    uint32_t vector1 = (( (a & 0x0F)       * 0x00204081) & 0x01010101) * 255;
    uint32_t vector2 = ((((a & 0xF0) >> 4) * 0x00204081) & 0x01010101) * 255;
    return (((uint64_t)vector2) << 32) | vector1;
}

// Do the common computation in 64 bits: 0.915
static uint64_t inflate_parallel4(unsigned char a) {
    uint32_t vector1 =  (a & 0x0F)       * 0x00204081;
    uint32_t vector2 = ((a & 0xF0) >> 4) * 0x00204081;
    uint64_t vector = (vector1 | (((uint64_t)vector2) << 32)) & 0x0101010101010101ULL;
    return vector * 255;
}

// Some computation is done in 64 bits a little sooner: 0.806
static uint64_t inflate_parallel5(unsigned char a) {
    uint32_t vector1 = (a & 0x0F) * 0x00204081;
    uint64_t vector2 = (a & 0xF0) * 0x002040810000000ULL;
    uint64_t vector = (vector1 | vector2) & 0x0101010101010101ULL;
    return vector * 255;
}

static uint64_t inflate_phuclv(uint8_t b) {
    uint64_t MAGIC = 0x8040201008040201ULL;
    uint64_t MASK  = 0x8080808080808080ULL;
    return ((MAGIC * b) & MASK) >> 7;
}

static uint32_t const lut_4_32[16] = {
    0x00000000, 0x000000FF, 0x0000FF00, 0x0000FFFF, 
    0x00FF0000, 0x00FF00FF, 0x00FFFF00, 0x00FFFFFF, 
    0xFF000000, 0xFF0000FF, 0xFF00FF00, 0xFF00FFFF, 
    0xFFFF0000, 0xFFFF00FF, 0xFFFFFF00, 0xFFFFFFFF, 
};

static uint64_t inflate_lut32(uint8_t b) {
    return lut_4_32[b & 15] | ((uint64_t)lut_4_32[b >> 4] << 32);
}

static uint64_t lut_8_64[256];

static uint64_t inflate_lut64(uint8_t b) {
    return lut_8_64[b];
}

#define ITER  1000000

int main() {
    clock_t t;
    uint64_t x;

    for (int b = 0; b < 256; b++)
        lut_8_64[b] = inflate((uint8_t)b);

#define TEST(func)  do {                                \
        t = clock();                                    \
        x = 0;                                          \
        for (int i = 0; i < ITER; i++) {                \
            for (int b = 0; b < 256; b++)               \
                x ^= func((uint8_t)b);                  \
        }                                               \
        t = clock() - t;                                \
        printf("%20s: %llu, %.3fms\n",                  \
               #func, x, t * 1000.0 / CLOCKS_PER_SEC);  \
       } while (0)

    TEST(inflate);
    TEST(inflate_Curd);
    TEST(inflate_chqrlie);
    TEST(fast_inflate_njuffa);
    TEST(inflate_parallel1);
    TEST(inflate_parallel2);
    TEST(inflate_parallel3);
    TEST(inflate_parallel4);
    TEST(inflate_parallel5);
    TEST(inflate_phuclv);
    TEST(inflate_lut32);
    TEST(inflate_lut64);

    return 0;
}

    
             
                                                        
            
            
              
                
                0
              
                   
                
               讨论(0)
              
                                                  
              
              
                          
             
       
          
              
                                       
     查看其它8个回答


            
                         
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
                              			
        
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复