Is there a more efficient way of expanding a char to an uint64_t?

前端 未结 8 626
星月不相逢
星月不相逢 2021-01-18 06:32

I want to inflate an unsigned char to an uint64_t by repeating each bit 8 times. E.g.

char -> uint64_t
0x00 -> 0x00
0x01 ->         


        
8条回答
  •  天涯浪人
    2021-01-18 07:00

    Here is one more method using only simple arithmetics:

    uint64_t inflate_chqrlie(uint8_t value) {
        uint64_t x = value;
        x = (x | (x << 28));
        x = (x | (x << 14));
        x = (x | (x <<  7)) & 0x0101010101010101ULL;
        x = (x << 8) - x;
        return x;
    }
    

    Another very efficient and concise one by phuclv using multiplication and mask:

    static uint64_t inflate_phuclv(uint8_t b) {
        uint64_t MAGIC = 0x8040201008040201ULL;
        uint64_t MASK  = 0x8080808080808080ULL;
        return ((MAGIC * b) & MASK) >> 7;
    }
    

    And another with a small lookup table:

    static uint32_t const lut_4_32[16] = {
        0x00000000, 0x000000FF, 0x0000FF00, 0x0000FFFF, 
        0x00FF0000, 0x00FF00FF, 0x00FFFF00, 0x00FFFFFF, 
        0xFF000000, 0xFF0000FF, 0xFF00FF00, 0xFF00FFFF, 
        0xFFFF0000, 0xFFFF00FF, 0xFFFFFF00, 0xFFFFFFFF, 
    };
    
    static uint64_t inflate_lut32(uint8_t b) {
        return lut_4_32[b & 15] | ((uint64_t)lut_4_32[b >> 4] << 32);
    }
    

    I wrote a benchmarking program to determine relative performance of the different approaches on my system (x86_64-apple-darwin16.7.0, Apple LLVM version 9.0.0 (clang-900.0.39.2, clang -O3).

    The results show that my function inflate_chqrlie is faster than naive approaches but slower than other elaborate versions, all of which are beaten hands down by inflate_lut64 using a 2KB the lookup table in cache optimal situations.

    The function inflate_lut32, using a much smaller lookup table (64 bytes instead of 2KB) is not as fast as inflate_lut64, but seems a good compromise for 32-bit architectures as it is still much faster than all other alternatives.

    64-bit benchmark:

                 inflate: 0, 848.316ms
            inflate_Curd: 0, 845.424ms
         inflate_chqrlie: 0, 371.502ms
     fast_inflate_njuffa: 0, 288.669ms
       inflate_parallel1: 0, 242.827ms
       inflate_parallel2: 0, 315.105ms
       inflate_parallel3: 0, 363.379ms
       inflate_parallel4: 0, 304.051ms
       inflate_parallel5: 0, 301.205ms
          inflate_phuclv: 0, 109.130ms
           inflate_lut32: 0, 197.178ms
           inflate_lut64: 0, 25.160ms
    

    32-bit benchmark:

                 inflate: 0, 1451.464ms
            inflate_Curd: 0, 955.509ms
         inflate_chqrlie: 0, 385.036ms
     fast_inflate_njuffa: 0, 463.212ms
       inflate_parallel1: 0, 468.070ms
       inflate_parallel2: 0, 570.107ms
       inflate_parallel3: 0, 511.741ms
       inflate_parallel4: 0, 601.892ms
       inflate_parallel5: 0, 506.695ms
          inflate_phuclv: 0, 192.431ms
           inflate_lut32: 0, 140.968ms
           inflate_lut64: 0, 28.776ms
    

    Here is the code:

    #include 
    #include 
    #include 
    
    static uint64_t inflate(unsigned char a) {
    #define BIT_SET(var, pos) ((var) & (1 << (pos)))
        uint64_t MASK = 0xFF;
        uint64_t result = 0;
        for (int i = 0; i < 8; i++) {
            if (BIT_SET(a, i))
                result |= (MASK << (8 * i));
        }
    
        return result;
    }
    
    static uint64_t inflate_Curd(unsigned char a) {
        uint64_t mask = 0xFF;
        uint64_t result = 0;
        for (int i = 0; i < 8; i++) {
            if (a & 1)
                result |= mask;
            mask <<= 8;
            a >>= 1;
        }
        return result;
    }
    
    uint64_t inflate_chqrlie(uint8_t value) {
        uint64_t x = value;
        x = (x | (x << 28));
        x = (x | (x << 14));
        x = (x | (x <<  7)) & 0x0101010101010101ULL;
        x = (x << 8) - x;
        return x;
    }
    
    uint64_t fast_inflate_njuffa(uint8_t a) {
        const uint64_t spread7 = (1ULL << 42) | (1ULL << 35) | (1ULL << 28) | (1ULL << 21) |
            (1ULL << 14) | (1ULL <<  7) | (1UL <<   0);
        const uint64_t byte_lsb = (1ULL << 56) | (1ULL << 48) | (1ULL << 40) | (1ULL << 32) |
            (1ULL << 24) | (1ULL << 16) | (1ULL <<  8) | (1ULL <<  0);
        uint64_t r;
        /* spread bits to lsbs of each byte */
        r = (((uint64_t)(a & 0x7f) * spread7) + ((uint64_t)a << 49));
        /* extract the lsbs of all bytes */
        r = r & byte_lsb;
        /* fill each byte with its lsb */
        r = r * 0xff;
        return r;
    }
    
    // Aki Suuihkonen: 1.265
    static uint64_t inflate_parallel1(unsigned char a) {
        uint64_t vector = a * 0x0101010101010101ULL;
        // replicate the word all over qword
        // A5 becomes A5 A5 A5 A5 A5 A5 A5 A5
        vector &= 0x8040201008040201;  // becomes 80 00 20 00 00 04 00 01 <--
        vector += 0x00406070787c7e7f;  // becomes 80 40 80 70 78 80 7e 80
        // MSB is correct
        vector = (vector >> 7) & 0x0101010101010101ULL;  // LSB is correct
        return vector * 255;                             // all bits correct
    }
    
    // By seizet and then combine: 1.583
    static uint64_t inflate_parallel2(unsigned char a) {
        uint64_t vector1 = a * 0x0002000800200080ULL;
        uint64_t vector2 = a * 0x0000040010004001ULL;
        uint64_t vector = (vector1 & 0x0100010001000100ULL) | (vector2 & 0x0001000100010001ULL);
        return vector * 255;
    }
    
    // Stay in 32 bits as much as possible: 1.006
    static uint64_t inflate_parallel3(unsigned char a) {
        uint32_t vector1 = (( (a & 0x0F)       * 0x00204081) & 0x01010101) * 255;
        uint32_t vector2 = ((((a & 0xF0) >> 4) * 0x00204081) & 0x01010101) * 255;
        return (((uint64_t)vector2) << 32) | vector1;
    }
    
    // Do the common computation in 64 bits: 0.915
    static uint64_t inflate_parallel4(unsigned char a) {
        uint32_t vector1 =  (a & 0x0F)       * 0x00204081;
        uint32_t vector2 = ((a & 0xF0) >> 4) * 0x00204081;
        uint64_t vector = (vector1 | (((uint64_t)vector2) << 32)) & 0x0101010101010101ULL;
        return vector * 255;
    }
    
    // Some computation is done in 64 bits a little sooner: 0.806
    static uint64_t inflate_parallel5(unsigned char a) {
        uint32_t vector1 = (a & 0x0F) * 0x00204081;
        uint64_t vector2 = (a & 0xF0) * 0x002040810000000ULL;
        uint64_t vector = (vector1 | vector2) & 0x0101010101010101ULL;
        return vector * 255;
    }
    
    static uint64_t inflate_phuclv(uint8_t b) {
        uint64_t MAGIC = 0x8040201008040201ULL;
        uint64_t MASK  = 0x8080808080808080ULL;
        return ((MAGIC * b) & MASK) >> 7;
    }
    
    static uint32_t const lut_4_32[16] = {
        0x00000000, 0x000000FF, 0x0000FF00, 0x0000FFFF, 
        0x00FF0000, 0x00FF00FF, 0x00FFFF00, 0x00FFFFFF, 
        0xFF000000, 0xFF0000FF, 0xFF00FF00, 0xFF00FFFF, 
        0xFFFF0000, 0xFFFF00FF, 0xFFFFFF00, 0xFFFFFFFF, 
    };
    
    static uint64_t inflate_lut32(uint8_t b) {
        return lut_4_32[b & 15] | ((uint64_t)lut_4_32[b >> 4] << 32);
    }
    
    static uint64_t lut_8_64[256];
    
    static uint64_t inflate_lut64(uint8_t b) {
        return lut_8_64[b];
    }
    
    #define ITER  1000000
    
    int main() {
        clock_t t;
        uint64_t x;
    
        for (int b = 0; b < 256; b++)
            lut_8_64[b] = inflate((uint8_t)b);
    
    #define TEST(func)  do {                                \
            t = clock();                                    \
            x = 0;                                          \
            for (int i = 0; i < ITER; i++) {                \
                for (int b = 0; b < 256; b++)               \
                    x ^= func((uint8_t)b);                  \
            }                                               \
            t = clock() - t;                                \
            printf("%20s: %llu, %.3fms\n",                  \
                   #func, x, t * 1000.0 / CLOCKS_PER_SEC);  \
           } while (0)
    
        TEST(inflate);
        TEST(inflate_Curd);
        TEST(inflate_chqrlie);
        TEST(fast_inflate_njuffa);
        TEST(inflate_parallel1);
        TEST(inflate_parallel2);
        TEST(inflate_parallel3);
        TEST(inflate_parallel4);
        TEST(inflate_parallel5);
        TEST(inflate_phuclv);
        TEST(inflate_lut32);
        TEST(inflate_lut64);
    
        return 0;
    }
    

提交回复
热议问题