Is there a more efficient way of expanding a char to an uint64_t?

前端 未结 8 627
星月不相逢
星月不相逢 2021-01-18 06:32

I want to inflate an unsigned char to an uint64_t by repeating each bit 8 times. E.g.

char -> uint64_t
0x00 -> 0x00
0x01 ->         


        
8条回答
  •  感情败类
    2021-01-18 07:06

    The desired functionality can be achieved by moving each bit of the source into the lsb of the appropriate target byte (0 → 0, 1 → 8, 2 → 16, ...., 7 → 56), then expanding each lsb to cover the whole byte, which is easily done by multiplying with 0xff (255). Instead of moving bits into place individually using shifts, then combining the results, we can use an integer multiply to shift multiple bits in parallel. To prevent self-overlap, we can move only the least-significant seven source bits in this fashion, but need to move the source msb separately with a shift.

    This leads to the following ISO-C99 implementation:

    #include 
    
    /* expand each bit in input into one byte in output */
    uint64_t fast_inflate (uint8_t a)
    {
        const uint64_t spread7 = (1ULL << 42) | (1ULL << 35) | (1ULL << 28) | (1ULL << 21) | 
                                 (1ULL << 14) | (1ULL <<  7) | (1UL <<   0);
        const uint64_t byte_lsb = (1ULL << 56) | (1ULL << 48) | (1ULL << 40) | (1ULL << 32) |
                                  (1ULL << 24) | (1ULL << 16) | (1ULL <<  8) | (1ULL <<  0);
        uint64_t r;
        /* spread bits to lsbs of each byte */
        r = (((uint64_t)(a & 0x7f) * spread7) + ((uint64_t)a << 49));
        /* extract the lsbs of all bytes */
        r = r & byte_lsb;
        /* fill each byte with its lsb */
        r = r * 0xff;
        return r;
    }
    
    #define BIT_SET(var, pos) ((var) & (1 << (pos)))
    static uint64_t inflate(unsigned char a)
    {
        uint64_t MASK = 0xFF;
        uint64_t result = 0;
        for (int i = 0; i < 8; i++) {
            if (BIT_SET(a, i))
                result |= (MASK << (8 * i));    
        }
        return result;
    }
    
    #include 
    #include 
    
    int main (void)
    {
        uint8_t a = 0;
        do {
            uint64_t res = fast_inflate (a);
            uint64_t ref = inflate (a);
            if (res != ref) {
                printf ("error @ %02x: fast_inflate = %016llx  inflate = %016llx\n", 
                        a, res, ref);
                return EXIT_FAILURE;
            }
            a++;
        } while (a);
        printf ("test passed\n");
        return EXIT_SUCCESS;
    }
    

    Most x64 compilers will compile fast_inflate() in straightforward manner. For example, my Intel compiler Version 13.1.3.198, when building with /Ox, generates the 11-instruction sequence below. Note that the final multiply with 0xff is actually implemented as a shift and subtract sequence.

    fast_inflate    PROC 
            mov       rdx, 040810204081H
            movzx     r9d, cl
            and       ecx, 127
            mov       r8, 0101010101010101H
            imul      rdx, rcx
            shl       r9, 49
            add       r9, rdx
            and       r9, r8
            mov       rax, r9
            shl       rax, 8
            sub       rax, r9
            ret
    

提交回复
热议问题