Efficient Algorithm for Bit Reversal (from MSB->LSB to LSB->MSB) in C

后端 未结 26 1370
情深已故
情深已故 2020-11-22 06:08

What is the most efficient algorithm to achieve the following:

0010 0000 => 0000 0100

The conversion is from MSB->LSB to LSB->MSB. All bits

相关标签:
26条回答
  • 2020-11-22 06:48

    Bit reversal in pseudo code

    source -> byte to be reversed b00101100 destination -> reversed, also needs to be of unsigned type so sign bit is not propogated down

    copy into temp so original is unaffected, also needs to be of unsigned type so that sign bit is not shifted in automaticaly

    bytecopy = b0010110
    

    LOOP8: //do this 8 times test if bytecopy is < 0 (negative)

        set bit8 (msb) of reversed = reversed | b10000000 
    
    else do not set bit8
    
    shift bytecopy left 1 place
    bytecopy = bytecopy << 1 = b0101100 result
    
    shift result right 1 place
    reversed = reversed >> 1 = b00000000
    8 times no then up^ LOOP8
    8 times yes then done.
    
    0 讨论(0)
  • 2020-11-22 06:49

    Of course the obvious source of bit-twiddling hacks is here: http://graphics.stanford.edu/~seander/bithacks.html#BitReverseObvious

    0 讨论(0)
  • 2020-11-22 06:50

    This ain't no job for a human! ... but perfect for a machine

    This is 2015, 6 years from when this question was first asked. Compilers have since become our masters, and our job as humans is only to help them. So what's the best way to give our intentions to the machine?

    Bit-reversal is so common that you have to wonder why the x86's ever growing ISA doesn't include an instruction to do it one go.

    The reason: if you give your true concise intent to the compiler, bit reversal should only take ~20 CPU cycles. Let me show you how to craft reverse() and use it:

    #include <inttypes.h>
    #include <stdio.h>
    
    uint64_t reverse(const uint64_t n,
                     const uint64_t k)
    {
            uint64_t r, i;
            for (r = 0, i = 0; i < k; ++i)
                    r |= ((n >> i) & 1) << (k - i - 1);
            return r;
    }
    
    int main()
    {
            const uint64_t size = 64;
            uint64_t sum = 0;
            uint64_t a;
            for (a = 0; a < (uint64_t)1 << 30; ++a)
                    sum += reverse(a, size);
            printf("%" PRIu64 "\n", sum);
            return 0;
    }
    

    Compiling this sample program with Clang version >= 3.6, -O3, -march=native (tested with Haswell), gives artwork-quality code using the new AVX2 instructions, with a runtime of 11 seconds processing ~1 billion reverse()s. That's ~10 ns per reverse(), with .5 ns CPU cycle assuming 2 GHz puts us at the sweet 20 CPU cycles.

    • You can fit 10 reverse()s in the time it takes to access RAM once for a single large array!
    • You can fit 1 reverse() in the time it takes to access an L2 cache LUT twice.

    Caveat: this sample code should hold as a decent benchmark for a few years, but it will eventually start to show its age once compilers are smart enough to optimize main() to just printf the final result instead of really computing anything. But for now it works in showcasing reverse().

    0 讨论(0)
  • 2020-11-22 06:51

    Generic

    C code. Using 1 byte input data num as example.

        unsigned char num = 0xaa;   // 1010 1010 (aa) -> 0101 0101 (55)
        int s = sizeof(num) * 8;    // get number of bits
        int i, x, y, p;
        int var = 0;                // make var data type to be equal or larger than num
    
        for (i = 0; i < (s / 2); i++) {
            // extract bit on the left, from MSB
            p = s - i - 1;
            x = num & (1 << p);
            x = x >> p;
            printf("x: %d\n", x);
    
            // extract bit on the right, from LSB
            y = num & (1 << i);
            y = y >> i;
            printf("y: %d\n", y);
    
            var = var | (x << i);       // apply x
            var = var | (y << p);       // apply y
        }
    
        printf("new: 0x%x\n", new);
    
    0 讨论(0)
  • 2020-11-22 06:52
    // Purpose: to reverse bits in an unsigned short integer 
    // Input: an unsigned short integer whose bits are to be reversed
    // Output: an unsigned short integer with the reversed bits of the input one
    unsigned short ReverseBits( unsigned short a )
    {
         // declare and initialize number of bits in the unsigned short integer
         const char num_bits = sizeof(a) * CHAR_BIT;
    
         // declare and initialize bitset representation of integer a
         bitset<num_bits> bitset_a(a);          
    
         // declare and initialize bitset representation of integer b (0000000000000000)
         bitset<num_bits> bitset_b(0);                  
    
         // declare and initialize bitset representation of mask (0000000000000001)
         bitset<num_bits> mask(1);          
    
         for ( char i = 0; i < num_bits; ++i )
         {
              bitset_b = (bitset_b << 1) | bitset_a & mask;
              bitset_a >>= 1;
         }
    
         return (unsigned short) bitset_b.to_ulong();
    }
    
    void PrintBits( unsigned short a )
    {
         // declare and initialize bitset representation of a
         bitset<sizeof(a) * CHAR_BIT> bitset(a);
    
         // print out bits
         cout << bitset << endl;
    }
    
    
    // Testing the functionality of the code
    
    int main ()
    {
         unsigned short a = 17, b;
    
         cout << "Original: "; 
         PrintBits(a);
    
         b = ReverseBits( a );
    
         cout << "Reversed: ";
         PrintBits(b);
    }
    
    // Output:
    Original: 0000000000010001
    Reversed: 1000100000000000
    
    0 讨论(0)
  • 2020-11-22 06:53

    NOTE: All algorithms below are in C, but should be portable to your language of choice (just don't look at me when they're not as fast :)

    Options

    Low Memory (32-bit int, 32-bit machine)(from here):

    unsigned int
    reverse(register unsigned int x)
    {
        x = (((x & 0xaaaaaaaa) >> 1) | ((x & 0x55555555) << 1));
        x = (((x & 0xcccccccc) >> 2) | ((x & 0x33333333) << 2));
        x = (((x & 0xf0f0f0f0) >> 4) | ((x & 0x0f0f0f0f) << 4));
        x = (((x & 0xff00ff00) >> 8) | ((x & 0x00ff00ff) << 8));
        return((x >> 16) | (x << 16));
    
    }
    

    From the famous Bit Twiddling Hacks page:

    Fastest (lookup table):

    static const unsigned char BitReverseTable256[] = 
    {
      0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0, 0x30, 0xB0, 0x70, 0xF0, 
      0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8, 0x18, 0x98, 0x58, 0xD8, 0x38, 0xB8, 0x78, 0xF8, 
      0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4, 0x64, 0xE4, 0x14, 0x94, 0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4, 
      0x0C, 0x8C, 0x4C, 0xCC, 0x2C, 0xAC, 0x6C, 0xEC, 0x1C, 0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC, 
      0x02, 0x82, 0x42, 0xC2, 0x22, 0xA2, 0x62, 0xE2, 0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2, 0x72, 0xF2, 
      0x0A, 0x8A, 0x4A, 0xCA, 0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A, 0xDA, 0x3A, 0xBA, 0x7A, 0xFA,
      0x06, 0x86, 0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6, 0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6, 
      0x0E, 0x8E, 0x4E, 0xCE, 0x2E, 0xAE, 0x6E, 0xEE, 0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE, 0x7E, 0xFE,
      0x01, 0x81, 0x41, 0xC1, 0x21, 0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1, 0x31, 0xB1, 0x71, 0xF1,
      0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, 0x19, 0x99, 0x59, 0xD9, 0x39, 0xB9, 0x79, 0xF9, 
      0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5, 0x15, 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5,
      0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD, 0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD,
      0x03, 0x83, 0x43, 0xC3, 0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3, 
      0x0B, 0x8B, 0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB, 0x1B, 0x9B, 0x5B, 0xDB, 0x3B, 0xBB, 0x7B, 0xFB,
      0x07, 0x87, 0x47, 0xC7, 0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97, 0x57, 0xD7, 0x37, 0xB7, 0x77, 0xF7, 
      0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F, 0xEF, 0x1F, 0x9F, 0x5F, 0xDF, 0x3F, 0xBF, 0x7F, 0xFF
    };
    
    unsigned int v; // reverse 32-bit value, 8 bits at time
    unsigned int c; // c will get v reversed
    
    // Option 1:
    c = (BitReverseTable256[v & 0xff] << 24) | 
        (BitReverseTable256[(v >> 8) & 0xff] << 16) | 
        (BitReverseTable256[(v >> 16) & 0xff] << 8) |
        (BitReverseTable256[(v >> 24) & 0xff]);
    
    // Option 2:
    unsigned char * p = (unsigned char *) &v;
    unsigned char * q = (unsigned char *) &c;
    q[3] = BitReverseTable256[p[0]]; 
    q[2] = BitReverseTable256[p[1]]; 
    q[1] = BitReverseTable256[p[2]]; 
    q[0] = BitReverseTable256[p[3]];
    

    You can extend this idea to 64-bit ints, or trade off memory for speed (assuming your L1 Data Cache is large enough), and reverse 16 bits at a time with a 64K-entry lookup table.


    Others

    Simple

    unsigned int v;     // input bits to be reversed
    unsigned int r = v & 1; // r will be reversed bits of v; first get LSB of v
    int s = sizeof(v) * CHAR_BIT - 1; // extra shift needed at end
    
    for (v >>= 1; v; v >>= 1)
    {   
      r <<= 1;
      r |= v & 1;
      s--;
    }
    r <<= s; // shift when v's highest bits are zero
    

    Faster (32-bit processor)

    unsigned char b = x;
    b = ((b * 0x0802LU & 0x22110LU) | (b * 0x8020LU & 0x88440LU)) * 0x10101LU >> 16; 
    

    Faster (64-bit processor)

    unsigned char b; // reverse this (8-bit) byte
    b = (b * 0x0202020202ULL & 0x010884422010ULL) % 1023;
    

    If you want to do this on a 32-bit int, just reverse the bits in each byte, and reverse the order of the bytes. That is:

    unsigned int toReverse;
    unsigned int reversed;
    unsigned char inByte0 = (toReverse & 0xFF);
    unsigned char inByte1 = (toReverse & 0xFF00) >> 8;
    unsigned char inByte2 = (toReverse & 0xFF0000) >> 16;
    unsigned char inByte3 = (toReverse & 0xFF000000) >> 24;
    reversed = (reverseBits(inByte0) << 24) | (reverseBits(inByte1) << 16) | (reverseBits(inByte2) << 8) | (reverseBits(inByte3);
    

    Results

    I benchmarked the two most promising solutions, the lookup table, and bitwise-AND (the first one). The test machine is a laptop w/ 4GB of DDR2-800 and a Core 2 Duo T7500 @ 2.4GHz, 4MB L2 Cache; YMMV. I used gcc 4.3.2 on 64-bit Linux. OpenMP (and the GCC bindings) were used for high-resolution timers.

    reverse.c

    #include <stdlib.h>
    #include <stdio.h>
    #include <omp.h>
    
    unsigned int
    reverse(register unsigned int x)
    {
        x = (((x & 0xaaaaaaaa) >> 1) | ((x & 0x55555555) << 1));
        x = (((x & 0xcccccccc) >> 2) | ((x & 0x33333333) << 2));
        x = (((x & 0xf0f0f0f0) >> 4) | ((x & 0x0f0f0f0f) << 4));
        x = (((x & 0xff00ff00) >> 8) | ((x & 0x00ff00ff) << 8));
        return((x >> 16) | (x << 16));
    
    }
    
    int main()
    {
        unsigned int *ints = malloc(100000000*sizeof(unsigned int));
        unsigned int *ints2 = malloc(100000000*sizeof(unsigned int));
        for(unsigned int i = 0; i < 100000000; i++)
          ints[i] = rand();
    
        unsigned int *inptr = ints;
        unsigned int *outptr = ints2;
        unsigned int *endptr = ints + 100000000;
        // Starting the time measurement
        double start = omp_get_wtime();
        // Computations to be measured
        while(inptr != endptr)
        {
          (*outptr) = reverse(*inptr);
          inptr++;
          outptr++;
        }
        // Measuring the elapsed time
        double end = omp_get_wtime();
        // Time calculation (in seconds)
        printf("Time: %f seconds\n", end-start);
    
        free(ints);
        free(ints2);
    
        return 0;
    }
    

    reverse_lookup.c

    #include <stdlib.h>
    #include <stdio.h>
    #include <omp.h>
    
    static const unsigned char BitReverseTable256[] = 
    {
      0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0, 0x30, 0xB0, 0x70, 0xF0, 
      0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8, 0x18, 0x98, 0x58, 0xD8, 0x38, 0xB8, 0x78, 0xF8, 
      0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4, 0x64, 0xE4, 0x14, 0x94, 0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4, 
      0x0C, 0x8C, 0x4C, 0xCC, 0x2C, 0xAC, 0x6C, 0xEC, 0x1C, 0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC, 
      0x02, 0x82, 0x42, 0xC2, 0x22, 0xA2, 0x62, 0xE2, 0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2, 0x72, 0xF2, 
      0x0A, 0x8A, 0x4A, 0xCA, 0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A, 0xDA, 0x3A, 0xBA, 0x7A, 0xFA,
      0x06, 0x86, 0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6, 0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6, 
      0x0E, 0x8E, 0x4E, 0xCE, 0x2E, 0xAE, 0x6E, 0xEE, 0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE, 0x7E, 0xFE,
      0x01, 0x81, 0x41, 0xC1, 0x21, 0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1, 0x31, 0xB1, 0x71, 0xF1,
      0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, 0x19, 0x99, 0x59, 0xD9, 0x39, 0xB9, 0x79, 0xF9, 
      0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5, 0x15, 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5,
      0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD, 0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD,
      0x03, 0x83, 0x43, 0xC3, 0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3, 
      0x0B, 0x8B, 0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB, 0x1B, 0x9B, 0x5B, 0xDB, 0x3B, 0xBB, 0x7B, 0xFB,
      0x07, 0x87, 0x47, 0xC7, 0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97, 0x57, 0xD7, 0x37, 0xB7, 0x77, 0xF7, 
      0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F, 0xEF, 0x1F, 0x9F, 0x5F, 0xDF, 0x3F, 0xBF, 0x7F, 0xFF
    };
    
    int main()
    {
        unsigned int *ints = malloc(100000000*sizeof(unsigned int));
        unsigned int *ints2 = malloc(100000000*sizeof(unsigned int));
        for(unsigned int i = 0; i < 100000000; i++)
          ints[i] = rand();
    
        unsigned int *inptr = ints;
        unsigned int *outptr = ints2;
        unsigned int *endptr = ints + 100000000;
        // Starting the time measurement
        double start = omp_get_wtime();
        // Computations to be measured
        while(inptr != endptr)
        {
        unsigned int in = *inptr;  
    
        // Option 1:
        //*outptr = (BitReverseTable256[in & 0xff] << 24) | 
        //    (BitReverseTable256[(in >> 8) & 0xff] << 16) | 
        //    (BitReverseTable256[(in >> 16) & 0xff] << 8) |
        //    (BitReverseTable256[(in >> 24) & 0xff]);
    
        // Option 2:
        unsigned char * p = (unsigned char *) &(*inptr);
        unsigned char * q = (unsigned char *) &(*outptr);
        q[3] = BitReverseTable256[p[0]]; 
        q[2] = BitReverseTable256[p[1]]; 
        q[1] = BitReverseTable256[p[2]]; 
        q[0] = BitReverseTable256[p[3]];
    
          inptr++;
          outptr++;
        }
        // Measuring the elapsed time
        double end = omp_get_wtime();
        // Time calculation (in seconds)
        printf("Time: %f seconds\n", end-start);
    
        free(ints);
        free(ints2);
    
        return 0;
    }
    

    I tried both approaches at several different optimizations, ran 3 trials at each level, and each trial reversed 100 million random unsigned ints. For the lookup table option, I tried both schemes (options 1 and 2) given on the bitwise hacks page. Results are shown below.

    Bitwise AND

    mrj10@mjlap:~/code$ gcc -fopenmp -std=c99 -o reverse reverse.c
    mrj10@mjlap:~/code$ ./reverse
    Time: 2.000593 seconds
    mrj10@mjlap:~/code$ ./reverse
    Time: 1.938893 seconds
    mrj10@mjlap:~/code$ ./reverse
    Time: 1.936365 seconds
    mrj10@mjlap:~/code$ gcc -fopenmp -std=c99 -O2 -o reverse reverse.c
    mrj10@mjlap:~/code$ ./reverse
    Time: 0.942709 seconds
    mrj10@mjlap:~/code$ ./reverse
    Time: 0.991104 seconds
    mrj10@mjlap:~/code$ ./reverse
    Time: 0.947203 seconds
    mrj10@mjlap:~/code$ gcc -fopenmp -std=c99 -O3 -o reverse reverse.c
    mrj10@mjlap:~/code$ ./reverse
    Time: 0.922639 seconds
    mrj10@mjlap:~/code$ ./reverse
    Time: 0.892372 seconds
    mrj10@mjlap:~/code$ ./reverse
    Time: 0.891688 seconds
    

    Lookup Table (option 1)

    mrj10@mjlap:~/code$ gcc -fopenmp -std=c99 -o reverse_lookup reverse_lookup.c
    mrj10@mjlap:~/code$ ./reverse_lookup
    Time: 1.201127 seconds              
    mrj10@mjlap:~/code$ ./reverse_lookup
    Time: 1.196129 seconds              
    mrj10@mjlap:~/code$ ./reverse_lookup
    Time: 1.235972 seconds              
    mrj10@mjlap:~/code$ gcc -fopenmp -std=c99 -O2 -o reverse_lookup reverse_lookup.c
    mrj10@mjlap:~/code$ ./reverse_lookup
    Time: 0.633042 seconds              
    mrj10@mjlap:~/code$ ./reverse_lookup
    Time: 0.655880 seconds              
    mrj10@mjlap:~/code$ ./reverse_lookup
    Time: 0.633390 seconds              
    mrj10@mjlap:~/code$ gcc -fopenmp -std=c99 -O3 -o reverse_lookup reverse_lookup.c
    mrj10@mjlap:~/code$ ./reverse_lookup
    Time: 0.652322 seconds              
    mrj10@mjlap:~/code$ ./reverse_lookup
    Time: 0.631739 seconds              
    mrj10@mjlap:~/code$ ./reverse_lookup
    Time: 0.652431 seconds  
    

    Lookup Table (option 2)

    mrj10@mjlap:~/code$ gcc -fopenmp -std=c99 -o reverse_lookup reverse_lookup.c
    mrj10@mjlap:~/code$ ./reverse_lookup
    Time: 1.671537 seconds
    mrj10@mjlap:~/code$ ./reverse_lookup
    Time: 1.688173 seconds
    mrj10@mjlap:~/code$ ./reverse_lookup
    Time: 1.664662 seconds
    mrj10@mjlap:~/code$ gcc -fopenmp -std=c99 -O2 -o reverse_lookup reverse_lookup.c
    mrj10@mjlap:~/code$ ./reverse_lookup
    Time: 1.049851 seconds
    mrj10@mjlap:~/code$ ./reverse_lookup
    Time: 1.048403 seconds
    mrj10@mjlap:~/code$ ./reverse_lookup
    Time: 1.085086 seconds
    mrj10@mjlap:~/code$ gcc -fopenmp -std=c99 -O3 -o reverse_lookup reverse_lookup.c
    mrj10@mjlap:~/code$ ./reverse_lookup
    Time: 1.082223 seconds
    mrj10@mjlap:~/code$ ./reverse_lookup
    Time: 1.053431 seconds
    mrj10@mjlap:~/code$ ./reverse_lookup
    Time: 1.081224 seconds
    

    Conclusion

    Use the lookup table, with option 1 (byte addressing is unsurprisingly slow) if you're concerned about performance. If you need to squeeze every last byte of memory out of your system (and you might, if you care about the performance of bit reversal), the optimized versions of the bitwise-AND approach aren't too shabby either.

    Caveat

    Yes, I know the benchmark code is a complete hack. Suggestions on how to improve it are more than welcome. Things I know about:

    • I don't have access to ICC. This may be faster (please respond in a comment if you can test this out).
    • A 64K lookup table may do well on some modern microarchitectures with large L1D.
    • -mtune=native didn't work for -O2/-O3 (ld blew up with some crazy symbol redefinition error), so I don't believe the generated code is tuned for my microarchitecture.
    • There may be a way to do this slightly faster with SSE. I have no idea how, but with fast replication, packed bitwise AND, and swizzling instructions, there's got to be something there.
    • I know only enough x86 assembly to be dangerous; here's the code GCC generated on -O3 for option 1, so somebody more knowledgable than myself can check it out:

    32-bit

    .L3:
    movl    (%r12,%rsi), %ecx
    movzbl  %cl, %eax
    movzbl  BitReverseTable256(%rax), %edx
    movl    %ecx, %eax
    shrl    $24, %eax
    mov     %eax, %eax
    movzbl  BitReverseTable256(%rax), %eax
    sall    $24, %edx
    orl     %eax, %edx
    movzbl  %ch, %eax
    shrl    $16, %ecx
    movzbl  BitReverseTable256(%rax), %eax
    movzbl  %cl, %ecx
    sall    $16, %eax
    orl     %eax, %edx
    movzbl  BitReverseTable256(%rcx), %eax
    sall    $8, %eax
    orl     %eax, %edx
    movl    %edx, (%r13,%rsi)
    addq    $4, %rsi
    cmpq    $400000000, %rsi
    jne     .L3
    

    EDIT: I also tried using uint64_t types on my machine to see if there was any performance boost. Performance was about 10% faster than 32-bit, and was nearly identical whether you were just using 64-bit types to reverse bits on two 32-bit int types at a time, or whether you were actually reversing bits in half as many 64-bit values. The assembly code is shown below (for the former case, reversing bits for two 32-bit int types at a time):

    .L3:
    movq    (%r12,%rsi), %rdx
    movq    %rdx, %rax
    shrq    $24, %rax
    andl    $255, %eax
    movzbl  BitReverseTable256(%rax), %ecx
    movzbq  %dl,%rax
    movzbl  BitReverseTable256(%rax), %eax
    salq    $24, %rax
    orq     %rax, %rcx
    movq    %rdx, %rax
    shrq    $56, %rax
    movzbl  BitReverseTable256(%rax), %eax
    salq    $32, %rax
    orq     %rax, %rcx
    movzbl  %dh, %eax
    shrq    $16, %rdx
    movzbl  BitReverseTable256(%rax), %eax
    salq    $16, %rax
    orq     %rax, %rcx
    movzbq  %dl,%rax
    shrq    $16, %rdx
    movzbl  BitReverseTable256(%rax), %eax
    salq    $8, %rax
    orq     %rax, %rcx
    movzbq  %dl,%rax
    shrq    $8, %rdx
    movzbl  BitReverseTable256(%rax), %eax
    salq    $56, %rax
    orq     %rax, %rcx
    movzbq  %dl,%rax
    shrq    $8, %rdx
    movzbl  BitReverseTable256(%rax), %eax
    andl    $255, %edx
    salq    $48, %rax
    orq     %rax, %rcx
    movzbl  BitReverseTable256(%rdx), %eax
    salq    $40, %rax
    orq     %rax, %rcx
    movq    %rcx, (%r13,%rsi)
    addq    $8, %rsi
    cmpq    $400000000, %rsi
    jne     .L3
    
    0 讨论(0)
提交回复
热议问题