How to efficiently find the n-th set bit?

ぐ巨炮叔叔 提交于 2020-01-12 14:09:15

问题


For code related to this question, I need to compute the following as fast as possible:

Given a 32 bit integer i, compute the position of the n-th least significant set bit. Both n and the result should be 0-indexed.

For example, given the number i = 110101101012 and n = 4, the desired number is 7 as the fourth set bit is at position 7: 11010110101.

Using the pdep instruction from the BMI2 instruction set extension for x86 and the commonly available __builtin_ctz() intrinsic function, this can be computed easily:

j = _pdep_u32(1 << n, i);
return (__builtin_ctz(j));

However, many computers do not have the pdep instruction (and it's slow on AMD CPUs that do have it), rendering this approach non-portable. You can also compute such bit positions without pdep like this:

j = i;
for (k = 0; k < n; k++)
    j &= j - 1;

return (__builtin_ctz(j));

However, this is pretty slow.

I am targeting computers that provide at least __builtin_popcount() and __builtin_ctz(). How can I find such bit positions faster?


回答1:


The version from bit-twiddling hacks adapted to this case is, for example,

unsigned int nth_bit_set(uint32_t value, unsigned int n)
{
    const uint32_t  pop2  = (value & 0x55555555u) + ((value >> 1) & 0x55555555u);
    const uint32_t  pop4  = (pop2  & 0x33333333u) + ((pop2  >> 2) & 0x33333333u);
    const uint32_t  pop8  = (pop4  & 0x0f0f0f0fu) + ((pop4  >> 4) & 0x0f0f0f0fu);
    const uint32_t  pop16 = (pop8  & 0x00ff00ffu) + ((pop8  >> 8) & 0x00ff00ffu);
    const uint32_t  pop32 = (pop16 & 0x000000ffu) + ((pop16 >>16) & 0x000000ffu);
    unsigned int    rank  = 0;
    unsigned int    temp;

    if (n++ >= pop32)
        return 32;

    temp = pop16 & 0xffu;
    /* if (n > temp) { n -= temp; rank += 16; } */
    rank += ((temp - n) & 256) >> 4;
    n -= temp & ((temp - n) >> 8);

    temp = (pop8 >> rank) & 0xffu;
    /* if (n > temp) { n -= temp; rank += 8; } */
    rank += ((temp - n) & 256) >> 5;
    n -= temp & ((temp - n) >> 8);

    temp = (pop4 >> rank) & 0x0fu;
    /* if (n > temp) { n -= temp; rank += 4; } */
    rank += ((temp - n) & 256) >> 6;
    n -= temp & ((temp - n) >> 8);

    temp = (pop2 >> rank) & 0x03u;
    /* if (n > temp) { n -= temp; rank += 2; } */
    rank += ((temp - n) & 256) >> 7;
    n -= temp & ((temp - n) >> 8);

    temp = (value >> rank) & 0x01u;
    /* if (n > temp) rank += 1; */
    rank += ((temp - n) & 256) >> 8;

    return rank;
}

which, when compiled in a separate compilation unit, on gcc-5.4.0 using -Wall -O3 -march=native -mtune=native on Intel Core i5-4200u, yields

00400a40 <nth_bit_set>:
  400a40: 89 f9                   mov    %edi,%ecx
  400a42: 89 f8                   mov    %edi,%eax
  400a44: 55                      push   %rbp
  400a45: 40 0f b6 f6             movzbl %sil,%esi
  400a49: d1 e9                   shr    %ecx
  400a4b: 25 55 55 55 55          and    $0x55555555,%eax
  400a50: 53                      push   %rbx
  400a51: 81 e1 55 55 55 55       and    $0x55555555,%ecx
  400a57: 01 c1                   add    %eax,%ecx
  400a59: 41 89 c8                mov    %ecx,%r8d
  400a5c: 89 c8                   mov    %ecx,%eax
  400a5e: 41 c1 e8 02             shr    $0x2,%r8d
  400a62: 25 33 33 33 33          and    $0x33333333,%eax
  400a67: 41 81 e0 33 33 33 33    and    $0x33333333,%r8d
  400a6e: 41 01 c0                add    %eax,%r8d
  400a71: 45 89 c1                mov    %r8d,%r9d
  400a74: 44 89 c0                mov    %r8d,%eax
  400a77: 41 c1 e9 04             shr    $0x4,%r9d
  400a7b: 25 0f 0f 0f 0f          and    $0xf0f0f0f,%eax
  400a80: 41 81 e1 0f 0f 0f 0f    and    $0xf0f0f0f,%r9d
  400a87: 41 01 c1                add    %eax,%r9d
  400a8a: 44 89 c8                mov    %r9d,%eax
  400a8d: 44 89 ca                mov    %r9d,%edx
  400a90: c1 e8 08                shr    $0x8,%eax
  400a93: 81 e2 ff 00 ff 00       and    $0xff00ff,%edx
  400a99: 25 ff 00 ff 00          and    $0xff00ff,%eax
  400a9e: 01 d0                   add    %edx,%eax
  400aa0: 0f b6 d8                movzbl %al,%ebx
  400aa3: c1 e8 10                shr    $0x10,%eax
  400aa6: 0f b6 d0                movzbl %al,%edx
  400aa9: b8 20 00 00 00          mov    $0x20,%eax
  400aae: 01 da                   add    %ebx,%edx
  400ab0: 39 f2                   cmp    %esi,%edx
  400ab2: 77 0c                   ja     400ac0 <nth_bit_set+0x80>
  400ab4: 5b                      pop    %rbx
  400ab5: 5d                      pop    %rbp
  400ab6: c3                      retq   

  400ac0: 83 c6 01                add    $0x1,%esi
  400ac3: 89 dd                   mov    %ebx,%ebp
  400ac5: 29 f5                   sub    %esi,%ebp
  400ac7: 41 89 ea                mov    %ebp,%r10d
  400aca: c1 ed 08                shr    $0x8,%ebp
  400acd: 41 81 e2 00 01 00 00    and    $0x100,%r10d
  400ad4: 21 eb                   and    %ebp,%ebx
  400ad6: 41 c1 ea 04             shr    $0x4,%r10d
  400ada: 29 de                   sub    %ebx,%esi
  400adc: c4 42 2b f7 c9          shrx   %r10d,%r9d,%r9d
  400ae1: 41 0f b6 d9             movzbl %r9b,%ebx
  400ae5: 89 dd                   mov    %ebx,%ebp
  400ae7: 29 f5                   sub    %esi,%ebp
  400ae9: 41 89 e9                mov    %ebp,%r9d
  400aec: 41 81 e1 00 01 00 00    and    $0x100,%r9d
  400af3: 41 c1 e9 05             shr    $0x5,%r9d
  400af7: 47 8d 14 11             lea    (%r9,%r10,1),%r10d
  400afb: 41 89 e9                mov    %ebp,%r9d
  400afe: 41 c1 e9 08             shr    $0x8,%r9d
  400b02: c4 42 2b f7 c0          shrx   %r10d,%r8d,%r8d
  400b07: 41 83 e0 0f             and    $0xf,%r8d
  400b0b: 44 21 cb                and    %r9d,%ebx
  400b0e: 45 89 c3                mov    %r8d,%r11d
  400b11: 29 de                   sub    %ebx,%esi
  400b13: 5b                      pop    %rbx
  400b14: 41 29 f3                sub    %esi,%r11d
  400b17: 5d                      pop    %rbp
  400b18: 44 89 da                mov    %r11d,%edx
  400b1b: 41 c1 eb 08             shr    $0x8,%r11d
  400b1f: 81 e2 00 01 00 00       and    $0x100,%edx
  400b25: 45 21 d8                and    %r11d,%r8d
  400b28: c1 ea 06                shr    $0x6,%edx
  400b2b: 44 29 c6                sub    %r8d,%esi
  400b2e: 46 8d 0c 12             lea    (%rdx,%r10,1),%r9d
  400b32: c4 e2 33 f7 c9          shrx   %r9d,%ecx,%ecx
  400b37: 83 e1 03                and    $0x3,%ecx
  400b3a: 41 89 c8                mov    %ecx,%r8d
  400b3d: 41 29 f0                sub    %esi,%r8d
  400b40: 44 89 c0                mov    %r8d,%eax
  400b43: 41 c1 e8 08             shr    $0x8,%r8d
  400b47: 25 00 01 00 00          and    $0x100,%eax
  400b4c: 44 21 c1                and    %r8d,%ecx
  400b4f: c1 e8 07                shr    $0x7,%eax
  400b52: 29 ce                   sub    %ecx,%esi
  400b54: 42 8d 14 08             lea    (%rax,%r9,1),%edx
  400b58: c4 e2 6b f7 c7          shrx   %edx,%edi,%eax
  400b5d: 83 e0 01                and    $0x1,%eax
  400b60: 29 f0                   sub    %esi,%eax
  400b62: 25 00 01 00 00          and    $0x100,%eax
  400b67: c1 e8 08                shr    $0x8,%eax
  400b6a: 01 d0                   add    %edx,%eax
  400b6c: c3                      retq

When compiled as a separate compilation unit, timing on this machine is difficult, because the actual operation is as fast as calling a do-nothing function (also compiled in a separate compilation unit); essentially, the calculation is done during the latencies associated with the function call.

It seems to be slightly faster than my suggestion of a binary search,

unsigned int nth_bit_set(uint32_t value, unsigned int n)
{
    uint32_t      mask = 0x0000FFFFu;
    unsigned int  size = 16u;
    unsigned int  base = 0u;

    if (n++ >= __builtin_popcount(value))
        return 32;

    while (size > 0) {
        const unsigned int  count = __builtin_popcount(value & mask);
        if (n > count) {
            base += size;
            size >>= 1;
            mask |= mask << size;
        } else {
            size >>= 1;
            mask >>= size;
        }
    }

    return base;
}

where the loop is executed exactly five times, compiling to

00400ba0 <nth_bit_set>:
  400ba0: 83 c6 01                add    $0x1,%esi
  400ba3: 31 c0                   xor    %eax,%eax
  400ba5: b9 10 00 00 00          mov    $0x10,%ecx
  400baa: ba ff ff 00 00          mov    $0xffff,%edx
  400baf: 45 31 db                xor    %r11d,%r11d
  400bb2: 66 0f 1f 44 00 00       nopw   0x0(%rax,%rax,1)
  400bb8: 41 89 c9                mov    %ecx,%r9d
  400bbb: 41 89 f8                mov    %edi,%r8d
  400bbe: 41 d0 e9                shr    %r9b
  400bc1: 41 21 d0                and    %edx,%r8d
  400bc4: c4 62 31 f7 d2          shlx   %r9d,%edx,%r10d
  400bc9: f3 45 0f b8 c0          popcnt %r8d,%r8d
  400bce: 41 09 d2                or     %edx,%r10d
  400bd1: 44 38 c6                cmp    %r8b,%sil
  400bd4: 41 0f 46 cb             cmovbe %r11d,%ecx
  400bd8: c4 e2 33 f7 d2          shrx   %r9d,%edx,%edx
  400bdd: 41 0f 47 d2             cmova  %r10d,%edx
  400be1: 01 c8                   add    %ecx,%eax
  400be3: 44 89 c9                mov    %r9d,%ecx
  400be6: 45 84 c9                test   %r9b,%r9b
  400be9: 75 cd                   jne    400bb8 <nth_bit_set+0x18>
  400beb: c3                      retq   

as in, not more than 31 cycles in 95% of calls to the binary search version, compared to not more than 28 cycles in 95% of calls to the bit-hack version; both run within 28 cycles in 50% of the cases. (The loop version takes up to 56 cycles in 95% of calls, up to 37 cycles median.)

To determine which one is better in actual real-world code, one would have to do a proper benchmark within the real-world task; at least with current x86-64 architecture processors, the work done is easily hidden in latencies incurred elsewhere (like function calls).




回答2:


My answer is mostly based on this implementation of a 64bit word select method (Hint: Look only at the MARISA_USE_POPCNT, MARISA_X64, MARISA_USE_SSE3 codepaths):

It works in two steps, first selecting the byte containing the n-th set bit and then using a lookup table inside the byte:

  • Extract the lower and higher nibbles for every byte (bitmasks 0xF, 0xF0, shift the higher nibbles down)
  • Replace the nibble values by their popcount (_mm_shuffle_epi8 with A000120)
  • Sum the popcounts of the lower and upper nibbles (Normal SSE addition) to get byte popcounts
  • Compute the prefix sum over all byte popcounts (multiplication with 0x01010101...)
  • Propagate the position n to all bytes (SSE broadcast or again multiplication with 0x01010101...)
  • Do a bytewise comparison (_mm_cmpgt_epi8 leaves 0xFF in every byte smaller than n)
  • Compute the byte offset by doing a popcount on the result

Now we know which byte contains the bit and a simple byte lookup table like in grek40's answer suffices to get the result.

Note however that I have not really benchmarked this result against other implementations, only that I have seen it to be quite efficient (and branchless)




回答3:


Edit

After giving it some thought and using the __builtin_popcount function, I figured it might be better to decide on the relevant byte and then compute the whole result instead of incrementally adding/subtracting numbers. Here is an updated version:

int GetBitAtPosition(unsigned i, unsigned n)
{
    unsigned bitCount;

    bitCount = __builtin_popcount(i & 0x00ffffff);
    if (bitCount <= n)
    {
        return (24 + LUT_BitPosition[i >> 24][n - bitCount]);
    }

    bitCount = __builtin_popcount(i & 0x0000ffff);
    if (bitCount <= n)
    {
        return (16 + LUT_BitPosition[(i >> 16) & 0xff][n - bitCount]);
    }

    bitCount = __builtin_popcount(i & 0x000000ff);
    if (bitCount <= n)
    {
        return (8 + LUT_BitPosition[(i >> 8) & 0xff][n - bitCount]);
    }

    return LUT_BitPosition[i & 0xff][n];
}

I felt like creating a LUT based solution where the number is inspected in byte-chunks, however, the LUT for the n-th bit position grew quite large (256*8) and the LUT-free version that was discussed in the comments might be better.

Generally the algorithm would look like this:

unsigned i = 0x000006B5;
unsigned n = 4;
unsigned result = 0;
unsigned bitCount;
while (i)
{
    bitCount = LUT_BitCount[i & 0xff];
    if (n < bitCount)
    {
        result += LUT_BitPosition[i & 0xff][n];
        break; // found
    }
    else
    {
        n -= bitCount;
        result += 8;
        i >>= 8;
    }
}

Might be worth to unroll the loop into its up to 4 iterations to get the best performance on 32 bit numbers.

The LUT for bitcount (could be replaced by __builtin_popcount):

unsigned LUT_BitCount[] = {
    0, 1, 1, 2, 1, 2, 2, 3, // 0-7

    1, 2, 2, 3, 2, 3, 3, 4, // 8-15

    1, 2, 2, 3, 2, 3, 3, 4, // 16-23
    2, 3, 3, 4, 3, 4, 4, 5, // 24-31

    1, 2, 2, 3, 2, 3, 3, 4, // 32-39
    2, 3, 3, 4, 3, 4, 4, 5, // 40-47
    2, 3, 3, 4, 3, 4, 4, 5, // 48-55
    3, 4, 4, 5, 4, 5, 5, 6, // 56-63

    1, 2, 2, 3, 2, 3, 3, 4, // 64-71
    2, 3, 3, 4, 3, 4, 4, 5, // 72-79
    2, 3, 3, 4, 3, 4, 4, 5, // 80-87
    3, 4, 4, 5, 4, 5, 5, 6, // 88-95
    2, 3, 3, 4, 3, 4, 4, 5, // 96-103
    3, 4, 4, 5, 4, 5, 5, 6, // 104-111
    3, 4, 4, 5, 4, 5, 5, 6, // 112-119
    4, 5, 5, 6, 5, 6, 6, 7, // 120-127

    1, 2, 2, 3, 2, 3, 3, 4, // 128
    2, 3, 3, 4, 3, 4, 4, 5, // 136
    2, 3, 3, 4, 3, 4, 4, 5, // 144
    3, 4, 4, 5, 4, 5, 5, 6, // 152
    2, 3, 3, 4, 3, 4, 4, 5, // 160
    3, 4, 4, 5, 4, 5, 5, 6, // 168
    3, 4, 4, 5, 4, 5, 5, 6, // 176
    4, 5, 5, 6, 5, 6, 6, 7, // 184
    2, 3, 3, 4, 3, 4, 4, 5, // 192
    3, 4, 4, 5, 4, 5, 5, 6, // 200
    3, 4, 4, 5, 4, 5, 5, 6, // 208
    4, 5, 5, 6, 5, 6, 6, 7, // 216
    3, 4, 4, 5, 4, 5, 5, 6, // 224
    4, 5, 5, 6, 5, 6, 6, 7, // 232
    4, 5, 5, 6, 5, 6, 6, 7, // 240
    5, 6, 6, 7, 6, 7, 7, 8, // 248-255
};

The LUT for bit position within a byte:

unsigned LUT_BitPosition[][8] = {
    // 0-7
    {UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {1,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,1,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {2,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,2,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {1,2,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,1,2,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},

    // 8-15
    {3,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,3,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {1,3,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,1,3,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {2,3,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,2,3,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {1,2,3,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,1,2,3,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},

    // 16-31
    {4,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,4,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {1,4,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,1,4,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {2,4,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,2,4,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {1,2,4,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,1,2,4,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {3,4,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,3,4,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {1,3,4,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,1,3,4,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {2,3,4,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,2,3,4,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {1,2,3,4,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,1,2,3,4,UINT_MAX,UINT_MAX,UINT_MAX},

    // 32-63
    {5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {1,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,1,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {2,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,2,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {1,2,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,1,2,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {3,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,3,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {1,3,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,1,3,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {2,3,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,2,3,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {1,2,3,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,1,2,3,5,UINT_MAX,UINT_MAX,UINT_MAX},
    {4,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,4,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {1,4,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,1,4,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {2,4,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,2,4,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {1,2,4,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,1,2,4,5,UINT_MAX,UINT_MAX,UINT_MAX},
    {3,4,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,3,4,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {1,3,4,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,1,3,4,5,UINT_MAX,UINT_MAX,UINT_MAX},
    {2,3,4,5,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,2,3,4,5,UINT_MAX,UINT_MAX,UINT_MAX},
    {1,2,3,4,5,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,1,2,3,4,5,UINT_MAX,UINT_MAX},

    // 64-127
    {6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {1,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,1,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {2,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,2,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {1,2,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,1,2,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {3,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,3,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {1,3,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,1,3,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {2,3,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,2,3,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {1,2,3,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,1,2,3,6,UINT_MAX,UINT_MAX,UINT_MAX},
    {4,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,4,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {1,4,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,1,4,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {2,4,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,2,4,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {1,2,4,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,1,2,4,6,UINT_MAX,UINT_MAX,UINT_MAX},
    {3,4,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,3,4,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {1,3,4,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,1,3,4,6,UINT_MAX,UINT_MAX,UINT_MAX},
    {2,3,4,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,2,3,4,6,UINT_MAX,UINT_MAX,UINT_MAX},
    {1,2,3,4,6,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,1,2,3,4,6,UINT_MAX,UINT_MAX},
    {5,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,5,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {1,5,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,1,5,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {2,5,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,2,5,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {1,2,5,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,1,2,5,6,UINT_MAX,UINT_MAX,UINT_MAX},
    {3,5,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,3,5,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {1,3,5,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,1,3,5,6,UINT_MAX,UINT_MAX,UINT_MAX},
    {2,3,5,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,2,3,5,6,UINT_MAX,UINT_MAX,UINT_MAX},
    {1,2,3,5,6,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,1,2,3,5,6,UINT_MAX,UINT_MAX},
    {4,5,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,4,5,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {1,4,5,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,1,4,5,6,UINT_MAX,UINT_MAX,UINT_MAX},
    {2,4,5,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,2,4,5,6,UINT_MAX,UINT_MAX,UINT_MAX},
    {1,2,4,5,6,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,1,2,4,5,6,UINT_MAX,UINT_MAX},
    {3,4,5,6,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,3,4,5,6,UINT_MAX,UINT_MAX,UINT_MAX},
    {1,3,4,5,6,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,1,3,4,5,6,UINT_MAX,UINT_MAX},
    {2,3,4,5,6,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,2,3,4,5,6,UINT_MAX,UINT_MAX},
    {1,2,3,4,5,6,UINT_MAX,UINT_MAX},
    {0,1,2,3,4,5,6,UINT_MAX},

    // 128-255
    {7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {1,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,1,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {2,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,2,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {1,2,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,1,2,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {3,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,3,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {1,3,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,1,3,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {2,3,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,2,3,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {1,2,3,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,1,2,3,7,UINT_MAX,UINT_MAX,UINT_MAX},
    {4,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,4,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {1,4,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,1,4,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {2,4,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,2,4,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {1,2,4,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,1,2,4,7,UINT_MAX,UINT_MAX,UINT_MAX},
    {3,4,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,3,4,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {1,3,4,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,1,3,4,7,UINT_MAX,UINT_MAX,UINT_MAX},
    {2,3,4,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,2,3,4,7,UINT_MAX,UINT_MAX,UINT_MAX},
    {1,2,3,4,7,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,1,2,3,4,7,UINT_MAX,UINT_MAX},
    {5,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,5,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {1,5,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,1,5,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {2,5,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,2,5,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {1,2,5,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,1,2,5,7,UINT_MAX,UINT_MAX,UINT_MAX},
    {3,5,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,3,5,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {1,3,5,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,1,3,5,7,UINT_MAX,UINT_MAX,UINT_MAX},
    {2,3,5,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,2,3,5,7,UINT_MAX,UINT_MAX,UINT_MAX},
    {1,2,3,5,7,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,1,2,3,5,7,UINT_MAX,UINT_MAX},
    {4,5,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,4,5,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {1,4,5,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,1,4,5,7,UINT_MAX,UINT_MAX,UINT_MAX},
    {2,4,5,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,2,4,5,7,UINT_MAX,UINT_MAX,UINT_MAX},
    {1,2,4,5,7,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,1,2,4,5,7,UINT_MAX,UINT_MAX},
    {3,4,5,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,3,4,5,7,UINT_MAX,UINT_MAX,UINT_MAX},
    {1,3,4,5,7,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,1,3,4,5,7,UINT_MAX,UINT_MAX},
    {2,3,4,5,7,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,2,3,4,5,7,UINT_MAX,UINT_MAX},
    {1,2,3,4,5,7,UINT_MAX,UINT_MAX},
    {0,1,2,3,4,5,7,UINT_MAX},
    {6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {1,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,1,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {2,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,2,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {1,2,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,1,2,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
    {3,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,3,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {1,3,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,1,3,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
    {2,3,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,2,3,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
    {1,2,3,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,1,2,3,6,7,UINT_MAX,UINT_MAX},
    {4,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,4,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {1,4,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,1,4,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
    {2,4,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,2,4,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
    {1,2,4,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,1,2,4,6,7,UINT_MAX,UINT_MAX},
    {3,4,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,3,4,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
    {1,3,4,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,1,3,4,6,7,UINT_MAX,UINT_MAX},
    {2,3,4,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,2,3,4,6,7,UINT_MAX,UINT_MAX},
    {1,2,3,4,6,7,UINT_MAX,UINT_MAX},
    {0,1,2,3,4,6,7,UINT_MAX},
    {5,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,5,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {1,5,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,1,5,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
    {2,5,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,2,5,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
    {1,2,5,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,1,2,5,6,7,UINT_MAX,UINT_MAX},
    {3,5,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,3,5,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
    {1,3,5,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,1,3,5,6,7,UINT_MAX,UINT_MAX},
    {2,3,5,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,2,3,5,6,7,UINT_MAX,UINT_MAX},
    {1,2,3,5,6,7,UINT_MAX,UINT_MAX},
    {0,1,2,3,5,6,7,UINT_MAX},
    {4,5,6,7,UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,4,5,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
    {1,4,5,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,1,4,5,6,7,UINT_MAX,UINT_MAX},
    {2,4,5,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,2,4,5,6,7,UINT_MAX,UINT_MAX},
    {1,2,4,5,6,7,UINT_MAX,UINT_MAX},
    {0,1,2,4,5,6,7,UINT_MAX},
    {3,4,5,6,7,UINT_MAX,UINT_MAX,UINT_MAX},
    {0,3,4,5,6,7,UINT_MAX,UINT_MAX},
    {1,3,4,5,6,7,UINT_MAX,UINT_MAX},
    {0,1,3,4,5,6,7,UINT_MAX},
    {2,3,4,5,6,7,UINT_MAX,UINT_MAX},
    {0,2,3,4,5,6,7,UINT_MAX},
    {1,2,3,4,5,6,7,UINT_MAX},
    {0,1,2,3,4,5,6,7},
};



回答4:


My approach is to calculate the population count for each 8-bit quarters of the 32-bit integer in parallel, then find which quarter contains the nth bit. The population count of quarters that are lower than the found one can be summarized as the initial value of later calculation.

After that count set bits one-by-one until the n is reached. Without branches and using an incomplete implementation of population count algorithm, my example is the following:

#include <stdio.h>
#include <stdint.h>

int main() {
    uint32_t n = 10, test = 3124375902u; /* 10111010001110100011000101011110 */
    uint32_t index, popcnt, quarter = 0, q_popcnt;

    /* count set bits of each quarter of 32-bit integer in parallel */
    q_popcnt = test - ((test >> 1) & 0x55555555);
    q_popcnt = (q_popcnt & 0x33333333) + ((q_popcnt >> 2) & 0x33333333);
    q_popcnt = (q_popcnt + (q_popcnt >> 4)) & 0x0F0F0F0F;

    popcnt = q_popcnt;

    /* find which quarters can be summarized and summarize them */
    quarter += (n + 1 >= (q_popcnt & 0xff));
    quarter += (n + 1 >= ((q_popcnt += q_popcnt >> 8) & 0xff));
    quarter += (n + 1 >= ((q_popcnt += q_popcnt >> 16) & 0xff));
    quarter += (n + 1 >= ((q_popcnt += q_popcnt >> 24) & 0xff));

    popcnt &= (UINT32_MAX >> (8 * quarter));
    popcnt = (popcnt * 0x01010101) >> 24;

    /* find the index of nth bit in quarter where it should be */
    index = 8 * quarter;
    index += ((popcnt += (test >> index) & 1) <= n);
    index += ((popcnt += (test >> index) & 1) <= n);
    index += ((popcnt += (test >> index) & 1) <= n);
    index += ((popcnt += (test >> index) & 1) <= n);
    index += ((popcnt += (test >> index) & 1) <= n);
    index += ((popcnt += (test >> index) & 1) <= n);
    index += ((popcnt += (test >> index) & 1) <= n);
    index += ((popcnt += (test >> index) & 1) <= n);

    printf("index = %u\n", index);
    return 0;
}

A simple approach which uses loops and conditionals can be the following as well:

#include <stdio.h>
#include <stdint.h>

int main() {
    uint32_t n = 11, test = 3124375902u; /* 10111010001110100011000101011110 */
    uint32_t popcnt = 0, index = 0;
    while(popcnt += ((test >> index) & 1), popcnt <= n && ++index < 32);

    printf("index = %u\n", index);
    return 0;
}



回答5:


Based on a method by Juha Järvi published in the famous Bit Twiddling Hacks, I tested this implementation where n and i are used as in the question:

    a = i - (i >> 1 & 0x55555555);
    b = (a & 0x33333333) + (a >> 2 & 0x33333333);
    c = b + (b >> 4) & 0x0f0f0f0f;

    r = n + 1;
    s = 0;
    t = c + (c >> 8) & 0xff;

    if (r > t) {
        s += 16;
        r -= t;
    }

    t = c >> s & 0xf;

    if (r > t) {
        s += 8;
        r -= t;
    }

    t = b >> s & 0x7;

    if (r > t) {
        s += 4;
        r -= t;
    }

    t = a >> s & 0x3;

    if (r > t) {
        s += 2;
        r -= t;
    }

    t = i >> s & 0x1;

    if (r > t)
        s++;

    return (s);

Based on my own tests, this is about as fast as the loop on x86, whereas it is 20% faster on arm64 and probably a lot faster on arm due to the fast conditional instructions, but I can't test this right now.



来源:https://stackoverflow.com/questions/45482787/how-to-efficiently-find-the-n-th-set-bit

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!