Find nth SET bit in an int

后端 未结 11 864
栀梦
栀梦 2020-12-14 18:14

Instead of just the lowest set bit, I want to find the position of the nth lowest set bit. (I\'m NOT talking about value on the nt

11条回答
  •  时光说笑
    2020-12-14 18:46

    The version from bit-twiddling hacks adapted to this case is, for example,

    unsigned int nth_bit_set(uint32_t value, unsigned int n)
    {
        const uint32_t  pop2  = (value & 0x55555555u) + ((value >> 1) & 0x55555555u);
        const uint32_t  pop4  = (pop2  & 0x33333333u) + ((pop2  >> 2) & 0x33333333u);
        const uint32_t  pop8  = (pop4  & 0x0f0f0f0fu) + ((pop4  >> 4) & 0x0f0f0f0fu);
        const uint32_t  pop16 = (pop8  & 0x00ff00ffu) + ((pop8  >> 8) & 0x00ff00ffu);
        const uint32_t  pop32 = (pop16 & 0x000000ffu) + ((pop16 >>16) & 0x000000ffu);
        unsigned int    rank  = 0;
        unsigned int    temp;
    
        if (n++ >= pop32)
            return 32;
    
        temp = pop16 & 0xffu;
        /* if (n > temp) { n -= temp; rank += 16; } */
        rank += ((temp - n) & 256) >> 4;
        n -= temp & ((temp - n) >> 8);
    
        temp = (pop8 >> rank) & 0xffu;
        /* if (n > temp) { n -= temp; rank += 8; } */
        rank += ((temp - n) & 256) >> 5;
        n -= temp & ((temp - n) >> 8);
    
        temp = (pop4 >> rank) & 0x0fu;
        /* if (n > temp) { n -= temp; rank += 4; } */
        rank += ((temp - n) & 256) >> 6;
        n -= temp & ((temp - n) >> 8);
    
        temp = (pop2 >> rank) & 0x03u;
        /* if (n > temp) { n -= temp; rank += 2; } */
        rank += ((temp - n) & 256) >> 7;
        n -= temp & ((temp - n) >> 8);
    
        temp = (value >> rank) & 0x01u;
        /* if (n > temp) rank += 1; */
        rank += ((temp - n) & 256) >> 8;
    
        return rank;
    }
    

    which, when compiled in a separate compilation unit, on gcc-5.4.0 using -Wall -O3 -march=native -mtune=native on Intel Core i5-4200u, yields

    00400a40 :
      400a40: 89 f9                   mov    %edi,%ecx
      400a42: 89 f8                   mov    %edi,%eax
      400a44: 55                      push   %rbp
      400a45: 40 0f b6 f6             movzbl %sil,%esi
      400a49: d1 e9                   shr    %ecx
      400a4b: 25 55 55 55 55          and    $0x55555555,%eax
      400a50: 53                      push   %rbx
      400a51: 81 e1 55 55 55 55       and    $0x55555555,%ecx
      400a57: 01 c1                   add    %eax,%ecx
      400a59: 41 89 c8                mov    %ecx,%r8d
      400a5c: 89 c8                   mov    %ecx,%eax
      400a5e: 41 c1 e8 02             shr    $0x2,%r8d
      400a62: 25 33 33 33 33          and    $0x33333333,%eax
      400a67: 41 81 e0 33 33 33 33    and    $0x33333333,%r8d
      400a6e: 41 01 c0                add    %eax,%r8d
      400a71: 45 89 c1                mov    %r8d,%r9d
      400a74: 44 89 c0                mov    %r8d,%eax
      400a77: 41 c1 e9 04             shr    $0x4,%r9d
      400a7b: 25 0f 0f 0f 0f          and    $0xf0f0f0f,%eax
      400a80: 41 81 e1 0f 0f 0f 0f    and    $0xf0f0f0f,%r9d
      400a87: 41 01 c1                add    %eax,%r9d
      400a8a: 44 89 c8                mov    %r9d,%eax
      400a8d: 44 89 ca                mov    %r9d,%edx
      400a90: c1 e8 08                shr    $0x8,%eax
      400a93: 81 e2 ff 00 ff 00       and    $0xff00ff,%edx
      400a99: 25 ff 00 ff 00          and    $0xff00ff,%eax
      400a9e: 01 d0                   add    %edx,%eax
      400aa0: 0f b6 d8                movzbl %al,%ebx
      400aa3: c1 e8 10                shr    $0x10,%eax
      400aa6: 0f b6 d0                movzbl %al,%edx
      400aa9: b8 20 00 00 00          mov    $0x20,%eax
      400aae: 01 da                   add    %ebx,%edx
      400ab0: 39 f2                   cmp    %esi,%edx
      400ab2: 77 0c                   ja     400ac0 
      400ab4: 5b                      pop    %rbx
      400ab5: 5d                      pop    %rbp
      400ab6: c3                      retq   
    
      400ac0: 83 c6 01                add    $0x1,%esi
      400ac3: 89 dd                   mov    %ebx,%ebp
      400ac5: 29 f5                   sub    %esi,%ebp
      400ac7: 41 89 ea                mov    %ebp,%r10d
      400aca: c1 ed 08                shr    $0x8,%ebp
      400acd: 41 81 e2 00 01 00 00    and    $0x100,%r10d
      400ad4: 21 eb                   and    %ebp,%ebx
      400ad6: 41 c1 ea 04             shr    $0x4,%r10d
      400ada: 29 de                   sub    %ebx,%esi
      400adc: c4 42 2b f7 c9          shrx   %r10d,%r9d,%r9d
      400ae1: 41 0f b6 d9             movzbl %r9b,%ebx
      400ae5: 89 dd                   mov    %ebx,%ebp
      400ae7: 29 f5                   sub    %esi,%ebp
      400ae9: 41 89 e9                mov    %ebp,%r9d
      400aec: 41 81 e1 00 01 00 00    and    $0x100,%r9d
      400af3: 41 c1 e9 05             shr    $0x5,%r9d
      400af7: 47 8d 14 11             lea    (%r9,%r10,1),%r10d
      400afb: 41 89 e9                mov    %ebp,%r9d
      400afe: 41 c1 e9 08             shr    $0x8,%r9d
      400b02: c4 42 2b f7 c0          shrx   %r10d,%r8d,%r8d
      400b07: 41 83 e0 0f             and    $0xf,%r8d
      400b0b: 44 21 cb                and    %r9d,%ebx
      400b0e: 45 89 c3                mov    %r8d,%r11d
      400b11: 29 de                   sub    %ebx,%esi
      400b13: 5b                      pop    %rbx
      400b14: 41 29 f3                sub    %esi,%r11d
      400b17: 5d                      pop    %rbp
      400b18: 44 89 da                mov    %r11d,%edx
      400b1b: 41 c1 eb 08             shr    $0x8,%r11d
      400b1f: 81 e2 00 01 00 00       and    $0x100,%edx
      400b25: 45 21 d8                and    %r11d,%r8d
      400b28: c1 ea 06                shr    $0x6,%edx
      400b2b: 44 29 c6                sub    %r8d,%esi
      400b2e: 46 8d 0c 12             lea    (%rdx,%r10,1),%r9d
      400b32: c4 e2 33 f7 c9          shrx   %r9d,%ecx,%ecx
      400b37: 83 e1 03                and    $0x3,%ecx
      400b3a: 41 89 c8                mov    %ecx,%r8d
      400b3d: 41 29 f0                sub    %esi,%r8d
      400b40: 44 89 c0                mov    %r8d,%eax
      400b43: 41 c1 e8 08             shr    $0x8,%r8d
      400b47: 25 00 01 00 00          and    $0x100,%eax
      400b4c: 44 21 c1                and    %r8d,%ecx
      400b4f: c1 e8 07                shr    $0x7,%eax
      400b52: 29 ce                   sub    %ecx,%esi
      400b54: 42 8d 14 08             lea    (%rax,%r9,1),%edx
      400b58: c4 e2 6b f7 c7          shrx   %edx,%edi,%eax
      400b5d: 83 e0 01                and    $0x1,%eax
      400b60: 29 f0                   sub    %esi,%eax
      400b62: 25 00 01 00 00          and    $0x100,%eax
      400b67: c1 e8 08                shr    $0x8,%eax
      400b6a: 01 d0                   add    %edx,%eax
      400b6c: c3                      retq
    

    When compiled as a separate compilation unit, timing on this machine is difficult, because the actual operation is as fast as calling a do-nothing function (also compiled in a separate compilation unit); essentially, the calculation is done during the latencies associated with the function call.

    It seems to be slightly faster than my suggestion of a binary search,

    unsigned int nth_bit_set(uint32_t value, unsigned int n)
    {
        uint32_t      mask = 0x0000FFFFu;
        unsigned int  size = 16u;
        unsigned int  base = 0u;
    
        if (n++ >= __builtin_popcount(value))
            return 32;
    
        while (size > 0) {
            const unsigned int  count = __builtin_popcount(value & mask);
            if (n > count) {
                base += size;
                size >>= 1;
                mask |= mask << size;
            } else {
                size >>= 1;
                mask >>= size;
            }
        }
    
        return base;
    }
    

    where the loop is executed exactly five times, compiling to

    00400ba0 :
      400ba0: 83 c6 01                add    $0x1,%esi
      400ba3: 31 c0                   xor    %eax,%eax
      400ba5: b9 10 00 00 00          mov    $0x10,%ecx
      400baa: ba ff ff 00 00          mov    $0xffff,%edx
      400baf: 45 31 db                xor    %r11d,%r11d
      400bb2: 66 0f 1f 44 00 00       nopw   0x0(%rax,%rax,1)
      400bb8: 41 89 c9                mov    %ecx,%r9d
      400bbb: 41 89 f8                mov    %edi,%r8d
      400bbe: 41 d0 e9                shr    %r9b
      400bc1: 41 21 d0                and    %edx,%r8d
      400bc4: c4 62 31 f7 d2          shlx   %r9d,%edx,%r10d
      400bc9: f3 45 0f b8 c0          popcnt %r8d,%r8d
      400bce: 41 09 d2                or     %edx,%r10d
      400bd1: 44 38 c6                cmp    %r8b,%sil
      400bd4: 41 0f 46 cb             cmovbe %r11d,%ecx
      400bd8: c4 e2 33 f7 d2          shrx   %r9d,%edx,%edx
      400bdd: 41 0f 47 d2             cmova  %r10d,%edx
      400be1: 01 c8                   add    %ecx,%eax
      400be3: 44 89 c9                mov    %r9d,%ecx
      400be6: 45 84 c9                test   %r9b,%r9b
      400be9: 75 cd                   jne    400bb8 
      400beb: c3                      retq   
    

    as in, not more than 31 cycles in 95% of calls to the binary search version, compared to not more than 28 cycles in 95% of calls to the bit-hack version; both run within 28 cycles in 50% of the cases. (The loop version takes up to 56 cycles in 95% of calls, up to 37 cycles median.)

    To determine which one is better in actual real-world code, one would have to do a proper benchmark within the real-world task; at least with current x86-64 architecture processors, the work done is easily hidden in latencies incurred elsewhere (like function calls).

提交回复
热议问题