Get an array of the bit positions within a 64-bit integer

前端 未结 8 1364
囚心锁ツ
囚心锁ツ 2020-12-30 06:07

OK, it may sound a bit complicated, but this is what I\'m trying to do :

  • Take e.g. 10101010101
  • And return { 0, 2, 4, 6, 8, 10 }
8条回答
  •  小鲜肉
    小鲜肉 (楼主)
    2020-12-30 06:53

    I have been wondering whether it would work faster with the bst assembly instruction. So I tried 3 implementations and got the following results for 10 million iterations:

    Your implementation (Dr.Kameleon) 1.77 seconds

    The log2() implementation (icepack) 2.17 seconds

    My assembly implementation (me) 0.16 seconds

    Output:

    bits version:
    Function started at 0
               ended at 177
                  spent 177 (1.770000 seconds)
    c version:
    Function started at 177
               ended at 394
                  spent 217 (2.170000 seconds)
    c version:
    Function started at 394
               ended at 410
                  spent 16 (0.160000 seconds)
    

    One point about C/C++, static is horrendous. It is actually compiled in a list of CPU instructions (NOT what I would expect either!!!) Instead, use an array outside your function in a nameless namespace. That will have the expected effect. Although in assembly you can use the .long (or some other size) and then %rip to reference the data from the IP.

    Note: once compiled, I do not see the size (n) being used in my assembly version so I'm not too sure whether the returned array is valid. Outside of that, the code itself becomes a loop of 5 assembly instructions, hence the tiny increase in speed (about x10).

    The reason for log2() slowness is that it converts the number to an xmm register and then does call to another function. It then converts the xmm register back to the regular register.

    #include 
    #include 
    #include 
    #include 
    #include 
    #include 
    #include 
    #include 
    
    using namespace std;
    
    namespace
    {
    const int index64[64] = {
        63,  0, 58,  1, 59, 47, 53,  2,
        60, 39, 48, 27, 54, 33, 42,  3,
        61, 51, 37, 40, 49, 18, 28, 20,
        55, 30, 34, 11, 43, 14, 22,  4,
        62, 57, 46, 52, 38, 26, 32, 41,
        50, 36, 17, 19, 29, 10, 13, 21,
        56, 45, 25, 31, 35, 16,  9, 12,
        44, 24, 15,  8, 23,  7,  6,  5  };
    const uint64_t debruijn64 = 0x07EDD5E59A4E28C2ULL;
    }
    
    int firstBit(uint64_t bitboard)
    {
        return index64[((bitboard & -bitboard) * debruijn64) >> 58];  
    }
    
    vector bits(uint64_t bitboard)
    {
        vector res;
        res.reserve(64);
    
        while(bitboard)
        {
            int first = firstBit(bitboard);
            res.push_back(first);
    
            bitboard &= ~(1ULL << first);
        }
        return res;
    }
    
    
    
    vector bits_c(uint64_t bitboard)
    {
        int n;
        vector res;
        res.reserve(64);
        for (n = 0; bitboard != 0; n++, bitboard &= (bitboard - 1))
        {
            res.push_back(log2(bitboard & ~(bitboard - 1)));
        }
        return res;
    }
    
    
    vector bits_asm(uint64_t bitboard)
    {
        int64_t n(0);
        int res[64];
        asm(
        "bsf %[b], %%rax\n\t"
        "je exit\n\t"
        ".align 16\n"
    "loop:\n\t"
        "mov %%eax, (%[r],%[n],4)\n\t"
        "btr %%rax, %[b]\n\t"
        "inc %[n]\n\t"
        "bsf %[b], %%rax\n\t"
        "je loop\n"
    "exit:\n\t"
        : /* output */ "=r" (n)
        : /* input */ [n] "r" (n), [r] "r" (res), [b] "r" (bitboard)
        : /* state */ "eax", "cc"
        );
        return vector(res, res + n);
    }
    
    
    
    
    class run_timer
    {
    public:
        run_timer()
        {
        }
    
        void start()
        {
            times(&f_start);
        }
    
        void stop()
        {
            times(&f_stop);
        }
    
        void report(const char *msg)
        {
            printf("%s:\n"
                   "Function started at %ld\n"
                   "           ended at %ld\n"
                   "              spent %ld (%f seconds)\n",
                   msg,
                   f_start.tms_utime,
                   f_stop.tms_utime,
                   f_stop.tms_utime - f_start.tms_utime,
                   (double)(f_stop.tms_utime - f_start.tms_utime)/(double)sysconf(_SC_CLK_TCK));
        }
    
        struct tms f_start;
        struct tms f_stop;
    };
    
    
    int main(int argc, char *argv[])
    {
        run_timer t;
    
        t.start();
        for(int i(0); i < 10000000; ++i)
        {
            bits(rand());
        }
        t.stop();
        t.report("bits version");
    
        t.start();
        for(int i(0); i < 10000000; ++i)
        {
            bits_c(rand());
        }
        t.stop();
        t.report("c version");
    
        t.start();
        for(int i(0); i < 10000000; ++i)
        {
            bits_asm(rand());
        }
        t.stop();
        t.report("c version");
    
        return 0;
    }
    

    Compiled with g++ with this command line:

    c++ -msse4.2 -O2 -o bits -c bits.cpp
    

    Although you may think that the -msse4.2 could be the problem with the log2() version, I tried without it and log2() is slower then.

    Btw, I do not recommend this method since it is not portable. Only the Intel based computers will understand those instructions.

提交回复
热议问题