Here are two ways to set an individual bit in C on x86-64:
inline void SetBitC(long *array, int bit) {
//Pure C version
*array |= 1<
For such code:
#include
#include
int main() {
volatile long long i = 0;
time_t start = time (NULL);
for (long long n = 0; n < (1LL << 32); n++) {
i |= 1 << 10;
}
time_t end = time (NULL);
printf("C took %ds\n", (int)(end - start));
start = time (NULL);
for (long long n = 0; n < (1LL << 32); n++) {
__asm__ ("bts %[bit], %[i]"
: [i] "=r"(i)
: "[i]"(i), [bit] "i" (10));
}
end = time (NULL);
printf("ASM took %ds\n", (int)(end - start));
}
the result was:
C took 12s
ASM took 10s
My flag was (-std=gnu99 -O2 -march=core2
). Without the volatile the loop was optimized out. gcc 4.4.2.
No difference was with:
__asm__ ("bts %[bit], %[i]"
: [i] "+m"(i)
: [bit] "r" (10));
So probably the answer was - noone cares. In microbenchmark the only difference is the one between those two methods but in real life I belive such code does not take much CPU.
Additionally for such code:
#include
#include
int main() {
volatile long long i = 0;
time_t start = time (NULL);
for (long long n = 0; n < (1L << 32); n++) {
i |= 1 << (n % 32);
}
time_t end = time (NULL);
printf("C took %ds\n", (int)(end - start));
start = time (NULL);
for (long long n = 0; n < (1L << 32); n++) {
__asm__ ("bts %[bit], %[i]"
: [i] "+m"(i)
: [bit] "r" (n % 32));
}
end = time (NULL);
printf("ASM took %ds\n", (int)(end - start));
}
The result was:
C took 9s
ASM took 10s
Both results were 'stable'. Testing CPU 'Intel(R) Core(TM)2 Duo CPU T9600 @ 2.80GHz'.