I want to inflate an unsigned char
to an uint64_t
by repeating each bit 8 times. E.g.
char -> uint64_t
0x00 -> 0x00
0x01 ->
Two minor optimizations:
One for testing the bits in the input (a will be destroyed but this doesn't matter)
The other for shifting the mask.
static uint64_t inflate(unsigned char a)
{
uint64_t mask = 0xFF;
uint64_t result = 0;
for (int i = 0; i < 8; i++) {
if (a & 1)
result |= mask;
mask <<= 8;
a >>= 1;
}
return result;
}
Maybe you can also replace the 'for (int i = 0; i < 8; i++)'-loop by a 'while (a)'-loop. This works, however, only if the right shift a >>=1 works unsigned (As much as I know C standard allows the compiler to do it signed or unsigned). Otherwise you will have an infinite loop in some cases.
EDIT:
To see the result I compiled both variants with gcc -std=c99 -S source.c
.
A quick glance at the resulting assembler outputs shows that the optimization shown above yields ca. 1/3 viewer instructions, most of them inside the loop.
Variations on the same theme as @Aki answer. Some of them are better here, but it may depend on your compiler and target machines (they should be more suitable for superscalar processor that Aki's function even if they do more work as there is less data dependencies)
// Aki Suuihkonen: 1.265
static uint64_t inflate_parallel1(unsigned char a) {
uint64_t vector = a * 0x0101010101010101ULL;
vector &= 0x8040201008040201;
vector += 0x00406070787c7e7f;
vector = (vector >> 7) & 0x0101010101010101ULL;
return vector * 255;
}
// By seizet and then combine: 1.583
static uint64_t inflate_parallel2(unsigned char a) {
uint64_t vector1 = a * 0x0002000800200080ULL;
uint64_t vector2 = a * 0x0000040010004001ULL;
uint64_t vector = (vector1 & 0x0100010001000100ULL) | (vector2 & 0x0001000100010001ULL);
return vector * 255;
}
// Stay in 32 bits as much as possible: 1.006
static uint64_t inflate_parallel3(unsigned char a) {
uint32_t vector1 = (( (a & 0x0F) * 0x00204081) & 0x01010101) * 255;
uint32_t vector2 = ((((a & 0xF0) >> 4) * 0x00204081) & 0x01010101) * 255;
return (((uint64_t)vector2) << 32) | vector1;
}
// Do the common computation in 64 bits: 0.915
static uint64_t inflate_parallel4(unsigned char a) {
uint32_t vector1 = (a & 0x0F) * 0x00204081;
uint32_t vector2 = ((a & 0xF0) >> 4) * 0x00204081;
uint64_t vector = (vector1 | (((uint64_t)vector2) << 32)) & 0x0101010101010101ULL;
return vector * 255;
}
// Some computation is done in 64 bits a little sooner: 0.806
static uint64_t inflate_parallel5(unsigned char a) {
uint32_t vector1 = (a & 0x0F) * 0x00204081;
uint64_t vector2 = (a & 0xF0) * 0x002040810000000ULL;
uint64_t vector = (vector1 | vector2) & 0x0101010101010101ULL;
return vector * 255;
}