I want to inflate an unsigned char
to an uint64_t
by repeating each bit 8 times. E.g.
char -> uint64_t
0x00 -> 0x00
0x01 ->
Here is one more method using only simple arithmetics:
uint64_t inflate_chqrlie(uint8_t value) {
uint64_t x = value;
x = (x | (x << 28));
x = (x | (x << 14));
x = (x | (x << 7)) & 0x0101010101010101ULL;
x = (x << 8) - x;
return x;
}
Another very efficient and concise one by phuclv using multiplication and mask:
static uint64_t inflate_phuclv(uint8_t b) {
uint64_t MAGIC = 0x8040201008040201ULL;
uint64_t MASK = 0x8080808080808080ULL;
return ((MAGIC * b) & MASK) >> 7;
}
And another with a small lookup table:
static uint32_t const lut_4_32[16] = {
0x00000000, 0x000000FF, 0x0000FF00, 0x0000FFFF,
0x00FF0000, 0x00FF00FF, 0x00FFFF00, 0x00FFFFFF,
0xFF000000, 0xFF0000FF, 0xFF00FF00, 0xFF00FFFF,
0xFFFF0000, 0xFFFF00FF, 0xFFFFFF00, 0xFFFFFFFF,
};
static uint64_t inflate_lut32(uint8_t b) {
return lut_4_32[b & 15] | ((uint64_t)lut_4_32[b >> 4] << 32);
}
I wrote a benchmarking program to determine relative performance of the different approaches on my system (x86_64-apple-darwin16.7.0, Apple LLVM version 9.0.0 (clang-900.0.39.2, clang -O3).
The results show that my function inflate_chqrlie
is faster than naive approaches but slower than other elaborate versions, all of which are beaten hands down by inflate_lut64
using a 2KB the lookup table in cache optimal situations.
The function inflate_lut32
, using a much smaller lookup table (64 bytes instead of 2KB) is not as fast as inflate_lut64
, but seems a good compromise for 32-bit architectures as it is still much faster than all other alternatives.
64-bit benchmark:
inflate: 0, 848.316ms
inflate_Curd: 0, 845.424ms
inflate_chqrlie: 0, 371.502ms
fast_inflate_njuffa: 0, 288.669ms
inflate_parallel1: 0, 242.827ms
inflate_parallel2: 0, 315.105ms
inflate_parallel3: 0, 363.379ms
inflate_parallel4: 0, 304.051ms
inflate_parallel5: 0, 301.205ms
inflate_phuclv: 0, 109.130ms
inflate_lut32: 0, 197.178ms
inflate_lut64: 0, 25.160ms
32-bit benchmark:
inflate: 0, 1451.464ms
inflate_Curd: 0, 955.509ms
inflate_chqrlie: 0, 385.036ms
fast_inflate_njuffa: 0, 463.212ms
inflate_parallel1: 0, 468.070ms
inflate_parallel2: 0, 570.107ms
inflate_parallel3: 0, 511.741ms
inflate_parallel4: 0, 601.892ms
inflate_parallel5: 0, 506.695ms
inflate_phuclv: 0, 192.431ms
inflate_lut32: 0, 140.968ms
inflate_lut64: 0, 28.776ms
Here is the code:
#include
#include
#include
static uint64_t inflate(unsigned char a) {
#define BIT_SET(var, pos) ((var) & (1 << (pos)))
uint64_t MASK = 0xFF;
uint64_t result = 0;
for (int i = 0; i < 8; i++) {
if (BIT_SET(a, i))
result |= (MASK << (8 * i));
}
return result;
}
static uint64_t inflate_Curd(unsigned char a) {
uint64_t mask = 0xFF;
uint64_t result = 0;
for (int i = 0; i < 8; i++) {
if (a & 1)
result |= mask;
mask <<= 8;
a >>= 1;
}
return result;
}
uint64_t inflate_chqrlie(uint8_t value) {
uint64_t x = value;
x = (x | (x << 28));
x = (x | (x << 14));
x = (x | (x << 7)) & 0x0101010101010101ULL;
x = (x << 8) - x;
return x;
}
uint64_t fast_inflate_njuffa(uint8_t a) {
const uint64_t spread7 = (1ULL << 42) | (1ULL << 35) | (1ULL << 28) | (1ULL << 21) |
(1ULL << 14) | (1ULL << 7) | (1UL << 0);
const uint64_t byte_lsb = (1ULL << 56) | (1ULL << 48) | (1ULL << 40) | (1ULL << 32) |
(1ULL << 24) | (1ULL << 16) | (1ULL << 8) | (1ULL << 0);
uint64_t r;
/* spread bits to lsbs of each byte */
r = (((uint64_t)(a & 0x7f) * spread7) + ((uint64_t)a << 49));
/* extract the lsbs of all bytes */
r = r & byte_lsb;
/* fill each byte with its lsb */
r = r * 0xff;
return r;
}
// Aki Suuihkonen: 1.265
static uint64_t inflate_parallel1(unsigned char a) {
uint64_t vector = a * 0x0101010101010101ULL;
// replicate the word all over qword
// A5 becomes A5 A5 A5 A5 A5 A5 A5 A5
vector &= 0x8040201008040201; // becomes 80 00 20 00 00 04 00 01 <--
vector += 0x00406070787c7e7f; // becomes 80 40 80 70 78 80 7e 80
// MSB is correct
vector = (vector >> 7) & 0x0101010101010101ULL; // LSB is correct
return vector * 255; // all bits correct
}
// By seizet and then combine: 1.583
static uint64_t inflate_parallel2(unsigned char a) {
uint64_t vector1 = a * 0x0002000800200080ULL;
uint64_t vector2 = a * 0x0000040010004001ULL;
uint64_t vector = (vector1 & 0x0100010001000100ULL) | (vector2 & 0x0001000100010001ULL);
return vector * 255;
}
// Stay in 32 bits as much as possible: 1.006
static uint64_t inflate_parallel3(unsigned char a) {
uint32_t vector1 = (( (a & 0x0F) * 0x00204081) & 0x01010101) * 255;
uint32_t vector2 = ((((a & 0xF0) >> 4) * 0x00204081) & 0x01010101) * 255;
return (((uint64_t)vector2) << 32) | vector1;
}
// Do the common computation in 64 bits: 0.915
static uint64_t inflate_parallel4(unsigned char a) {
uint32_t vector1 = (a & 0x0F) * 0x00204081;
uint32_t vector2 = ((a & 0xF0) >> 4) * 0x00204081;
uint64_t vector = (vector1 | (((uint64_t)vector2) << 32)) & 0x0101010101010101ULL;
return vector * 255;
}
// Some computation is done in 64 bits a little sooner: 0.806
static uint64_t inflate_parallel5(unsigned char a) {
uint32_t vector1 = (a & 0x0F) * 0x00204081;
uint64_t vector2 = (a & 0xF0) * 0x002040810000000ULL;
uint64_t vector = (vector1 | vector2) & 0x0101010101010101ULL;
return vector * 255;
}
static uint64_t inflate_phuclv(uint8_t b) {
uint64_t MAGIC = 0x8040201008040201ULL;
uint64_t MASK = 0x8080808080808080ULL;
return ((MAGIC * b) & MASK) >> 7;
}
static uint32_t const lut_4_32[16] = {
0x00000000, 0x000000FF, 0x0000FF00, 0x0000FFFF,
0x00FF0000, 0x00FF00FF, 0x00FFFF00, 0x00FFFFFF,
0xFF000000, 0xFF0000FF, 0xFF00FF00, 0xFF00FFFF,
0xFFFF0000, 0xFFFF00FF, 0xFFFFFF00, 0xFFFFFFFF,
};
static uint64_t inflate_lut32(uint8_t b) {
return lut_4_32[b & 15] | ((uint64_t)lut_4_32[b >> 4] << 32);
}
static uint64_t lut_8_64[256];
static uint64_t inflate_lut64(uint8_t b) {
return lut_8_64[b];
}
#define ITER 1000000
int main() {
clock_t t;
uint64_t x;
for (int b = 0; b < 256; b++)
lut_8_64[b] = inflate((uint8_t)b);
#define TEST(func) do { \
t = clock(); \
x = 0; \
for (int i = 0; i < ITER; i++) { \
for (int b = 0; b < 256; b++) \
x ^= func((uint8_t)b); \
} \
t = clock() - t; \
printf("%20s: %llu, %.3fms\n", \
#func, x, t * 1000.0 / CLOCKS_PER_SEC); \
} while (0)
TEST(inflate);
TEST(inflate_Curd);
TEST(inflate_chqrlie);
TEST(fast_inflate_njuffa);
TEST(inflate_parallel1);
TEST(inflate_parallel2);
TEST(inflate_parallel3);
TEST(inflate_parallel4);
TEST(inflate_parallel5);
TEST(inflate_phuclv);
TEST(inflate_lut32);
TEST(inflate_lut64);
return 0;
}