问题
I have 3 buffers containing R, G, B bit data running on a 32-bit processor.
I need to combine the three bytes in the following way:
R[0] = 0b r1r2r3r4r5r6r7r8
G[0] = 0b g1g2g3g4g5g6g7g8
B[0] = 0b b1b2b3b4b5b6b7b8
int32_t Out = 0b r1g1b1r2g2b2r3g3 b3r4g4b4r5g5b5r6 g6b6r7g7b7r8g8b8 xxxxxxxx
where xxxxxxxx is continuing on to each of the next bytes in the buffers.
I am looking for an optimal way to combine them. My approach is definitely not efficient.
Here is my approach
static void rgbcombineline(uint8_t line)
{
uint32_t i, bit;
uint8_t bitMask, rByte, gByte, bByte;
uint32_t ByteExp, rgbByte;
uint8_t *strPtr = (uint8_t*)&ByteExp;
for (i = 0; i < (LCDpixelsCol / 8); i++)
{
rByte = rDispbuff[line][i];
gByte = gDispbuff[line][i];
bByte = bDispbuff[line][i];
bitMask = 0b00000001;
ByteExp = 0;
for(bit = 0; bit < 8; bit++)
{
rgbByte = 0;
rgbByte |= ((rByte & bitMask) >> bit) << 2;
rgbByte |= ((gByte & bitMask) >> bit) << 1;
rgbByte |= ((bByte & bitMask) >> bit);
ByteExp |= (rgbByte << 3*bit);
bitMask <<= 1;
}
TempLinebuff[((i*3)+0) +2] = *(strPtr + 2);
TempLinebuff[((i*3)+1) +2] = *(strPtr + 1);
TempLinebuff[((i*3)+2) +2] = *(strPtr + 0);
}
}
回答1:
If you can spare 1024 bytes, you can achieve your desired result with a single 256-element lookup table:
uint32_t lookup[256] = {
0, 1, 8, 9, 64, 65, ...
/* map abcdefgh to a00b00c00d00e00f00g00h */
};
uint32_t result = (lookup[rByte] << 2) | (lookup[gByte] << 1) | lookup[bByte];
This uses only 3 lookups, 2 shifts and 2 or
operations, which should provide an acceptable speedup.
If you have more space, you can use three lookup tables to eliminate the shifts too (although this may result in worse cache performance, so always profile to check!)
回答2:
You can use a multiplication by a "magical" constant to replicate the bits. Then use bit-shifts to extract the needed bits, and bit-wise masking to combine them. The "magical" constant is a 17-bit binary 10000000100000001. When multiplied by it, any 8-bit number is concatenated to itself 3 times.
r1r2r3r4r5r6r7r8 * M = r1r2r3r4r5r6r7r8r1r2r3r4r5r6r7r8r1r2r3r4r5r6r7r8 r1r2r3r4r5r6r7r8 * M shr 2 = 0 0 r1r2r3r4r5r6r7r8r1r2r3r4r5r6r7r8r1r2r3r4r5r6 r1r2r3r4r5r6r7r8 * M shr 4 = 0 0 0 0 r1r2r3r4r5r6r7r8r1r2r3r4r5r6r7r8r1r2r3r4 r1r2r3r4r5r6r7r8 * M shr 6 = 0 0 0 0 0 0 r1r2r3r4r5r6r7r8r1r2r3r4r5r6r7r8r1r2
The bits marked in bold are those that are at the right places.
If you use this masking code
R * M & 0b100000000000100000000000 |
(R * M >> 2) & 0b000100000000000100000000 |
(R * M >> 4) & 0b000000100000000000100000 |
(R * M >> 6) & 0b000000000100000000000100
you will get the "red" bits combined in the right way:
r1 0 0 r2 0 0 r3 0 0 r4 0 0 r5 0 0 r6 0 0 r7 0 0 r8 0 0
Then combine the "blue" and "green" bits in a similar way.
A rough estimation of the number of operations:
- Multiplications: 3
- Bit shifts: 9
- Bit-wise AND: 12
- Bit-wise OR: 11
回答3:
You can use a table of size 64 that contains bitstripped values for 6 bit and then fetch 2 bits each from r, g and b and use table for faster lookup. Using lookup of size 512 or 4096 can be more efficient.
/* Converts bits abcdefghijkl to adgjbehkcfil */
static const uint32_t bitStripLookUp[4096] = {
/* Hard coded values, can be generate with some script */
...
};
...
rByte = rDispbuff[line][i]; // rByte, gByte, bByte should be unit32
gByte = gDispbuff[line][i];
bByte = bDispbuff[line][i];
uMSB = ((rByte << 4) & 0x0F00) | (gByte & 0x00F0) | ((bByte >> 4) & 0x000F); // r7r6r5r4g7g6g5g4b7b6b5b4
uLSB = ((rByte << 8) & 0x0F00) | ((gByte << 4) & 0x00F0) | (bByte & 0x000F); // r3r2r1r0g3g2g1g0b3b2b1b0
stuffed_value = (bitStripLookUp[uMSB] << 12) | bitStripLookUp[uLSB];
回答4:
Interleaving with bitwise operators
inline unsigned interleave(unsigned n)
{
n = ((n << 18) | (n << 9) | n) & 0007007007; // 000000111 000000111 000000111
n = ((n << 6) | (n << 3) | n) & 0444444444; // 100100100 100100100 100100100
return n;
}
unsigned r = interleave(rByte);
unsigned g = interleave(gByte);
unsigned b = interleave(bByte);
unsigned rgb = r | (g >> 1) | (b >> 2);
TempLinebuff[((i*3)+0) +2] = rgb >> 16;
TempLinebuff[((i*3)+1) +2] = rgb >> 8;
TempLinebuff[((i*3)+2) +2] = rgb;
Lookup table solution
#define EXPANDBIT(x, n) (((x) & (1 << (n))) << (3*(n))))
#define EXPAND8BIT(a) (EXPANDBIT(a, 0) | EXPANDBIT(a, 1) | EXPANDBIT(a, 2) | EXPANDBIT(a, 3) | \
EXPANDBIT(a, 4) | EXPANDBIT(a, 5) | EXPANDBIT(a, 6) | EXPANDBIT(a, 7))
#define EXPAND16(A) EXPAND8BIT(16*(A)+ 0), EXPAND8BIT(16*(A)+ 1), EXPAND8BIT(16*(A)+ 2), EXPAND8BIT(16*(A)+ 3), \
EXPAND8BIT(16*(A)+ 4), EXPAND8BIT(16*(A)+ 5), EXPAND8BIT(16*(A)+ 6), EXPAND8BIT(16*(A)+ 7), \
EXPAND8BIT(16*(A)+ 8), EXPAND8BIT(16*(A)+ 9), EXPAND8BIT(16*(A)+10), EXPAND8BIT(16*(A)+11), \
EXPAND8BIT(16*(A)+12), EXPAND8BIT(16*(A)+13), EXPAND8BIT(16*(A)+14), EXPAND8BIT(16*(A)+15)
const uint32_t LUT[256] = {
EXPAND16( 0), EXPAND16( 1), EXPAND16( 2), EXPAND16( 3),
EXPAND16( 4), EXPAND16( 5), EXPAND16( 6), EXPAND16( 7),
EXPAND16( 8), EXPAND16( 9), EXPAND16(10), EXPAND16(11),
EXPAND16(12), EXPAND16(13), EXPAND16(14), EXPAND16(15)
};
output = LUT[rByte] | LUT[gByte] << 1 | LUT[bByte] << 2;
The size of the lookup table may be increased if neccessary
On x86 with BMI2 there's hardware support with PDEP instruction which can be accessed via the intrinsic _pdep_u32
. The solution is now much simpler
output = _pdep_u32(rByte, 044444444U << 8)
| _pdep_u32(gByte, 022222222U << 8)
| _pdep_u32(bByte, 011111111U << 8);
Another way is
interleaving using multiplication and mask with this packing technique
This is for architectures without hardware bit deposit instruction but with fast multipliers
uint32_t expand8bits(uint8_t b)
{
uint64_t MAGIC = 0x8040201008040201;
uint64_t MASK = 0x8080808080808080;
uint64_t expanded8bits = htobe64((MAGIC*b) & MASK);
uint64_t result = expanded8bits*0x2108421 & 0x9249000000009000;
// no need to shift if you want to get the bits in the high part
return ((result | (result << 30)) & (044444444ULL << 8)) >> 32;
}
uint32_t stripeBits(uint8_t rByte, uint8_t gByte, uint8_t bByte)
{
return expand8bits(rByte) | (expand8bits(gByte) >> 1) | (expand8bits(bByte) >> 2);
}
The way it works is like this
- The first step expands the input bits from
abcdefgh
to a0000000 b0000000 c0000000 d0000000 e0000000 f0000000 g0000000 h0000000 and store inexpand8bits
- Then we move those spaced out bits close together by multiplying and masking in the next step. After that
result
contains a00b00c00d00e00f00000000000000000000000000000000g00h000000000000 and will be ready to merge into a single value
The magic number for bringing the bits closer is calculated like this
a0000000b0000000c0000000d0000000e0000000f0000000g0000000h0000000
× 10000100001000010000100001 (0x2108421)
────────────────────────────────────────────────────────────────
a0000000b0000000c0000000d0000000e0000000f0000000g0000000h0000000
000b0000000c0000000d0000000e0000000f0000000g0000000h0000000
+ 000000c0000000d0000000e0000000f0000000g0000000h0000000
0c0000000d0000000e0000000f0000000g0000000h0000000
0000d0000000e0000000f0000000g0000000h0000000
0000000e0000000f0000000g0000000h0000000
────────────────────────────────────────────────────────────────
ac0bd0cebd0ce0dfce0df0egdf0eg0fheg0fh0g0fh0g00h0g00h0000h0000000
& 1001001001001001000000000000000000000000000000001001000000000000 (0x9249000000009000)
────────────────────────────────────────────────────────────────
a00b00c00d00e00f00000000000000000000000000000000g00h000000000000
Alternatively expand8bits
can be implemented using only 32-bit magic number multiplication like this, which may be simpler
uint32_t expand8bits(uint8_t b)
{
const uint8_t RMASK_1458 = 0b10011001;
const uint32_t MAGIC_1458 = 0b00000001000001010000010000000000U;
const uint32_t MAGIC_2367 = 0b00000000010100000101000000000000U;
const uint32_t MASK_BIT1458 = 0b10000000010010000000010000000000U;
const uint32_t MASK_BIT2367 = 0b00010010000000010010000000000000U;
return (((b & RMASK_1458) * MAGIC_1458) & MASK_BIT1458)
| (((b & ~RMASK_1458) * MAGIC_2367) & MASK_BIT2367);
}
Here we split the 8-bit number to two 4-bit parts, one with bits 1, 4, 5, 8 and the remaining with bits 2, 3, 6, 7. The magic numbers are like this
a00de00h 0bc00fg0
× 00000001000001010000010000000000 × 00000000010100000101000000000000
──────────────────────────────── ────────────────────────────────
a00de00h 0bc00fg0
+ a00de00h + 0bc00fg0
a00de00h 0bc00fg0
a00de00h 0bc00fg0
──────────────────────────────── ────────────────────────────────
a00de0ahadedehah0de00h0000000000 000bcbcfgfgbcbcfgfg0000000000000
& 10000000010010000000010000000000 & 00010010000000010010000000000000
──────────────────────────────── ────────────────────────────────
a00000000d00e00000000h0000000000 000b00c00000000f00g0000000000000
See
- What's a fast way to space-out bits within a word?
- How to create a byte out of 8 bool values (and vice versa)?
- Portable efficient alternative to PDEP without using BMI2?
来源:https://stackoverflow.com/questions/36256537/how-to-do-bit-striping-on-pixel-data