I have a pointer to an array of bytes mixed
that contains the interleaved bytes of two distinct arrays array1
and array2
. Say mi
Okay, here is your original method:
static void simpleDeint (
uint8_t * array1, uint8_t * array2, uint8_t * mixed, int mixedLength
) {
int i, j;
int mixedLength_2 = mixedLength / 2;
for (i = 0, j = 0; i < mixedLength_2; i++, j += 2)
{
array1[i] = mixed[j];
array2[i] = mixed[j+1];
}
}
With 10 million entries and -O3
(compiler shall optimize for maximum speed), I can run this 154 times per second on my Mac.
Here is my first suggestion:
static void structDeint (
uint8_t * array1, uint8_t * array2, uint8_t * mixed, int mixedLength
) {
int i;
int len;
uint8_t * array1Ptr = (uint8_t *)array1;
uint8_t * array2Ptr = (uint8_t *)array2;
struct {
uint8_t byte1;
uint8_t byte2;
} * tb = (void *)mixed;
len = mixedLength / 2;
for (i = 0; i < len; i++) {
*(array1Ptr++) = tb->byte1;
*(array2Ptr++) = tb->byte2;
tb++;
}
}
Same count and optimization as before, I get 193 runs per second.
Now the suggestion from Graham Lee:
static void unionDeint (
uint8_t * array1, uint8_t * array2, uint8_t * mixed, int mixedLength
) {
union my_union {
uint16_t wide;
struct { uint8_t top; uint8_t bottom; } narrow;
};
uint16_t * source = (uint16_t *)mixed;
for (int i = 0; i < mixedLength/2; i++) {
union my_union cursor;
cursor.wide = source[i];
array1[i] = cursor.narrow.top;
array2[i] = cursor.narrow.bottom;
}
}
Same setup as before, 198 runs per second (NOTE: This method is not endian safe, result depends on CPU endianess. In your case array1 and array2 are probably swapped since ARM is little endian, so you would have to swap them in the code).
Here's my best one so far:
static void uint32Deint (
uint8_t * array1, uint8_t * array2, uint8_t * mixed, int mixedLength
) {
int i;
int count;
uint32_t * fourBytes = (void *)mixed;
uint8_t * array1Ptr = (uint8_t *)array1;
uint8_t * array2Ptr = (uint8_t *)array2;
count = mixedLength / 4;
for (i = 0; i < count; i++) {
uint32_t temp = *(fourBytes++);
#if __LITTLE_ENDIAN__
*(array1Ptr++) = (uint8_t)(temp & 0xFF);
temp >>= 8;
*(array2Ptr++) = (uint8_t)(temp & 0xFF);
temp >>= 8;
*(array1Ptr++) = (uint8_t)(temp & 0xFF);
temp >>= 8;
*(array2Ptr++) = tb->byte2;
#else
*(array1Ptr++) = (uint8_t)(temp >> 24);
*(array2Ptr++) = (uint8_t)((temp >> 16) & 0xFF);
*(array1Ptr++) = (uint8_t)((temp >> 8) & 0xFF);
*(array2Ptr++) = (uint8_t)(temp & 0xFF);
#endif
}
// Either it is a multiple of 4 or a multiple of 2.
// If it is a multiple of 2, 2 bytes are left over.
if (count * 4 != mixedLength) {
*(array1Ptr) = mixed[mixedLength - 2];
*(array2Ptr) = mixed[mixedLength - 1];
}
}
Same setup as above, 219 times a second and unless I made a mistake, should work with either endianess.