I need a fast memory transpose algorithm for my Gaussian convolution function in C/C++. What I do now is
convolute_1D
transpose
convolute_1D
transpose
Consider this 4x4 transpose.
struct MATRIX {
union {
float f[4][4];
__m128 m[4];
__m256 n[2];
};
};
MATRIX myTranspose(MATRIX in) {
// This takes 15 assembler instructions (compile not inline),
// and is faster than XMTranspose
// Comes in like this 1 2 3 4 5 6 7 8
// 9 10 11 12 13 14 15 16
//
// Want the result 1 5 9 13 2 6 10 14
// 3 7 11 15 4 8 12 16
__m256 t0, t1, t2, t3, t4, t5, n0, n1;
MATRIX result;
n0 = in.n[0]; // n0 = 1, 2, 3, 4, 5, 6, 7, 8
n1 = in.n[1]; // n1 = 9, 10, 11, 12, 13, 14, 15, 16
t0 = _mm256_unpacklo_ps(n0, n1); // t0 = 1, 9, 2, 10, 5, 13, 6, 14
t1 = _mm256_unpackhi_ps(n0, n1); // t1 = 3, 11, 4, 12, 7, 15, 8, 16
t2 = _mm256_permute2f128_ps(t0, t1, 0x20); // t2 = 1, 9, 2, 10, 3, 11, 4, 12
t3 = _mm256_permute2f128_ps(t0, t1, 0x31); // t3 = 5, 13, 6, 14, 7, 15, 8, 16
t4 = _mm256_unpacklo_ps(t2, t3); // t2 = 1, 5, 9, 13, 3, 7, 11, 15
t5 = _mm256_unpackhi_ps(t2, t3); // t3 = 2, 6, 10, 14, 4, 8, 12, 16
result.n[0] = _mm256_permute2f128_ps(t4, t5, 0x20); // t6 = 1, 5, 9, 13, 2, 6, 10, 14
result.n[1] = _mm256_permute2f128_ps(t4, t5, 0x31); // t7 = 3, 7, 11, 15, 4, 8, 12, 16
return result;
}