问题
After some thought, I came up with the following code for multiplying two quaternions using SSE:
#include <pmmintrin.h> /* SSE3 intrinsics */
/* multiplication of two quaternions (x, y, z, w) x (a, b, c, d) */
__m128 _mm_cross4_ps(__m128 xyzw, __m128 abcd)
{
/* The product of two quaternions is: */
/* (X,Y,Z,W) = (xd+yc-zb+wa, -xc+yd+za+wb, xb-ya+zd+wc, -xa-yb-zc+wd) */
__m128 wzyx = _mm_shuffle_ps(xyzw, xyzw, _MM_SHUFFLE(0,1,2,3));
__m128 baba = _mm_shuffle_ps(abcd, abcd, _MM_SHUFFLE(0,1,0,1));
__m128 dcdc = _mm_shuffle_ps(abcd, abcd, _MM_SHUFFLE(2,3,2,3));
/* variable names below are for parts of componens of result (X,Y,Z,W) */
/* nX stands for -X and similarly for the other components */
/* znxwy = (xb - ya, zb - wa, wd - zc, yd - xc) */
__m128 ZnXWY = _mm_hsub_ps(_mm_mul_ps(xyzw, baba), _mm_mul_ps(wzyx, dcdc));
/* xzynw = (xd + yc, zd + wc, wb + za, yb + xa) */
__m128 XZYnW = _mm_hadd_ps(_mm_mul_ps(xyzw, dcdc), _mm_mul_ps(wzyx, baba));
/* _mm_shuffle_ps(XZYnW, ZnXWY, _MM_SHUFFLE(3,2,1,0)) */
/* = (xd + yc, zd + wc, wd - zc, yd - xc) */
/* _mm_shuffle_ps(ZnXWY, XZYnW, _MM_SHUFFLE(2,3,0,1)) */
/* = (zb - wa, xb - ya, yb + xa, wb + za) */
/* _mm_addsub_ps adds elements 1 and 3 and subtracts elements 0 and 2, so we get: */
/* _mm_addsub_ps(*, *) = (xd+yc-zb+wa, xb-ya+zd+wc, wd-zc+yb+xa, yd-xc+wb+za) */
__m128 XZWY = _mm_addsub_ps(_mm_shuffle_ps(XZYnW, ZnXWY, _MM_SHUFFLE(3,2,1,0)),
_mm_shuffle_ps(ZnXWY, XZYnW, _MM_SHUFFLE(2,3,0,1)));
/* now we only need to shuffle the components in place and return the result */
return _mm_shuffle_ps(XZWY, XZWY, _MM_SHUFFLE(2,1,3,0));
/* operations: 6 shuffles, 4 multiplications, 3 compound additions/subtractions */
}
I expected the assembly to have a minimal amount of instructions. However, when I compile it to assembly with gcc -msse3 -S, the resulting function has 67 instructions:
.text
.globl __Z13_mm_cross4_psU8__vectorfS_
__Z13_mm_cross4_psU8__vectorfS_:
LFB594:
pushq %rbp
LCFI0:
movq %rsp, %rbp
LCFI1:
subq $232, %rsp
movaps %xmm0, -336(%rbp)
movaps %xmm1, -352(%rbp)
movaps -336(%rbp), %xmm0
movaps -336(%rbp), %xmm1
shufps $27, %xmm1, %xmm0
movaps %xmm0, -16(%rbp)
movaps -352(%rbp), %xmm0
movaps -352(%rbp), %xmm1
shufps $17, %xmm1, %xmm0
movaps %xmm0, -32(%rbp)
movaps -352(%rbp), %xmm0
movaps -352(%rbp), %xmm1
shufps $187, %xmm1, %xmm0
movaps %xmm0, -48(%rbp)
movaps -16(%rbp), %xmm0
movaps %xmm0, -112(%rbp)
movaps -48(%rbp), %xmm0
movaps %xmm0, -128(%rbp)
movaps -128(%rbp), %xmm0
movaps -112(%rbp), %xmm1
mulps %xmm1, %xmm0
movaps -336(%rbp), %xmm1
movaps %xmm1, -144(%rbp)
movaps -32(%rbp), %xmm1
movaps %xmm1, -160(%rbp)
movaps -160(%rbp), %xmm1
movaps -144(%rbp), %xmm2
mulps %xmm2, %xmm1
movaps %xmm1, -176(%rbp)
movaps %xmm0, -192(%rbp)
movaps -176(%rbp), %xmm0
hsubps -192(%rbp), %xmm0
movaps %xmm0, -64(%rbp)
movaps -16(%rbp), %xmm0
movaps %xmm0, -208(%rbp)
movaps -32(%rbp), %xmm0
movaps %xmm0, -224(%rbp)
movaps -224(%rbp), %xmm0
movaps -208(%rbp), %xmm1
mulps %xmm1, %xmm0
movaps -336(%rbp), %xmm1
movaps %xmm1, -240(%rbp)
movaps -48(%rbp), %xmm1
movaps %xmm1, -256(%rbp)
movaps -256(%rbp), %xmm1
movaps -240(%rbp), %xmm2
mulps %xmm2, %xmm1
movaps %xmm1, -272(%rbp)
movaps %xmm0, -288(%rbp)
movaps -272(%rbp), %xmm0
haddps -288(%rbp), %xmm0
movaps %xmm0, -80(%rbp)
movaps -64(%rbp), %xmm0
movaps -80(%rbp), %xmm1
shufps $177, %xmm1, %xmm0
movaps -80(%rbp), %xmm1
movaps -64(%rbp), %xmm2
shufps $228, %xmm2, %xmm1
movaps %xmm1, -304(%rbp)
movaps %xmm0, -320(%rbp)
movaps -304(%rbp), %xmm0
addsubps -320(%rbp), %xmm0
movaps %xmm0, -96(%rbp)
movaps -96(%rbp), %xmm0
movaps -96(%rbp), %xmm1
shufps $156, %xmm1, %xmm0
leave
LCFI2:
ret
What am I doing wrong? There must a better way to shuffle the elements around without using so many movaps
instructions.
回答1:
Never mind. If I compile the code with gcc -msse3 -O1 -S instead, I get the following:
.text
.align 4,0x90
.globl __Z13_mm_cross4_psU8__vectorfS_
__Z13_mm_cross4_psU8__vectorfS_:
LFB644:
movaps %xmm0, %xmm5
movaps %xmm1, %xmm3
movaps %xmm0, %xmm2
shufps $27, %xmm0, %xmm5
movaps %xmm5, %xmm4
shufps $17, %xmm1, %xmm3
shufps $187, %xmm1, %xmm1
mulps %xmm3, %xmm2
mulps %xmm1, %xmm4
mulps %xmm5, %xmm3
mulps %xmm1, %xmm0
hsubps %xmm4, %xmm2
haddps %xmm3, %xmm0
movaps %xmm2, %xmm1
shufps $177, %xmm0, %xmm1
shufps $228, %xmm2, %xmm0
addsubps %xmm1, %xmm0
shufps $156, %xmm0, %xmm0
ret
That's only 18 instructions now. That's what I expected in the beginning. Oops.
回答2:
You may be interested in the Agner Fog's C++ vector class library. It provides a Quaternion4f
and Quaternion4d
classes (including *
and *=
operators, of course), implemented by using SSE2 and AVX instruction sets respectively. The library is an Open Source project, so you may dig into the code and find a good implementation example to build your function on.
Later on, you may consult the "optimizing subroutines in assembly language" manual and provide an optimized, pure assembly implementation of the function or, while being aware of some low-level tricks, try to redesign the intrinsics approach in C.
来源:https://stackoverflow.com/questions/18542894/how-to-multiply-two-quaternions-with-minimal-instructions