Embedded broadcasts with intrinsics and assembly

前端 未结 1 1551
醉梦人生
醉梦人生 2020-12-20 22:01

In section 2.5.3 \"Broadcasts\" of the Intel Architecture Instruction Set Extensions Programming Reference the we learn than AVX512 (and Knights Corner) has

相关标签:
1条回答
  • 2020-12-20 22:40

    As Peter Cordes notes GCC doesn't let you specify a different template for different constraint alternatives. So instead my solution has the assembler choose the correct instruction according to the operands chosen.

    I don't have a version of GCC that supports the ZMM registers, so this following example uses XMM registers and a couple of nonexistent instructions to demonstrate how you can achieve what you're looking for.

    typedef __attribute__((vector_size(16))) float v4sf;
    
    v4sf
    foo(v4sf a, float b) {
        v4sf ret;
        asm(".ifndef isxmm\n\t"
            ".altmacro\n\t"
            ".macro ifxmm operand, rnum\n\t"
            ".ifc \"\\operand\",\"%%xmm\\rnum\"\n\t"
            ".set isxmm, 1\n\t"
            ".endif\n\t"
            ".endm\n\t"
            ".endif\n\t"
            ".set isxmm, 0\n\t"
            ".set regnum, 0\n\t"
            ".rept 8\n\t"
            "ifxmm <%2>, %%regnum\n\t"
            ".set regnum, regnum + 1\n\t"
            ".endr\n\t"
            ".if isxmm\n\t"
            "alt-1 %1, %2, %0\n\t"
            ".else\n\t"
            "alt-2 %1, %2, %0\n\t"
            ".endif\n\t"
            : "=x,x" (ret)
            : "x,x" (a), "x,m" (b));
        return ret;
    }
    
    
    v4sf
    bar(v4sf a, v4sf b) {
        return foo(a, b[0]);
    }
    

    This example should be compiled with gcc -m32 -msse -O3 and should generate two assembler error messages similar to the following:

    t103.c: Assembler messages:
    t103.c:24: Error: no such instruction: `alt-2 %xmm0,4(%esp),%xmm0'
    t103.c:22: Error: no such instruction: `alt-1 %xmm0,%xmm1,%xmm0'
    

    The basic idea here is the assembler checks to see whether the second operand (%2) is an XMM register or something else, presumably a memory location. Since the GNU assembler doesn't support much in the way of operations on strings, the second operand is compared to every possible XMM register one at a time in a .rept loop. The isxmm macro is used to paste %xmm and a register number together.

    For your specific problem you'd probably need to rewrite it something like this:

    __m512
    mul_broad(__m512 a, float b) {
        __m512 ret;
        __m512 dummy;
        asm(".ifndef isxmm\n\t"
            ".altmacro\n\t"
            ".macro ifxmm operand, rnum\n\t"
            ".ifc \"\\operand\",\"%%zmm\\rnum\"\n\t"
            ".set isxmm, 1\n\t"
            ".endif\n\t"
            ".endm\n\t"
            ".endif\n\t"
            ".set isxmm, 0\n\t"
            ".set regnum, 0\n\t"
            ".rept 32\n\t"
            "ifxmm <%[b]>, %%regnum\n\t"
            ".set regnum, regnum + 1\n\t"
            ".endr\n\t"
            ".if isxmm\n\t"
            "vbroadcastss %x[b], %[b]\n\t"
            "vmulps %[a], %[b], %[ret]\n\t"
            ".else\n\t"
            "vmulps %[b] %{1to16%}, %[a], %[ret]\n\t"
            "# dummy = %[dummy]\n\t"
            ".endif\n\t"
            : [ret] "=x,x" (ret), [dummy] "=xm,x" (dummy)
            : [a] "x,xm" (a), [b] "m,[dummy]" (b));
        return ret;
    }
    
    0 讨论(0)
提交回复
热议问题