How to simulate pcmpgtq on sse2?

问题

PCMPGTQ was introduced in sse4.2, and it provides a greater than signed comparison for 64 bit numbers that yields a mask.

How does one support this functionality on instructions sets predating sse4.2?

Update: This same question applies to ARMv7 with Neon which also lacks a 64-bit comparator. The sister question to this is found here: What is the most efficient way to support CMGT with 64bit signed comparisons on ARMv7a with Neon?

回答1:

__m128i pcmpgtq_sse2 (__m128i a, __m128i b) {
    __m128i r = _mm_and_si128(_mm_cmpeq_epi32(a, b), _mm_sub_epi64(b, a));
    r = _mm_or_si128(r, _mm_cmpgt_epi32(a, b));
    return _mm_shuffle_epi32(r, _MM_SHUFFLE(3,3,1,1));
}

We have 32-bit signed comparison intrinsics so split the packed qwords into dwords pairs.

If the high dword in a is greater than the high dword in b then there is no need to compare the low dwords.

if (a.hi > b.hi) { r.hi = 0xFFFFFFFF; }
if (a.hi <= b.hi) { r.hi = 0x00000000; }

If the high dword in a is equal to the high dword in b then a 64-bit subtract will either clear or set all 32 high bits of the result (if the high dwords are equal then they "cancel" each other out, effectively a unsigned compare of the low dwords, placing the result in the high dwords).

if (a.hi == b.hi) { r = (b - a) & 0xFFFFFFFF00000000; }

Copy the comparison mask in the high 32-bits to the low 32-bits.

r.lo = r.hi

Updated: Here's the Godbolt for SSE2 and ARMv7+Neon.

回答2:

I'm not sure if this is the most optimal output, but this is the output for x64 from Clang. I've also taken the same implementation and converted it to support armv7 with neon.

See the Godbolt and the assembly below:

.LCPI0_0:
        .quad   2147483648                      # 0x80000000
        .quad   2147483648                      # 0x80000000
cmpgtq_sim(long __vector(2), long __vector(2)):                  # @cmpgtq_sim(long __vector(2), long __vector(2))
        movdqa  xmm2, xmmword ptr [rip + .LCPI0_0] # xmm2 = [2147483648,2147483648]
        pxor    xmm1, xmm2
        pxor    xmm0, xmm2
        movdqa  xmm2, xmm0
        pcmpgtd xmm2, xmm1
        pshufd  xmm3, xmm2, 160                 # xmm3 = xmm2[0,0,2,2]
        pcmpeqd xmm0, xmm1
        pshufd  xmm1, xmm0, 245                 # xmm1 = xmm0[1,1,3,3]
        pand    xmm1, xmm3
        pshufd  xmm0, xmm2, 245                 # xmm0 = xmm2[1,1,3,3]
        por     xmm0, xmm1

ARMv7+Neon

cmpgtq_sim(__simd128_int64_t, __simd128_int64_t):
        vldr    d16, .L3
        vldr    d17, .L3+8
        veor    q0, q0, q8
        veor    q1, q1, q8
        vcgt.s32        q8, q0, q1
        vceq.i32        q1, q0, q1
        vmov    q9, q8  @ v4si
        vmov    q10, q1  @ v4si
        vtrn.32 q9, q8
        vmov    q0, q8  @ v4si
        vmov    q8, q1  @ v4si
        vtrn.32 q10, q8
        vand    q8, q8, q9
        vorr    q0, q8, q0
        bx      lr
.L3:
        .word   -2147483648
        .word   0
        .word   -2147483648
        .word   0

回答3:

The code below demonstrates that the desired functionality can be emulated using pre-existing SSE instructions by working from first principles: signed "less than" predicate being true is equivalent to the overflow flag differing from the sign flag after subtraction. Obviously we do not have flags, so the flags need to be emulated as well, leaving the resulting predicate in the most significant bit (MSB) of each quadword. In a second step we can then expand the MSB into a quadword-sized mask.

The resulting fairly lengthy code is a pretty good indication that this is probably not the best way of emulating the desired functionality from a performance perspective, even though it is functionally correct. Compiler Explorer (godbolt.org) shows twelve instructions when compiling with Clang 11 (please refer to the assembly listing below).

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include "nmmintrin.h"

#define  USE_SSE42_REF (0)
#define  NBR_OF_TESTS  (1000000000)

#if USE_SSE42_REF
__m128i pcmpgtq_ref (__m128i a, __m128i b)
{
    return _mm_cmpgt_epi64 (a, b);
}
#else // USE_SSE42_REF
__m128i pcmpgtq_ref (__m128i a, __m128i b)
{
    __m128i r;
    struct {
        int64_t lo;
        int64_t hi;
    } hilo_a, hilo_b, hilo_r;
    memcpy (&hilo_a, &a, sizeof hilo_a);
    memcpy (&hilo_b, &b, sizeof hilo_b);
    hilo_r.lo = hilo_a.lo > hilo_b.lo ? (-1LL) : 0LL;
    hilo_r.hi = hilo_a.hi > hilo_b.hi ? (-1LL) : 0LL;
    memcpy (&r, &hilo_r, sizeof r);
    return r;
}
#endif // USE_SSE42_REF

/* "signed less than" == (OF != SF); compute predicate in MSB of each byte */
__m128i ltq_core (__m128i a, __m128i b)
{
    __m128i m = _mm_set1_epi64x (0x7fffffffffffffffULL);
    __m128i c = _mm_and_si128 (b, m);
    __m128i d = _mm_andnot_si128 (a, m);
    __m128i t = _mm_add_epi64 (c, d);
    __m128i s = _mm_xor_si128 (a, b);
    __m128i x = _mm_xor_si128 (a, t);
    __m128i y = _mm_and_si128 (x, s);
    __m128i r = _mm_xor_si128 (y, t);
    return r;
}

/* extend sign bits into mask, quadword-wise */
__m128i q_sign_to_mask (__m128i a)
{
    __m128i q = _mm_set1_epi64x (0);
    __m128i s = _mm_srli_epi64 (a, 63);
    __m128i r = _mm_sub_epi64 (q, s);
    return r;
}

__m128i pcmpltq (__m128i a, __m128i b)
{
    return q_sign_to_mask (ltq_core (a, b));
}

__m128i pcmpgtq (__m128i a, __m128i b)
{
    return pcmpltq (b, a);
}

/*
  https://groups.google.com/forum/#!original/comp.lang.c/qFv18ql_WlU/IK8KGZZFJx4J
  From: geo <gmars...@gmail.com>
  Newsgroups: sci.math,comp.lang.c,comp.lang.fortran
  Subject: 64-bit KISS RNGs
  Date: Sat, 28 Feb 2009 04:30:48 -0800 (PST)

  This 64-bit KISS RNG has three components, each nearly
  good enough to serve alone.    The components are:
  Multiply-With-Carry (MWC), period (2^121+2^63-1)
  Xorshift (XSH), period 2^64-1
  Congruential (CNG), period 2^64
*/

static uint64_t kiss64_x = 1234567890987654321ULL;
static uint64_t kiss64_c = 123456123456123456ULL;
static uint64_t kiss64_y = 362436362436362436ULL;
static uint64_t kiss64_z = 1066149217761810ULL;
static uint64_t kiss64_t;

#define MWC64  (kiss64_t = (kiss64_x << 58) + kiss64_c, \
                kiss64_c = (kiss64_x >> 6), kiss64_x += kiss64_t, \
                kiss64_c += (kiss64_x < kiss64_t), kiss64_x)
#define XSH64  (kiss64_y ^= (kiss64_y << 13), kiss64_y ^= (kiss64_y >> 17), \
                kiss64_y ^= (kiss64_y << 43))
#define CNG64  (kiss64_z = 6906969069ULL * kiss64_z + 1234567ULL)
#define KISS64 (MWC64 + XSH64 + CNG64)

int main (void)
{
    struct {
        uint64_t lo;
        uint64_t hi;
    } hilo_a, hilo_b, hilo_res, hilo_ref;

    for (int i = 0; i < NBR_OF_TESTS; i++) {
        uint64_t al = KISS64;
        uint64_t ah = KISS64;
        uint64_t bl = KISS64;
        uint64_t bh = KISS64;

        if ((i & 0xff) == 0x00) bl = al; // increase chance of equality
        if ((i & 0xff) == 0xff) bh = ah; // increase chance of equality

        __m128i a = _mm_set_epi64x (ah, al);
        __m128i b = _mm_set_epi64x (bh, bl);

        __m128i res = pcmpgtq (a, b);
        __m128i ref = pcmpgtq_ref (a, b);
        
        memcpy (&hilo_res, &res, sizeof hilo_res);
        memcpy (&hilo_ref, &ref, sizeof hilo_ref);

        if ((hilo_res.hi != hilo_ref.hi) || (hilo_res.lo != hilo_ref.lo)) {
            memcpy (&hilo_a, &a, sizeof hilo_a);
            memcpy (&hilo_b, &b, sizeof hilo_b); 
            printf ("error: a=%016llx_%016llx  b=%016llx_%016llx  res=%016llx_%016llx  ref=%016llx_%016llx\n",
                    hilo_a.hi, hilo_a.lo, hilo_b.hi, hilo_b.lo,
                    hilo_res.hi, hilo_res.lo, hilo_ref.hi, hilo_ref.lo);
            return EXIT_FAILURE;
        }
    }
    return EXIT_SUCCESS;
}

When compiling with -msse2, the output from Clang 11 looks as follows:

.LCPI4_0:
        .quad   9223372036854775807             # 0x7fffffffffffffff
        .quad   9223372036854775807             # 0x7fffffffffffffff
pcmpgtq:                                # @pcmpgtq
        movdqa  xmm2, xmmword ptr [rip + .LCPI4_0] # xmm2 = [9223372036854775807,9223372036854775807]
        movdqa  xmm3, xmm1
        pxor    xmm3, xmm0
        pand    xmm0, xmm2
        movdqa  xmm4, xmm1
        pandn   xmm4, xmm2
        paddq   xmm4, xmm0
        pand    xmm1, xmm3
        pandn   xmm3, xmm4
        por     xmm3, xmm1
        psrad   xmm3, 31
        pshufd  xmm0, xmm3, 245                 # xmm0 = xmm3[1,1,3,3]
        ret

来源：https://stackoverflow.com/questions/65166174/how-to-simulate-pcmpgtq-on-sse2

标签

assembly

sse

simd

sse2

sse4