I would like to make some vector computation faster, and I believe that SIMD instructions for float comparison and manipulation could help, here is the operation:
You can use GCC's and Clang's vector extensions to implement a ternary select function (see https://stackoverflow.com/a/48538557/2542702).
#if defined(__clang__)
typedef double double4 __attribute__ ((ext_vector_type(4)));
typedef int64_t long4 __attribute__ ((ext_vector_type(4)));
typedef double double4 __attribute__ ((vector_size (sizeof(double)*4)));
typedef int64_t long4 __attribute__ ((vector_size (sizeof(int64_t)*4)));
double4 select(long4 s, double4 a, double4 b) {
double4 c;
#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__)
c = s ? a : b;
for(int i=0; i<4; i++) c[i] = s[i] ? a[i] : b[i];
return c;
void func(double* left, double* right, double* res, size_t size, double th, double drop) {
size_t i;
for (i = 0; i<(size&-4); i+=4) {
double4 leftv = *(double4*)&left[i];
double4 rightv = *(double4*)&right[i];
*(double4*)&res[i] = select(rightv >= th, leftv, leftv - drop);
for(;i= th ? left[i] : (left[i] - drop);