I would like to know if there\'s a way to implement atomic operations (particularly atomic_add) with double type.
For floats this code works, but atomic_xchg doesn\'t su
I was looking for for the same in the past and I found this: https://github.com/ddemidov/vexcl-experiments/blob/master/sort-by-key-atomic.cpp. At the end I figured out different approach to my problem so I did not use it. Here is the code:
"#pragma OPENCL EXTENSION cl_khr_fp64: enable\n"
"#pragma OPENCL EXTENSION cl_khr_int64_base_atomics: enable\n"
"void AtomicAdd(__global double *val, double delta) {\n"
" union {\n"
" double f;\n"
" ulong i;\n"
" } old;\n"
" union {\n"
" double f;\n"
" ulong i;\n"
" } new;\n"
" do {\n"
" old.f = *val;\n"
" new.f = old.f + delta;\n"
" } while (atom_cmpxchg ( (volatile __global ulong *)val, old.i, new.i) != old.i);\n"
"}\n"
"kernel void atomic_reduce(\n"
" ulong n,\n"
" global const int * key,\n"
" global const double * val,\n"
" global double * sum\n"
")\n"
"{\n"
" for(size_t idx = get_global_id(0); idx < n; idx += get_global_size(0))\n"
" AtomicAdd(sum + key[idx], val[idx]);\n"
"}\n",
"atomic_reduce"
Both approaches of initial post and the answer by doqtor work well. Basically there are two ways to implement them on doubles: using unions or using OpenCL as_type functions. OpenCL 1.0 code snippets are presented at the end of the answer (for OpenCL 2.x they can be shortened, but NVIDIA does not support it as of yet). As for performance, I personally have experience on AMD OpenCL realization on Tahiti chips that all these variants produce more or less the same execution time (as_ and union variants even produce the same optimized ISA code on most tested compilers). So using one variant or another is a matter of personal taste.
// define REALDOUBLES for double precision, undefine for single
#if REALDOUBLES
// extensions needed
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#ifdef cl_khr_int64_base_atomics
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#endif
// definitions
#define UINTVAR ulong
#define AS_INT as_ulong
#define AS_REAL as_double
#define ATOM_CMPXCHG atom_cmpxchg
#define ATOM_XCHG atom_xchg
#else
// extensions needed
#ifdef cl_khr_local_int32_base_atomics
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
#endif
#ifdef cl_khr_global_int32_base_atomics
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#endif
// definitions
#define UINTVAR uint
#define AS_INT as_uint
#define AS_REAL as_float
#define ATOM_CMPXCHG atomic_cmpxchg
#define ATOM_XCHG atomic_xchg
#endif
// as_ variants
// variant from GROMACS - https://streamhpc.com/blog/2016-02-09/atomic-operations-for-floats-in-opencl-improved/
inline void atomic_add_local(volatile local REAL * const source, const REAL operand) {
UINTVAR expected, current;
current = AS_INT(*source);
do {
expected = current;
current = ATOM_CMPXCHG((volatile local UINTVAR *)source, expected, AS_INT(AS_REAL(expected) + operand));
} while (current != expected);
}
// NVIDIA variant
inline void atomic_add_local(local REAL * const source, const REAL operand) {
UINTVAR old = AS_INT(operand);
while ((old = ATOM_XCHG((local UINTVAR *)source, AS_INT(AS_REAL(ATOM_XCHG((local UINTVAR *)source, AS_INT((REAL)0))) + AS_REAL(old)))) != AS_INT((REAL)0));
}
// union variants
typedef union {
UINTVAR intVal;
REAL floatVal;
} uni;
// NVIDIA variant
inline void atomic_add_local(local REAL * const source, const REAL operand) {
uni old, t, zero;
old.floatVal = operand;
zero.floatVal = 0;
do {
t.intVal = ATOM_XCHG((local UINTVAR *)source, zero.intVal);
t.floatVal += old.floatVal;
} while ((old.intVal = ATOM_XCHG((local UINTVAR *)source, t.intVal)) != zero.intVal);
}
// shortened variant from GROMACS - https://streamhpc.com/blog/2016-02-09/atomic-operations-for-floats-in-opencl-improved/
inline void atomic_add_local(volatile local REAL * const source, const REAL operand) {
uni expected, current;
current.floatVal = *source;
do {
expected.floatVal = current.floatVal;
current.floatVal = expected.floatVal + operand;
current.intVal = ATOM_CMPXCHG((volatile local UINTVAR *)source, expected.intVal, current.intVal);
} while (current.intVal != expected.intVal);
}
And obvious replacement local<->global
for global memory.