I was just reading:
Efficiently dividing unsigned value by a power of two, rounding up
and I was wondering what was the fastest way to do this in CUDA. Of course
riffing off of the kewl answer by @tera:
template __device__ T pdqru(T p, T q)
{
return bool(p) * (((p - 1) >> lg(q)) + 1);
}
11 instructions (no branches, no predication) to get the result in R0:
Fatbin elf code:
================
arch = sm_61
code version = [1,7]
producer = cuda
host = linux
compile_size = 64bit
code for sm_61
Function : _Z4testjj
.headerflags @"EF_CUDA_SM61 EF_CUDA_PTX_SM(EF_CUDA_SM61)"
/* 0x001fc800fec007f6 */
/*0008*/ MOV R1, c[0x0][0x20]; /* 0x4c98078000870001 */
/*0010*/ IADD R0, RZ, -c[0x0][0x144]; /* 0x4c1100000517ff00 */
/*0018*/ LOP.AND R0, R0, c[0x0][0x144]; /* 0x4c47000005170000 */
/* 0x003fc400ffa00711 */
/*0028*/ FLO.U32 R0, R0; /* 0x5c30000000070000 */
/*0030*/ MOV R5, c[0x0][0x140]; /* 0x4c98078005070005 */
/*0038*/ IADD32I R2, -R0, 0x1f; /* 0x1d00000001f70002 */
/* 0x001fd800fcc007f5 */
/*0048*/ IADD32I R0, R5, -0x1; /* 0x1c0ffffffff70500 */
/*0050*/ IADD32I R2, -R2, 0x1f; /* 0x1d00000001f70202 */
/*0058*/ SHR.U32 R0, R0, R2; /* 0x5c28000000270000 */
/* 0x001fd000fe2007f1 */
/*0068*/ IADD32I R0, R0, 0x1; /* 0x1c00000000170000 */
/*0070*/ MOV32I R2, 0x0; /* 0x010000000007f002 */
/*0078*/ MOV32I R3, 0x0; /* 0x010000000007f003 */
/* 0x001ffc001e2007f2 */
/*0088*/ ICMP.NE R0, R0, RZ, R5; /* 0x5b4b02800ff70000 */
/*0090*/ STG.E [R2], R0; /* 0xeedc200000070200 */
/*0098*/ EXIT; /* 0xe30000000007000f */
/* 0x001f8000fc0007ff */
/*00a8*/ BRA 0xa0; /* 0xe2400fffff07000f */
/*00b0*/ NOP; /* 0x50b0000000070f00 */
/*00b8*/ NOP; /* 0x50b0000000070f00 */
..........................
After studying the above SASS code, it seemed evident that these two instructions:
/*0038*/ IADD32I R2, -R0, 0x1f; /* 0x1d00000001f70002 */
/* 0x001fd800fcc007f5 */
...
/*0050*/ IADD32I R2, -R2, 0x1f; /* 0x1d00000001f70202 */
shouldn't really be necessary. I don't have a precise explanation, but my assumption is that because the FLO.U32
SASS instruction does not have precisely the same semantics as the __ffs()
intrinsic, the compiler apparently has an idiom when using that intrinsic, which wraps the basic FLO
instruction that is doing the work. It wasn't obvious how to work around this at the C++ source code level, but I was able to use the bfind
PTX instruction in a way to reduce the instruction count further, to 7 according to my count (to get the answer into a register):
$ cat t107.cu
#include
#include
__device__ unsigned r = 0;
static __device__ __inline__ uint32_t __my_bfind(uint32_t val){
uint32_t ret;
asm volatile("bfind.u32 %0, %1;" : "=r"(ret): "r"(val));
return ret;}
template __device__ T pdqru(T p, T q)
{
return bool(p) * (((p - 1) >> (__my_bfind(q))) + 1);
}
__global__ void test(unsigned p, unsigned q){
#ifdef USE_DISPLAY
unsigned q2 = 16;
unsigned z = 0;
unsigned l = 1U<<31;
printf("result %u/%u = %u\n", p, q, pdqru(p, q));
printf("result %u/%u = %u\n", p, q2, pdqru(p, q2));
printf("result %u/%u = %u\n", p, z, pdqru(p, z));
printf("result %u/%u = %u\n", z, q, pdqru(z, q));
printf("result %u/%u = %u\n", l, q, pdqru(l, q));
printf("result %u/%u = %u\n", q, l, pdqru(q, l));
printf("result %u/%u = %u\n", l, l, pdqru(l, l));
printf("result %u/%u = %u\n", q, q, pdqru(q, q));
#else
r = pdqru(p, q);
#endif
}
int main(){
unsigned h_r;
test<<<1,1>>>(32767, 32);
cudaMemcpyFromSymbol(&h_r, r, sizeof(unsigned));
printf("result = %u\n", h_r);
}
$ nvcc -arch=sm_61 -o t107 t107.cu -std=c++11
$ cuobjdump -sass t107
Fatbin elf code:
================
arch = sm_61
code version = [1,7]
producer =
host = linux
compile_size = 64bit
code for sm_61
Fatbin elf code:
================
arch = sm_61
code version = [1,7]
producer = cuda
host = linux
compile_size = 64bit
code for sm_61
Function : _Z4testjj
.headerflags @"EF_CUDA_SM61 EF_CUDA_PTX_SM(EF_CUDA_SM61)"
/* 0x001c4400fe0007f6 */
/*0008*/ MOV R1, c[0x0][0x20]; /* 0x4c98078000870001 */
/*0010*/ { MOV32I R3, 0x0; /* 0x010000000007f003 */
/*0018*/ FLO.U32 R2, c[0x0][0x144]; } /* 0x4c30000005170002 */
/* 0x003fd800fec007f6 */
/*0028*/ MOV R5, c[0x0][0x140]; /* 0x4c98078005070005 */
/*0030*/ IADD32I R0, R5, -0x1; /* 0x1c0ffffffff70500 */
/*0038*/ SHR.U32 R0, R0, R2; /* 0x5c28000000270000 */
/* 0x001fc800fca007f1 */
/*0048*/ IADD32I R0, R0, 0x1; /* 0x1c00000000170000 */
/*0050*/ MOV32I R2, 0x0; /* 0x010000000007f002 */
/*0058*/ ICMP.NE R0, R0, RZ, R5; /* 0x5b4b02800ff70000 */
/* 0x001ffc00ffe000f1 */
/*0068*/ STG.E [R2], R0; /* 0xeedc200000070200 */
/*0070*/ EXIT; /* 0xe30000000007000f */
/*0078*/ BRA 0x78; /* 0xe2400fffff87000f */
..........................
Fatbin ptx code:
================
arch = sm_61
code version = [5,0]
producer = cuda
host = linux
compile_size = 64bit
compressed
$ nvcc -arch=sm_61 -o t107 t107.cu -std=c++11 -DUSE_DISPLAY
$ cuda-memcheck ./t107
========= CUDA-MEMCHECK
result 32767/32 = 1024
result 32767/16 = 2048
result 32767/0 = 1
result 0/32 = 0
result 2147483648/32 = 67108864
result 32/2147483648 = 1
result 2147483648/2147483648 = 1
result 32/32 = 1
result = 0
========= ERROR SUMMARY: 0 errors
$
I've only demonstrated the 32-bit example, above.
I think I could make the case that there are really only 6 instructions doing the "work" in the above kernel SASS, and that the remainder of the instructions are kernel "overhead" and/or the instructions needed to store the register result into global memory. It seems evident that the compiler is generating just these instructions as a result of the function:
/*0018*/ FLO.U32 R2, c[0x0][0x144]; // find bit set in q
/* */
/*0028*/ MOV R5, c[0x0][0x140]; // load p
/*0030*/ IADD32I R0, R5, -0x1; // subtract 1 from p
/*0038*/ SHR.U32 R0, R0, R2; // shift p right by q bit
/* */
/*0048*/ IADD32I R0, R0, 0x1; // add 1 to result
/*0050*/ ... /* */
/*0058*/ ICMP.NE R0, R0, RZ, R5; // account for p=0 case
However this would be inconsistent with the way I've counted other cases (they should all probably be reduced by 1).