I was just reading:
Efficiently dividing unsigned value by a power of two, rounding up
and I was wondering what was the fastest way to do this in CUDA. Of course
template __device__ T div_by_power_of_2_rounding_up(T p, T q)
{
return p==0 ? 0 : ((p - 1) >> lg(q)) + 1;
}
One instruction shorter than Robert's previous answer (but see his comeback) if my count is correct, or the same instruction count as the funnel shift. Has a branch though - not sure if that makes a difference (other than a benefit if the entire warp gets zero p
inputs):
Fatbin elf code:
================
arch = sm_61
code version = [1,7]
producer = cuda
host = linux
compile_size = 64bit
code for sm_61
Function : _Z4testjj
.headerflags @"EF_CUDA_SM61 EF_CUDA_PTX_SM(EF_CUDA_SM61)"
/* 0x001fc000fda007f6 */
/*0008*/ MOV R1, c[0x0][0x20]; /* 0x4c98078000870001 */
/*0010*/ ISETP.EQ.AND P0, PT, RZ, c[0x0][0x140], PT; /* 0x4b6503800507ff07 */
/*0018*/ { MOV R0, RZ; /* 0x5c9807800ff70000 */
/*0028*/ @P0 BRA 0x90; } /* 0x001fc800fec007fd */
/* 0xe24000000600000f */
/*0030*/ IADD R0, RZ, -c[0x0][0x144]; /* 0x4c1100000517ff00 */
/*0038*/ LOP.AND R0, R0, c[0x0][0x144]; /* 0x4c47000005170000 */
/* 0x003fc400ffa00711 */
/*0048*/ FLO.U32 R0, R0; /* 0x5c30000000070000 */
/*0050*/ MOV R3, c[0x0][0x140]; /* 0x4c98078005070003 */
/*0058*/ IADD32I R2, -R0, 0x1f; /* 0x1d00000001f70002 */
/* 0x001fd800fcc007f5 */
/*0068*/ IADD32I R0, R3, -0x1; /* 0x1c0ffffffff70300 */
/*0070*/ IADD32I R2, -R2, 0x1f; /* 0x1d00000001f70202 */
/*0078*/ SHR.U32 R0, R0, R2; /* 0x5c28000000270000 */
/* 0x001fc800fe2007f6 */
/*0088*/ IADD32I R0, R0, 0x1; /* 0x1c00000000170000 */
/*0090*/ MOV32I R2, 0x0; /* 0x010000000007f002 */
/*0098*/ MOV32I R3, 0x0; /* 0x010000000007f003 */
/* 0x001ffc00ffe000f1 */
/*00a8*/ STG.E [R2], R0; /* 0xeedc200000070200 */
/*00b0*/ EXIT; /* 0xe30000000007000f */
/*00b8*/ BRA 0xb8; /* 0xe2400fffff87000f */
..........................
I believe it should still be possible to shave an instruction or two from the funnel shift by writing it in PTX (Morning update: as Robert has proven in the meantime), but I really need to go to bed.
Update2: Doing that (using Harold's funnel shift and writing the function in PTX)
_device__ uint32_t div_by_power_of_2_rounding_up(uint32_t p, uint32_t q)
{
uint32_t ret;
asm volatile("{\r\t"
".reg.u32 shift, mask, lo, hi;\n\t"
"bfind.u32 shift, %2;\r\t"
"sub.u32 mask, %2, 1;\r\t"
"add.cc.u32 lo, %1, mask;\r\t"
"addc.u32 hi, 0, 0;\r\t"
"shf.r.wrap.b32 %0, lo, hi, shift;\n\t"
"}"
: "=r"(ret) : "r"(p), "r"(q));
return ret;
}
just gets us to the same instruction count as Robert has already achieved with his simpler C code:
Fatbin elf code:
================
arch = sm_61
code version = [1,7]
producer = cuda
host = linux
compile_size = 64bit
code for sm_61
Function : _Z4testjj
.headerflags @"EF_CUDA_SM61 EF_CUDA_PTX_SM(EF_CUDA_SM61)"
/* 0x001fc000fec007f6 */
/*0008*/ MOV R1, c[0x0][0x20]; /* 0x4c98078000870001 */
/*0010*/ MOV R0, c[0x0][0x144]; /* 0x4c98078005170000 */
/*0018*/ { IADD32I R2, R0, -0x1; /* 0x1c0ffffffff70002 */
/*0028*/ FLO.U32 R0, c[0x0][0x144]; } /* 0x001fc400fec00716 */
/* 0x4c30000005170000 */
/*0030*/ IADD R5.CC, R2, c[0x0][0x140]; /* 0x4c10800005070205 */
/*0038*/ IADD.X R6, RZ, RZ; /* 0x5c1008000ff7ff06 */
/* 0x003fc800fc8007f1 */
/*0048*/ MOV32I R2, 0x0; /* 0x010000000007f002 */
/*0050*/ MOV32I R3, 0x0; /* 0x010000000007f003 */
/*0058*/ SHF.R.W R0, R5, R0, R6; /* 0x5cfc030000070500 */
/* 0x001ffc00ffe000f1 */
/*0068*/ STG.E [R2], R0; /* 0xeedc200000070200 */
/*0070*/ EXIT; /* 0xe30000000007000f */
/*0078*/ BRA 0x78; /* 0xe2400fffff87000f */
..........................