Efficiently dividing unsigned value by a power of two, rounding up - in CUDA

前端未结

关注

 6  1125

被撕碎了的回忆 2021-01-21 09:28

I was just reading:

Efficiently dividing unsigned value by a power of two, rounding up

and I was wondering what was the fastest way to do this in CUDA. Of course

6条回答

后悔当初 (楼主)

2021-01-21 10:16

template  __device__ T div_by_power_of_2_rounding_up(T p, T q)
{
    return p==0 ? 0 : ((p - 1) >> lg(q)) + 1;
}

One instruction shorter than Robert's previous answer (but see his comeback) if my count is correct, or the same instruction count as the funnel shift. Has a branch though - not sure if that makes a difference (other than a benefit if the entire warp gets zero p inputs):

Fatbin elf code:
================
arch = sm_61
code version = [1,7]
producer = cuda
host = linux
compile_size = 64bit

    code for sm_61
        Function : _Z4testjj
    .headerflags    @"EF_CUDA_SM61 EF_CUDA_PTX_SM(EF_CUDA_SM61)"
                                                                                         /* 0x001fc000fda007f6 */
        /*0008*/                   MOV R1, c[0x0][0x20];                                 /* 0x4c98078000870001 */
        /*0010*/                   ISETP.EQ.AND P0, PT, RZ, c[0x0][0x140], PT;           /* 0x4b6503800507ff07 */
        /*0018*/         {         MOV R0, RZ;                                           /* 0x5c9807800ff70000 */
        /*0028*/               @P0 BRA 0x90;        }                                    /* 0x001fc800fec007fd */
                                                                                         /* 0xe24000000600000f */
        /*0030*/                   IADD R0, RZ, -c[0x0][0x144];                          /* 0x4c1100000517ff00 */
        /*0038*/                   LOP.AND R0, R0, c[0x0][0x144];                        /* 0x4c47000005170000 */
                                                                                         /* 0x003fc400ffa00711 */
        /*0048*/                   FLO.U32 R0, R0;                                       /* 0x5c30000000070000 */
        /*0050*/                   MOV R3, c[0x0][0x140];                                /* 0x4c98078005070003 */
        /*0058*/                   IADD32I R2, -R0, 0x1f;                                /* 0x1d00000001f70002 */
                                                                                         /* 0x001fd800fcc007f5 */
        /*0068*/                   IADD32I R0, R3, -0x1;                                 /* 0x1c0ffffffff70300 */
        /*0070*/                   IADD32I R2, -R2, 0x1f;                                /* 0x1d00000001f70202 */
        /*0078*/                   SHR.U32 R0, R0, R2;                                   /* 0x5c28000000270000 */
                                                                                         /* 0x001fc800fe2007f6 */
        /*0088*/                   IADD32I R0, R0, 0x1;                                  /* 0x1c00000000170000 */
        /*0090*/                   MOV32I R2, 0x0;                                       /* 0x010000000007f002 */
        /*0098*/                   MOV32I R3, 0x0;                                       /* 0x010000000007f003 */
                                                                                         /* 0x001ffc00ffe000f1 */
        /*00a8*/                   STG.E [R2], R0;                                       /* 0xeedc200000070200 */
        /*00b0*/                   EXIT;                                                 /* 0xe30000000007000f */
        /*00b8*/                   BRA 0xb8;                                             /* 0xe2400fffff87000f */
        ..........................

I believe it should still be possible to shave an instruction or two from the funnel shift by writing it in PTX (Morning update: as Robert has proven in the meantime), but I really need to go to bed.

Update2: Doing that (using Harold's funnel shift and writing the function in PTX)

_device__ uint32_t div_by_power_of_2_rounding_up(uint32_t p, uint32_t q)
{
  uint32_t ret;
  asm volatile("{\r\t"
               ".reg.u32        shift, mask, lo, hi;\n\t"
               "bfind.u32       shift, %2;\r\t"
               "sub.u32         mask, %2, 1;\r\t"
               "add.cc.u32      lo, %1, mask;\r\t"
               "addc.u32        hi, 0, 0;\r\t"
               "shf.r.wrap.b32  %0, lo, hi, shift;\n\t"
               "}"
                : "=r"(ret) : "r"(p), "r"(q));
  return ret;
}

just gets us to the same instruction count as Robert has already achieved with his simpler C code:

Fatbin elf code:
================
arch = sm_61
code version = [1,7]
producer = cuda
host = linux
compile_size = 64bit

    code for sm_61
        Function : _Z4testjj
    .headerflags    @"EF_CUDA_SM61 EF_CUDA_PTX_SM(EF_CUDA_SM61)"
                                                                            /* 0x001fc000fec007f6 */
        /*0008*/                   MOV R1, c[0x0][0x20];                    /* 0x4c98078000870001 */
        /*0010*/                   MOV R0, c[0x0][0x144];                   /* 0x4c98078005170000 */
        /*0018*/         {         IADD32I R2, R0, -0x1;                    /* 0x1c0ffffffff70002 */
        /*0028*/                   FLO.U32 R0, c[0x0][0x144];        }      /* 0x001fc400fec00716 */
                                                                            /* 0x4c30000005170000 */
        /*0030*/                   IADD R5.CC, R2, c[0x0][0x140];           /* 0x4c10800005070205 */
        /*0038*/                   IADD.X R6, RZ, RZ;                       /* 0x5c1008000ff7ff06 */
                                                                            /* 0x003fc800fc8007f1 */
        /*0048*/                   MOV32I R2, 0x0;                          /* 0x010000000007f002 */
        /*0050*/                   MOV32I R3, 0x0;                          /* 0x010000000007f003 */
        /*0058*/                   SHF.R.W R0, R5, R0, R6;                  /* 0x5cfc030000070500 */
                                                                            /* 0x001ffc00ffe000f1 */
        /*0068*/                   STG.E [R2], R0;                          /* 0xeedc200000070200 */
        /*0070*/                   EXIT;                                    /* 0xe30000000007000f */
        /*0078*/                   BRA 0x78;                                /* 0xe2400fffff87000f */
        ..........................

0 讨论(0)

查看其它6个回答