Efficiently dividing unsigned value by a power of two, rounding up - in CUDA

前端 未结 6 1125
被撕碎了的回忆
被撕碎了的回忆 2021-01-21 09:28

I was just reading:

Efficiently dividing unsigned value by a power of two, rounding up

and I was wondering what was the fastest way to do this in CUDA. Of course

6条回答
  •  后悔当初
    2021-01-21 10:16

    template  __device__ T div_by_power_of_2_rounding_up(T p, T q)
    {
        return p==0 ? 0 : ((p - 1) >> lg(q)) + 1;
    }
    

    One instruction shorter than Robert's previous answer (but see his comeback) if my count is correct, or the same instruction count as the funnel shift. Has a branch though - not sure if that makes a difference (other than a benefit if the entire warp gets zero p inputs):

    Fatbin elf code:
    ================
    arch = sm_61
    code version = [1,7]
    producer = cuda
    host = linux
    compile_size = 64bit
    
        code for sm_61
            Function : _Z4testjj
        .headerflags    @"EF_CUDA_SM61 EF_CUDA_PTX_SM(EF_CUDA_SM61)"
                                                                                             /* 0x001fc000fda007f6 */
            /*0008*/                   MOV R1, c[0x0][0x20];                                 /* 0x4c98078000870001 */
            /*0010*/                   ISETP.EQ.AND P0, PT, RZ, c[0x0][0x140], PT;           /* 0x4b6503800507ff07 */
            /*0018*/         {         MOV R0, RZ;                                           /* 0x5c9807800ff70000 */
            /*0028*/               @P0 BRA 0x90;        }                                    /* 0x001fc800fec007fd */
                                                                                             /* 0xe24000000600000f */
            /*0030*/                   IADD R0, RZ, -c[0x0][0x144];                          /* 0x4c1100000517ff00 */
            /*0038*/                   LOP.AND R0, R0, c[0x0][0x144];                        /* 0x4c47000005170000 */
                                                                                             /* 0x003fc400ffa00711 */
            /*0048*/                   FLO.U32 R0, R0;                                       /* 0x5c30000000070000 */
            /*0050*/                   MOV R3, c[0x0][0x140];                                /* 0x4c98078005070003 */
            /*0058*/                   IADD32I R2, -R0, 0x1f;                                /* 0x1d00000001f70002 */
                                                                                             /* 0x001fd800fcc007f5 */
            /*0068*/                   IADD32I R0, R3, -0x1;                                 /* 0x1c0ffffffff70300 */
            /*0070*/                   IADD32I R2, -R2, 0x1f;                                /* 0x1d00000001f70202 */
            /*0078*/                   SHR.U32 R0, R0, R2;                                   /* 0x5c28000000270000 */
                                                                                             /* 0x001fc800fe2007f6 */
            /*0088*/                   IADD32I R0, R0, 0x1;                                  /* 0x1c00000000170000 */
            /*0090*/                   MOV32I R2, 0x0;                                       /* 0x010000000007f002 */
            /*0098*/                   MOV32I R3, 0x0;                                       /* 0x010000000007f003 */
                                                                                             /* 0x001ffc00ffe000f1 */
            /*00a8*/                   STG.E [R2], R0;                                       /* 0xeedc200000070200 */
            /*00b0*/                   EXIT;                                                 /* 0xe30000000007000f */
            /*00b8*/                   BRA 0xb8;                                             /* 0xe2400fffff87000f */
            ..........................
    

    I believe it should still be possible to shave an instruction or two from the funnel shift by writing it in PTX (Morning update: as Robert has proven in the meantime), but I really need to go to bed.

    Update2: Doing that (using Harold's funnel shift and writing the function in PTX)

    _device__ uint32_t div_by_power_of_2_rounding_up(uint32_t p, uint32_t q)
    {
      uint32_t ret;
      asm volatile("{\r\t"
                   ".reg.u32        shift, mask, lo, hi;\n\t"
                   "bfind.u32       shift, %2;\r\t"
                   "sub.u32         mask, %2, 1;\r\t"
                   "add.cc.u32      lo, %1, mask;\r\t"
                   "addc.u32        hi, 0, 0;\r\t"
                   "shf.r.wrap.b32  %0, lo, hi, shift;\n\t"
                   "}"
                    : "=r"(ret) : "r"(p), "r"(q));
      return ret;
    }
    

    just gets us to the same instruction count as Robert has already achieved with his simpler C code:

    Fatbin elf code:
    ================
    arch = sm_61
    code version = [1,7]
    producer = cuda
    host = linux
    compile_size = 64bit
    
        code for sm_61
            Function : _Z4testjj
        .headerflags    @"EF_CUDA_SM61 EF_CUDA_PTX_SM(EF_CUDA_SM61)"
                                                                                /* 0x001fc000fec007f6 */
            /*0008*/                   MOV R1, c[0x0][0x20];                    /* 0x4c98078000870001 */
            /*0010*/                   MOV R0, c[0x0][0x144];                   /* 0x4c98078005170000 */
            /*0018*/         {         IADD32I R2, R0, -0x1;                    /* 0x1c0ffffffff70002 */
            /*0028*/                   FLO.U32 R0, c[0x0][0x144];        }      /* 0x001fc400fec00716 */
                                                                                /* 0x4c30000005170000 */
            /*0030*/                   IADD R5.CC, R2, c[0x0][0x140];           /* 0x4c10800005070205 */
            /*0038*/                   IADD.X R6, RZ, RZ;                       /* 0x5c1008000ff7ff06 */
                                                                                /* 0x003fc800fc8007f1 */
            /*0048*/                   MOV32I R2, 0x0;                          /* 0x010000000007f002 */
            /*0050*/                   MOV32I R3, 0x0;                          /* 0x010000000007f003 */
            /*0058*/                   SHF.R.W R0, R5, R0, R6;                  /* 0x5cfc030000070500 */
                                                                                /* 0x001ffc00ffe000f1 */
            /*0068*/                   STG.E [R2], R0;                          /* 0xeedc200000070200 */
            /*0070*/                   EXIT;                                    /* 0xe30000000007000f */
            /*0078*/                   BRA 0x78;                                /* 0xe2400fffff87000f */
            ..........................
    

提交回复
热议问题