Efficiently dividing unsigned value by a power of two, rounding up - in CUDA

前端 未结 6 1130
被撕碎了的回忆
被撕碎了的回忆 2021-01-21 09:28

I was just reading:

Efficiently dividing unsigned value by a power of two, rounding up

and I was wondering what was the fastest way to do this in CUDA. Of course

6条回答
  •  面向向阳花
    2021-01-21 10:10

    riffing off of the kewl answer by @tera:

     template  __device__ T pdqru(T p, T q)
    {
        return bool(p) * (((p - 1) >> lg(q)) + 1);
    }
    

    11 instructions (no branches, no predication) to get the result in R0:

    Fatbin elf code:
    ================
    arch = sm_61
    code version = [1,7]
    producer = cuda
    host = linux
    compile_size = 64bit
    
            code for sm_61
                    Function : _Z4testjj
            .headerflags    @"EF_CUDA_SM61 EF_CUDA_PTX_SM(EF_CUDA_SM61)"
                                                                       /* 0x001fc800fec007f6 */
            /*0008*/                   MOV R1, c[0x0][0x20];           /* 0x4c98078000870001 */
            /*0010*/                   IADD R0, RZ, -c[0x0][0x144];    /* 0x4c1100000517ff00 */
            /*0018*/                   LOP.AND R0, R0, c[0x0][0x144];  /* 0x4c47000005170000 */
                                                                       /* 0x003fc400ffa00711 */
            /*0028*/                   FLO.U32 R0, R0;                 /* 0x5c30000000070000 */
            /*0030*/                   MOV R5, c[0x0][0x140];          /* 0x4c98078005070005 */
            /*0038*/                   IADD32I R2, -R0, 0x1f;          /* 0x1d00000001f70002 */
                                                                       /* 0x001fd800fcc007f5 */
            /*0048*/                   IADD32I R0, R5, -0x1;           /* 0x1c0ffffffff70500 */
            /*0050*/                   IADD32I R2, -R2, 0x1f;          /* 0x1d00000001f70202 */
            /*0058*/                   SHR.U32 R0, R0, R2;             /* 0x5c28000000270000 */
                                                                       /* 0x001fd000fe2007f1 */
            /*0068*/                   IADD32I R0, R0, 0x1;            /* 0x1c00000000170000 */
            /*0070*/                   MOV32I R2, 0x0;                 /* 0x010000000007f002 */
            /*0078*/                   MOV32I R3, 0x0;                 /* 0x010000000007f003 */
                                                                       /* 0x001ffc001e2007f2 */
            /*0088*/                   ICMP.NE R0, R0, RZ, R5;         /* 0x5b4b02800ff70000 */
            /*0090*/                   STG.E [R2], R0;                 /* 0xeedc200000070200 */
            /*0098*/                   EXIT;                           /* 0xe30000000007000f */
                                                                       /* 0x001f8000fc0007ff */
            /*00a8*/                   BRA 0xa0;                       /* 0xe2400fffff07000f */
            /*00b0*/                   NOP;                            /* 0x50b0000000070f00 */
            /*00b8*/                   NOP;                            /* 0x50b0000000070f00 */
                    ..........................
    

    After studying the above SASS code, it seemed evident that these two instructions:

            /*0038*/                   IADD32I R2, -R0, 0x1f;          /* 0x1d00000001f70002 */
                                                                       /* 0x001fd800fcc007f5 */
            ...
            /*0050*/                   IADD32I R2, -R2, 0x1f;          /* 0x1d00000001f70202 */
    

    shouldn't really be necessary. I don't have a precise explanation, but my assumption is that because the FLO.U32 SASS instruction does not have precisely the same semantics as the __ffs() intrinsic, the compiler apparently has an idiom when using that intrinsic, which wraps the basic FLO instruction that is doing the work. It wasn't obvious how to work around this at the C++ source code level, but I was able to use the bfind PTX instruction in a way to reduce the instruction count further, to 7 according to my count (to get the answer into a register):

    $ cat t107.cu
    #include 
    #include 
    __device__ unsigned r = 0;
    
    
    static __device__ __inline__ uint32_t __my_bfind(uint32_t val){
      uint32_t ret;
      asm volatile("bfind.u32 %0, %1;" : "=r"(ret): "r"(val));
      return ret;}
    
    template  __device__ T pdqru(T p, T q)
    {
        return bool(p) * (((p - 1) >> (__my_bfind(q))) + 1);
    }
    
    __global__ void test(unsigned p, unsigned q){
    #ifdef USE_DISPLAY
      unsigned q2 = 16;
      unsigned z = 0;
      unsigned l = 1U<<31;
      printf("result %u/%u = %u\n", p, q, pdqru(p, q));
      printf("result %u/%u = %u\n", p, q2, pdqru(p, q2));
      printf("result %u/%u = %u\n", p, z, pdqru(p, z));
      printf("result %u/%u = %u\n", z, q, pdqru(z, q));
      printf("result %u/%u = %u\n", l, q, pdqru(l, q));
      printf("result %u/%u = %u\n", q, l, pdqru(q, l));
      printf("result %u/%u = %u\n", l, l, pdqru(l, l));
      printf("result %u/%u = %u\n", q, q, pdqru(q, q));
    #else
      r = pdqru(p, q);
    #endif
    }
    
    
    int main(){
      unsigned h_r;
      test<<<1,1>>>(32767, 32);
      cudaMemcpyFromSymbol(&h_r, r, sizeof(unsigned));
      printf("result = %u\n", h_r);
    }
    
    
    $ nvcc -arch=sm_61 -o t107 t107.cu -std=c++11
    $ cuobjdump -sass t107
    
    Fatbin elf code:
    ================
    arch = sm_61
    code version = [1,7]
    producer = 
    host = linux
    compile_size = 64bit
    
            code for sm_61
    
    Fatbin elf code:
    ================
    arch = sm_61
    code version = [1,7]
    producer = cuda
    host = linux
    compile_size = 64bit
    
            code for sm_61
                    Function : _Z4testjj
            .headerflags    @"EF_CUDA_SM61 EF_CUDA_PTX_SM(EF_CUDA_SM61)"
                                                                            /* 0x001c4400fe0007f6 */
            /*0008*/                   MOV R1, c[0x0][0x20];                /* 0x4c98078000870001 */
            /*0010*/         {         MOV32I R3, 0x0;                      /* 0x010000000007f003 */
            /*0018*/                   FLO.U32 R2, c[0x0][0x144];        }  /* 0x4c30000005170002 */
                                                                            /* 0x003fd800fec007f6 */
            /*0028*/                   MOV R5, c[0x0][0x140];               /* 0x4c98078005070005 */
            /*0030*/                   IADD32I R0, R5, -0x1;                /* 0x1c0ffffffff70500 */
            /*0038*/                   SHR.U32 R0, R0, R2;                  /* 0x5c28000000270000 */
                                                                            /* 0x001fc800fca007f1 */
            /*0048*/                   IADD32I R0, R0, 0x1;                 /* 0x1c00000000170000 */
            /*0050*/                   MOV32I R2, 0x0;                      /* 0x010000000007f002 */
            /*0058*/                   ICMP.NE R0, R0, RZ, R5;              /* 0x5b4b02800ff70000 */
                                                                            /* 0x001ffc00ffe000f1 */
            /*0068*/                   STG.E [R2], R0;                      /* 0xeedc200000070200 */
            /*0070*/                   EXIT;                                /* 0xe30000000007000f */
            /*0078*/                   BRA 0x78;                            /* 0xe2400fffff87000f */
                    ..........................
    
    
    
    Fatbin ptx code:
    ================
    arch = sm_61
    code version = [5,0]
    producer = cuda
    host = linux
    compile_size = 64bit
    compressed
    $ nvcc -arch=sm_61 -o t107 t107.cu -std=c++11 -DUSE_DISPLAY
    $ cuda-memcheck ./t107
    ========= CUDA-MEMCHECK
    result 32767/32 = 1024
    result 32767/16 = 2048
    result 32767/0 = 1
    result 0/32 = 0
    result 2147483648/32 = 67108864
    result 32/2147483648 = 1
    result 2147483648/2147483648 = 1
    result 32/32 = 1
    result = 0
    ========= ERROR SUMMARY: 0 errors
    $
    

    I've only demonstrated the 32-bit example, above.

    I think I could make the case that there are really only 6 instructions doing the "work" in the above kernel SASS, and that the remainder of the instructions are kernel "overhead" and/or the instructions needed to store the register result into global memory. It seems evident that the compiler is generating just these instructions as a result of the function:

            /*0018*/                   FLO.U32 R2, c[0x0][0x144];  // find bit set in q
                                                                            /*  */
            /*0028*/                   MOV R5, c[0x0][0x140];      // load p
            /*0030*/                   IADD32I R0, R5, -0x1;       // subtract 1 from p
            /*0038*/                   SHR.U32 R0, R0, R2;         // shift p right by q bit
                                                                            /*  */
            /*0048*/                   IADD32I R0, R0, 0x1;        // add 1 to result
            /*0050*/                   ...                                  /*  */
            /*0058*/                   ICMP.NE R0, R0, RZ, R5;     // account for p=0 case
    

    However this would be inconsistent with the way I've counted other cases (they should all probably be reduced by 1).

提交回复
热议问题