Efficiently dividing unsigned value by a power of two, rounding up - in CUDA

前端 未结 6 1126
被撕碎了的回忆
被撕碎了的回忆 2021-01-21 09:28

I was just reading:

Efficiently dividing unsigned value by a power of two, rounding up

and I was wondering what was the fastest way to do this in CUDA. Of course

6条回答
  •  走了就别回头了
    2021-01-21 10:18

    One possible straightforward approach:

    $ cat t105.cu
    #include 
    
    __device__ unsigned r = 0;
    
    template 
    __device__ T pdqru(T p, T q){
    
      T p1 = p +  (q-1);
      if (sizeof(T) == 8)
        q = __ffsll(q);
      else
        q = __ffs(q);
      return (p1>(q-1))+1) :(p1 >> (q-1));
    }
    
    __global__ void test(unsigned p, unsigned q){
    #ifdef USE_DISPLAY
      unsigned q2 = 16;
      unsigned z = 0;
      unsigned l = 1U<<31;
      printf("result %u/%u = %u\n", p, q, pdqru(p, q));
      printf("result %u/%u = %u\n", p, q2, pdqru(p, q2));
      printf("result %u/%u = %u\n", p, z, pdqru(p, z));
      printf("result %u/%u = %u\n", z, q, pdqru(z, q));
      printf("result %u/%u = %u\n", l, q, pdqru(l, q));
      printf("result %u/%u = %u\n", q, l, pdqru(q, l));
      printf("result %u/%u = %u\n", l, l, pdqru(l, l));
      printf("result %u/%u = %u\n", q, q, pdqru(q, q));
    #else
      r = pdqru(p, q);
    #endif
    }
    
    
    int main(){
      unsigned h_r;
      test<<<1,1>>>(32767, 32);
      cudaMemcpyFromSymbol(&h_r, r, sizeof(unsigned));
      printf("result = %u\n", h_r);
    }
    
    
    $ nvcc -arch=sm_61 -o t105 t105.cu
    $ cuobjdump -sass ./t105
    
    Fatbin elf code:
    ================
    arch = sm_61
    code version = [1,7]
    producer = 
    host = linux
    compile_size = 64bit
    
            code for sm_61
    
    Fatbin elf code:
    ================
    arch = sm_61
    code version = [1,7]
    producer = cuda
    host = linux
    compile_size = 64bit
    
            code for sm_61
                    Function : _Z4testjj
            .headerflags    @"EF_CUDA_SM61 EF_CUDA_PTX_SM(EF_CUDA_SM61)"
                                                                             /* 0x001fc800fec007f6 */
            /*0008*/                   MOV R1, c[0x0][0x20];                 /* 0x4c98078000870001 */
            /*0010*/                   IADD R0, RZ, -c[0x0][0x144];          /* 0x4c1100000517ff00 */
            /*0018*/                   LOP.AND R0, R0, c[0x0][0x144];        /* 0x4c47000005170000 */
                                                                             /* 0x005fd401fe20003d */
            /*0028*/                   FLO.U32 R2, R0;                       /* 0x5c30000000070002 */
            /*0030*/                   MOV R0, c[0x0][0x144];                /* 0x4c98078005170000 */
            /*0038*/                   IADD32I R3, -R2, 0x1f;                /* 0x1d00000001f70203 */
                                                                             /* 0x001fd000fc2007f1 */
            /*0048*/                   IADD32I R0, R0, -0x1;                 /* 0x1c0ffffffff70000 */
            /*0050*/                   MOV R2, c[0x0][0x140];                /* 0x4c98078005070002 */
            /*0058*/                   IADD32I R4, -R3, 0x1f;                /* 0x1d00000001f70304 */
                                                                             /* 0x001fd800fe2007f6 */
            /*0068*/                   IADD R5, R0, c[0x0][0x140];           /* 0x4c10000005070005 */
            /*0070*/                   ISETP.LT.U32.AND P0, PT, R5, R0, PT;  /* 0x5b62038000070507 */
            /*0078*/                   SHR.U32 R0, R2, R4;                   /* 0x5c28000000470200 */
                                                                             /* 0x001fd000fc2007f1 */
            /*0088*/                   IADD32I R0, R0, 0x1;                  /* 0x1c00000000170000 */
            /*0090*/                   MOV32I R2, 0x0;                       /* 0x010000000007f002 */
            /*0098*/                   MOV32I R3, 0x0;                       /* 0x010000000007f003 */
                                                                             /* 0x001ffc001e2007f2 */
            /*00a8*/              @!P0 SHR.U32 R0, R5, R4;                   /* 0x5c28000000480500 */
            /*00b0*/                   STG.E [R2], R0;                       /* 0xeedc200000070200 */
            /*00b8*/                   EXIT;                                 /* 0xe30000000007000f */
                                                                             /* 0x001f8000fc0007ff */
            /*00c8*/                   BRA 0xc0;                             /* 0xe2400fffff07000f */
            /*00d0*/                   NOP;                                  /* 0x50b0000000070f00 */
            /*00d8*/                   NOP;                                  /* 0x50b0000000070f00 */
                                                                             /* 0x001f8000fc0007e0 */
            /*00e8*/                   NOP;                                  /* 0x50b0000000070f00 */
            /*00f0*/                   NOP;                                  /* 0x50b0000000070f00 */
            /*00f8*/                   NOP;                                  /* 0x50b0000000070f00 */
                    ..........................
    
    
    
    Fatbin ptx code:
    ================
    arch = sm_61
    code version = [5,0]
    producer = cuda
    host = linux
    compile_size = 64bit
    compressed
    $ nvcc -arch=sm_61 -o t105 t105.cu -DUSE_DISPLAY
    $ cuda-memcheck ./t105
    ========= CUDA-MEMCHECK
    result 32767/32 = 1024
    result 32767/16 = 2048
    result 32767/0 = 2048
    result 0/32 = 0
    result 2147483648/32 = 67108864
    result 32/2147483648 = 1
    result 2147483648/2147483648 = 1
    result 32/32 = 1
    result = 0
    ========= ERROR SUMMARY: 0 errors
    $
    

    Approximately 14 SASS instructions for the 32-bit case, to get the answer into R0. It produces spurious results for the divide-by-zero case.

    The equivalent assembly for this answer case looks like this:

    $ cat t106.cu
    #include 
    #include 
    __device__ unsigned r = 0;
    
    
    template  __device__ int find_first_set(T x);
    template <> __device__ int find_first_set(uint32_t x) { return __ffs(x);   }
    template <> __device__ int find_first_set(uint64_t x) { return __ffsll(x); }
    
    template   __device__ T lg(T x) { return find_first_set(x) - 1; }
    
    template 
    __device__ T pdqru(T dividend, T divisor)
    {
        auto log_2_of_divisor = lg(divisor);
        auto mask = divisor - 1;
        auto correction_for_rounding_up = ((dividend & mask) + mask) >> log_2_of_divisor;
    
        return (dividend >> log_2_of_divisor) + correction_for_rounding_up;
    }
    
    __global__ void test(unsigned p, unsigned q){
    #ifdef USE_DISPLAY
      unsigned q2 = 16;
      unsigned z = 0;
      unsigned l = 1U<<31;
      printf("result %u/%u = %u\n", p, q, pdqru(p, q));
      printf("result %u/%u = %u\n", p, q2, pdqru(p, q2));
      printf("result %u/%u = %u\n", p, z, pdqru(p, z));
      printf("result %u/%u = %u\n", z, q, pdqru(z, q));
      printf("result %u/%u = %u\n", l, q, pdqru(l, q));
      printf("result %u/%u = %u\n", q, l, pdqru(q, l));
      printf("result %u/%u = %u\n", l, l, pdqru(l, l));
      printf("result %u/%u = %u\n", q, q, pdqru(q, q));
    #else
      r = pdqru(p, q);
    #endif
    }
    
    
    int main(){
      unsigned h_r;
      test<<<1,1>>>(32767, 32);
      cudaMemcpyFromSymbol(&h_r, r, sizeof(unsigned));
      printf("result = %u\n", h_r);
    }
    
    
    $ nvcc -std=c++11  -arch=sm_61 -o t106 t106.cu
    $ cuobjdump -sass t106
    
    Fatbin elf code:
    ================
    arch = sm_61
    code version = [1,7]
    producer = 
    host = linux
    compile_size = 64bit
    
            code for sm_61
    
    Fatbin elf code:
    ================
    arch = sm_61
    code version = [1,7]
    producer = cuda
    host = linux
    compile_size = 64bit
    
            code for sm_61
                    Function : _Z4testjj
            .headerflags    @"EF_CUDA_SM61 EF_CUDA_PTX_SM(EF_CUDA_SM61)"
                                                                       /* 0x001fd400fe2007f6 */
            /*0008*/                   MOV R1, c[0x0][0x20];           /* 0x4c98078000870001 */
            /*0010*/                   IADD R0, RZ, -c[0x0][0x144];    /* 0x4c1100000517ff00 */
            /*0018*/                   MOV R2, c[0x0][0x144];          /* 0x4c98078005170002 */
                                                                       /* 0x003fc40007a007f2 */
            /*0028*/                   LOP.AND R0, R0, c[0x0][0x144];  /* 0x4c47000005170000 */
            /*0030*/                   FLO.U32 R3, R0;                 /* 0x5c30000000070003 */
            /*0038*/                   IADD32I R0, R2, -0x1;           /* 0x1c0ffffffff70200 */
                                                                       /* 0x001fc400fcc017f5 */
            /*0048*/                   IADD32I R3, -R3, 0x1f;          /* 0x1d00000001f70303 */
            /*0050*/                   LOP.AND R2, R0, c[0x0][0x140];  /* 0x4c47000005070002 */
            /*0058*/                   IADD R2, R0, R2;                /* 0x5c10000000270002 */
                                                                       /* 0x001fd000fe2007f1 */
            /*0068*/                   IADD32I R0, -R3, 0x1f;          /* 0x1d00000001f70300 */
            /*0070*/                   MOV R3, c[0x0][0x140];          /* 0x4c98078005070003 */
            /*0078*/                   MOV32I R6, 0x0;                 /* 0x010000000007f006 */
                                                                       /* 0x001fc400fc2407f1 */
            /*0088*/                   SHR.U32 R4, R2, R0.reuse;       /* 0x5c28000000070204 */
            /*0090*/                   SHR.U32 R5, R3, R0;             /* 0x5c28000000070305 */
            /*0098*/                   MOV R2, R6;                     /* 0x5c98078000670002 */
                                                                       /* 0x0003c400fe4007f4 */
            /*00a8*/                   MOV32I R3, 0x0;                 /* 0x010000000007f003 */
            /*00b0*/                   IADD R0, R4, R5;                /* 0x5c10000000570400 */
            /*00b8*/                   STG.E [R2], R0;                 /* 0xeedc200000070200 */
                                                                       /* 0x001f8000ffe007ff */
            /*00c8*/                   EXIT;                           /* 0xe30000000007000f */
            /*00d0*/                   BRA 0xd0;                       /* 0xe2400fffff87000f */
            /*00d8*/                   NOP;                            /* 0x50b0000000070f00 */
                                                                       /* 0x001f8000fc0007e0 */
            /*00e8*/                   NOP;                            /* 0x50b0000000070f00 */
            /*00f0*/                   NOP;                            /* 0x50b0000000070f00 */
            /*00f8*/                   NOP;                            /* 0x50b0000000070f00 */
                    ..........................
    
    
    
    Fatbin ptx code:
    ================
    arch = sm_61
    code version = [5,0]
    producer = cuda
    host = linux
    compile_size = 64bit
    compressed
    $
    

    which appears to be 1 instruction longer, by my count.

提交回复
热议问题