Parallelize function which will count all vectors with sum equal of vector elements and elements not bigger of k

前端 未结 3 1527
旧巷少年郎
旧巷少年郎 2021-01-28 18:16

I want to parallelize a function in CUDA C which will count all vectors with sum equal of vector elements and elements not bigger than k. For example if the number of vector ele

3条回答
  •  旧巷少年郎
    2021-01-28 18:32

    Here's an example brute-force program to enumerate all the possible vectors, and then test the sum of each vector to see if it matches the desired sum.

    • Assume n= length of vector in "digits"
    • Assume each vector "digit" is represented by an unsigned quantity
    • Assume k=maximum "digit" value + 1
    • The size of the vector space is given by k^n
    • Divide this space into contiguous groups of vectors to be processed by each thread: (k^n)/grid_size
    • generate the starting vector for each thread (i.e. the starting vector in each group)
    • Each thread then loops through testing the vector sum and incrementing the count if necessary, and then "incrementing" the vector, until each thread has processed it's assigned contiguous group of vectors

    The program:

    #include 
    #include 
    #include 
    #include 
    
    #define MAX_N 12
    #define nTPB 256
    #define GRIDSIZE (32*nTPB)
    
    
    #define cudaCheckErrors(msg) \
        do { \
            cudaError_t __err = cudaGetLastError(); \
            if (__err != cudaSuccess) { \
                fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                    msg, cudaGetErrorString(__err), \
                    __FILE__, __LINE__); \
                fprintf(stderr, "*** FAILED - ABORTING\n"); \
                exit(1); \
            } \
        } while (0)
    
    
    // thrust code is to quickly prototype a CPU based
    // method for verification
    int increment(thrust::host_vector &data, unsigned max){
      int pos = 0;
      int done = 0;
      int finished = 0;
    
      while(!done){
        data[pos]++;
        if (data[pos] >= max) {
          data[pos] = 0;
          pos++;
          if (pos >= data.size()){
            done = 1;
            finished = 1;
            }
          }
        else done = 1;
      }
      return finished;
    }
    
    __constant__ unsigned long powers[MAX_N];
    
    __device__ unsigned vec_sum(unsigned *vector, int size){
      unsigned sum = 0;
      for (int i=0; i 0) && (pos > 0)){
        unsigned long temp = residual/powers[pos-1];
        vector[(pos-1)*nTPB] = temp;
        residual -= temp*powers[pos-1];
        pos--;
        }
      while (pos>0) {
       vector[(pos-1)*nTPB] = 0;
       pos--;
       }
    }
    __device__ void increment_vector(unsigned *vector, int size, int k){
      int pos = 0;
      int done = 0;
    
      while(!done){
        vector[(pos*nTPB)]++;
        if (vector[pos*nTPB] >= k) {
          vector[pos*nTPB] = 0;
          pos++;
          if (pos >= size){
            done = 1;
            }
          }
        else done = 1;
      }
    }
    
    __global__ void find_vector_match(unsigned long long int *count, int k, int n, unsigned sum){
      __shared__ unsigned vecs[MAX_N *nTPB];
      unsigned *vec = &(vecs[threadIdx.x]);
      unsigned long idx = threadIdx.x+blockDim.x*blockIdx.x;
      if (idx < (k*powers[n-1])){
        unsigned long vec_count = 0;
        unsigned long vecs_per_thread = (k*powers[n-1])/(gridDim.x*blockDim.x);
        vecs_per_thread++;
        unsigned long vec_num = idx*vecs_per_thread;
        create_vector((vec_num), vec, n);
        while ((vec_count < vecs_per_thread) && (vec_num < (k*powers[n-1]))){
          if (vec_sum(vec, n) == sum) atomicAdd(count, 1UL);
          increment_vector(vec, n, k);
          vec_count++;
          vec_num++;
          }
       }
    }
    
    int main(){
    
    // calculate on CPU first for verification
      struct timeval t1, t2, t3;
      int n, k, sum;
      printf("Enter the length of vector (maximum: %d) n=", MAX_N);
      scanf("%d",&n);
      printf("Enter the max value of vector elements k=");
      scanf("%d",&k);
      printf("Enter the sum of vector elements sum=");
      scanf("%d",&sum);
      int count = 0;
      gettimeofday(&t1, NULL);
      k++;
    
      thrust::host_vector test(n);
      thrust::fill(test.begin(), test.end(), 0);
      int finished = 0;
      do{
        if (thrust::reduce(test.begin(), test.end()) == sum) count++;
        finished = increment(test, k);
        }
        while (!finished);
      gettimeofday(&t2, NULL);
      printf("CPU count = %d, in %d seconds\n", count, t2.tv_sec - t1.tv_sec);
      unsigned long h_powers[MAX_N];
      h_powers[0] = 1;
      if (n < MAX_N)
        for (int i = 1; i>>(d_count, k, n, sum);
      cudaMemcpy(h_count, d_count, sizeof(unsigned long long int), cudaMemcpyDeviceToHost);
      cudaCheckErrors("cudaMemcpy D2H fail");
      gettimeofday(&t3, NULL);
      printf("GPU count = %d, in %d seconds\n", *h_count, t3.tv_sec - t2.tv_sec);
    
      return 0;
    }
    

    compile with:

    $ nvcc -O3 -arch=sm_20 -o t260 t260.cu
    

    sample output:

    $ ./t260
    Enter the length of vector (maximum: 12) n=2
    Enter the max value of vector elements k=3
    Enter the sum of vector elements sum=4
    CPU count = 3, in 0 seconds
    GPU count = 3, in 0 seconds
    $ ./t260
    Enter the length of vector (maximum: 12) n=5
    Enter the max value of vector elements k=3
    Enter the sum of vector elements sum=10
    CPU count = 101, in 0 seconds
    GPU count = 101, in 0 seconds
    $ ./t260
    Enter the length of vector (maximum: 12) n=9
    Enter the max value of vector elements k=9
    Enter the sum of vector elements sum=20
    CPU count = 2714319, in 12 seconds
    GPU count = 2714319, in 1 seconds
    $ ./t260
    Enter the length of vector (maximum: 12) n=10
    Enter the max value of vector elements k=9
    Enter the sum of vector elements sum=20
    CPU count = 9091270, in 123 seconds
    GPU count = 9091270, in 4 seconds
    

    So for large problem sizes, the naive brute-force GPU code appears to be about 30x faster than the naive brute-force single-threaded CPU code. (... on my particular machine setup: CPU = Xeon X5560, GPU = Quadro5000, CentOS 5.5, CUDA 5.0)

提交回复
热议问题