Parallelize function which will count all vectors with sum equal of vector elements and elements not bigger of k

前端未结

关注

 3  1526

I want to parallelize a function in CUDA C which will count all vectors with sum equal of vector elements and elements not bigger than k. For example if the number of vector ele

相关标签:

3条回答

北海茫月

2021-01-28 18:29

The problem is __syncthreads(). For a __syncthreads() to work properly, all the threads inside the block should be able to reach it otherwise some threads wait forever and your program doesn't get out. In your program, execution of __syncthreads() in some parts is conditional. That's the reason why your program doesn't work with more than one thread in one block.

0 讨论(0)
发布评论:

提交评论
- 加载中...
离开以前

2021-01-28 18:30
As Robert said in comments, if you want to generate all (k+1)^n permutations on GPU and test them, you can think of some GPU kernel like this:
```
__device__ int count;  //global variable must be initialized to zero before kernel call
__global__ void perm_generator(int k, int n, int sum) {
   int tid = blockIdx.x*blockDim.x+threadIdx.x;
   int id = tid;
   int mysum = 0;
   for ( int i = n; i > 1; i-- ) { //all n-1 vector elements
     mysum += (id % (k+1));
     id /= (k+1);
   }
   mysum += id; //last element
   if ( mysum == sum ) atomicAdd( &count, 1 );
}
```
The kernel should be called with exactly (k+1)^n threads. If you happen to call your kernel with more threads (simply because rule of thumb that block dimension should be multiple of 32), you need to check value of tid inside your kernel beforehand. Also, cudaThreadSynchronize() is deprecated. Use cudaDeviceSynchronize() instead.
0 讨论(0)
发布评论:

提交评论
- 加载中...

旧巷少年郎

2021-01-28 18:32

Here's an example brute-force program to enumerate all the possible vectors, and then test the sum of each vector to see if it matches the desired sum.

Assume n= length of vector in "digits"
Assume each vector "digit" is represented by an unsigned quantity
Assume k=maximum "digit" value + 1
The size of the vector space is given by k^n
Divide this space into contiguous groups of vectors to be processed by each thread: (k^n)/grid_size
generate the starting vector for each thread (i.e. the starting vector in each group)
Each thread then loops through testing the vector sum and incrementing the count if necessary, and then "incrementing" the vector, until each thread has processed it's assigned contiguous group of vectors

The program:

#include <stdio.h>
#include <thrust/host_vector.h>
#include <sys/time.h>
#include <time.h>

#define MAX_N 12
#define nTPB 256
#define GRIDSIZE (32*nTPB)


#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)


// thrust code is to quickly prototype a CPU based
// method for verification
int increment(thrust::host_vector<unsigned> &data, unsigned max){
  int pos = 0;
  int done = 0;
  int finished = 0;

  while(!done){
    data[pos]++;
    if (data[pos] >= max) {
      data[pos] = 0;
      pos++;
      if (pos >= data.size()){
        done = 1;
        finished = 1;
        }
      }
    else done = 1;
  }
  return finished;
}

__constant__ unsigned long powers[MAX_N];

__device__ unsigned vec_sum(unsigned *vector, int size){
  unsigned sum = 0;
  for (int i=0; i<size; i++) sum += vector[(i*nTPB)];
  return sum;
}

__device__ void create_vector(unsigned long index, unsigned *vector, int size){
  unsigned long residual = index;
  unsigned pos = size;
  while ((residual > 0) && (pos > 0)){
    unsigned long temp = residual/powers[pos-1];
    vector[(pos-1)*nTPB] = temp;
    residual -= temp*powers[pos-1];
    pos--;
    }
  while (pos>0) {
   vector[(pos-1)*nTPB] = 0;
   pos--;
   }
}
__device__ void increment_vector(unsigned *vector, int size, int k){
  int pos = 0;
  int done = 0;

  while(!done){
    vector[(pos*nTPB)]++;
    if (vector[pos*nTPB] >= k) {
      vector[pos*nTPB] = 0;
      pos++;
      if (pos >= size){
        done = 1;
        }
      }
    else done = 1;
  }
}

__global__ void find_vector_match(unsigned long long int *count, int k, int n, unsigned sum){
  __shared__ unsigned vecs[MAX_N *nTPB];
  unsigned *vec = &(vecs[threadIdx.x]);
  unsigned long idx = threadIdx.x+blockDim.x*blockIdx.x;
  if (idx < (k*powers[n-1])){
    unsigned long vec_count = 0;
    unsigned long vecs_per_thread = (k*powers[n-1])/(gridDim.x*blockDim.x);
    vecs_per_thread++;
    unsigned long vec_num = idx*vecs_per_thread;
    create_vector((vec_num), vec, n);
    while ((vec_count < vecs_per_thread) && (vec_num < (k*powers[n-1]))){
      if (vec_sum(vec, n) == sum) atomicAdd(count, 1UL);
      increment_vector(vec, n, k);
      vec_count++;
      vec_num++;
      }
   }
}

int main(){

// calculate on CPU first for verification
  struct timeval t1, t2, t3;
  int n, k, sum;
  printf("Enter the length of vector (maximum: %d) n=", MAX_N);
  scanf("%d",&n);
  printf("Enter the max value of vector elements k=");
  scanf("%d",&k);
  printf("Enter the sum of vector elements sum=");
  scanf("%d",&sum);
  int count = 0;
  gettimeofday(&t1, NULL);
  k++;

  thrust::host_vector<unsigned> test(n);
  thrust::fill(test.begin(), test.end(), 0);
  int finished = 0;
  do{
    if (thrust::reduce(test.begin(), test.end()) == sum) count++;
    finished = increment(test, k);
    }
    while (!finished);
  gettimeofday(&t2, NULL);
  printf("CPU count = %d, in %d seconds\n", count, t2.tv_sec - t1.tv_sec);
  unsigned long h_powers[MAX_N];
  h_powers[0] = 1;
  if (n < MAX_N)
    for (int i = 1; i<n; i++) h_powers[i] = h_powers[i-1]*k;
  cudaMemcpyToSymbol(powers, h_powers, MAX_N*sizeof(unsigned long));
  cudaCheckErrors("cudaMemcpyToSymbolfail");
  unsigned long long int *h_count, *d_count;
  h_count = (unsigned long long int *)malloc(sizeof(unsigned long long int));
  cudaMalloc((void **)&d_count, sizeof(unsigned long long int));
  cudaCheckErrors("cudaMalloc fail");
  *h_count = 0;
  cudaMemcpy(d_count, h_count, sizeof(unsigned long long int), cudaMemcpyHostToDevice);
  cudaCheckErrors("cudaMemcpy H2D fail");
  find_vector_match<<<(GRIDSIZE + nTPB -1)/nTPB, nTPB>>>(d_count, k, n, sum);
  cudaMemcpy(h_count, d_count, sizeof(unsigned long long int), cudaMemcpyDeviceToHost);
  cudaCheckErrors("cudaMemcpy D2H fail");
  gettimeofday(&t3, NULL);
  printf("GPU count = %d, in %d seconds\n", *h_count, t3.tv_sec - t2.tv_sec);

  return 0;
}

compile with:

$ nvcc -O3 -arch=sm_20 -o t260 t260.cu

sample output:

$ ./t260
Enter the length of vector (maximum: 12) n=2
Enter the max value of vector elements k=3
Enter the sum of vector elements sum=4
CPU count = 3, in 0 seconds
GPU count = 3, in 0 seconds
$ ./t260
Enter the length of vector (maximum: 12) n=5
Enter the max value of vector elements k=3
Enter the sum of vector elements sum=10
CPU count = 101, in 0 seconds
GPU count = 101, in 0 seconds
$ ./t260
Enter the length of vector (maximum: 12) n=9
Enter the max value of vector elements k=9
Enter the sum of vector elements sum=20
CPU count = 2714319, in 12 seconds
GPU count = 2714319, in 1 seconds
$ ./t260
Enter the length of vector (maximum: 12) n=10
Enter the max value of vector elements k=9
Enter the sum of vector elements sum=20
CPU count = 9091270, in 123 seconds
GPU count = 9091270, in 4 seconds

So for large problem sizes, the naive brute-force GPU code appears to be about 30x faster than the naive brute-force single-threaded CPU code. (... on my particular machine setup: CPU = Xeon X5560, GPU = Quadro5000, CentOS 5.5, CUDA 5.0)

0 讨论(0)