Get statistics for a list of numbers using GPU

问题

I have several lists of numbers on a file . For example,

.333, .324, .123 , .543, .00054
.2243, .333, .53343 , .4434

Now, I want to get the number of times each number occurs using the GPU. I believe this will be faster to do on the GPU than the CPU because each thread can process one list. What data structure should I use on the GPU to easily get the above counts. For example , for the above, the answer will look as follows:

.333 = 2 times in entire file
.324 = 1 time

etc..

I looking for a general solution. Not one that works only on devices with specific compute capability

Just writing kernel suggested by Pavan to see if I have implemented it efficiently:

int uniqueEle = newend.valiter – d_A;

int* count;
cudaMalloc((void**)&count, uniqueEle * sizeof(int)); // stores the count of each unique element
int TPB = 256;
int blocks = uniqueEle + TPB -1 / TPB;
//Cast d_I to raw pointer called d_rawI
launch<<<blocks,TPB>>>(d_rawI,count,uniqueEle);

__global__ void launch(int *i, int* count, int n){
    int id = blockDim.x * blockIdx.x + threadIdx.x;
    __shared__ int indexes[256];
    if(id < n ){
        indexes[threadIdx.x] = i[id];
        //as occurs between two blocks
        if(id % 255 == 0){
            count[indexes] = i[id+1] - i[id];
        }
    }
    __syncthreads();
    if(id < ele - 1){
        if(threadIdx.x < 255)
            count[id] = indexes[threadIdx.x+1] – indexes[threadIdx.x];

    }
}

Question: how to modify this kernel so that it handles arrays of arbitrary size. I.e , handle the condition when the total number of threads < number of elements

回答1:

Here is how I would do the code in matlab

A = [333, .324, .123 , .543, .00054 .2243, .333, .53343 , .4434];
[values, locations] = unique(A);   % Find unique values and their locations
counts = diff([0, locations]);     % Find the count based on their locations

There is no easy way to do this in plain cuda, but you can use existing libraries to do this.

1) Thrust

It is also being shipped with CUDA toolkit from CUDA 4.0.

The matlab code can be roughly translated into thrust by using the following functions. I am not too proficient with thrust, but I am just trying to give you an idea on what routines to look at.

float _A[] = {.333, .324, .123 , .543, .00054 .2243, .333, .53343 , .4434};
int _I[] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
float *A, *I; 
// Allocate memory on device and cudaMempCpy values from _A to A and _I to I
int num = 9;
// Values vector
thrust::device_vector<float>d_A(A, A+num);
// Need to sort to get same values together    
thrust::stable_sort(d_A, d_A+num);
// Vector containing 0 to num-1
thrust::device_vector<int>d_I(I, I+num);
// Find unique values and elements
thrust::device_vector<float>d_Values(num), d_Locations(num), d_counts(num);
// Find unique elements
thrust::device_vector<float>::iterator valiter;
thrust::device_vector<int>::iterator idxiter;
thrust::pair<valiter, idxiter> new_end;
new_end = thrust::unique_by_key(d_A, d_A+num, d_I, d_Values, d_Locations);

You now have the locations of the first instance of each unique value. You can now launch a kernel to find the differences between adjacent elements from 0 to new_end in d_Locations. Subtract the final value from num to get the count for final location.

EDIT (Adding code that was provided over chat)

Here is how the difference code needs to be done

#define MAX_BLOCKS 65535
#define roundup(A, B) = (((A) + (B) - 1) / (B))

int uniqueEle = newend.valiter – d_A;
int* count;
cudaMalloc((void**)&count, uniqueEle * sizeof(int));

int TPB = 256;
int num_blocks = roundup(uniqueEle, TPB);
int blocks_y = roundup(num_blocks, MAX_BLOCKS);
int blocks_x = roundup(num_blocks, blocks_y);
dim3 blocks(blocks_x, blocks_y);

kernel<<<blocks,TPB>>>(d_rawI, count, uniqueEle);

__global__ void kernel(float *i, int* count, int n)
{
int tx = threadIdx.x;
int bid = blockIdx.y * gridDim.x + blockIdx.x;
int id = blockDim.x * bid + tx;
__shared__ int indexes[256];

if (id < n) indexes[tx] = i[id];
__syncthreads();

if (id < n - 1) {
if (tx < 255) count[id] = indexes[tx + 1] - indexes[tx];
else count[id] = i[id + 1] - indexes[tx];
}

if (id == n - 1) count[id] = n - indexes[tx];
return;
}

2) ArrayFire

This is an easy to use, free array based library.

You can do the following in ArrayFire.

using namespace af;
float h_A[] = {.333, .324, .123 , .543, .00054 .2243, .333, .53343 , .4434};
int num = 9;
// Transfer data to device
array A(9, 1, h_A);
array values, locations, original;
// Find the unique values and locations
setunique(values, locations, original, A);
// Locations are 0 based, add 1.
// Add *num* at the end to find count of last value. 
array counts = diff1(join(locations + 1, num));

Disclosure: I work for AccelerEyes, that develops this software.

回答2:

To answer the latest addenum to this question - the diff kernel which would complete the thrust method proposed by Pavan could look something like this:

template<int blcksz>
__global__ void diffkernel(const int *i, int* count, const int n) { 
    int id = blockDim.x * blockIdx.x + threadIdx.x; 
    int strd = blockDim.x * gridDim.x;
    int nmax = blcksz * ((n/blcksz) + ((n%blcksz>0) ? 1 : 0));

    __shared__ int indices[blcksz+1]; 

    for(; id<nmax; id+=strd) {
        // Data load
        indices[threadIdx.x] = (id < n) ? i[id] : n; 
        if (threadIdx.x == (blcksz-1)) 
            indices[blcksz] = ((id+1) < n) ? i[id+1] : n; 

        __syncthreads(); 

        // Differencing calculation
        int diff = indices[threadIdx.x+1] - indices[threadIdx.x];

        // Store
        if (id < n) count[id] = diff;

        __syncthreads(); 
    }
}

回答3:

here is a solution:

__global__ void counter(float* a, int* b, int N)
{
    int idx = blockIdx.x*blockDim.x+threadIdx.x;

    if(idx < N)
    {
        float my = a[idx];
        int count = 0;
        for(int i=0; i < N; i++)
        {
            if(my == a[i])
                count++;
        }

        b[idx]=count;
    }
}

int main()
{

    int threads = 9;
    int blocks = 1;
    int N = blocks*threads;
    float* h_a;
    int* h_b;
    float* d_a;
    int* d_b;

    h_a = (float*)malloc(N*sizeof(float));
    h_b = (int*)malloc(N*sizeof(int));

    cudaMalloc((void**)&d_a,N*sizeof(float));
    cudaMalloc((void**)&d_b,N*sizeof(int));

    h_a[0]= .333f; 
    h_a[1]= .324f;
    h_a[2]= .123f;
    h_a[3]= .543f;
    h_a[4]= .00054f;
    h_a[5]= .2243f;
    h_a[6]= .333f;
    h_a[7]= .53343f;
    h_a[8]= .4434f;

    cudaMemcpy(d_a,h_a,N*sizeof(float),cudaMemcpyHostToDevice);

    counter<<<blocks,threads>>>(d_a,d_b,N);

    cudaMemcpy(h_b,d_b,N*sizeof(int),cudaMemcpyDeviceToHost);

    for(int i=0; i < N; i++)
    {
        printf("%f = %d times\n",h_a[i],h_b[i]);
    }

    cudaFree(d_a);
    cudaFree(d_b);
    free(h_a);
    free(h_b);
    getchar();
    return 0;
}

来源：https://stackoverflow.com/questions/9147401/get-statistics-for-a-list-of-numbers-using-gpu

标签

cuda

parallel-processing

gpu

gpgpu