Replicate a vector multiple times using CUDA Thrust

前端 未结 3 1375
广开言路
广开言路 2020-12-17 05:36

I am trying to solve a problem using CUDA Thrust.

I have a host array with 3 elements. Is it possible, using Thrust, to create a device array of 3

相关标签:
3条回答
  • 2020-12-17 05:50

    One possible approach:

    1. create a device vector of appropriate size
    2. create 3 strided ranges, one for each of the element positions {1, 2, 3} in the final output (device) vector
    3. use thrust::fill to fill each of the 3 strided ranges with the appropriate (host vector) element {1, 2, 3}

    This code is a trivial modification of the strided range example to demonstrate. You can change the REPS define to 128 to see the full expansion to 384 output elements:

    #include <thrust/iterator/counting_iterator.h>
    #include <thrust/iterator/transform_iterator.h>
    #include <thrust/iterator/permutation_iterator.h>
    #include <thrust/functional.h>
    
    #include <thrust/fill.h>
    #include <thrust/device_vector.h>
    #include <thrust/host_vector.h>
    
    // for printing
    #include <thrust/copy.h>
    #include <ostream>
    
    
    #define STRIDE 3
    #define REPS  15  // change to 128 if you like
    #define DSIZE (STRIDE*REPS)
    
    // this example illustrates how to make strided access to a range of values
    // examples:
    //   strided_range([0, 1, 2, 3, 4, 5, 6], 1) -> [0, 1, 2, 3, 4, 5, 6]
    //   strided_range([0, 1, 2, 3, 4, 5, 6], 2) -> [0, 2, 4, 6]
    //   strided_range([0, 1, 2, 3, 4, 5, 6], 3) -> [0, 3, 6]
    //   ...
    
    template <typename Iterator>
    class strided_range
    {
        public:
    
        typedef typename thrust::iterator_difference<Iterator>::type difference_type;
    
        struct stride_functor : public thrust::unary_function<difference_type,difference_type>
        {
            difference_type stride;
    
            stride_functor(difference_type stride)
                : stride(stride) {}
    
            __host__ __device__
            difference_type operator()(const difference_type& i) const
            {
                return stride * i;
            }
        };
    
        typedef typename thrust::counting_iterator<difference_type>                   CountingIterator;
        typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
        typedef typename thrust::permutation_iterator<Iterator,TransformIterator>     PermutationIterator;
    
        // type of the strided_range iterator
        typedef PermutationIterator iterator;
    
        // construct strided_range for the range [first,last)
        strided_range(Iterator first, Iterator last, difference_type stride)
            : first(first), last(last), stride(stride) {}
    
        iterator begin(void) const
        {
            return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride)));
        }
    
        iterator end(void) const
        {
            return begin() + ((last - first) + (stride - 1)) / stride;
        }
    
        protected:
        Iterator first;
        Iterator last;
        difference_type stride;
    };
    
    int main(void)
    {
        thrust::host_vector<int> h_data(STRIDE);
        h_data[0] = 1;
        h_data[1] = 2;
        h_data[2] = 3;
    
        thrust::device_vector<int> data(DSIZE);
    
        typedef thrust::device_vector<int>::iterator Iterator;
        strided_range<Iterator> pos1(data.begin(), data.end(), STRIDE);
        strided_range<Iterator> pos2(data.begin()+1, data.end(), STRIDE);
        strided_range<Iterator> pos3(data.begin()+2, data.end(), STRIDE);
    
        thrust::fill(pos1.begin(), pos1.end(), h_data[0]);
        thrust::fill(pos2.begin(), pos2.end(), h_data[1]);
        thrust::fill(pos3.begin(), pos3.end(), h_data[2]);
    
    
        // print the generated data
        std::cout << "data: ";
        thrust::copy(data.begin(), data.end(), std::ostream_iterator<int>(std::cout, " "));  std::cout << std::endl;
    
        return 0;
    }
    
    0 讨论(0)
  • 2020-12-17 05:51

    Robert Crovella has already answered this question using strided ranges. He has also pointed out the possibility of using the expand operator.

    Below, I'm providing a worked example using the expand operator. Opposite to the use of strided ranges, it avoids the need of for loops.

    #include <thrust/device_vector.h>
    #include <thrust/gather.h>
    #include <thrust/sequence.h>
    #include <stdio.h>
    
    using namespace thrust::placeholders;
    
    /*************************************/
    /* CONVERT LINEAR INDEX TO ROW INDEX */
    /*************************************/
    template <typename T>
    struct linear_index_to_row_index : public thrust::unary_function<T,T> {
    
        T Ncols; // --- Number of columns
    
        __host__ __device__ linear_index_to_row_index(T Ncols) : Ncols(Ncols) {}
    
        __host__ __device__ T operator()(T i) { return i / Ncols; }
    };
    
    /*******************/
    /* EXPAND OPERATOR */
    /*******************/
    template <typename InputIterator1, typename InputIterator2, typename OutputIterator>
    OutputIterator expand(InputIterator1 first1,
                          InputIterator1 last1,
                          InputIterator2 first2,
                          OutputIterator output)
    {
        typedef typename thrust::iterator_difference<InputIterator1>::type difference_type;
    
        difference_type input_size  = thrust::distance(first1, last1);
        difference_type output_size = thrust::reduce(first1, last1);
    
        // scan the counts to obtain output offsets for each input element
        thrust::device_vector<difference_type> output_offsets(input_size, 0);
        thrust::exclusive_scan(first1, last1, output_offsets.begin()); 
    
        // scatter the nonzero counts into their corresponding output positions
        thrust::device_vector<difference_type> output_indices(output_size, 0);
        thrust::scatter_if(thrust::counting_iterator<difference_type>(0), thrust::counting_iterator<difference_type>(input_size),
                           output_offsets.begin(), first1, output_indices.begin());
    
        // compute max-scan over the output indices, filling in the holes
        thrust::inclusive_scan(output_indices.begin(), output_indices.end(), output_indices.begin(), thrust::maximum<difference_type>());
    
        // gather input values according to index array (output = first2[output_indices])
        OutputIterator output_end = output; thrust::advance(output_end, output_size);
        thrust::gather(output_indices.begin(), output_indices.end(), first2, output);
    
        // return output + output_size
        thrust::advance(output, output_size);
    
        return output;
    }
    
    /**************************/
    /* STRIDED RANGE OPERATOR */
    /**************************/
    template <typename Iterator>
    class strided_range
    {
        public:
    
        typedef typename thrust::iterator_difference<Iterator>::type difference_type;
    
        struct stride_functor : public thrust::unary_function<difference_type,difference_type>
        {
            difference_type stride;
    
            stride_functor(difference_type stride)
                : stride(stride) {}
    
            __host__ __device__
            difference_type operator()(const difference_type& i) const
            {
                return stride * i;
            }
        };
    
        typedef typename thrust::counting_iterator<difference_type>                   CountingIterator;
        typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
        typedef typename thrust::permutation_iterator<Iterator,TransformIterator>     PermutationIterator;
    
        // type of the strided_range iterator
        typedef PermutationIterator iterator;
    
        // construct strided_range for the range [first,last)
        strided_range(Iterator first, Iterator last, difference_type stride)
            : first(first), last(last), stride(stride) {}
    
        iterator begin(void) const
        {
            return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride)));
        }
    
        iterator end(void) const
        {
            return begin() + ((last - first) + (stride - 1)) / stride;
        }
    
        protected:
        Iterator first;
        Iterator last;
        difference_type stride;
    };
    
    /********/
    /* MAIN */
    /********/
    int main(){
    
        /**************************/
        /* SETTING UP THE PROBLEM */
        /**************************/
    
        const int Nrows = 10;           // --- Number of objects
        const int Ncols =  3;           // --- Number of centroids  
    
        thrust::device_vector<int> d_sequence(Nrows * Ncols);
        thrust::device_vector<int> d_counts(Ncols, Nrows);
        thrust::sequence(d_sequence.begin(), d_sequence.begin() + Ncols);
        expand(d_counts.begin(), d_counts.end(), d_sequence.begin(), 
            thrust::make_permutation_iterator(
                                    d_sequence.begin(),
                                    thrust::make_transform_iterator(thrust::make_counting_iterator(0),(_1 % Nrows) * Ncols + _1 / Nrows)));
    
        printf("\n\nCentroid indices\n");
        for(int i = 0; i < Nrows; i++) {
            std::cout << " [ ";
            for(int j = 0; j < Ncols; j++)
                std::cout << d_sequence[i * Ncols + j] << " ";
            std::cout << "]\n";
        }
    
        return 0;
    }
    
    0 讨论(0)
  • 2020-12-17 05:56

    As an apparently simpler alternative to using CUDA Thrust, I'm posting below a worked example implementing in CUDA the classical Matlab's meshgrid function.

    In Matlab

    x = [1 2 3];
    y = [4 5 6 7];
    [X, Y] = meshgrid(x, y);
    

    produces

    X =
    
         1     2     3
         1     2     3
         1     2     3
         1     2     3
    

    and

    Y =
    
         4     4     4
         5     5     5
         6     6     6
         7     7     7
    

    X is exactly the four-fold replication of the x array, which is the OP's question and first guess of Robert Crovella's answer, while Y is the three-fold consecutive replication of each element of the y array, which is the second guess of Robert Crovella's answer.

    Here is the code:

    #include <cstdio>
    
    #include <thrust/pair.h>
    
    #include "Utilities.cuh"
    
    #define BLOCKSIZE_MESHGRID_X    16
    #define BLOCKSIZE_MESHGRID_Y    16
    
    #define DEBUG
    
    /*******************/
    /* MESHGRID KERNEL */
    /*******************/
    template <class T>
    __global__ void meshgrid_kernel(const T * __restrict__ x, size_t Nx, const float * __restrict__ y, size_t Ny, T * __restrict__ X, T * __restrict__ Y) 
    {
        unsigned int tidx = blockIdx.x * blockDim.x + threadIdx.x;
        unsigned int tidy = blockIdx.y * blockDim.y + threadIdx.y;
    
        if ((tidx < Nx) && (tidy < Ny)) {   
            X[tidy * Nx + tidx] = x[tidx];
            Y[tidy * Nx + tidx] = y[tidy];
        }
    }
    
    /************/
    /* MESHGRID */
    /************/
    template <class T>
    thrust::pair<T *,T *> meshgrid(const T *x, const unsigned int Nx, const T *y, const unsigned int Ny) {
    
        T *X; gpuErrchk(cudaMalloc((void**)&X, Nx * Ny * sizeof(T)));
        T *Y; gpuErrchk(cudaMalloc((void**)&Y, Nx * Ny * sizeof(T)));
    
        dim3 BlockSize(BLOCKSIZE_MESHGRID_X, BLOCKSIZE_MESHGRID_Y);
        dim3 GridSize (iDivUp(Nx, BLOCKSIZE_MESHGRID_X), iDivUp(BLOCKSIZE_MESHGRID_Y, BLOCKSIZE_MESHGRID_Y));
    
        meshgrid_kernel<<<GridSize, BlockSize>>>(x, Nx, y, Ny, X, Y);
    #ifdef DEBUG
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());
    #endif
    
        return thrust::make_pair(X, Y);
    }
    
    /********/
    /* MAIN */
    /********/
    int main()
    {
        const int Nx = 3;
        const int Ny = 4;
    
        float *h_x = (float *)malloc(Nx * sizeof(float));
        float *h_y = (float *)malloc(Ny * sizeof(float));
    
        float *h_X = (float *)malloc(Nx * Ny * sizeof(float));
        float *h_Y = (float *)malloc(Nx * Ny * sizeof(float));
    
        for (int i = 0; i < Nx; i++) h_x[i] = i;
        for (int i = 0; i < Ny; i++) h_y[i] = i + 4.f;
    
        float *d_x; gpuErrchk(cudaMalloc(&d_x, Nx * sizeof(float)));
        float *d_y; gpuErrchk(cudaMalloc(&d_y, Ny * sizeof(float)));
    
        gpuErrchk(cudaMemcpy(d_x, h_x, Nx * sizeof(float), cudaMemcpyHostToDevice));
        gpuErrchk(cudaMemcpy(d_y, h_y, Ny * sizeof(float), cudaMemcpyHostToDevice));
    
        thrust::pair<float *, float *> meshgrid_pointers = meshgrid(d_x, Nx, d_y, Ny);
        float *d_X = (float *)meshgrid_pointers.first;
        float *d_Y = (float *)meshgrid_pointers.second;
    
        gpuErrchk(cudaMemcpy(h_X, d_X, Nx * Ny * sizeof(float), cudaMemcpyDeviceToHost));
        gpuErrchk(cudaMemcpy(h_Y, d_Y, Nx * Ny * sizeof(float), cudaMemcpyDeviceToHost));
    
        for (int j = 0; j < Ny; j++) {
            for (int i = 0; i < Nx; i++) {
                printf("i = %i; j = %i; x = %f; y = %f\n", i, j, h_X[j * Nx + i], h_Y[j * Nx + i]);
            }
        }
    
        return 0;
    
    }
    
    0 讨论(0)
提交回复
热议问题