Replicate a vector multiple times using CUDA Thrust

前端 未结 3 1374
广开言路
广开言路 2020-12-17 05:36

I am trying to solve a problem using CUDA Thrust.

I have a host array with 3 elements. Is it possible, using Thrust, to create a device array of 3

3条回答
  •  隐瞒了意图╮
    2020-12-17 05:51

    Robert Crovella has already answered this question using strided ranges. He has also pointed out the possibility of using the expand operator.

    Below, I'm providing a worked example using the expand operator. Opposite to the use of strided ranges, it avoids the need of for loops.

    #include 
    #include 
    #include 
    #include 
    
    using namespace thrust::placeholders;
    
    /*************************************/
    /* CONVERT LINEAR INDEX TO ROW INDEX */
    /*************************************/
    template 
    struct linear_index_to_row_index : public thrust::unary_function {
    
        T Ncols; // --- Number of columns
    
        __host__ __device__ linear_index_to_row_index(T Ncols) : Ncols(Ncols) {}
    
        __host__ __device__ T operator()(T i) { return i / Ncols; }
    };
    
    /*******************/
    /* EXPAND OPERATOR */
    /*******************/
    template 
    OutputIterator expand(InputIterator1 first1,
                          InputIterator1 last1,
                          InputIterator2 first2,
                          OutputIterator output)
    {
        typedef typename thrust::iterator_difference::type difference_type;
    
        difference_type input_size  = thrust::distance(first1, last1);
        difference_type output_size = thrust::reduce(first1, last1);
    
        // scan the counts to obtain output offsets for each input element
        thrust::device_vector output_offsets(input_size, 0);
        thrust::exclusive_scan(first1, last1, output_offsets.begin()); 
    
        // scatter the nonzero counts into their corresponding output positions
        thrust::device_vector output_indices(output_size, 0);
        thrust::scatter_if(thrust::counting_iterator(0), thrust::counting_iterator(input_size),
                           output_offsets.begin(), first1, output_indices.begin());
    
        // compute max-scan over the output indices, filling in the holes
        thrust::inclusive_scan(output_indices.begin(), output_indices.end(), output_indices.begin(), thrust::maximum());
    
        // gather input values according to index array (output = first2[output_indices])
        OutputIterator output_end = output; thrust::advance(output_end, output_size);
        thrust::gather(output_indices.begin(), output_indices.end(), first2, output);
    
        // return output + output_size
        thrust::advance(output, output_size);
    
        return output;
    }
    
    /**************************/
    /* STRIDED RANGE OPERATOR */
    /**************************/
    template 
    class strided_range
    {
        public:
    
        typedef typename thrust::iterator_difference::type difference_type;
    
        struct stride_functor : public thrust::unary_function
        {
            difference_type stride;
    
            stride_functor(difference_type stride)
                : stride(stride) {}
    
            __host__ __device__
            difference_type operator()(const difference_type& i) const
            {
                return stride * i;
            }
        };
    
        typedef typename thrust::counting_iterator                   CountingIterator;
        typedef typename thrust::transform_iterator TransformIterator;
        typedef typename thrust::permutation_iterator     PermutationIterator;
    
        // type of the strided_range iterator
        typedef PermutationIterator iterator;
    
        // construct strided_range for the range [first,last)
        strided_range(Iterator first, Iterator last, difference_type stride)
            : first(first), last(last), stride(stride) {}
    
        iterator begin(void) const
        {
            return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride)));
        }
    
        iterator end(void) const
        {
            return begin() + ((last - first) + (stride - 1)) / stride;
        }
    
        protected:
        Iterator first;
        Iterator last;
        difference_type stride;
    };
    
    /********/
    /* MAIN */
    /********/
    int main(){
    
        /**************************/
        /* SETTING UP THE PROBLEM */
        /**************************/
    
        const int Nrows = 10;           // --- Number of objects
        const int Ncols =  3;           // --- Number of centroids  
    
        thrust::device_vector d_sequence(Nrows * Ncols);
        thrust::device_vector d_counts(Ncols, Nrows);
        thrust::sequence(d_sequence.begin(), d_sequence.begin() + Ncols);
        expand(d_counts.begin(), d_counts.end(), d_sequence.begin(), 
            thrust::make_permutation_iterator(
                                    d_sequence.begin(),
                                    thrust::make_transform_iterator(thrust::make_counting_iterator(0),(_1 % Nrows) * Ncols + _1 / Nrows)));
    
        printf("\n\nCentroid indices\n");
        for(int i = 0; i < Nrows; i++) {
            std::cout << " [ ";
            for(int j = 0; j < Ncols; j++)
                std::cout << d_sequence[i * Ncols + j] << " ";
            std::cout << "]\n";
        }
    
        return 0;
    }
    

提交回复
热议问题