I was wondering if it is possible to sort by keys using Thrust Library without the need of creating a Vector to store the keys (on the fly). For example I have the following two
The original thrust example you linked performed a row-sum on an underlying data set that had row-major storage. Your question is essentially how to do the same thing when the underlying storage is column-major.
We can use essentially the same method, but we must use permutation iterators to convert the underlying column-major storage to row-major storage "on the fly".
For this, we can borrow the functor I described here.
Here is a fully worked example:
$ cat t466.cu
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/reduce.h>
#include <thrust/functional.h>
#include <thrust/sequence.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <iostream>
#define COLS 3
#define ROWS 3
#define DSIZE (COLS*ROWS)
#define INIT 10
#define STEP 10
// convert a linear index to a row index
template <typename T>
struct linear_index_to_row_index : public thrust::unary_function<T,T>
{
T C; // number of columns
__host__ __device__
linear_index_to_row_index(T C) : C(C) {}
__host__ __device__
T operator()(T i)
{
return i % C;
}
};
struct rm2cm_idx_functor : public thrust::unary_function<int, int>
{
int r;
int c;
rm2cm_idx_functor(int _r, int _c) : r(_r), c(_c) {};
__host__ __device__
int operator() (int idx) {
unsigned my_r = idx/c;
unsigned my_c = idx%c;
return (my_c * r) + my_r;
}
};
int main(void)
{
int C = COLS; // number of columns
int R = ROWS; // number of rows
thrust::host_vector<int> h_vals(DSIZE);
// initialize data
thrust::sequence(h_vals.begin(), h_vals.end(), INIT, STEP);
thrust::device_vector<int> vals = h_vals;
std::cout << " Initial data: " << std::endl;
thrust::copy(h_vals.begin(), h_vals.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
// allocate storage for row sums and indices
thrust::device_vector<int> row_sums(R);
thrust::device_vector<int> row_indices(R);
// compute row sums by summing values with equal row indices
thrust::reduce_by_key
(thrust::make_permutation_iterator(thrust::make_transform_iterator(thrust::counting_iterator<int>(0), linear_index_to_row_index<int>(R)), thrust::make_transform_iterator(thrust::counting_iterator<int>(0), rm2cm_idx_functor(R, C))),
thrust::make_permutation_iterator(thrust::make_transform_iterator(thrust::counting_iterator<int>(0), linear_index_to_row_index<int>(R)) + (R*C), thrust::make_transform_iterator(thrust::counting_iterator<int>(0), rm2cm_idx_functor(R, C)) + (R*C)),
thrust::make_permutation_iterator(vals.begin(), thrust::make_transform_iterator(thrust::counting_iterator<int>(0), rm2cm_idx_functor(R, C))),
row_indices.begin(),
row_sums.begin(),
thrust::equal_to<int>(),
thrust::plus<int>());
// print data
thrust::host_vector<int> h_row_sums = row_sums;
std::cout << " Results: " << std::endl;
thrust::copy(h_row_sums.begin(), h_row_sums.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
return 0;
}
$ nvcc -arch=sm_20 -o t466 t466.cu
$ ./t466
Initial data:
10,20,30,40,50,60,70,80,90,
Results:
120,150,180,
$
Note that I have also changed the linear_index_to_row_index
functor to give me a row index suitably organized for underlying column-major storage (the previous functor returned the index when the underlying storage was assumed to be row-major). This only involved changing the division operation to a modulo operation and pass R
instead of C
to initialize the functor, so note the subtle difference.