Suppose I have
I've found this solution to your question. I used a mix of cuda kernels and thrust primitives. I suppose that your operator has the commutative property on the keys arguments, that is
fun(key1, key2, id) == fun(key2, key1, id)
My solution produce three output arrays (keys1, keys2 and ids) in two steps. In the first step it computes only the number of elements in the output arrays and in the second step it fill the arrays. Basically the algorithm runs two times: in the first time it "simulates" the writing and the second time it actually writes the output. This is a common pattern that I use when the output size depends on the input data.
Here are the steps:
Here is the output of my algorithm:
d_keys: 1 2 3 1 1 2 2 1 1 1
d_segs: 0 0 0 1 2 2 2 3 3 3
d_cpairs: 2 1 0 0 1 0 0 0 0 0
num_pairs: 4
d_addrs: 0 2 3 3 3 4 4 4 4 4
keys1: 1 1 2 1
keys2: 2 3 3 2
ids: 0 0 0 2
Note that the last column in the output is not exactly like you want but if the commutative property hold it's fine.
Here is my complete code. Probably this is not the fastest solution but I think it's simple.
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/scan.h>
#include <thrust/sort.h>
#define SHOW_VECTOR(V,size) std::cout << #V << ": "; for(int i =0 ; i < size; i++ ){ std::cout << V[i] << " "; } std::cout << std::endl;
#define RAW_CAST(V) thrust::raw_pointer_cast(V.data())
__global__
void count_kernel(const int *d_keys,const int *d_segs,int *d_cpairs, int siz){
int tidx = threadIdx.x+ blockIdx.x*blockDim.x;
if(tidx < siz){
int sum = 0;
int i = tidx+1;
while(d_segs[i] == d_segs[tidx]){
if(d_keys[i] != d_keys[tidx] &&
d_keys[i] != d_keys[i-1]){
sum++;
}
i++;
}
d_cpairs[tidx] = sum;
}
}
__global__
void scatter_kernel(const int *d_keys,
const int *d_segs,
const int *d_addrs,
int *d_keys1,
int *d_keys2,
int *d_ids,
int siz){
int tidx = threadIdx.x+ blockIdx.x*blockDim.x;
if(tidx < siz){
int base_address = d_addrs[tidx];
int j =0;
int i = tidx+1;
while(d_segs[i] == d_segs[tidx]){
if(d_keys[i] != d_keys[tidx] &&
d_keys[i] != d_keys[i-1]){
d_keys1[base_address+j] = d_keys[tidx];
d_keys2[base_address+j] = d_keys[i];
d_ids[base_address+j] = d_segs[i];
j++;
}
i++;
}
}
}
int main(){
int keyArray[] = {1, 2, 3, 1, 2, 2, 1, 1, 1, 1};
int idsArray[] = {0, 0, 0, 1, 2, 2, 2, 3, 3, 3};
int sz1 = sizeof(keyArray)/sizeof(keyArray[0]);
thrust::host_vector<int> h_keys(keyArray, keyArray+sz1);
thrust::host_vector<int> h_segs(idsArray, idsArray+sz1);
thrust::device_vector<int> d_keys = h_keys;
thrust::device_vector<int> d_segs = h_segs;
thrust::device_vector<int> d_cpairs(sz1);
thrust::device_vector<int> d_addrs(sz1);
//sort each segment to group like keys together
thrust::stable_sort_by_key(d_keys.begin(), d_keys.end(), d_segs.begin());
thrust::stable_sort_by_key(d_segs.begin(), d_segs.end(), d_keys.begin());
SHOW_VECTOR(d_keys,sz1);
SHOW_VECTOR(d_segs,sz1);
//count the number of pairs produced by each key
count_kernel<<<1,sz1>>>(RAW_CAST(d_keys),RAW_CAST(d_segs),RAW_CAST(d_cpairs),sz1);
SHOW_VECTOR(d_cpairs,sz1);
//determine the total number of pairs
int num_pairs = thrust::reduce(d_cpairs.begin(),d_cpairs.end());
std::cout << "num_pairs: " << num_pairs << std::endl;
//compute the addresses
thrust::exclusive_scan(d_cpairs.begin(),d_cpairs.end(),d_addrs.begin());
thrust::device_vector<int> keys1(num_pairs);
thrust::device_vector<int> keys2(num_pairs);
thrust::device_vector<int> ids(num_pairs);
SHOW_VECTOR(d_addrs,sz1);
//fill the vector with the keys and ids
scatter_kernel<<<1,sz1>>>(RAW_CAST(d_keys),
RAW_CAST(d_segs),
RAW_CAST(d_addrs),
RAW_CAST(keys1),
RAW_CAST(keys2),
RAW_CAST(ids),
sz1);
SHOW_VECTOR(keys1,num_pairs);
SHOW_VECTOR(keys2,num_pairs);
SHOW_VECTOR(ids,num_pairs);
return 0;
}
This turned out to be more involved than I initially thought. My interpretation is that you essentially want to compute all combinations of N things taken two at a time, without repetition, per segment (N is different for each segment). I'll present an answer that I think covers most of the concepts you might want to consider. It's just one possible solution method, of course.
This may not arrive at precisely the solution you want (although I think it's pretty close). My purpose is not to give you a finished code, but to demonstrate a possible algorithmic approach.
Here's a walkthrough of the algorithm:
Sort the keys by segment. This step isn't necessary if you can guarantee that like keys (within a segment) are grouped together. (Your data as presented wouldn't require this step.) We can use back-to-back stable_sort_by_key
to accomplish this. Output:
d_keys:
1,2,3,1,1,2,2,1,1,1,
d_segs:
0,0,0,1,2,2,2,3,3,3,
Reduce the data set to just the unique keys, per segment (remove duplicated keys.) We can do this on the keys and segments together with thrust::unique_copy
, and an appropriate functor to define equality of keys within a segment only. Output:
d_ukeys:
1,2,3,1,1,2,1,
d_usegs:
0,0,0,1,2,2,3,
Compute the length of each segment now that duplicate keys are removed. We can do this with thrust::reduce_by_key
and a constant_iterator
. Output:
d_seg_lens:
3,1,2,1,
Now that we know each segment length, we want to remove any segment (plus it's associate keys) for which the segment length is 1. These segments don't have any key pairs possible. To facilitate this, we also need to compute the starting index of each segment and create a stencil that identifies the segments whose length is one, then we can use remove_if
and scatter_if
to remove the associated segment and key data. Output:
d_seg_idxs:
0,3,4,6,
d_stencil:
0,0,0,1,0,0,1,
and the reduced data:
d_ukeys:
1,2,3,1,2,
d_usegs:
0,0,0,2,2,
d_seg_lens:
3,2,
Our goal at this point will be to create a vector of appropriate length so that it can hold one entry for each combination of N things taken two at a time, where N is the length of each segment. So for the first segment above, it has 3 items, so the combination of 3 things taken 2 at a time requires storage for 3 combinations total. Likewise the second segment above of length 2 has only one unique combination between the 2 keys, therefore storage is needed for only one combination for that segment. We can compute this storage length using the standard combinations formula reduced to combinations of N things taken 2 at a time:
C = (N)*(N-1)/2
we'll use this formula, passed to thrust::transform_reduce
along with our segment lengths, to compute the overall storage length needed. (4 combinations total, for this example).
Once we have the overall length determined, and the necessary vectors of that length allocated, we need to generate the actual combinations that belong to each segment. This again is a multi-step sequence that begins with generating flags to mark (delineate) each "segment" within these vectors. With the flag array, we can use exclusive_scan_by_key
to generate an ordinal sequence identifying the combination that belongs in each position, per segment. Output:
d_flags:
1,0,0,1,
example ordinal sequence:
0,1,2,0
With the ordinal sequence per segment, we now need to generate the actual unique combinations, per segment. This step required some consideration to try and come up with an algorithm to accomplish this in a fixed time (ie. without iteration). The algorithm I came up with is to map each ordinal sequence to a matrix, so for a segment with 4 combinations (which would have 4 things taken 2 at a time, so 6 total combinations, larger than any in this example):
1 2 3
0 C1 C2 C3
1 C4 C5 C6
2
We then "re-map" any combinations (Cx
above) that are below the main diagonal to alternate locations in the matrix, like this:
1 2 3
0 C1 C2 C3
1 C5 C6
2 C4
we can then read off unique combination pairs as the row and column indices of the above specially-crafted matrix. (effectively a linear-to-triangular mapping) So C1 corresponds to the combination of logical key 0 and logical key 1. C6 corresponds to the combination of logical key 1 and logical key 3. This matrix assembly and mapping to row and column "indices" to produce unique combinations is handled by the comb_n
functor passed to thrust::for_each_n
, which is also receiving the segment lengths and the input segment ordinal sequences, and is generating "logical" keys 1 and 2 as output:
d_lkey1:
1,2,2,1,
d_lkey2:
0,0,1,0,
(added note: I believe the approach I am describing here is comparable to what is discussed in this question/answer, which I stumbled upon recently, although they appear to be different at first glance.)
The last transformation step is to convert our logical keys and segments into "actual" keys and segments. Output:
d_key1:
2,3,3,2,
d_key2:
1,1,2,1,
d_aseg:
0,0,0,2,
Here's a complete code that implements the above. It may not do precisely what you want, but I think it's pretty close to what you described, and hopefully will be useful for ideas if you want to do this with thrust:
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/copy.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/reduce.h>
#include <thrust/sort.h>
#include <thrust/unique.h>
#include <thrust/transform_reduce.h>
#include <thrust/functional.h>
#include <thrust/scan.h>
#include <thrust/scatter.h>
#include <thrust/for_each.h>
#include <thrust/remove.h>
#include <iostream>
#define MY_DISPLAY
typedef float DataType;
// user supplied functor
struct fun
{
DataType *dptr;
fun(DataType *_dptr) : dptr(_dptr) {};
template <typename T>
__host__ __device__
void operator()(const T d1)
{
int key1 = thrust::get<0>(d1);
int key2 = thrust::get<1>(d1);
int id = thrust::get<2>(d1);
DataType my_val = dptr[key1]*100+dptr[key2]*10+id;
printf("result: %f\n", my_val);
}
};
// for determining "equality" of keys
struct my_unique_eq
{
template <typename T>
__host__ __device__
bool operator()(const T d1, const T d2) const
{
return ((thrust::get<0>(d1) == thrust::get<0>(d2))&&(thrust::get<1>(d1) == thrust::get<1>(d2)));
}
};
// BinaryPredicate for the head flag segment representation
// equivalent to thrust::not2(thrust::project2nd<int,int>()));
// from: https://github.com/thrust/thrust/blob/master/examples/scan_by_key.cu
template <typename HeadFlagType>
struct head_flag_predicate
: public thrust::binary_function<HeadFlagType,HeadFlagType,bool>
{
__host__ __device__
bool operator()(HeadFlagType left, HeadFlagType right) const
{
return !right;
}
};
// find nth combination of c things taken 2 at a time
struct comb_n
{
template <typename T>
__host__ __device__
void operator()(const T &d1)
{ // item c from n choose 2
int c = thrust::get<0>(d1);
int n = thrust::get<1>(d1);
int row = c/(n-1);
int col = c%(n-1);
if (col < row) {
col = (n-2)-col;
row = (n-1)-row;
}
thrust::get<2>(d1) = col+1; // lkey1
thrust::get<3>(d1) = row; // lkey2
}
};
using namespace thrust::placeholders;
typedef thrust::pair<thrust::device_vector<int>::iterator, thrust::device_vector<int>::iterator > mip;
typedef thrust::zip_iterator< thrust::tuple<thrust::device_vector<int>::iterator, thrust::device_vector<int>::iterator> > mzi;
int main(){
// set up sample data
DataType dataArray[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f};
int keyArray[] = {1, 2, 3, 1, 2, 2, 1, 1, 1, 1};
int ids[] = {0, 0, 0, 1, 2, 2, 2, 3, 3, 3};
int sz0 = sizeof(dataArray)/sizeof(dataArray[0]);
int sz1 = sizeof(keyArray)/sizeof(keyArray[0]);
thrust::host_vector<int> h_keys(keyArray, keyArray+sz1);
thrust::host_vector<int> h_segs(ids, ids+sz1);
thrust::device_vector<int> d_keys = h_keys;
thrust::device_vector<int> d_segs = h_segs;
thrust::host_vector<DataType> h_data(dataArray, dataArray+sz0);
thrust::device_vector<DataType> d_data = h_data;
//sort each segment to group like keys together
thrust::stable_sort_by_key(d_keys.begin(), d_keys.end(), d_segs.begin());
thrust::stable_sort_by_key(d_segs.begin(), d_segs.end(), d_keys.begin());
#ifdef MY_DISPLAY
std::cout << "d_keys:" << std::endl;
thrust::copy(d_keys.begin(), d_keys.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl << "d_segs:" << std::endl;
thrust::copy(d_segs.begin(), d_segs.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
#endif
//now reduce the data set to just unique keys
thrust::device_vector<int> d_ukeys(sz1);
thrust::device_vector<int> d_usegs(sz1);
mzi ip1 = thrust::unique_copy(thrust::make_zip_iterator(thrust::make_tuple(d_keys.begin(), d_segs.begin())), thrust::make_zip_iterator(thrust::make_tuple(d_keys.end(), d_segs.end())), thrust::make_zip_iterator(thrust::make_tuple(d_ukeys.begin(), d_usegs.begin())), my_unique_eq());
int sz2 = ip1 - thrust::make_zip_iterator(thrust::make_tuple(d_ukeys.begin(), d_usegs.begin()));
thrust::device_vector<int> d_seg_lens(sz2);
d_ukeys.resize(sz2);
d_usegs.resize(sz2);
#ifdef MY_DISPLAY
std::cout << "d_ukeys:" << std::endl;
thrust::copy(d_ukeys.begin(), d_ukeys.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl << "d_usegs:" << std::endl;
thrust::copy(d_usegs.begin(), d_usegs.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
#endif
thrust::device_vector<int> d_seg_nums(sz2);
// compute the length of each segment
mip ip2 = thrust::reduce_by_key(d_usegs.begin(), d_usegs.end(), thrust::make_constant_iterator(1), d_seg_nums.begin(), d_seg_lens.begin());
int sz3 = thrust::get<1>(ip2) - d_seg_lens.begin();
d_seg_nums.resize(sz3);
d_seg_lens.resize(sz3);
thrust::device_vector<int> d_seg_idxs(sz3);
// remove segments that have only one unique key
// compute start index of each segment
thrust::exclusive_scan(d_seg_lens.begin(), d_seg_lens.end(), d_seg_idxs.begin());
#ifdef MY_DISPLAY
std::cout << "d_seg_lens:" << std::endl;
thrust::copy(d_seg_lens.begin(), d_seg_lens.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
std::cout << "d_seg_idxs:" << std::endl;
thrust::copy(d_seg_idxs.begin(), d_seg_idxs.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
#endif
// remove if the length of segment is 1
thrust::device_vector<int> d_stencil(d_usegs.size());
thrust::scatter_if(thrust::make_constant_iterator(1), thrust::make_constant_iterator(1)+d_seg_lens.size(), d_seg_idxs.begin(), d_seg_lens.begin(), d_stencil.begin(), (_1 == 1));
#ifdef MY_DISPLAY
std::cout << "d_stencil:" << std::endl;
thrust::copy(d_stencil.begin(), d_stencil.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
#endif
thrust::remove_if(d_usegs.begin(), d_usegs.end(), d_stencil.begin(), (_1 == 1));
thrust::remove_if(d_ukeys.begin(), d_ukeys.end(), d_stencil.begin(), (_1 == 1));
int removals = thrust::remove_if(d_seg_lens.begin(), d_seg_lens.end(),(_1 == 1)) - d_seg_lens.begin();
d_usegs.resize(d_usegs.size()-removals);
d_ukeys.resize(d_ukeys.size()-removals);
d_seg_lens.resize(d_seg_lens.size()-removals);
// compute storage length needed (sum of combinations in each segment)
int sz4 = thrust::transform_reduce(d_seg_lens.begin(), d_seg_lens.end(), ((_1)*(_1 - 1))/2, 0, thrust::plus<int>());
#ifdef MY_DISPLAY
std::cout << "d_ukeys:" << std::endl;
thrust::copy(d_ukeys.begin(), d_ukeys.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl << "d_usegs:" << std::endl;
thrust::copy(d_usegs.begin(), d_usegs.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
std::cout << "d_seg_lens:" << std::endl;
thrust::copy(d_seg_lens.begin(), d_seg_lens.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
#endif
// now create intra-segment index sequence
// first create flag array to mark segments
thrust::device_vector<int> d_flags(sz4);
// compute flag indexes
thrust::device_vector<int> d_flag_idxs(sz3);
thrust::exclusive_scan(d_seg_lens.begin(), d_seg_lens.end(), d_flag_idxs.begin());
// scatter flags
thrust::scatter(thrust::make_constant_iterator(1), thrust::make_constant_iterator(1)+sz3, d_flag_idxs.begin(), d_flags.begin());
#ifdef MY_DISPLAY
std::cout << "d_flags:" << std::endl;
thrust::copy(d_flags.begin(), d_flags.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
#endif
// use flag array to create intra-segment index sequence
thrust::device_vector<int> d_seg_i_idxs(d_flags.size());
thrust::exclusive_scan_by_key(d_flags.begin(), d_flags.end(), thrust::make_constant_iterator(1), d_seg_i_idxs.begin(), 0, head_flag_predicate<int>());
// convert flags to segment references
thrust::inclusive_scan(d_flags.begin(), d_flags.end(), d_flags.begin());
thrust::transform(d_flags.begin(), d_flags.end(), d_flags.begin(), (_1 - 1));
// for_each - run functor that uses intra-segment index sequence plus segment
// index to create logical combinations
// input required:
// -- logical segment number
// -- intra-segment index
// -- segment size
// output:
// -- logical key1
// -- logical key2
thrust::device_vector<int> d_lkey1(sz4);
thrust::device_vector<int> d_lkey2(sz4);
thrust::for_each_n(thrust::make_zip_iterator(thrust::make_tuple(d_seg_i_idxs.begin(), thrust::make_permutation_iterator(d_seg_lens.begin(), d_flags.begin()), d_lkey1.begin(), d_lkey2.begin())), sz4, comb_n());
#ifdef MY_DISPLAY
std::cout << "d_lkey1:" << std::endl;
thrust::copy(d_lkey1.begin(), d_lkey1.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl << "d_lkey2:" << std::endl;
thrust::copy(d_lkey2.begin(), d_lkey2.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
#endif
// convert logical keys and logical segments to actual keys/segments
thrust::device_vector<int> d_key1(sz4);
thrust::device_vector<int> d_key2(sz4);
thrust::device_vector<int> d_aseg(sz4);
thrust::copy_n(thrust::make_permutation_iterator(d_seg_idxs.begin(), d_flags.begin()), sz4, d_aseg.begin());
thrust::transform(d_lkey1.begin(), d_lkey1.end(), thrust::make_permutation_iterator(d_seg_idxs.begin(), d_flags.begin()), d_lkey1.begin(), thrust::plus<int>());
thrust::transform(d_lkey2.begin(), d_lkey2.end(), thrust::make_permutation_iterator(d_seg_idxs.begin(), d_flags.begin()), d_lkey2.begin(), thrust::plus<int>());
thrust::copy_n(thrust::make_permutation_iterator(d_ukeys.begin(), d_lkey1.begin()), sz4, d_key1.begin());
thrust::copy_n(thrust::make_permutation_iterator(d_ukeys.begin(), d_lkey2.begin()), sz4, d_key2.begin());
#ifdef MY_DISPLAY
std::cout << "d_lkey1:" << std::endl;
thrust::copy(d_lkey1.begin(), d_lkey1.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl << "d_lkey2:" << std::endl;
thrust::copy(d_lkey2.begin(), d_lkey2.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
#endif
#ifdef MY_DISPLAY
std::cout << "d_key1:" << std::endl;
thrust::copy(d_key1.begin(), d_key1.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl << "d_key2:" << std::endl;
thrust::copy(d_key2.begin(), d_key2.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
#endif
// d_flags has already been converted to logical segment sequence
// just need to convert these via lookup to the actual segments
thrust::unique(d_usegs.begin(), d_usegs.end());
thrust::copy_n(thrust::make_permutation_iterator(d_usegs.begin(), d_flags.begin()), sz4, d_aseg.begin());
#ifdef MY_DISPLAY
std::cout << "d_aseg:" << std::endl;
thrust::copy(d_aseg.begin(), d_aseg.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
#endif
// run desired user function
thrust::for_each_n(thrust::make_zip_iterator(thrust::make_tuple(d_key1.begin(), d_key2.begin(), d_aseg.begin())), sz4, fun(thrust::raw_pointer_cast(d_data.data())));
std::cout << "Finished." << std::endl;
return 0;
}