问题
I'm trying to code integration by Simpson's method in CUDA.
This is the formula for Simpson's rule
where x_k = a + k*h
.
Here's my code
__device__ void initThreadBounds(int *n_start, int *n_end, int n,
int totalBlocks, int blockWidth)
{
int threadId = blockWidth * blockIdx.x + threadIdx.x;
int nextThreadId = threadId + 1;
int threads = blockWidth * totalBlocks;
*n_start = (threadId * n)/ threads;
*n_end = (nextThreadId * n)/ threads;
}
__device__ float reg_func (float x)
{
return x;
}
typedef float (*p_func) (float);
__device__ p_func integrale_f = reg_func;
__device__ void integralSimpsonMethod(int totalBlocks, int totalThreads,
double a, double b, int n, float p_function(float), float* result)
{
*result = 0;
float h = (b - a)/n;
//*result = p_function(a)+p_function(a + h * n);
//parallel
int idx_start;
int idx_end;
initThreadBounds(&idx_start, &idx_end, n-1, totalBlocks, totalThreads);
//parallel_ends
for (int i = idx_start; i < idx_end; i+=2) {
*result += ( p_function(a + h*(i-1)) +
4 * p_function(a + h*(i)) +
p_function(a + h*(i+1)) ) * h/3;
}
}
__global__ void integralSimpson(int totalBlocks, int totalThreads, float* result)
{
float res = 0;
integralSimpsonMethod(totalBlocks, totalThreads, 0, 10, 1000, integrale_f, &res);
result[(blockIdx.x*totalThreads + threadIdx.x)] = res;
//printf ("Simpson method\n");
}
__host__ void inttest()
{
const int blocksNum = 32;
const int threadNum = 32;
float *device_resultf;
float host_resultf[threadNum*blocksNum]={0};
cudaMalloc((void**) &device_resultf, sizeof(float)*threadNum*blocksNum);
integralSimpson<<<blocksNum, threadNum>>>(blocksNum, threadNum, device_resultf);
cudaThreadSynchronize();
cudaMemcpy(host_resultf, device_resultf, sizeof(float) *threadNum*blocksNum,
cudaMemcpyDeviceToHost);
float sum = 0;
for (int i = 0; i != blocksNum*threadNum; ++i) {
sum += host_resultf[i];
// printf ("result in %i cell = %f \n", i, host_resultf[i]);
}
printf ("sum = %f \n", sum);
cudaFree(device_resultf);
}
int main(int argc, char* argv[])
{
inttest();
int i;
scanf ("%d",&i);
}
The problem is: it works wrong when n
is lower than 100000. For an integral from 0
to 10
, the result is ~99
, but when n = 100000
or larger it works fine and the result is ~50
.
What's wrong, guys?
回答1:
The basic problem here is that you don't understand your own algorithm.
Your integralSimpsonMethod()
function is designed such that each thread is sampling at least 3 quadrature points per sub-interval in the integral domain. Therefore, if you choose n so that it is less than four times the number of threads in the kernel call, it is inevitable that each sub interval will overlap and the resulting integral will be incorrect. You need to make sure that the code checks and scales the thread count or n so that they don't produce overlap when the integral is computed.
If you are doing this for anything other than self-edification, then I recommend you look up the composite version of Simpson's rule. This is much better suited to parallel implementation and will be considerably more performant if implemented correctly.
回答2:
I would propose an approach to Simpson's integration by using CUDA Thrust. You basically need five steps:
- Generate the Simpson's quadrature weights;
- Generate the function sampling points;
- Generate the function values;
- Calculate the elementwise product between the quadrature weights and the function values;
- Sum the above products.
Step #1 requires creating an array with elements repeated many times, namely, 1 4 2 4 2 4 ... 1
for the Simpson's case. This can be accomplished by borrowing Robert Crovella's approach in cuda thrust library repeat vector multiple times.
Step #2 can be accomplished by using couting_iterators and borrowing talonmies approach in Purpose and usage of counting_iterators in CUDA Thrust library.
Step #3 is an application of thrust::transform
.
Steps #4 and #5 can be accomplished together by thrust::inner_product
.
This approach can be exploited also for use when other quadrature integration rules are of interest.
Here is the code
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/inner_product.h>
#include <thrust/functional.h>
#include <thrust/fill.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
// for printing
#include <thrust/copy.h>
#include <ostream>
#define STRIDE 2
#define N 100
#define pi_f 3.14159265358979f // Greek pi in single precision
struct sin_functor
{
__host__ __device__
float operator()(float x) const
{
return sin(2.f*pi_f*x);
}
};
template <typename Iterator>
class strided_range
{
public:
typedef typename thrust::iterator_difference<Iterator>::type difference_type;
struct stride_functor : public thrust::unary_function<difference_type,difference_type>
{
difference_type stride;
stride_functor(difference_type stride)
: stride(stride) {}
__host__ __device__
difference_type operator()(const difference_type& i) const
{
return stride * i;
}
};
typedef typename thrust::counting_iterator<difference_type> CountingIterator;
typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
typedef typename thrust::permutation_iterator<Iterator,TransformIterator> PermutationIterator;
// type of the strided_range iterator
typedef PermutationIterator iterator;
// construct strided_range for the range [first,last)
strided_range(Iterator first, Iterator last, difference_type stride)
: first(first), last(last), stride(stride) {}
iterator begin(void) const
{
return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride)));
}
iterator end(void) const
{
return begin() + ((last - first) + (stride - 1)) / stride;
}
protected:
Iterator first;
Iterator last;
difference_type stride;
};
int main(void)
{
// --- Generate the integration coefficients
thrust::host_vector<float> h_coefficients(STRIDE);
h_coefficients[0] = 4.f;
h_coefficients[1] = 2.f;
thrust::device_vector<float> d_coefficients(N);
typedef thrust::device_vector<float>::iterator Iterator;
strided_range<Iterator> pos1(d_coefficients.begin()+1, d_coefficients.end()-2, STRIDE);
strided_range<Iterator> pos2(d_coefficients.begin()+2, d_coefficients.end()-1, STRIDE);
thrust::fill(pos1.begin(), pos1.end(), h_coefficients[0]);
thrust::fill(pos2.begin(), pos2.end(), h_coefficients[1]);
d_coefficients[0] = 1.f;
d_coefficients[N-1] = 1.f;
// print the generated d_coefficients
std::cout << "d_coefficients: ";
thrust::copy(d_coefficients.begin(), d_coefficients.end(), std::ostream_iterator<float>(std::cout, " ")); std::cout << std::endl;
// --- Generate sampling points
float a = 0.f;
float b = .5f;
float Dx = (b-a)/(float)(N-1);
thrust::device_vector<float> d_x(N);
thrust::transform(thrust::make_counting_iterator(a/Dx),
thrust::make_counting_iterator((b+1.f)/Dx),
thrust::make_constant_iterator(Dx),
d_x.begin(),
thrust::multiplies<float>());
// --- Calculate function values
thrust::device_vector<float> d_y(N);
thrust::transform(d_x.begin(), d_x.end(), d_y.begin(), sin_functor());
// --- Calculate integral
float integral = (Dx/3.f) * thrust::inner_product(d_y.begin(), d_y.begin() + N, d_coefficients.begin(), 0.0f);
printf("The integral is = %f\n", integral);
getchar();
return 0;
}
来源:https://stackoverflow.com/questions/16134188/simpsons-method-to-integrate-real-valued-functions-with-cuda