I have implemented a 2D median filter in CUDA and the whole program is shown below.
#include \"cuda_runtime.h\"
#include \"cuda_runtime_api.h\"
#include \"de
It seems you share nothing between threads using shared memory, i.e. for 3x3 filter, you read each pixel 9 times from the global memory, which is not necessary. This white paper may provide some ideas on how to using shared memory in a convolution kernel. Hope it help.
http://docs.nvidia.com/cuda/samples/3_Imaging/convolutionSeparable/doc/convolutionSeparable.pdf
I'm answering your last question on the use of shared memory.
As already noticed by Eric, your use of shared memory does not really lead to thread collaboration.
I'm comparing your solution, for the 3x3
case, with a variant of your kernel not using shared memory at all as well as with the Accelereyes solution discussed in 2D median filtering in CUDA: how to efficiently copy global memory to shared memory.
Here is the complete code:
#include <iostream>
#include <fstream>
using namespace std;
#define BLOCK_WIDTH 16
#define BLOCK_HEIGHT 16
/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/**********************************************/
/* KERNEL WITH OPTIMIZED USE OF SHARED MEMORY */
/**********************************************/
__global__ void Optimized_Kernel_Function_shared(unsigned short *Input_Image, unsigned short *Output_Image, int Image_Width, int Image_Height)
{
const int tx_l = threadIdx.x; // --- Local thread x index
const int ty_l = threadIdx.y; // --- Local thread y index
const int tx_g = blockIdx.x * blockDim.x + tx_l; // --- Global thread x index
const int ty_g = blockIdx.y * blockDim.y + ty_l; // --- Global thread y index
__shared__ unsigned short smem[BLOCK_WIDTH+2][BLOCK_HEIGHT+2];
// --- Fill the shared memory border with zeros
if (tx_l == 0) smem[tx_l] [ty_l+1] = 0; // --- left border
else if (tx_l == BLOCK_WIDTH-1) smem[tx_l+2][ty_l+1] = 0; // --- right border
if (ty_l == 0) { smem[tx_l+1][ty_l] = 0; // --- upper border
if (tx_l == 0) smem[tx_l] [ty_l] = 0; // --- top-left corner
else if (tx_l == BLOCK_WIDTH-1) smem[tx_l+2][ty_l] = 0; // --- top-right corner
} else if (ty_l == BLOCK_HEIGHT-1) {smem[tx_l+1][ty_l+2] = 0; // --- bottom border
if (tx_l == 0) smem[tx_l] [ty_l+2] = 0; // --- bottom-left corder
else if (tx_l == BLOCK_WIDTH-1) smem[tx_l+2][ty_l+2] = 0; // --- bottom-right corner
}
// --- Fill shared memory
smem[tx_l+1][ty_l+1] = Input_Image[ty_g*Image_Width + tx_g]; // --- center
if ((tx_l == 0)&&((tx_g > 0))) smem[tx_l] [ty_l+1] = Input_Image[ty_g*Image_Width + tx_g-1]; // --- left border
else if ((tx_l == BLOCK_WIDTH-1)&&(tx_g < Image_Width - 1)) smem[tx_l+2][ty_l+1] = Input_Image[ty_g*Image_Width + tx_g+1]; // --- right border
if ((ty_l == 0)&&(ty_g > 0)) { smem[tx_l+1][ty_l] = Input_Image[(ty_g-1)*Image_Width + tx_g]; // --- upper border
if ((tx_l == 0)&&((tx_g > 0))) smem[tx_l] [ty_l] = Input_Image[(ty_g-1)*Image_Width + tx_g-1]; // --- top-left corner
else if ((tx_l == BLOCK_WIDTH-1)&&(tx_g < Image_Width - 1)) smem[tx_l+2][ty_l] = Input_Image[(ty_g-1)*Image_Width + tx_g+1]; // --- top-right corner
} else if ((ty_l == BLOCK_HEIGHT-1)&&(ty_g < Image_Height - 1)) { smem[tx_l+1][ty_l+2] = Input_Image[(ty_g+1)*Image_Width + tx_g]; // --- bottom border
if ((tx_l == 0)&&((tx_g > 0))) smem[tx_l] [ty_l+2] = Input_Image[(ty_g-1)*Image_Width + tx_g-1]; // --- bottom-left corder
else if ((tx_l == BLOCK_WIDTH-1)&&(tx_g < Image_Width - 1)) smem[tx_l+2][ty_l+2] = Input_Image[(ty_g+1)*Image_Width + tx_g+1]; // --- bottom-right corner
}
__syncthreads();
// --- Pull the 3x3 window in a local array
unsigned short v[9] = { smem[tx_l][ty_l], smem[tx_l+1][ty_l], smem[tx_l+2][ty_l],
smem[tx_l][ty_l+1], smem[tx_l+1][ty_l+1], smem[tx_l+2][ty_l+1],
smem[tx_l][ty_l+2], smem[tx_l+1][ty_l+2], smem[tx_l+2][ty_l+2] };
// --- Bubble-sort
for (int i = 0; i < 5; i++) {
for (int j = i + 1; j < 9; j++) {
if (v[i] > v[j]) { // swap?
unsigned short tmp = v[i];
v[i] = v[j];
v[j] = tmp;
}
}
}
// --- Pick the middle one
Output_Image[ty_g*Image_Width + tx_g] = v[4];
}
/****************************/
/* ORIGINAL KERNEL FUNCTION */
/****************************/
__global__ void Original_Kernel_Function(unsigned short *Input_Image, unsigned short *Output_Image, int Image_Width, int Image_Height) {
__shared__ unsigned short surround[BLOCK_WIDTH*BLOCK_HEIGHT][9];
int iterator;
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
if( (x >= (Image_Width - 1)) || (y >= Image_Height - 1) || (x == 0) || (y == 0)) return;
// --- Fill shared memory
iterator = 0;
for (int r = x - 1; r <= x + 1; r++) {
for (int c = y - 1; c <= y + 1; c++) {
surround[tid][iterator] = Input_Image[c*Image_Width+r];
iterator++;
}
}
// --- Sort shared memory to find the median using Bubble Short
for (int i=0; i<5; ++i) {
// --- Find the position of the minimum element
int minval=i;
for (int l=i+1; l<9; ++l) if (surround[tid][l] < surround[tid][minval]) minval=l;
// --- Put found minimum element in its place
unsigned short temp = surround[tid][i];
surround[tid][i]=surround[tid][minval];
surround[tid][minval]=temp;
}
// --- Pick the middle one
Output_Image[(y*Image_Width)+x]=surround[tid][4];
__syncthreads();
}
/***********************************************/
/* ORIGINAL KERNEL FUNCTION - NO SHARED MEMORY */
/***********************************************/
__global__ void Original_Kernel_Function_no_shared(unsigned short *Input_Image, unsigned short *Output_Image, int Image_Width, int Image_Height) {
unsigned short surround[9];
int iterator;
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
if( (x >= (Image_Width - 1)) || (y >= Image_Height - 1) || (x == 0) || (y == 0)) return;
// --- Fill array private to the threads
iterator = 0;
for (int r = x - 1; r <= x + 1; r++) {
for (int c = y - 1; c <= y + 1; c++) {
surround[iterator] = Input_Image[c*Image_Width+r];
iterator++;
}
}
// --- Sort private array to find the median using Bubble Short
for (int i=0; i<5; ++i) {
// --- Find the position of the minimum element
int minval=i;
for (int l=i+1; l<9; ++l) if (surround[l] < surround[minval]) minval=l;
// --- Put found minimum element in its place
unsigned short temp = surround[i];
surround[i]=surround[minval];
surround[minval]=temp;
}
// --- Pick the middle one
Output_Image[(y*Image_Width)+x]=surround[4];
}
/********/
/* MAIN */
/********/
int main()
{
const int Image_Width = 1580;
const int Image_Height = 1050;
// --- Open data file
ifstream is; is.open("C:\\Users\\user\\Documents\\Project\\Median_Filter\\Release\\Image_To_Be_Filtered.raw", ios::binary );
// --- Get file length
is.seekg(0, ios::end);
int dataLength = is.tellg();
is.seekg(0, ios::beg);
// --- Read data from file and close file
unsigned short* Input_Image_Host = new unsigned short[dataLength * sizeof(char) / sizeof(unsigned short)];
is.read((char*)Input_Image_Host,dataLength);
is.close();
// --- CUDA warm up
unsigned short *forFirstCudaMalloc; gpuErrchk(cudaMalloc((void**)&forFirstCudaMalloc, dataLength * sizeof(unsigned short)));
gpuErrchk(cudaFree(forFirstCudaMalloc));
// --- Allocate host and device memory spaces
unsigned short *Output_Image_Host = (unsigned short *)malloc(dataLength);
unsigned short *Input_Image; gpuErrchk(cudaMalloc( (void**)&Input_Image, dataLength * sizeof(unsigned short)));
unsigned short *Output_Image; gpuErrchk(cudaMalloc((void**)&Output_Image, dataLength * sizeof(unsigned short)));
// --- Copy data from host to device
gpuErrchk(cudaMemcpy(Input_Image, Input_Image_Host, dataLength, cudaMemcpyHostToDevice));// copying Host Data To Device Memory For Filtering
// --- Grid and block sizes
const dim3 grid (iDivUp(Image_Width, BLOCK_WIDTH), iDivUp(Image_Height, BLOCK_HEIGHT), 1);
const dim3 block(BLOCK_WIDTH, BLOCK_HEIGHT, 1);
/****************************/
/* ORIGINAL KERNEL FUNCTION */
/****************************/
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
cudaFuncSetCacheConfig(Original_Kernel_Function, cudaFuncCachePreferShared);
Original_Kernel_Function<<<grid,block>>>(Input_Image, Output_Image, Image_Width, Image_Height);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Original kernel function - elapsed time: %3.3f ms \n", time);
/***********************************************/
/* ORIGINAL KERNEL FUNCTION - NO SHARED MEMORY */
/***********************************************/
cudaEventRecord(start, 0);
cudaFuncSetCacheConfig(Original_Kernel_Function_no_shared, cudaFuncCachePreferL1);
Original_Kernel_Function_no_shared<<<grid,block>>>(Input_Image, Output_Image, Image_Width, Image_Height);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Original kernel function - no shared - elapsed time: %3.3f ms \n", time);
/**********************************************/
/* KERNEL WITH OPTIMIZED USE OF SHARED MEMORY */
/**********************************************/
cudaEventRecord(start, 0);
cudaFuncSetCacheConfig(Optimized_Kernel_Function_shared, cudaFuncCachePreferShared);
Optimized_Kernel_Function_shared<<<grid,block>>>(Input_Image, Output_Image, Image_Width, Image_Height);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Optimized kernel function - shared - elapsed time: %3.3f ms \n", time);
// --- Copy results back to the host
gpuErrchk(cudaMemcpy(Output_Image_Host, Output_Image, dataLength, cudaMemcpyDeviceToHost));
// --- Open results file, write results and close the file
ofstream of2; of2.open("C:\\Users\\angelo\\Documents\\Project\\Median_Filter\\Release\\Filtered_Image.raw", ios::binary);
of2.write((char*)Output_Image_Host, dataLength);
of2.close();
cout << "\n Press Any Key To Exit..!!";
gpuErrchk(cudaFree(Input_Image));
delete Input_Image_Host;
delete Output_Image_Host;
return 0;
}
Here are the timing results on a Kepler K20c:
1580 x 1050
Original_Kernel_Function = 1.588ms
Original_Kernel_Function_no_shared = 1.278ms
Optimized_Kernel_Function_shared = 1.455ms
2048 x 2048
Original_Kernel_Function = 3.94ms
Original_Kernel_Function_no_shared = 3.118ms
Optimized_Kernel_Function_shared = 3.709ms
4096 x 4096
Original_Kernel_Function = 16.003ms
Original_Kernel_Function_no_shared = 13.735ms
Optimized_Kernel_Function_shared = 14.526ms
8192 x 8192
Original_Kernel_Function = 62.278ms
Original_Kernel_Function_no_shared = 47.484ms
Optimized_Kernel_Function_shared = 57.474ms
Here are the timing results on a GT540M, which is more similar to your card:
1580 x 1050
Original_Kernel_Function = 10.332 ms
Original_Kernel_Function_no_shared = 9.294 ms
Optimized_Kernel_Function_shared = 10.301 ms
2048 x 2048
Original_Kernel_Function = 25.256 ms
Original_Kernel_Function_no_shared = 23.567 ms
Optimized_Kernel_Function_shared = 23.876 ms
4096 x 4096
Original_Kernel_Function = 99.791 ms
Original_Kernel_Function_no_shared = 93.919 ms
Optimized_Kernel_Function_shared = 95.464 ms
8192 x 8192
Original_Kernel_Function = 399.259 ms
Original_Kernel_Function_no_shared = 375.634 ms
Optimized_Kernel_Function_shared = 383.121 ms
As it can be seen, the version not using shared memory seems to be (slightly) convenient in all the cases.
Quickselect median is the fastest linear time algorithm for best case; however, it is hard to implement in CUDA due to memory overhead. The easiest approach for a highly parallel algorithm is to minimize memory overhead. Instead of a partial sort algorithm like Quickselect, do away with memory overhead entirely by using the Torben median algorithm. The Torben algorithm can be significantly slower than other algorithms, but it does not modify the input data. Therefore, there is no need to allocate shared memory.
Finally, for maximum speed, bind the input to a texture, which has the added bonus of managing border extensions. To minimize cache misses, use a row major nested for loop for row major array.
Two hints:
block_width * pixel_width
, i.e. 16 x 2 = 32 contiguous bytes at a time. This will require higher occupancy for latency hiding than reading 128 bytes at a time. You can improve things by using wider blocks (wider blocks are generally better for that very reason). Also make sure your reads are aligned. This is related to the previous point and explained in [this section of the CUDA C Programming Guide][7].EDIT: I moved the rest of the answer here.