2D CUDA median filter optimization

问题

I have implemented a 2D median filter in CUDA and the whole program is shown below.

#include "cuda_runtime.h"
#include "cuda_runtime_api.h"
#include "device_launch_parameters.h"
#include <iostream>  
#include <fstream>   
#include <iomanip>   
#include <windows.h>
#include <io.h>                  
#include <stdio.h>
#include<conio.h>
#include <cstdlib>
#include "cstdlib"
#include <process.h>
#include <stdlib.h>
#include <malloc.h>
#include <ctime>
using namespace std;

#define MEDIAN_DIMENSION  3 // For matrix of 3 x 3. We can Use 5 x 5 , 7 x 7 , 9 x 9......   
#define MEDIAN_LENGTH 9   // Shoul be  MEDIAN_DIMENSION x MEDIAN_DIMENSION = 3 x 3

#define BLOCK_WIDTH 16  // Should be 8 If matrix is of larger then of 5 x 5 elese error occur as " uses too much shared data "  at surround[BLOCK_WIDTH*BLOCK_HEIGHT][MEDIAN_LENGTH]
#define BLOCK_HEIGHT 16// Should be 8 If matrix is of larger then of 5 x 5 elese error occur as " uses too much shared data "  at surround[BLOCK_WIDTH*BLOCK_HEIGHT][MEDIAN_LENGTH]

 __global__ void MedianFilter_gpu( unsigned short *Device_ImageData,int Image_Width,int Image_Height){

      __shared__ unsigned short surround[BLOCK_WIDTH*BLOCK_HEIGHT][MEDIAN_LENGTH];

    int iterator;
    const int Half_Of_MEDIAN_LENGTH =(MEDIAN_LENGTH/2)+1;
    int StartPoint=MEDIAN_DIMENSION/2;
    int EndPoint=StartPoint+1;

    const int x = blockDim.x * blockIdx.x + threadIdx.x;
    const int y = blockDim.y * blockIdx.y + threadIdx.y;

    const int tid=threadIdx.y*blockDim.y+threadIdx.x;   

      if(x>=Image_Width || y>=Image_Height)
        return;

     //Fill surround with pixel value of Image in Matrix Pettern of MEDIAN_DIMENSION x MEDIAN_DIMENSION
            if (x == 0 || x == Image_Width - StartPoint || y == 0
                || y == Image_Height - StartPoint) {
            } else {             
                iterator = 0;
                for (int r = x - StartPoint; r < x + (EndPoint); r++) {
                    for (int c = y - StartPoint; c < y + (EndPoint); c++) {
                        surround[tid][iterator] =*(Device_ImageData+(c*Image_Width)+r);
                        iterator++;
                    }
                }
//Sort the Surround Array to Find Median. Use Bubble Short  if Matrix oF 3 x 3 Matrix 
                    //You can use Insertion commented below to Short Bigger Dimension Matrix  

                              ////      bubble short //

                    for ( int i=0; i<Half_Of_MEDIAN_LENGTH; ++i)
                    {     
                        // Find position of minimum element
                        int min=i;
                        for ( int l=i+1; l<MEDIAN_LENGTH; ++l)
                            if (surround[tid][l] <surround[tid][min] )
                                min=l;
                        // Put found minimum element in its place
                        unsigned short  temp= surround[tid][i];
                        surround[tid][i]=surround[tid][min];
                        surround[tid][min]=temp;
                    }//bubble short  end

                    //////insertion sort start   //

                    /*int t,j,i;
                    for ( i = 1 ; i< MEDIAN_LENGTH ; i++) {
                        j = i;
                        while ( j > 0 && surround[tid][j] < surround[tid][j-1]) {
                            t= surround[tid][j];
                            surround[tid][j]= surround[tid][j-1];
                            surround[tid][j-1] = t;
                            j--;
                        }
                    }*/

                    ////insertion sort end   



                    *(Device_ImageData+(y*Image_Width)+x)= surround[tid][Half_Of_MEDIAN_LENGTH-1];   // it will give value of surround[tid][4] as Median Value if use 3 x 3 matrix
                        __syncthreads();
            }  
}

  int main( int argc, const char** argv )
{
    int dataLength;
    int p1;
    unsigned short* Host_ImageData = NULL;
    ifstream is; // Read File 
    is.open ("D:\\Image_To_Be_Filtered.raw", ios::binary );

    // get length of file:
    is.seekg (0, ios::end);
    dataLength = is.tellg();
    is.seekg (0, ios::beg);

    Host_ImageData = new  unsigned short[dataLength * sizeof(char) / sizeof(unsigned short)];
    is.read ((char*)Host_ImageData,dataLength);
    is.close();

    int Image_Width = 1580;
    int Image_Height = 1050;

    unsigned short *Host_ResultData = (unsigned short *)malloc(dataLength);
    unsigned short *Device_ImageData = NULL;

    /////////////////////////////
    // As First time cudaMalloc take more time  for memory alocation, i dont want to cosider this time in my process. 
    //So Please Ignore Code For Displaying First CudaMelloc Time
    clock_t begin = clock();
    unsigned short *forFirstCudaMalloc = NULL;
    cudaMalloc( (void**)&forFirstCudaMalloc, dataLength * sizeof(unsigned short) );
    clock_t end = clock();
    double elapsed_secs = double(end - begin) / CLOCKS_PER_SEC;
    cout<<"First CudaMelloc time = "<<elapsed_secs<<"  Second\n" ;
    cudaFree( forFirstCudaMalloc );
    ////////////////////////////

    //Actual Process Starts From Here 
    clock_t beginOverAll = clock();   //
    cudaMalloc( (void**)&Device_ImageData, dataLength * sizeof(unsigned short) ); 
    cudaMemcpy(Device_ImageData, Host_ImageData, dataLength, cudaMemcpyHostToDevice);// copying Host Data To Device Memory For Filtering

    int x = static_cast<int>(ceilf(static_cast<float>(1580.0) /BLOCK_WIDTH));
    int y = static_cast<int>(ceilf(static_cast<float>(1050.0) /BLOCK_HEIGHT));

    const dim3 grid (x, y, 1);      
    const dim3 block(BLOCK_WIDTH, BLOCK_HEIGHT, 1); 

    begin = clock();

    MedianFilter_gpu<<<grid,block>>>( Device_ImageData, Image_Width, Image_Height);
    cudaDeviceSynchronize();

    end = clock();
    elapsed_secs = double(end - begin) / CLOCKS_PER_SEC;
    cout<<"Process time = "<<elapsed_secs<<"  Second\n" ;

    cudaMemcpy(Host_ResultData, Device_ImageData, dataLength, cudaMemcpyDeviceToHost); // copying Back Device Data To Host Memory To write In file After Filter Done

    clock_t endOverall = clock();
    elapsed_secs = double(endOverall - beginOverAll) / CLOCKS_PER_SEC;
    cout<<"Complete Time  = "<<elapsed_secs<<"  Second\n" ;

    ofstream of2;   //Write Filtered Image Into File
    of2.open("D:\\Filtered_Image.raw",  ios::binary);
    of2.write((char*)Host_ResultData,dataLength);
    of2.close();
    cout<<"\nEnd of Writing File.  Press Any Key To Exit..!!";
    cudaFree(Device_ImageData);
    delete Host_ImageData;
    delete Host_ResultData;

    getch();
    return 0;
}

Here is the link for the file I use. I used ImajeJ to store the image in "raw" format and the same for reading the "raw" Image. My image pixel is 16 bit, unsigned short. The width of the image is 1580 and the height is 1050.

I strongly believe that the filter can be made more efficient and fast by using proper CUDA optimization.

Indeed, I'm running on a GeForce GT 520M card and the timings are the following

1) For MEDIAN_DIMENSION of 3 x 3 = 0.027 seconds

2) For MEDIAN_DIMENSION of 5 x 5 = 0.206 seconds

3) For MEDIAN_DIMENSION of 7 x 7 = 1.11 seconds

4) For MEDIAN_DIMENSION of 9 x 9 = 4.931 seconds

As you can see, as we increase MEDIAN_DIMENSION, the time increases very much and I have applications where I generally use higher MEDIAN_DIMENSION like 7 x 7 and 9 x 9. I think that, by using Cuda, even for 9 x 9 the time should be less than 1 second.

Since I think that the sorting part is taking most of the time here, can we make the sorting part of the algorithm faster?

Can we use grid and block more efficiently? Can I use larger BLOCK_WIDTH and BLOCK_HEIGHT (like 32 and 32) and still not hit the maximum __shared__ memory limit which is 4Kb for my device?

Can __shared__ memory be used more efficiently?

Any help will be appreciated.

Thanks in advance.

回答1:

I'm answering your last question on the use of shared memory.

As already noticed by Eric, your use of shared memory does not really lead to thread collaboration.

I'm comparing your solution, for the 3x3 case, with a variant of your kernel not using shared memory at all as well as with the Accelereyes solution discussed in 2D median filtering in CUDA: how to efficiently copy global memory to shared memory.

Here is the complete code:

#include <iostream>  
#include <fstream>   

using namespace std;

#define BLOCK_WIDTH 16 
#define BLOCK_HEIGHT 16

/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

/**********************************************/
/* KERNEL WITH OPTIMIZED USE OF SHARED MEMORY */
/**********************************************/
__global__ void Optimized_Kernel_Function_shared(unsigned short *Input_Image, unsigned short *Output_Image, int Image_Width, int Image_Height)
{
    const int tx_l = threadIdx.x;                           // --- Local thread x index
    const int ty_l = threadIdx.y;                           // --- Local thread y index

    const int tx_g = blockIdx.x * blockDim.x + tx_l;        // --- Global thread x index
    const int ty_g = blockIdx.y * blockDim.y + ty_l;        // --- Global thread y index

    __shared__ unsigned short smem[BLOCK_WIDTH+2][BLOCK_HEIGHT+2];

    // --- Fill the shared memory border with zeros
    if (tx_l == 0)                      smem[tx_l]  [ty_l+1]    = 0;    // --- left border
    else if (tx_l == BLOCK_WIDTH-1)     smem[tx_l+2][ty_l+1]    = 0;    // --- right border
    if (ty_l == 0) {                    smem[tx_l+1][ty_l]      = 0;    // --- upper border
        if (tx_l == 0)                  smem[tx_l]  [ty_l]      = 0;    // --- top-left corner
        else if (tx_l == BLOCK_WIDTH-1) smem[tx_l+2][ty_l]      = 0;    // --- top-right corner
        }   else if (ty_l == BLOCK_HEIGHT-1) {smem[tx_l+1][ty_l+2]  = 0;    // --- bottom border
        if (tx_l == 0)                  smem[tx_l]  [ty_l+2]    = 0;    // --- bottom-left corder
        else if (tx_l == BLOCK_WIDTH-1) smem[tx_l+2][ty_l+2]    = 0;    // --- bottom-right corner
    }

    // --- Fill shared memory
                                                                    smem[tx_l+1][ty_l+1] =                           Input_Image[ty_g*Image_Width + tx_g];      // --- center
    if ((tx_l == 0)&&((tx_g > 0)))                                      smem[tx_l]  [ty_l+1] = Input_Image[ty_g*Image_Width + tx_g-1];      // --- left border
    else if ((tx_l == BLOCK_WIDTH-1)&&(tx_g < Image_Width - 1))         smem[tx_l+2][ty_l+1] = Input_Image[ty_g*Image_Width + tx_g+1];      // --- right border
    if ((ty_l == 0)&&(ty_g > 0)) {                                      smem[tx_l+1][ty_l]   = Input_Image[(ty_g-1)*Image_Width + tx_g];    // --- upper border
            if ((tx_l == 0)&&((tx_g > 0)))                                  smem[tx_l]  [ty_l]   = Input_Image[(ty_g-1)*Image_Width + tx_g-1];  // --- top-left corner
            else if ((tx_l == BLOCK_WIDTH-1)&&(tx_g < Image_Width - 1))     smem[tx_l+2][ty_l]   = Input_Image[(ty_g-1)*Image_Width + tx_g+1];  // --- top-right corner
         } else if ((ty_l == BLOCK_HEIGHT-1)&&(ty_g < Image_Height - 1)) {  smem[tx_l+1][ty_l+2] = Input_Image[(ty_g+1)*Image_Width + tx_g];    // --- bottom border
         if ((tx_l == 0)&&((tx_g > 0)))                                 smem[tx_l]  [ty_l+2] = Input_Image[(ty_g-1)*Image_Width + tx_g-1];  // --- bottom-left corder
        else if ((tx_l == BLOCK_WIDTH-1)&&(tx_g < Image_Width - 1))     smem[tx_l+2][ty_l+2] = Input_Image[(ty_g+1)*Image_Width + tx_g+1];  // --- bottom-right corner
    }
    __syncthreads();

    // --- Pull the 3x3 window in a local array
    unsigned short v[9] = { smem[tx_l][ty_l],   smem[tx_l+1][ty_l],     smem[tx_l+2][ty_l],
                            smem[tx_l][ty_l+1], smem[tx_l+1][ty_l+1],   smem[tx_l+2][ty_l+1],
                            smem[tx_l][ty_l+2], smem[tx_l+1][ty_l+2],   smem[tx_l+2][ty_l+2] };    

    // --- Bubble-sort
    for (int i = 0; i < 5; i++) {
        for (int j = i + 1; j < 9; j++) {
            if (v[i] > v[j]) { // swap?
                unsigned short tmp = v[i];
                v[i] = v[j];
                v[j] = tmp;
            }
         }
    }

    // --- Pick the middle one
    Output_Image[ty_g*Image_Width + tx_g] = v[4];
}

/****************************/
/* ORIGINAL KERNEL FUNCTION */
/****************************/
__global__ void Original_Kernel_Function(unsigned short *Input_Image, unsigned short *Output_Image, int Image_Width, int Image_Height) {

    __shared__ unsigned short surround[BLOCK_WIDTH*BLOCK_HEIGHT][9];

    int iterator;

    const int x     = blockDim.x * blockIdx.x + threadIdx.x;
    const int y     = blockDim.y * blockIdx.y + threadIdx.y;
    const int tid   = threadIdx.y * blockDim.x + threadIdx.x;   

    if( (x >= (Image_Width - 1)) || (y >= Image_Height - 1) || (x == 0) || (y == 0)) return;

    // --- Fill shared memory
    iterator = 0;
    for (int r = x - 1; r <= x + 1; r++) {
        for (int c = y - 1; c <= y + 1; c++) {
            surround[tid][iterator] = Input_Image[c*Image_Width+r];
            iterator++;
        }
    }

    // --- Sort shared memory to find the median using Bubble Short
    for (int i=0; i<5; ++i) {

        // --- Find the position of the minimum element
        int minval=i;
        for (int l=i+1; l<9; ++l) if (surround[tid][l] < surround[tid][minval]) minval=l;

        // --- Put found minimum element in its place
        unsigned short temp = surround[tid][i];
        surround[tid][i]=surround[tid][minval];
        surround[tid][minval]=temp;
    }

    // --- Pick the middle one
    Output_Image[(y*Image_Width)+x]=surround[tid][4]; 

    __syncthreads();

}

/***********************************************/
/* ORIGINAL KERNEL FUNCTION - NO SHARED MEMORY */
/***********************************************/
__global__ void Original_Kernel_Function_no_shared(unsigned short *Input_Image, unsigned short *Output_Image, int Image_Width, int Image_Height) {

    unsigned short surround[9];

    int iterator;

    const int x     = blockDim.x * blockIdx.x + threadIdx.x;
    const int y     = blockDim.y * blockIdx.y + threadIdx.y;
    const int tid   = threadIdx.y * blockDim.x + threadIdx.x;   

    if( (x >= (Image_Width - 1)) || (y >= Image_Height - 1) || (x == 0) || (y == 0)) return;

    // --- Fill array private to the threads
    iterator = 0;
    for (int r = x - 1; r <= x + 1; r++) {
        for (int c = y - 1; c <= y + 1; c++) {
            surround[iterator] = Input_Image[c*Image_Width+r];
            iterator++;
        }
    }

    // --- Sort private array to find the median using Bubble Short
    for (int i=0; i<5; ++i) {

        // --- Find the position of the minimum element
        int minval=i;
        for (int l=i+1; l<9; ++l) if (surround[l] < surround[minval]) minval=l;

        // --- Put found minimum element in its place
        unsigned short temp = surround[i];
        surround[i]=surround[minval];
        surround[minval]=temp;
    }

    // --- Pick the middle one
    Output_Image[(y*Image_Width)+x]=surround[4]; 

}

/********/
/* MAIN */
/********/
int main()
{
    const int Image_Width = 1580;
    const int Image_Height = 1050;

    // --- Open data file
    ifstream is; is.open("C:\\Users\\user\\Documents\\Project\\Median_Filter\\Release\\Image_To_Be_Filtered.raw", ios::binary );

    // --- Get file length
    is.seekg(0, ios::end);
    int dataLength = is.tellg();
    is.seekg(0, ios::beg);

    // --- Read data from file and close file
    unsigned short* Input_Image_Host = new unsigned short[dataLength * sizeof(char) / sizeof(unsigned short)];
    is.read((char*)Input_Image_Host,dataLength);
    is.close();

    // --- CUDA warm up
    unsigned short *forFirstCudaMalloc; gpuErrchk(cudaMalloc((void**)&forFirstCudaMalloc, dataLength * sizeof(unsigned short)));
    gpuErrchk(cudaFree(forFirstCudaMalloc));

    // --- Allocate host and device memory spaces 
    unsigned short *Output_Image_Host = (unsigned short *)malloc(dataLength);
    unsigned short *Input_Image; gpuErrchk(cudaMalloc( (void**)&Input_Image, dataLength * sizeof(unsigned short))); 
    unsigned short *Output_Image; gpuErrchk(cudaMalloc((void**)&Output_Image, dataLength * sizeof(unsigned short))); 

    // --- Copy data from host to device
    gpuErrchk(cudaMemcpy(Input_Image, Input_Image_Host, dataLength, cudaMemcpyHostToDevice));// copying Host Data To Device Memory For Filtering

    // --- Grid and block sizes
    const dim3 grid (iDivUp(Image_Width, BLOCK_WIDTH), iDivUp(Image_Height, BLOCK_HEIGHT), 1);      
    const dim3 block(BLOCK_WIDTH, BLOCK_HEIGHT, 1); 

    /****************************/
    /* ORIGINAL KERNEL FUNCTION */
    /****************************/
    float time;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    cudaFuncSetCacheConfig(Original_Kernel_Function, cudaFuncCachePreferShared);
    Original_Kernel_Function<<<grid,block>>>(Input_Image, Output_Image, Image_Width, Image_Height);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Original kernel function - elapsed time:  %3.3f ms \n", time);

    /***********************************************/
    /* ORIGINAL KERNEL FUNCTION - NO SHARED MEMORY */
    /***********************************************/
    cudaEventRecord(start, 0);

    cudaFuncSetCacheConfig(Original_Kernel_Function_no_shared, cudaFuncCachePreferL1);
    Original_Kernel_Function_no_shared<<<grid,block>>>(Input_Image, Output_Image, Image_Width, Image_Height);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Original kernel function - no shared - elapsed time:  %3.3f ms \n", time);

    /**********************************************/
    /* KERNEL WITH OPTIMIZED USE OF SHARED MEMORY */
    /**********************************************/
    cudaEventRecord(start, 0);

    cudaFuncSetCacheConfig(Optimized_Kernel_Function_shared, cudaFuncCachePreferShared);
    Optimized_Kernel_Function_shared<<<grid,block>>>(Input_Image, Output_Image, Image_Width, Image_Height);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Optimized kernel function - shared - elapsed time:  %3.3f ms \n", time);

    // --- Copy results back to the host
    gpuErrchk(cudaMemcpy(Output_Image_Host, Output_Image, dataLength, cudaMemcpyDeviceToHost));

    // --- Open results file, write results and close the file
    ofstream of2;     of2.open("C:\\Users\\angelo\\Documents\\Project\\Median_Filter\\Release\\Filtered_Image.raw",  ios::binary);
    of2.write((char*)Output_Image_Host, dataLength);
    of2.close();

    cout << "\n Press Any Key To Exit..!!";
    gpuErrchk(cudaFree(Input_Image));

    delete Input_Image_Host;
    delete Output_Image_Host;

    return 0;
}

Here are the timing results on a Kepler K20c:

1580 x 1050
Original_Kernel_Function             = 1.588ms
Original_Kernel_Function_no_shared   = 1.278ms
Optimized_Kernel_Function_shared     = 1.455ms

2048 x 2048
Original_Kernel_Function             = 3.94ms
Original_Kernel_Function_no_shared   = 3.118ms
Optimized_Kernel_Function_shared     = 3.709ms

4096 x 4096
Original_Kernel_Function             = 16.003ms
Original_Kernel_Function_no_shared   = 13.735ms
Optimized_Kernel_Function_shared     = 14.526ms

8192 x 8192
Original_Kernel_Function             = 62.278ms
Original_Kernel_Function_no_shared   = 47.484ms
Optimized_Kernel_Function_shared     = 57.474ms

Here are the timing results on a GT540M, which is more similar to your card:

1580 x 1050
Original_Kernel_Function             = 10.332 ms
Original_Kernel_Function_no_shared   =  9.294 ms
Optimized_Kernel_Function_shared     = 10.301 ms

2048 x 2048
Original_Kernel_Function             = 25.256 ms
Original_Kernel_Function_no_shared   = 23.567 ms
Optimized_Kernel_Function_shared     = 23.876 ms

4096 x 4096
Original_Kernel_Function             = 99.791 ms
Original_Kernel_Function_no_shared   = 93.919 ms
Optimized_Kernel_Function_shared     = 95.464 ms

8192 x 8192
Original_Kernel_Function             = 399.259 ms
Original_Kernel_Function_no_shared   = 375.634 ms
Optimized_Kernel_Function_shared     = 383.121 ms

As it can be seen, the version not using shared memory seems to be (slightly) convenient in all the cases.

回答2:

It seems you share nothing between threads using shared memory, i.e. for 3x3 filter, you read each pixel 9 times from the global memory, which is not necessary. This white paper may provide some ideas on how to using shared memory in a convolution kernel. Hope it help.

http://docs.nvidia.com/cuda/samples/3_Imaging/convolutionSeparable/doc/convolutionSeparable.pdf

回答3:

Quickselect median is the fastest linear time algorithm for best case; however, it is hard to implement in CUDA due to memory overhead. The easiest approach for a highly parallel algorithm is to minimize memory overhead. Instead of a partial sort algorithm like Quickselect, do away with memory overhead entirely by using the Torben median algorithm. The Torben algorithm can be significantly slower than other algorithms, but it does not modify the input data. Therefore, there is no need to allocate shared memory.

Finally, for maximum speed, bind the input to a texture, which has the added bonus of managing border extensions. To minimize cache misses, use a row major nested for loop for row major array.

回答4:

Two hints:

Change the order of your double loop iterations: First iterate in y (outer loop), then in x (inner loop). This is the most important fix, because it applies to any double loop you will ever implement. You want to make sure that successive reads are as close to each other as possible (there are different reasons for this on different devices, e.g. for single-threaded CPU code its most important for caching, on GPUs its most important for coalesced memory access, and maybe caching). Since you are iterating over rows first, you have 0 coalescing right now, effectively sending a single 2-byte (pixel width) request per pixel. Make sure to read this thread on the matter, even though it only explains the CPU side of things.
Make sure, your reads are coalesced. In your example, even if you fixed your loop, you would only be reading block_width * pixel_width, i.e. 16 x 2 = 32 contiguous bytes at a time. This will require higher occupancy for latency hiding than reading 128 bytes at a time. You can improve things by using wider blocks (wider blocks are generally better for that very reason). Also make sure your reads are aligned. This is related to the previous point and explained in [this section of the CUDA C Programming Guide][7].

EDIT: I moved the rest of the answer here.

来源：https://stackoverflow.com/questions/19634328/2d-cuda-median-filter-optimization

标签

c++

performance

optimization

cuda

median