I am measuring the difference between the standard and 1Dtexture access to memory. To do so I have created two kernels
__global__ void texture1D(float* doarray,int size) { int index; //calculate each thread global index index=blockIdx.x*blockDim.x+threadIdx.x; //fetch global memory through texture reference doarray[index]=tex1Dfetch(texreference,index); return; } __global__ void standard1D(float* diarray, float* doarray, int size) { int index; //calculate each thread global index index=blockIdx.x*blockDim.x+threadIdx.x; //fetch global memory through texture reference doarray[index]= diarray[index]; return; }
Then, I call eache kernel measuring the time it takes:
//copy array from host to device memory cudaMemcpy(diarray,harray,sizeof(float)*size,cudaMemcpyHostToDevice); checkCuda( cudaEventCreate(&startEvent) ); checkCuda( cudaEventCreate(&stopEvent) ); checkCuda( cudaEventRecord(startEvent, 0) ); //bind texture reference with linear memory cudaBindTexture(0,texreference,diarray,sizeof(float)*size); //execute device kernel texture1D<<<(int)ceil((float)size/threadSize),threadSize>>>(doarray,size); //unbind texture reference to free resource cudaUnbindTexture(texreference); checkCuda( cudaEventRecord(stopEvent, 0) ); checkCuda( cudaEventSynchronize(stopEvent) ); //copy result array from device to host memory cudaMemcpy(horray,doarray,sizeof(float)*size,cudaMemcpyDeviceToHost); //check result checkResutl(horray, harray, size); cudaEvent_t startEvent2, stopEvent2; checkCuda( cudaEventCreate(&startEvent2) ); checkCuda( cudaEventCreate(&stopEvent2) ); checkCuda( cudaEventRecord(startEvent2, 0) ); standard1D<<<(int)ceil((float)size/threadSize),threadSize>>>(diarray,doarray,size); checkCuda( cudaEventRecord(stopEvent2, 0) ); checkCuda( cudaEventSynchronize(stopEvent2) ); //copy back to CPU cudaMemcpy(horray,doarray,sizeof(float)*size,cudaMemcpyDeviceToHost);
and print results:
float time,time2; checkCuda( cudaEventElapsedTime(&time, startEvent, stopEvent) ); checkCuda( cudaEventElapsedTime(&time2, startEvent2, stopEvent2) ); printf("Texture bandwidth (GB/s): %f\n",bytes * 1e-6 / time); printf("Standard bandwidth (GB/s): %f\n",bytes * 1e-6 / time2);
It turns out that, no matters the size of the array I am allocating (size
), the standard bandwidth is always much higher. Is that how it suppose to be or am I screwing it up at some point? My understanding of Texture memory access was that it can speed up global memory access.
I have made a comparison between global memory and texture memory (used for caching purposes only, and not for filtering) for the interpolation of a 1D complex valued function.
The kernels I'm comparing are the 4
, 2
using global memory and 2
using texture memory. They are distinguished according to the way complex values are accessed (1 float2
or 2 floats
) and are reported below. I will post somewhere the full Visual Studio 2010 in case someone like to make some criticisms or perform his own testing.
__global__ void linear_interpolation_kernel_function_GPU(float* __restrict__ result_d, const float* __restrict__ data_d, const float* __restrict__ x_out_d, const int M, const int N) { int j = threadIdx.x + blockDim.x * blockIdx.x; if(j<N) { float reg_x_out = x_out_d[j/2]+M/2; int k = __float2int_rz(reg_x_out); float a = reg_x_out - __int2float_rz(k); float dk = data_d[2*k+(j&1)]; float dkp1 = data_d[2*k+2+(j&1)]; result_d[j] = a * dkp1 + (-dk * a + dk); } } __global__ void linear_interpolation_kernel_function_GPU_alternative(float2* __restrict__ result_d, const float2* __restrict__ data_d, const float* __restrict__ x_out_d, const int M, const int N) { int j = threadIdx.x + blockDim.x * blockIdx.x; if(j<N) { float reg_x_out = x_out_d[j]+M/2; int k = __float2int_rz(reg_x_out); float a = reg_x_out - __int2float_rz(k); float2 dk = data_d[k]; float2 dkp1 = data_d[k+1]; result_d[j].x = a * dkp1.x + (-dk.x * a + dk.x); result_d[j].y = a * dkp1.y + (-dk.y * a + dk.y); } } __global__ void linear_interpolation_kernel_function_GPU_texture(float2* __restrict__ result_d, const float* __restrict__ x_out_d, const int M, const int N) { int j = threadIdx.x + blockDim.x * blockIdx.x; if(j<N) { float reg_x_out = x_out_d[j]+M/2; int k = __float2int_rz(reg_x_out); float a = reg_x_out - __int2float_rz(k); float2 dk = tex1Dfetch(data_d_texture,k); float2 dkp1 = tex1Dfetch(data_d_texture,k+1); result_d[j].x = a * dkp1.x + (-dk.x * a + dk.x); result_d[j].y = a * dkp1.y + (-dk.y * a + dk.y); } } __global__ void linear_interpolation_kernel_function_GPU_texture_alternative(float* __restrict__ result_d, const float* __restrict__ x_out_d, const int M, const int N) { int j = threadIdx.x + blockDim.x * blockIdx.x; if(j<N) { float reg_x_out = x_out_d[j/2]+M/4; int k = __float2int_rz(reg_x_out); float a = reg_x_out - __int2float_rz(k); float dk = tex1Dfetch(data_d_texture2,2*k+(j&1)); float dkp1 = tex1Dfetch(data_d_texture2,2*k+2+(j&1)); result_d[j] = a * dkp1 + (-dk * a + dk); } }
I have considered 4 different GPUs, namely, GeForce GT540M (cc 2.1), Tesla C2050 (cc 2.0), Kepler K20c (cc 3.5) and GT210 (cc 1.2). The results are reported in the figures below. As it can be seen, using textures as cache with older compute capabilities improves over the use of global memory, while the two solutions are pretty equivalent for the newest architecture.
Of course, this example is not exhaustive and there may be in practice other cases when the former or the latter should be preferred for particular applications.
p.s. The processing times are in [ms] and not in [s] as indicated in the figure labels.