The efficiency and performance of ILP for the NVIDIA Kepler architecture

问题

Quoting the "Kepler Tuning Guide" provided by NVIDIA:

Also note that Kepler GPUs can utilize ILP in place of thread/warp-level parallelism (TLP) more readily than Fermi GPUs can.

In my opinion, the following code snippet

a = .....;
a2 = f(a); 
a3 = g(a2);

can be improved as follows

a = ...;
b = ....;
a2 = f(a);
b2 = f(b);
a3 = g(a2);
b3 = g(b2);

So in my projects, I have a section of code as follows (example 1)

if(x < src.cols && y < src.rows)
{
    if(!mask(y,x))
    {
        src.ptr(y)[x] = make_short4(0,0,0,0);
    }
}

and I rewrite it as follows (example2)

if(x < src.cols && y < src.rows)
{
    if(!mask(y,x))
    {
        short4 t;
        t.x = 0;
        t.y = 0;
        t.z = 0;
        t.w = 0;
        src.ptr(y)[x].x = t.x;
        src.ptr(y)[x].y = t.y;
        src.ptr(y)[x].z = t.z;
        src.ptr(y)[x].w = t.w;  
     }
}

In the Kepler architecture, the example2 will be more efficient and exhibit better performance than example1, is that right?

回答1:

A good explanation on Instruction Level Parallelism (ILP) can be found at CUDA Performance: Maximizing Instruction-Level Parallelism.

It has been pointed out by Robert Crovella and talonmies, and it has been recognized by yourself, that your example above does not reach ILP.

Concerning how implementing ILP, I'm showing below the classical example, translated from the PyCUDA code at numbapro-examples, which I have tested for a Fermi and for a Kepler GPU. Please, notice that for the latter case I have not observed relevant speedups.

THE CODE

#include <stdio.h>
#include <time.h>

#define BLOCKSIZE 64

/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){
    return ((a % b) != 0) ? (a / b + 1) : (a / b);
}

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

/************************************/
/* NO INSTRUCTION LEVEL PARALLELISM */
/************************************/
__global__ void ILP0(float* d_a, float* d_b, float* d_c) {

    int i = threadIdx.x + blockIdx.x * blockDim.x;

    d_c[i] = d_a[i] + d_b[i];

}

/************************************/
/* INSTRUCTION LEVEL PARALLELISM X2 */
/************************************/
__global__ void ILP2(float* d_a, float* d_b, float* d_c) {

    // --- Loading the data
    int i = threadIdx.x + blockIdx.x * blockDim.x;

    float ai = d_a[i];
    float bi = d_b[i];

    int stride = gridDim.x * blockDim.x;

    int j = i + stride;
    float aj = d_a[j];
    float bj = d_b[j];

    // --- Computing
    float ci = ai + bi;
    float cj = aj + bj;

    // --- Writing the data
    d_c[i] = ci;
    d_c[j] = cj;

}

/************************************/
/* INSTRUCTION LEVEL PARALLELISM X4 */
/************************************/
__global__ void ILP4(float* d_a, float* d_b, float* d_c) {

    // --- Loading the data
    int i = threadIdx.x + blockIdx.x * blockDim.x;

    float ai = d_a[i];
    float bi = d_b[i];

    int stride = gridDim.x * blockDim.x;

    int j = i + stride;
    float aj = d_a[j];
    float bj = d_b[j];

    int k = j + stride;
    float ak = d_a[k];
    float bk = d_b[k];

    int l = k + stride;
    float al = d_a[l];
    float bl = d_b[l];

    // --- Computing
    float ci = ai + bi;
    float cj = aj + bj;
    float ck = ak + bk;
    float cl = al + bl;

    // --- Writing the data
    d_c[i] = ci;
    d_c[j] = cj;
    d_c[k] = ck;
    d_c[l] = cl;

}

/************************************/
/* INSTRUCTION LEVEL PARALLELISM X8 */
/************************************/
__global__ void ILP8(float* d_a, float* d_b, float* d_c) {

    // --- Loading the data
    int i = threadIdx.x + blockIdx.x * blockDim.x;

    float ai = d_a[i];
    float bi = d_b[i];

    int stride = gridDim.x * blockDim.x;

    int j = i + stride;
    float aj = d_a[j];
    float bj = d_b[j];

    int k = j + stride;
    float ak = d_a[k];
    float bk = d_b[k];

    int l = k + stride;
    float al = d_a[l];
    float bl = d_b[l];

    int m = l + stride;
    float am = d_a[m];
    float bm = d_b[m];

    int n = m + stride;
    float an = d_a[n];
    float bn = d_b[n];

    int p = n + stride;
    float ap = d_a[p];
    float bp = d_b[p];

    int q = p + stride;
    float aq = d_a[q];
    float bq = d_b[q];

    // --- Computing
    float ci = ai + bi;
    float cj = aj + bj;
    float ck = ak + bk;
    float cl = al + bl;
    float cm = am + bm;
    float cn = an + bn;
    float cp = ap + bp;
    float cq = aq + bq;

    // --- Writing the data
    d_c[i] = ci;
    d_c[j] = cj;
    d_c[k] = ck;
    d_c[l] = cl;
    d_c[m] = cm;
    d_c[n] = cn;
    d_c[p] = cp;
    d_c[q] = cq;

}

/********/
/* MAIN */
/********/
void main() {

    float timing;
    cudaEvent_t start, stop;

    const int N = 65536*4; // --- ASSUMPTION: N can be divided by BLOCKSIZE

    float* a = (float*)malloc(N*sizeof(float));
    float* b = (float*)malloc(N*sizeof(float));
    float* c = (float*)malloc(N*sizeof(float));
    float* c_ref = (float*)malloc(N*sizeof(float));

    srand(time(NULL));
    for (int i=0; i<N; i++) {

        a[i] = rand() / RAND_MAX;
        b[i] = rand() / RAND_MAX;
        c_ref[i] = a[i] + b[i];

    }

    float* d_a; gpuErrchk(cudaMalloc((void**)&d_a,N*sizeof(float)));
    float* d_b; gpuErrchk(cudaMalloc((void**)&d_b,N*sizeof(float)));
    float* d_c0; gpuErrchk(cudaMalloc((void**)&d_c0,N*sizeof(float)));
    float* d_c2; gpuErrchk(cudaMalloc((void**)&d_c2,N*sizeof(float)));
    float* d_c4; gpuErrchk(cudaMalloc((void**)&d_c4,N*sizeof(float)));
    float* d_c8; gpuErrchk(cudaMalloc((void**)&d_c8,N*sizeof(float)));

    gpuErrchk(cudaMemcpy(d_a, a, N*sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_b, b, N*sizeof(float), cudaMemcpyHostToDevice));

    /******************/
    /* ILP0 TEST CASE */
    /******************/
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);
    ILP0<<<iDivUp(N,BLOCKSIZE),BLOCKSIZE>>>(d_a, d_b, d_c0);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&timing, start, stop);
    printf("Elapsed time - ILP0:  %3.3f ms \n", timing);

    gpuErrchk(cudaMemcpy(c, d_c0, N*sizeof(float), cudaMemcpyDeviceToHost));

    // --- Checking the results
    for (int i=0; i<N; i++)
        if (c[i] != c_ref[i]) {

            printf("Error!\n");
            return;

        }

    printf("Test passed!\n");

    /******************/
    /* ILP2 TEST CASE */
    /******************/
    cudaEventRecord(start, 0);
    ILP2<<<(N/2)/BLOCKSIZE,BLOCKSIZE>>>(d_a, d_b, d_c2);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&timing, start, stop);
    printf("Elapsed time - ILP2:  %3.3f ms \n", timing);

    gpuErrchk(cudaMemcpy(c, d_c2, N*sizeof(float), cudaMemcpyDeviceToHost));

    // --- Checking the results
    for (int i=0; i<N; i++)
        if (c[i] != c_ref[i]) {

            printf("Error!\n");
            return;

        }

    printf("Test passed!\n");

    /******************/
    /* ILP4 TEST CASE */
    /******************/
    cudaEventRecord(start, 0);
    ILP4<<<(N/4)/BLOCKSIZE,BLOCKSIZE>>>(d_a, d_b, d_c4);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&timing, start, stop);
    printf("Elapsed time - ILP4:  %3.3f ms \n", timing);

    gpuErrchk(cudaMemcpy(c, d_c4, N*sizeof(float), cudaMemcpyDeviceToHost));

    // --- Checking the results
    for (int i=0; i<N; i++)
        if (c[i] != c_ref[i]) {

            printf("Error!\n");
            return;

        }

    printf("Test passed!\n");

    /******************/
    /* ILP8 TEST CASE */
    /******************/
    cudaEventRecord(start, 0);
    ILP8<<<(N/8)/BLOCKSIZE,BLOCKSIZE>>>(d_a, d_b, d_c8);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&timing, start, stop);
    printf("Elapsed time - ILP8:  %3.3f ms \n", timing);

    gpuErrchk(cudaMemcpy(c, d_c8, N*sizeof(float), cudaMemcpyDeviceToHost));

    // --- Checking the results
    for (int i=0; i<N; i++)
        if (c[i] != c_ref[i]) {

            printf("%f %f\n",c[i],c_ref[i]);
            printf("Error!\n");
            return;

        }

    printf("Test passed!\n");

}

PERFORMANCE

Card                    Kernel          Time [ms]            Speedup
GeForce GT540M          ILP0            4.609                1
      "                 ILP2            2.666                1.72
      "                 ILP4            1.675                2.76
      "                 ILP8            1.477                3.12

Kepler K20c             ILP0            0.045                
      "                 ILP2            0.043                
      "                 ILP4            0.043                
      "                 ILP8            0.042

来源：https://stackoverflow.com/questions/24771730/the-efficiency-and-performance-of-ilp-for-the-nvidia-kepler-architecture

标签

cuda

kepler