I am interested in obtaining the number of nano seconds it would take to execute 1 double precision FLOP on GeForce GTX 550 Ti.
In order to do that I am following this
Compute capability 2.1 devices has a double precision throughput of 4 operations per cycle (8 if doing DFMA). This assumes all 32 threads are active in the dispatched warp.
4 ops/cycle/SM * 4 SMs * 1800 MHz * 2 ops/DFMA = 56 GFLOPS double
The calculation assumes all threads in a warp are active.
The code in your question contains two dependent operations that could be fused into a DFMA. Use cuobjdump -sass to examine the assembly. If you launch multiple warps on the same SM the test turns into a measure of dependent instruction throughput not latency.
You should be aware that there is a problem in design of the kernel which means that whatever measurements you make using this code will have absolutely no relationship to double precision instruction throughput.
Because the result of the computational loop containing all the double precision operations is not used in a memory write, it is getting removed by compiler optimisation. The CUDA 5 compiler emits the following PTX for your kernel:
.visible .entry _Z12bench_singlePf(
.param .u32 _Z12bench_singlePf_param_0
)
{
.local .align 8 .b8 __local_depot0[8];
.reg .b32 %SP;
.reg .b32 %SPL;
.reg .s32 %r<16>;
mov.u32 %SPL, __local_depot0;
cvta.local.u32 %SP, %SPL;
add.u32 %r3, %SP, 0;
.loc 2 13 1
cvta.to.local.u32 %r4, %r3;
// inline asm
mov.u32 %r1, %clock;
// inline asm
// inline asm
mov.u32 %r2, %clock;
// inline asm
st.local.v2.u32 [%r4], {%r2, %r1};
cvta.const.u32 %r5, $str;
// Callseq Start 0
{
.reg .b32 temp_param_reg;
.param .b32 param0;
st.param.b32 [param0+0], %r5;
.param .b32 param1;
st.param.b32 [param1+0], %r3;
.param .b32 retval0;
.loc 2 13 1
call.uni (retval0),
vprintf,
(
param0,
param1
);
ld.param.b32 %r6, [retval0+0];
}
// Callseq End 0
.loc 2 14 1
sub.s32 %r7, %r2, %r1;
cvta.const.u32 %r8, $str1;
st.local.u32 [%r4], %r7;
// Callseq Start 1
{
.reg .b32 temp_param_reg;
.param .b32 param0;
st.param.b32 [param0+0], %r8;
.param .b32 param1;
st.param.b32 [param1+0], %r3;
.param .b32 retval0;
.loc 2 14 1
call.uni (retval0),
vprintf,
(
param0,
param1
);
ld.param.b32 %r9, [retval0+0];
}
// Callseq End 1
.loc 2 15 2
ret;
}
The two clock load instructions are adjacent and the only other code is calls to printf
. There is no computational loop in that PTX.
You should redesign your kernel so that the compiler can't deduce that the loop result is unused and optimise it away. One approach would be something like this:
__global__
void bench_single(float *data, int flag=0)
{
int i;
double x = 1.;
clock_t start, end;
start = clock();
for(i=0; i<1000000; i++) {
x = x * 2.388415813 + 1.253314137;
}
end = clock();
printf("End and start %d - %d\n", end, start);
printf("Finished in %d cycles\n", end-start);
if (flag) {
data[blockIdx.x] = x;
}
}
The conditional write at the end of the kernel prevents the compiler for optimising the loop away, so now the compiler emits this PTX:
.visible .entry _Z12bench_singlePfi(
.param .u32 _Z12bench_singlePfi_param_0,
.param .u32 _Z12bench_singlePfi_param_1
)
{
.local .align 8 .b8 __local_depot0[8];
.reg .b32 %SP;
.reg .b32 %SPL;
.reg .pred %p<3>;
.reg .f32 %f<2>;
.reg .s32 %r<28>;
.reg .f64 %fd<44>;
mov.u32 %SPL, __local_depot0;
cvta.local.u32 %SP, %SPL;
ld.param.u32 %r6, [_Z12bench_singlePfi_param_0];
ld.param.u32 %r7, [_Z12bench_singlePfi_param_1];
add.u32 %r10, %SP, 0;
.loc 2 13 1
cvta.to.local.u32 %r1, %r10;
// inline asm
mov.u32 %r8, %clock;
// inline asm
mov.f64 %fd43, 0d3FF0000000000000;
mov.u32 %r27, 1000000;
BB0_1:
.loc 2 10 1
fma.rn.f64 %fd4, %fd43, 0d40031B79BFF0AC8C, 0d3FF40D931FE078AF;
fma.rn.f64 %fd5, %fd4, 0d40031B79BFF0AC8C, 0d3FF40D931FE078AF;
fma.rn.f64 %fd6, %fd5, 0d40031B79BFF0AC8C, 0d3FF40D931FE078AF;
fma.rn.f64 %fd7, %fd6, 0d40031B79BFF0AC8C, 0d3FF40D931FE078AF;
fma.rn.f64 %fd8, %fd7, 0d40031B79BFF0AC8C, 0d3FF40D931FE078AF;
fma.rn.f64 %fd9, %fd8, 0d40031B79BFF0AC8C, 0d3FF40D931FE078AF;
fma.rn.f64 %fd10, %fd9, 0d40031B79BFF0AC8C, 0d3FF40D931FE078AF;
fma.rn.f64 %fd11, %fd10, 0d40031B79BFF0AC8C, 0d3FF40D931FE078AF;
fma.rn.f64 %fd12, %fd11, 0d40031B79BFF0AC8C, 0d3FF40D931FE078AF;
fma.rn.f64 %fd13, %fd12, 0d40031B79BFF0AC8C, 0d3FF40D931FE078AF;
fma.rn.f64 %fd14, %fd13, 0d40031B79BFF0AC8C, 0d3FF40D931FE078AF;
fma.rn.f64 %fd15, %fd14, 0d40031B79BFF0AC8C, 0d3FF40D931FE078AF;
fma.rn.f64 %fd16, %fd15, 0d40031B79BFF0AC8C, 0d3FF40D931FE078AF;
fma.rn.f64 %fd17, %fd16, 0d40031B79BFF0AC8C, 0d3FF40D931FE078AF;
fma.rn.f64 %fd18, %fd17, 0d40031B79BFF0AC8C, 0d3FF40D931FE078AF;
fma.rn.f64 %fd19, %fd18, 0d40031B79BFF0AC8C, 0d3FF40D931FE078AF;
fma.rn.f64 %fd20, %fd19, 0d40031B79BFF0AC8C, 0d3FF40D931FE078AF;
fma.rn.f64 %fd21, %fd20, 0d40031B79BFF0AC8C, 0d3FF40D931FE078AF;
fma.rn.f64 %fd22, %fd21, 0d40031B79BFF0AC8C, 0d3FF40D931FE078AF;
fma.rn.f64 %fd23, %fd22, 0d40031B79BFF0AC8C, 0d3FF40D931FE078AF;
fma.rn.f64 %fd24, %fd23, 0d40031B79BFF0AC8C, 0d3FF40D931FE078AF;
fma.rn.f64 %fd25, %fd24, 0d40031B79BFF0AC8C, 0d3FF40D931FE078AF;
fma.rn.f64 %fd26, %fd25, 0d40031B79BFF0AC8C, 0d3FF40D931FE078AF;
fma.rn.f64 %fd27, %fd26, 0d40031B79BFF0AC8C, 0d3FF40D931FE078AF;
fma.rn.f64 %fd28, %fd27, 0d40031B79BFF0AC8C, 0d3FF40D931FE078AF;
fma.rn.f64 %fd29, %fd28, 0d40031B79BFF0AC8C, 0d3FF40D931FE078AF;
fma.rn.f64 %fd30, %fd29, 0d40031B79BFF0AC8C, 0d3FF40D931FE078AF;
fma.rn.f64 %fd31, %fd30, 0d40031B79BFF0AC8C, 0d3FF40D931FE078AF;
fma.rn.f64 %fd32, %fd31, 0d40031B79BFF0AC8C, 0d3FF40D931FE078AF;
fma.rn.f64 %fd33, %fd32, 0d40031B79BFF0AC8C, 0d3FF40D931FE078AF;
fma.rn.f64 %fd34, %fd33, 0d40031B79BFF0AC8C, 0d3FF40D931FE078AF;
fma.rn.f64 %fd35, %fd34, 0d40031B79BFF0AC8C, 0d3FF40D931FE078AF;
fma.rn.f64 %fd36, %fd35, 0d40031B79BFF0AC8C, 0d3FF40D931FE078AF;
fma.rn.f64 %fd37, %fd36, 0d40031B79BFF0AC8C, 0d3FF40D931FE078AF;
fma.rn.f64 %fd38, %fd37, 0d40031B79BFF0AC8C, 0d3FF40D931FE078AF;
fma.rn.f64 %fd39, %fd38, 0d40031B79BFF0AC8C, 0d3FF40D931FE078AF;
fma.rn.f64 %fd40, %fd39, 0d40031B79BFF0AC8C, 0d3FF40D931FE078AF;
fma.rn.f64 %fd41, %fd40, 0d40031B79BFF0AC8C, 0d3FF40D931FE078AF;
fma.rn.f64 %fd42, %fd41, 0d40031B79BFF0AC8C, 0d3FF40D931FE078AF;
fma.rn.f64 %fd43, %fd42, 0d40031B79BFF0AC8C, 0d3FF40D931FE078AF;
.loc 2 9 1
add.s32 %r27, %r27, -40;
setp.ne.s32 %p1, %r27, 0;
@%p1 bra BB0_1;
cvta.to.global.u32 %r5, %r6;
// inline asm
mov.u32 %r11, %clock;
// inline asm
.loc 2 13 1
st.local.v2.u32 [%r1], {%r11, %r8};
cvta.const.u32 %r12, $str;
// Callseq Start 0
{
.reg .b32 temp_param_reg;
.param .b32 param0;
st.param.b32 [param0+0], %r12;
.param .b32 param1;
st.param.b32 [param1+0], %r10;
.param .b32 retval0;
.loc 2 13 1
call.uni (retval0),
vprintf,
(
param0,
param1
);
ld.param.b32 %r14, [retval0+0];
}
// Callseq End 0
.loc 2 14 1
sub.s32 %r15, %r11, %r8;
cvta.const.u32 %r16, $str1;
st.local.u32 [%r1], %r15;
// Callseq Start 1
{
.reg .b32 temp_param_reg;
.param .b32 param0;
st.param.b32 [param0+0], %r16;
.param .b32 param1;
st.param.b32 [param1+0], %r10;
.param .b32 retval0;
.loc 2 14 1
call.uni (retval0),
vprintf,
(
param0,
param1
);
ld.param.b32 %r17, [retval0+0];
}
// Callseq End 1
.loc 2 16 1
setp.eq.s32 %p2, %r7, 0;
@%p2 bra BB0_4;
.loc 2 17 1
cvt.rn.f32.f64 %f1, %fd43;
mov.u32 %r18, %ctaid.x;
shl.b32 %r19, %r18, 2;
add.s32 %r20, %r5, %r19;
st.global.f32 [%r20], %f1;
BB0_4:
.loc 2 19 2
ret;
}
Note now there is a nice stream of floating point multiply-add instructions from the where the compiler partially unrolled the loop.
As Greg Smith pointed out, you shouldn't expect to get a real measure of instruction throughput until you have enough warps running on a given SM to overcome instruction scheduling latency. That probably means you will want to run at least one large block. Also note that the printf call will have a large negative influence on throughput. You will get a more representative number if you have only one thread per block write out its result, or (better still) store it to global memory. Run a large number of blocks and you will get a number of measurements you can average. As a final check, you should also disassemble the object code with cudaobjdump
to ensure that the assembler doesn't move around the position of the clock read instructions, otherwise the in-kernel timing you rely on will break. Older versions of the assembler had a habit of instruction reordering that could break the functionality of a series of clock reads inserted into kernel C code or PTX.