How to measure the elapsead time below nanosecond for x86?

后端 未结 3 1162
深忆病人
深忆病人 2020-12-30 18:05

I have searched and used many approaches for measuring the elapsed time. there are many questions for this purpose. For example, this question is very good but when you nee

相关标签:
3条回答
  • 2020-12-30 18:37

    I recommend using this method for x86 micro-architecture.

    NOTE:

    1. NUM_LOOP should be a number which helps to increase the accuracy with repeating your code to record the best time
    2. ttbest_rdtsc must be bigger than the worst time I recommend to maximize it.

    3. I used (you might not want it) OVERAL_TIME as another checking rule because I used this for many kernels and in some cases NUM_LOOP was very big and I didn't want to change it. I planned OVERAL_TIME to limit the iterations and stop after specific time.

    UPDATE: The whole program is this:

    #include <stdio.h>
    #include <x86intrin.h>
    
    #define NUM_LOOP 100 //executes your code NUM_LOOP times to get the smalest time to avoid overheads such as cache misses, etc.
    
    int main()
    {
        long long t1_rdtsc, t2_rdtsc, ttotal_rdtsc, ttbest_rdtsc = 99999999999999999;
        int do_while = 0;
        do{
    
            t1_rdtsc = _rdtsc();
                //put your code here
            t2_rdtsc = _rdtsc();
    
            ttotal_rdtsc = t2_rdtsc - t1_rdtsc;
    
            //store the smalest time:
            if (ttotal_rdtsc<ttbest_rdtsc)
                ttbest_rdtsc = ttotal_rdtsc;
    
        }while (do_while++ < NUM_LOOP); 
    
        printf("\nthe best is %lld in %d repetitions\n", ttbest_rdtsc, NUM_LOOP );
    
        return 0;
    }
    

    that I have changed to this and added to a header for my self then I can use it simply in my program.

    #include <x86intrin.h>
    #define do_while NUM_LOOP
    #define OVERAL_TIME 999999999
    long long t1_rdtsc, t2_rdtsc, ttotal_rdtsc, ttbest_rdtsc = 99999999999999999, elapsed, elapsed_rdtsc=do_while, overal_time = OVERAL_TIME, ttime=0;
    #define begin_rdtsc\
                    do{\
                        t1_rdtsc=_rdtsc();
    
    #define end_rdtsc\
                        t2_rdtsc=_rdtsc();\
                        ttotal_rdtsc=t2_rdtsc-t1_rdtsc;\
                        if (ttotal_rdtsc<ttbest_rdtsc){\
                            ttbest_rdtsc = ttotal_rdtsc;\
                            elapsed=(do_while-elapsed_rdtsc);}\
                        ttime+=ttotal_rdtsc;\
                    }while (elapsed_rdtsc-- && (ttime<overal_time));\
                    printf("\nthe best is %lld in %lldth iteration and %lld repetitions\n", ttbest_rdtsc, elapsed, (do_while-elapsed_rdtsc));
    

    How to use this method? Well, it is very simple!

    int main()
    {
        //before the section
        begin_rdtsc
           //put your code here to measure the clocks.
        end_rdtsc
        return 0
    }
    

    Be creative, You can change it to measure the speedup in your program, etc. An example of the output is:

    the best is 9600 in 384751th iteration and 569179 repetitions
    

    my tested code got 9600 clock that the best was recorded in 384751enditeration and my code was tested 569179 times

    I have tested them on GCC and Clang.

    0 讨论(0)
  • 2020-12-30 18:50

    If you have problem with autovectorizer and want to limit it just add a asm("#somthing"); after your begin_rdtsc it will separate the do-while loop. I just checked and it vectorized your posted code which auto vectorizer was unable to vectorize it. I changed your macro you can use it....

    long long t1_rdtsc, t2_rdtsc, ttotal_rdtsc[do_while], ttbest_rdtsc = 99999999999999999, elapsed,  elapsed_rdtsc=do_while, overal_time = OVERAL_TIME, ttime=0;
    int ii=0;
        #define begin_rdtsc\
                        do{\
                            asm("#mmmmmmmmmmm");\
                            t1_rdtsc=_rdtsc();
    
        #define end_rdtsc\
                            t2_rdtsc=_rdtsc();\
                            asm("#mmmmmmmmmmm");\
                            ttotal_rdtsc[ii]=t2_rdtsc-t1_rdtsc;\
                        }while (ii++<do_while);\    
                        for(ii=0; ii<do_while; ii++){\
                            if (ttotal_rdtsc[ii]<ttbest_rdtsc){\
                                ttbest_rdtsc = ttotal_rdtsc[ii];}}\             
                        printf("\nthe best is %lld in %lld iteration\n", ttbest_rdtsc, elapsed_rdtsc);
    
    0 讨论(0)
  • 2020-12-30 19:01

    I have developed my first answer and got this solution. But, I still want a solution. Because it is very important to measure the time accurately and with the least impacts. I put this part in a header file and include it in main program files.

    //Header file header.h
    #define count 1000 // number of repetition 
    long long t1_rdtsc, t2_rdtsc, ttotal_rdtsc[count], ttbest_rdtsc = 99999999999999999, elapsed,  elapsed_rdtsc=count, overal_time = OVERAL_TIME, ttime=0;
    int ii=0;
    #define begin_rdtsc\
                        do{\
                            t1_rdtsc=_rdtsc();
    
    #define end_rdtsc\
                            t2_rdtsc=_rdtsc();\
                            ttotal_rdtsc[ii]=t2_rdtsc-t1_rdtsc;\
                        }while (ii++<count);\   
                        for(ii=0; ii<do_while; ii++){\
                            if (ttotal_rdtsc[ii]<ttbest_rdtsc){\
                                ttbest_rdtsc = ttotal_rdtsc[ii];}}\             
                        printf("\nthe best is %lld in %lldth iteration \n", ttbest_rdtsc, elapsed_rdtsc);
    
    //Main program
    #include "header.h"
    .
    .
    .
    int main()
    {
        //before the section
        begin_rdtsc
           //put your code here to measure the clocks.
        end_rdtsc
        return 0
    }
    
    0 讨论(0)
提交回复
热议问题