Significant FMA performance anomaly experienced in the Intel Broadwell processor

前端未结

关注

 2  894

Code1:

vzeroall
mov             rcx, 1000000
startLabel1:
vfmadd231ps     ymm0, ymm0, ymm0
vfmadd231ps     ymm1, ymm1, ymm1
vfmadd231ps     ymm2, ym


                      
              相关标签:


      
      
        
          2条回答        

        
                         				            
            
           
            
                              
                
              
              
                
                  情深已故        
                
              
                            
                2020-12-05 00:43
              
            
            
                                                                       
Updated

I've got no explanation for you, since I'm on Haswell, but I do have code to share that might help you or someone else with Broadwell or Skylake hardware isolate your problem. If you could please run it on your machine and share the results, we could gain an insight into what's happening to your machine.

Intro

Recent Intel Core i7 processors have 7 performance monitor counters (PMCs), 3 fixed-function and 4 general-purpose, that may be used to profile code. The fixed-function PMCs are:


Instructions retired
Unhalted core cycles (Clock ticks including the effects of TurboBoost)
Unhalted Reference cycles (Fixed-frequency clock ticks)


The ratio of core:reference clock cycles determines the relative speedup or slowdown from dynamic frequency scaling.

Although software exists (see comments below) that accesses these counters, I did not know them and still find them to be insufficiently fine-grained.

I therefore wrote myself a Linux kernel module, perfcount, over the past few days to grant me access to the Intel performance counter monitors, and a userspace testbench and library for your code that wraps your FMA code around calls to my LKM. Instructions for how to reproduce my setup will follow.

My testbench source code is below. It warms up, then runs your code several times, testing it over a long list of metrics. I changed your loop count to 1 billion. Because only 4 general-purpose PMCs can be programmed at once, I do the measurements 4 at a time.

perfcountdemo.c

/* Includes */
#include "libperfcount.h"
#include <ctype.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>


/* Function prototypes */
void code1(void);
void code2(void);
void code3(void);
void code4(void);
void code5(void);

/* Global variables */
void ((*FN_TABLE[])(void)) = {
    code1,
    code2,
    code3,
    code4,
    code5
};


/**
 * Code snippets to bench
 */

void code1(void){
    asm volatile(
    ".intel_syntax noprefix\n\t"
    "vzeroall\n\t"
    "mov             rcx, 1000000000\n\t"
    "LstartLabel1:\n\t"
    "vfmadd231ps     %%ymm0, %%ymm0, %%ymm0\n\t"
    "vfmadd231ps     ymm1, ymm1, ymm1\n\t"
    "vfmadd231ps     ymm2, ymm2, ymm2\n\t"
    "vfmadd231ps     ymm3, ymm3, ymm3\n\t"
    "vfmadd231ps     ymm4, ymm4, ymm4\n\t"
    "vfmadd231ps     ymm5, ymm5, ymm5\n\t"
    "vfmadd231ps     ymm6, ymm6, ymm6\n\t"
    "vfmadd231ps     ymm7, ymm7, ymm7\n\t"
    "vfmadd231ps     ymm8, ymm8, ymm8\n\t"
    "vfmadd231ps     ymm9, ymm9, ymm9\n\t"
    "vpaffffd          ymm10, ymm10, ymm10\n\t"
    "vpaffffd          ymm11, ymm11, ymm11\n\t"
    "vpaffffd          ymm12, ymm12, ymm12\n\t"
    "vpaffffd          ymm13, ymm13, ymm13\n\t"
    "vpaffffd          ymm14, ymm14, ymm14\n\t"
    "dec             rcx\n\t"
    "jnz             LstartLabel1\n\t"
    ".att_syntax noprefix\n\t"
    : /* No outputs we care about */
    : /* No inputs we care about */
    : "xmm0",  "xmm1",  "xmm2",  "xmm3",  "xmm4",  "xmm5",  "xmm6",  "xmm7",
      "xmm8",  "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15",
      "rcx",
      "memory"
    );
}
void code2(void){

}
void code3(void){

}
void code4(void){

}
void code5(void){

}



/* Test Schedule */
const char* const SCHEDULE[] = {
    /* Batch */
    "uops_issued.any",
    "uops_issued.any<1",
    "uops_issued.any>=1",
    "uops_issued.any>=2",
    /* Batch */
    "uops_issued.any>=3",
    "uops_issued.any>=4",
    "uops_issued.any>=5",
    "uops_issued.any>=6",
    /* Batch */
    "uops_executed_port.port_0",
    "uops_executed_port.port_1",
    "uops_executed_port.port_2",
    "uops_executed_port.port_3",
    /* Batch */
    "uops_executed_port.port_4",
    "uops_executed_port.port_5",
    "uops_executed_port.port_6",
    "uops_executed_port.port_7",
    /* Batch */
    "resource_stalls.any",
    "resource_stalls.rs",
    "resource_stalls.sb",
    "resource_stalls.rob",
    /* Batch */
    "uops_retired.all",
    "uops_retired.all<1",
    "uops_retired.all>=1",
    "uops_retired.all>=2",
    /* Batch */
    "uops_retired.all>=3",
    "uops_retired.all>=4",
    "uops_retired.all>=5",
    "uops_retired.all>=6",
    /* Batch */
    "inst_retired.any_p",
    "inst_retired.any_p<1",
    "inst_retired.any_p>=1",
    "inst_retired.any_p>=2",
    /* Batch */
    "inst_retired.any_p>=3",
    "inst_retired.any_p>=4",
    "inst_retired.any_p>=5",
    "inst_retired.any_p>=6",
    /* Batch */
    "idq_uops_not_delivered.core",
    "idq_uops_not_delivered.core<1",
    "idq_uops_not_delivered.core>=1",
    "idq_uops_not_delivered.core>=2",
    /* Batch */
    "idq_uops_not_delivered.core>=3",
    "idq_uops_not_delivered.core>=4",
    "rs_events.empty",
    "idq.empty",
    /* Batch */
    "idq.mite_all_uops",
    "idq.mite_all_uops<1",
    "idq.mite_all_uops>=1",
    "idq.mite_all_uops>=2",
    /* Batch */
    "idq.mite_all_uops>=3",
    "idq.mite_all_uops>=4",
    "move_elimination.int_not_eliminated",
    "move_elimination.simd_not_eliminated",
    /* Batch */
    "lsd.uops",
    "lsd.uops<1",
    "lsd.uops>=1",
    "lsd.uops>=2",
    /* Batch */
    "lsd.uops>=3",
    "lsd.uops>=4",
    "ild_stall.lcp",
    "ild_stall.iq_full",
    /* Batch */
    "br_inst_exec.all_branches",
    "br_inst_exec.0x81",
    "br_inst_exec.0x82",
    "icache.misses",
    /* Batch */
    "br_misp_exec.all_branches",
    "br_misp_exec.0x81",
    "br_misp_exec.0x82",
    "fp_assist.any",
    /* Batch */
    "cpu_clk_unhalted.core_clk",
    "cpu_clk_unhalted.ref_xclk",
    "baclears.any"

};
const int NUMCOUNTS = sizeof(SCHEDULE)/sizeof(*SCHEDULE);


/**
 * Main
 */

int main(int argc, char* argv[]){
    int i;

    /**
     * Initialize
     */

    pfcInit();
    if(argc <= 1){
        pfcDumpEvents();
        exit(1);
    }
    pfcPinThread(3);


    /**
     * Arguments are:
     * 
     *     perfcountdemo #codesnippet
     * 
     * There is a schedule of configuration that is followed.
     */

    void (*fn)(void) = FN_TABLE[strtoull(argv[1], NULL, 0)];
    static const uint64_t ZERO_CNT[7] = {0,0,0,0,0,0,0};
    static const uint64_t ZERO_CFG[7] = {0,0,0,0,0,0,0};

    uint64_t cnt[7]                   = {0,0,0,0,0,0,0};
    uint64_t cfg[7]                   = {2,2,2,0,0,0,0};

    /* Warmup */
    for(i=0;i<10;i++){
        fn();
    }

    /* Run master loop */
    for(i=0;i<NUMCOUNTS;i+=4){
        /* Configure counters */
        const char* sched0 = i+0 < NUMCOUNTS ? SCHEDULE[i+0] : "";
        const char* sched1 = i+1 < NUMCOUNTS ? SCHEDULE[i+1] : "";
        const char* sched2 = i+2 < NUMCOUNTS ? SCHEDULE[i+2] : "";
        const char* sched3 = i+3 < NUMCOUNTS ? SCHEDULE[i+3] : "";
        cfg[3] = pfcParseConfig(sched0);
        cfg[4] = pfcParseConfig(sched1);
        cfg[5] = pfcParseConfig(sched2);
        cfg[6] = pfcParseConfig(sched3);

        pfcWrConfigCnts(0, 7, cfg);
        pfcWrCountsCnts(0, 7, ZERO_CNT);
        pfcRdCountsCnts(0, 7, cnt);
        /* ^ Should report 0s, and launch the counters. */
        /************** Hot section **************/
        fn();
        /************ End Hot section ************/
        pfcRdCountsCnts(0, 7, cnt);
        pfcWrConfigCnts(0, 7, ZERO_CFG);
        /* ^ Should clear the counter config and disable them. */

        /**
         * Print the lovely results
         */

        printf("Instructions Issued                : %20llu\n", cnt[0]);
        printf("Unhalted core cycles               : %20llu\n", cnt[1]);
        printf("Unhalted reference cycles          : %20llu\n", cnt[2]);
        printf("%-35s: %20llu\n", sched0, cnt[3]);
        printf("%-35s: %20llu\n", sched1, cnt[4]);
        printf("%-35s: %20llu\n", sched2, cnt[5]);
        printf("%-35s: %20llu\n", sched3, cnt[6]);
    }

    /**
     * Close up shop
     */

    pfcFini();
}


On my machine, I got the following results:

Haswell Core i7-4700MQ

> ./perfcountdemo 0
Instructions Issued                :          17000001807
Unhalted core cycles               :           5305920785
Unhalted reference cycles          :           4245764952
uops_issued.any                    :          16000811079
uops_issued.any<1                  :           1311417889
uops_issued.any>=1                 :           4000292290
uops_issued.any>=2                 :           4000229358
Instructions Issued                :          17000001806
Unhalted core cycles               :           5303822082
Unhalted reference cycles          :           4243345896
uops_issued.any>=3                 :           4000156998
uops_issued.any>=4                 :           4000110067
uops_issued.any>=5                 :                    0
uops_issued.any>=6                 :                    0
Instructions Issued                :          17000001811
Unhalted core cycles               :           5314227923
Unhalted reference cycles          :           4252020624
uops_executed_port.port_0          :           5016261477
uops_executed_port.port_1          :           5036728509
uops_executed_port.port_2          :                 5282
uops_executed_port.port_3          :                12481
Instructions Issued                :          17000001816
Unhalted core cycles               :           5329351248
Unhalted reference cycles          :           4265809728
uops_executed_port.port_4          :                 7087
uops_executed_port.port_5          :           4946019835
uops_executed_port.port_6          :           1000228324
uops_executed_port.port_7          :                 1372
Instructions Issued                :          17000001816
Unhalted core cycles               :           5325153463
Unhalted reference cycles          :           4261060248
resource_stalls.any                :           1322734589
resource_stalls.rs                 :            844250210
resource_stalls.sb                 :                    0
resource_stalls.rob                :                    0
Instructions Issued                :          17000001814
Unhalted core cycles               :           5327823817
Unhalted reference cycles          :           4262914728
uops_retired.all                   :          16000445793
uops_retired.all<1                 :            687284798
uops_retired.all>=1                :           4646263984
uops_retired.all>=2                :           4452324050
Instructions Issued                :          17000001809
Unhalted core cycles               :           5311736558
Unhalted reference cycles          :           4250015688
uops_retired.all>=3                :           3545695253
uops_retired.all>=4                :           3341664653
uops_retired.all>=5                :                 1016
uops_retired.all>=6                :                    1
Instructions Issued                :          17000001871
Unhalted core cycles               :           5477215269
Unhalted reference cycles          :           4383891984
inst_retired.any_p                 :          17000001871
inst_retired.any_p<1               :            891904306
inst_retired.any_p>=1              :           4593972062
inst_retired.any_p>=2              :           4441024510
Instructions Issued                :          17000001835
Unhalted core cycles               :           5377202052
Unhalted reference cycles          :           4302895152
inst_retired.any_p>=3              :           3555852364
inst_retired.any_p>=4              :           3369559466
inst_retired.any_p>=5              :            999980244
inst_retired.any_p>=6              :                    0
Instructions Issued                :          17000001826
Unhalted core cycles               :           5349373678
Unhalted reference cycles          :           4280991912
idq_uops_not_delivered.core        :              1580573
idq_uops_not_delivered.core<1      :           5354931839
idq_uops_not_delivered.core>=1     :               471248
idq_uops_not_delivered.core>=2     :               418625
Instructions Issued                :          17000001808
Unhalted core cycles               :           5309687640
Unhalted reference cycles          :           4248083976
idq_uops_not_delivered.core>=3     :               280800
idq_uops_not_delivered.core>=4     :               247923
rs_events.empty                    :                    0
idq.empty                          :               649944
Instructions Issued                :          17000001838
Unhalted core cycles               :           5392229041
Unhalted reference cycles          :           4315704216
idq.mite_all_uops                  :              2496139
idq.mite_all_uops<1                :           5397877484
idq.mite_all_uops>=1               :               971582
idq.mite_all_uops>=2               :               595973
Instructions Issued                :          17000001822
Unhalted core cycles               :           5347205506
Unhalted reference cycles          :           4278845208
idq.mite_all_uops>=3               :               394011
idq.mite_all_uops>=4               :               335205
move_elimination.int_not_eliminated:                    0
move_elimination.simd_not_eliminated:                    0
Instructions Issued                :          17000001812
Unhalted core cycles               :           5320621549
Unhalted reference cycles          :           4257095280
lsd.uops                           :          15999287982
lsd.uops<1                         :           1326629729
lsd.uops>=1                        :           3999821996
lsd.uops>=2                        :           3999821996
Instructions Issued                :          17000001813
Unhalted core cycles               :           5320533147
Unhalted reference cycles          :           4257105096
lsd.uops>=3                        :           3999823498
lsd.uops>=4                        :           3999823498
ild_stall.lcp                      :                    0
ild_stall.iq_full                  :                 3468
Instructions Issued                :          17000001813
Unhalted core cycles               :           5323278281
Unhalted reference cycles          :           4258969200
br_inst_exec.all_branches          :           1000016626
br_inst_exec.0x81                  :           1000016616
br_inst_exec.0x82                  :                    0
icache.misses                      :                  294
Instructions Issued                :          17000001812
Unhalted core cycles               :           5315098728
Unhalted reference cycles          :           4253082504
br_misp_exec.all_branches          :                    5
br_misp_exec.0x81                  :                    2
br_misp_exec.0x82                  :                    0
fp_assist.any                      :                    0
Instructions Issued                :          17000001819
Unhalted core cycles               :           5338484610
Unhalted reference cycles          :           4271432976
cpu_clk_unhalted.core_clk          :           5338494250
cpu_clk_unhalted.ref_xclk          :            177976806
baclears.any                       :                    1
                                   :                    0


We may see that on Haswell, everything is well-oiled. I'll make a few notes from the above stats:


Instructions issued is incredibly consistent for me. It's always around 17000001800, which is a good sign: It means we can make a very good estimate of our overhead. Idem for the other fixed-function counters. The fact that they all match reasonably well means that the tests in batches of 4 are apples-to-apples comparisons.
With a ratio of core:reference cycles of around 5305920785/4245764952, we get an average frequency scaling of ~1.25; This jives well with my observations that my core clocked up from 2.4 GHz to 3.0 GHz. cpu_clk_unhalted.core_clk/(10.0*cpu_clk_unhalted.ref_xclk) gives just under 3 GHz too.
The ratio of instructions issued to core cycles gives the IPC, 17000001807/5305920785 ~ 3.20, which is also about right: 2 FMA+1 VPADDD every clock cycle for 4 clock cycles, and 2 extra loop control instructions every 5th clock cycle that go in parallel.
uops_issued.any: The number of instructions issued is ~17B, but the number of uops issued is ~16B. That's because the two instructions for loop control are fusing together; Good sign. Moreover, around 1.3B clock cycles out of 5.3B (25% of the time), no uops were issued, while the near-totality of the rest of the time (4B clock cycles), 4 uops issued at a time.
uops_executed_port.port_[0-7]: Port saturation. We're in good health. Of the 16B post-fusion uops, Ports 0, 1 and 5 ate 5B uops each over 5.3B cycles (Which means they were distributed optimally: Float, float, int respectively), Port 6 ate 1B (the fused dec-branch op), and ports 2, 3, 4 and 7 ate negligible amounts by comparison.
resource_stalls: 1.3B of them occurred, 2/3 of which were due to the reservation station (RS) and the other third to unknown causes.
From the cumulative distribution we built with our comparisons on uops_retired.all and inst_retired.all, we know we are retiring 4 uops 60% of the time, 0 uops 13% of the time and 2 uops the rest of the time, with negligible amounts otherwise.
(Numerous *idq* counts): The IDQ only rarely holds us up.
lsd: The Loop Stream Detector is working; Nearly 16B fused uops were supplied to the frontend from it.
ild: Instruction length decoding is not the bottleneck, and not a single length-changing prefix is encountered.
br_inst_exec/br_misp_exec: Branch misprediction is a negligible problem.
icache.misses: Negligible.
fp_assist: Negligible. Denormals not encountered. (I believe that without DAZ denormals-are-zero flushing, they'd require an assist, which should register here)


So on Intel Haswell it's smooth sailing. If you could run my suite on your machines, that would be great.

Instructions for Reproduction


Rule #1: Inspect all my code before doing anything with it. Never blindly trust strangers on the Internet.
Grab perfcountdemo.c, libperfcount.c and libperfcount.h, put them in the same directory and compile them together.
Grab perfcount.c and Makefile, put them in the same directory, and make the kernel module.
Reboot your machine with the GRUB boot flags nmi_watchdog=0 modprobe.blacklist=iTCO_wdt,iTCO_vendor_support. The NMI watchdog will tamper with the unhalted-core-cycle counter otherwise.
insmod perfcount.ko the module. dmesg | tail -n 10 should say it successfully loaded and say there are 3 Ff counters and 4 Gp counters, or else give a reason for failing to do so.
Run my application, preferably while the rest of the system is not under load. Try also changing in perfcountdemo.c the core to which you restrict your affinity by changing the argument to pfcPinThread().
Edit in here the results.

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  一生所求        
                
              
                            
                2020-12-05 00:51
              
            
            
                                                                       
Update: previous version contained a 6 VPADDD instructions (vs 5 in the question), and the extra VPADDD caused imbalance on Broadwell. After it was fixed, Haswell, Broadwell and Skylake issue almost the same number of uops to ports 0, 1 and 5.

There is no port contamination, but uops are scheduled suboptimally, with the majority of uops going to Port 5 on Broadwell, and making it the bottleneck before Ports 0 and 1 are saturated.

To demonstrate what is going on, I suggest to (ab)use the demo on PeachPy.IO:


Open www.peachpy.io in Google Chrome (it wouldn't work in other browsers).
Replace the default code (which implements SDOT function) with the code below, which is literally your example ported to PeachPy syntax:

n = Argument(size_t)
x = Argument(ptr(const_float_))
incx = Argument(size_t)
y = Argument(ptr(const_float_))
incy = Argument(size_t)

with Function("sdot", (n, x, incx, y, incy)) as function:
    reg_n = GeneralPurposeRegister64()
    LOAD.ARGUMENT(reg_n, n)

    VZEROALL()

    with Loop() as loop:
        for i in range(15):
            ymm_i = YMMRegister(i)
            if i < 10:
                VFMADD231PS(ymm_i, ymm_i, ymm_i)
            else:
                VPADDD(ymm_i, ymm_i, ymm_i)
        DEC(reg_n)
        JNZ(loop.begin)

    RETURN()

I have a number of machines on different microarchitectures as a backend for PeachPy.io. Choose Intel Haswell, Intel Broadwell, or Intel Skylake and press "Quick Run". The system will compile your code, upload it to server, and visualize performance counters collected during execution.
Here is the uops distribution over execution ports on Intel Haswell:





And here is the same plot from Intel Broadwell:





Apparently, whatever was the flaw in uops scheduler, it was fixed in Intel Skylake, because port pressure on that machine is the same as on Haswell.

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
                             
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复