Significant FMA performance anomaly experienced in the Intel Broadwell processor

  • Code1:

    vzeroall
    mov             rcx, 1000000
    startLabel1:
    vfmadd231ps     ymm0, ymm0, ymm0
    vfmadd231ps     ymm1, ymm1, ymm1
    vfmadd231ps     ymm2, ymm2, ymm2
    vfmadd231ps     ymm3, ymm3, ymm3
    vfmadd231ps     ymm4, ymm4, ymm4
    vfmadd231ps     ymm5, ymm5, ymm5
    vfmadd231ps     ymm6, ymm6, ymm6
    vfmadd231ps     ymm7, ymm7, ymm7
    vfmadd231ps     ymm8, ymm8, ymm8
    vfmadd231ps     ymm9, ymm9, ymm9
    vpaddd          ymm10, ymm10, ymm10
    vpaddd          ymm11, ymm11, ymm11
    vpaddd          ymm12, ymm12, ymm12
    vpaddd          ymm13, ymm13, ymm13
    vpaddd          ymm14, ymm14, ymm14
    dec             rcx
    jnz             startLabel1
    
  • Code2:

    vzeroall
    mov             rcx, 1000000
    startLabel2:
    vmulps          ymm0, ymm0, ymm0
    vmulps          ymm1, ymm1, ymm1
    vmulps          ymm2, ymm2, ymm2
    vmulps          ymm3, ymm3, ymm3
    vmulps          ymm4, ymm4, ymm4
    vmulps          ymm5, ymm5, ymm5
    vmulps          ymm6, ymm6, ymm6
    vmulps          ymm7, ymm7, ymm7
    vmulps          ymm8, ymm8, ymm8
    vmulps          ymm9, ymm9, ymm9
    vpaddd          ymm10, ymm10, ymm10
    vpaddd          ymm11, ymm11, ymm11
    vpaddd          ymm12, ymm12, ymm12
    vpaddd          ymm13, ymm13, ymm13
    vpaddd          ymm14, ymm14, ymm14
    dec             rcx
    jnz             startLabel2
    
  • Code3 (same as Code2 but with long VEX prefix):

    vzeroall
    mov             rcx, 1000000
    startLabel3:
    byte            0c4h, 0c1h, 07ch, 059h, 0c0h ;long VEX form vmulps ymm0, ymm0, ymm0
    byte            0c4h, 0c1h, 074h, 059h, 0c9h ;long VEX form vmulps ymm1, ymm1, ymm1
    byte            0c4h, 0c1h, 06ch, 059h, 0d2h ;long VEX form vmulps ymm2, ymm2, ymm2
    byte            0c4h, 0c1h, 06ch, 059h, 0dbh ;long VEX form vmulps ymm3, ymm3, ymm3
    byte            0c4h, 0c1h, 05ch, 059h, 0e4h ;long VEX form vmulps ymm4, ymm4, ymm4
    byte            0c4h, 0c1h, 054h, 059h, 0edh ;long VEX form vmulps ymm5, ymm5, ymm5
    byte            0c4h, 0c1h, 04ch, 059h, 0f6h ;long VEX form vmulps ymm6, ymm6, ymm6
    byte            0c4h, 0c1h, 044h, 059h, 0ffh ;long VEX form vmulps ymm7, ymm7, ymm7
    vmulps          ymm8, ymm8, ymm8
    vmulps          ymm9, ymm9, ymm9
    vpaddd          ymm10, ymm10, ymm10
    vpaddd          ymm11, ymm11, ymm11
    vpaddd          ymm12, ymm12, ymm12
    vpaddd          ymm13, ymm13, ymm13
    vpaddd          ymm14, ymm14, ymm14
    dec             rcx
    jnz             startLabel3
    
  • Code4 (same as Code1 but with xmm registers):

    vzeroall
    mov             rcx, 1000000
    startLabel4:
    vfmadd231ps     xmm0, xmm0, xmm0
    vfmadd231ps     xmm1, xmm1, xmm1
    vfmadd231ps     xmm2, xmm2, xmm2
    vfmadd231ps     xmm3, xmm3, xmm3
    vfmadd231ps     xmm4, xmm4, xmm4
    vfmadd231ps     xmm5, xmm5, xmm5
    vfmadd231ps     xmm6, xmm6, xmm6
    vfmadd231ps     xmm7, xmm7, xmm7
    vfmadd231ps     xmm8, xmm8, xmm8
    vfmadd231ps     xmm9, xmm9, xmm9
    vpaddd          xmm10, xmm10, xmm10
    vpaddd          xmm11, xmm11, xmm11
    vpaddd          xmm12, xmm12, xmm12
    vpaddd          xmm13, xmm13, xmm13
    vpaddd          xmm14, xmm14, xmm14
    dec             rcx
    jnz             startLabel4
    
  • Code5 (same as Code1 but with nonzeroing vpsubd`s):

    vzeroall
    mov             rcx, 1000000
    startLabel5:
    vfmadd231ps     ymm0, ymm0, ymm0
    vfmadd231ps     ymm1, ymm1, ymm1
    vfmadd231ps     ymm2, ymm2, ymm2
    vfmadd231ps     ymm3, ymm3, ymm3
    vfmadd231ps     ymm4, ymm4, ymm4
    vfmadd231ps     ymm5, ymm5, ymm5
    vfmadd231ps     ymm6, ymm6, ymm6
    vfmadd231ps     ymm7, ymm7, ymm7
    vfmadd231ps     ymm8, ymm8, ymm8
    vfmadd231ps     ymm9, ymm9, ymm9
    vpsubd          ymm10, ymm10, ymm11
    vpsubd          ymm11, ymm11, ymm12
    vpsubd          ymm12, ymm12, ymm13
    vpsubd          ymm13, ymm13, ymm14
    vpsubd          ymm14, ymm14, ymm10
    dec             rcx
    jnz             startLabel5
    
  • Code6b: (revised, memory operands for vpaddds only)

    vzeroall
    mov             rcx, 1000000
    startLabel6:
    vfmadd231ps     ymm0, ymm0, ymm0
    vfmadd231ps     ymm1, ymm1, ymm1
    vfmadd231ps     ymm2, ymm2, ymm2
    vfmadd231ps     ymm3, ymm3, ymm3
    vfmadd231ps     ymm4, ymm4, ymm4
    vfmadd231ps     ymm5, ymm5, ymm5
    vfmadd231ps     ymm6, ymm6, ymm6
    vfmadd231ps     ymm7, ymm7, ymm7
    vfmadd231ps     ymm8, ymm8, ymm8
    vfmadd231ps     ymm9, ymm9, ymm9
    vpaddd          ymm10, ymm10, [mem]
    vpaddd          ymm11, ymm11, [mem]
    vpaddd          ymm12, ymm12, [mem]
    vpaddd          ymm13, ymm13, [mem]
    vpaddd          ymm14, ymm14, [mem]
    dec             rcx
    jnz             startLabel6
    
  • Code7: (same as Code1 but vpaddds use ymm15)

    vzeroall
    mov             rcx, 1000000
    startLabel7:
    vfmadd231ps     ymm0, ymm0, ymm0
    vfmadd231ps     ymm1, ymm1, ymm1
    vfmadd231ps     ymm2, ymm2, ymm2
    vfmadd231ps     ymm3, ymm3, ymm3
    vfmadd231ps     ymm4, ymm4, ymm4
    vfmadd231ps     ymm5, ymm5, ymm5
    vfmadd231ps     ymm6, ymm6, ymm6
    vfmadd231ps     ymm7, ymm7, ymm7
    vfmadd231ps     ymm8, ymm8, ymm8
    vfmadd231ps     ymm9, ymm9, ymm9
    vpaddd          ymm10, ymm15, ymm15
    vpaddd          ymm11, ymm15, ymm15
    vpaddd          ymm12, ymm15, ymm15
    vpaddd          ymm13, ymm15, ymm15
    vpaddd          ymm14, ymm15, ymm15
    dec             rcx
    jnz             startLabel7
    
  • Code8: (same as Code7 but uses xmm instead of ymm)

    vzeroall
    mov             rcx, 1000000
    startLabel8:
    vfmadd231ps     xmm0, ymm0, ymm0
    vfmadd231ps     xmm1, xmm1, xmm1
    vfmadd231ps     xmm2, xmm2, xmm2
    vfmadd231ps     xmm3, xmm3, xmm3
    vfmadd231ps     xmm4, xmm4, xmm4
    vfmadd231ps     xmm5, xmm5, xmm5
    vfmadd231ps     xmm6, xmm6, xmm6
    vfmadd231ps     xmm7, xmm7, xmm7
    vfmadd231ps     xmm8, xmm8, xmm8
    vfmadd231ps     xmm9, xmm9, xmm9
    vpaddd          xmm10, xmm15, xmm15
    vpaddd          xmm11, xmm15, xmm15
    vpaddd          xmm12, xmm15, xmm15
    vpaddd          xmm13, xmm15, xmm15
    vpaddd          xmm14, xmm15, xmm15
    dec             rcx
    jnz             startLabel8
    
  • Measured TSC clocks with Turbo and C1E disabled:

              Haswell        Broadwell                  Skylake
    
    CPUID     306C3, 40661   306D4, 40671               506E3
    
    Code1     ~5000000        ~7730000 ->~54% slower    ~5500000 ->~10% slower
    Code2     ~5000000       ~5000000                  ~5000000
    Code3     ~6000000       ~5000000                  ~5000000
    Code4     ~5000000       ~7730000                  ~5500000
    Code5     ~5000000       ~7730000                  ~5500000
    Code6b    ~5000000       ~8380000                  ~5500000
    Code7     ~5000000       ~5000000                  ~5000000
    Code8     ~5000000       ~5000000                  ~5000000 
    
  • Can somebody explain what happens with Code1 on Broadwell? My guess is Broadwell somehow contaminates Port1 with vpaddds in Code1 case, however Haswell is able to use Port5 only if Port0 and Port1 is full;

  • Do you have any idea to accomplish the ~5000000 clk on Broadwell with FMA instructions?

  • I tried to reorder. Similar behavior experienced with double and qword;

  • I used Windows 8.1 and Win 10;

    Update:


  • Added Code3 as Marat Dukhan's idea with long VEX;

  • Extended the result table with Skylake experiences;

  • Uploaded a VS2015 Community + MASM sample code here

    Update2:


  • I tried with xmm registers instead of ymm (Code 4). Same result on Broadwell.

    Update3:


  • I added Code5 as Peter Cordes idea (substitute vpaddd`s with other intructions (vpxor, vpor, vpand, vpandn, vpsubd)). If the new instruction not a zeroing idiom(vpxor, vpsubd with same register), the result is the same on BDW. Sample project updated with Code4 and Code5.

    Update4:


  • I added Code6 as Stephen Canon`s idea (memory operands). The result is ~8200000 clks. Sample project updated with Code6;

  • I checked the CPU freq and the possible thottling with System Stability Test of AIDA64. The frequency is stable and no sign of throttling;

    在这里输入图像描述

  • Intel IACA 2.1 Haswell throughput analysis:

    Intel(R) Architecture Code Analyzer Version - 2.1
    Analyzed File - Assembly.obj
    Binary Format - 64Bit
    Architecture  - HSW
    Analysis Type - Throughput
    
    Throughput Analysis Report
    --------------------------
    Block Throughput: 5.10 Cycles       Throughput Bottleneck: Port0, Port1, Port5
    
    Port Binding In Cycles Per Iteration:
    ---------------------------------------------------------------------------------------
    |  Port  |  0   -  DV  |  1   |  2   -  D   |  3   -  D   |  4   |  5   |  6   |  7   |
    ---------------------------------------------------------------------------------------
    | Cycles | 5.0    0.0  | 5.0  | 0.0    0.0  | 0.0    0.0  | 0.0  | 5.0  | 1.0  | 0.0  |
    ---------------------------------------------------------------------------------------
    
    | Num Of |                    Ports pressure in cycles                     |    |
    |  Uops  |  0  - DV  |  1  |  2  -  D  |  3  -  D  |  4  |  5  |  6  |  7  |    |
    ---------------------------------------------------------------------------------
    |   1    | 1.0       |     |           |           |     |     |     |     | CP | vfmadd231ps ymm0, ymm0, ymm0
    |   1    |           | 1.0 |           |           |     |     |     |     | CP | vfmadd231ps ymm1, ymm1, ymm1
    |   1    | 1.0       |     |           |           |     |     |     |     | CP | vfmadd231ps ymm2, ymm2, ymm2
    |   1    |           | 1.0 |           |           |     |     |     |     | CP | vfmadd231ps ymm3, ymm3, ymm3
    |   1    | 1.0       |     |           |           |     |     |     |     | CP | vfmadd231ps ymm4, ymm4, ymm4
    |   1    |           | 1.0 |           |           |     |     |     |     | CP | vfmadd231ps ymm5, ymm5, ymm5
    |   1    | 1.0       |     |           |           |     |     |     |     | CP | vfmadd231ps ymm6, ymm6, ymm6
    |   1    |           | 1.0 |           |           |     |     |     |     | CP | vfmadd231ps ymm7, ymm7, ymm7
    |   1    | 1.0       |     |           |           |     |     |     |     | CP | vfmadd231ps ymm8, ymm8, ymm8
    |   1    |           | 1.0 |           |           |     |     |     |     | CP | vfmadd231ps ymm9, ymm9, ymm9
    |   1    |           |     |           |           |     | 1.0 |     |     | CP | vpaddd ymm10, ymm10, ymm10
    |   1    |           |     |           |           |     | 1.0 |     |     | CP | vpaddd ymm11, ymm11, ymm11
    |   1    |           |     |           |           |     | 1.0 |     |     | CP | vpaddd ymm12, ymm12, ymm12
    |   1    |           |     |           |           |     | 1.0 |     |     | CP | vpaddd ymm13, ymm13, ymm13
    |   1    |           |     |           |           |     | 1.0 |     |     | CP | vpaddd ymm14, ymm14, ymm14
    |   1    |           |     |           |           |     |     | 1.0 |     |    | dec rcx
    |   0F   |           |     |           |           |     |     |     |     |    | jnz 0xffffffffffffffaa
    Total Num Of Uops: 16
    
  • I followed jcomeau_ictx idea, and modified the Agner Fog`s testp.zip (published 2015-12-22) The port usage on the BDW 306D4:

               Clock   Core cyc   Instruct      uop p0     uop p1     uop p5     uop p6 
    Code1:   7734720    7734727   17000001    4983410    5016592    5000001    1000001
    Code2:   5000072    5000072   17000001    5000010    5000014    4999978    1000002
    

    The port distribution near perfect as on the Haswell. Then I checked the resource stall counters (event 0xa2)

              Clock   Core cyc   Instruct      res.stl.   RS stl.    SB stl.    ROB stl.
    Code1:   7736212    7736213   17000001    3736191    3736143          0          0
    Code2:   5000068    5000072   17000001    1000050     999957          0          0
    

    It seems to me the Code1 and Code2 difference comming from the RS stall. Remark from Intel SDM: "Cycles stalled due to no eligible RS entry available."

    How can I avoid this stall with FMA?

    Update5:


  • Code6 changed, as Peter Cordes drew my attention, only vpaddds use memory operands. No effect on HSW and SKL, BDW get worse.

  • As Marat Dukhan measured, not just vpadd/vpsub/vpand/vpandn/vpxor affected, but other Port5 bounded instructions like vmovaps, vblendps, vpermps, vshufps, vbroadcastss;

  • As IwillnotexistIdonotexist suggested, I tried out with other operands. A successful modification is Code7, where all vpaddds use ymm15. This version can produce on BDWs ~5000000 clks, but just for a while. After ~6 million FMA pair it reaches the usual ~7730000 clks:

    Clock   Core cyc   Instruct   res.stl.   RS stl.     SB stl.    ROB stl.
    5133724    5110723   17000001    1107998     946376          0          0
    6545476    6545482   17000001    2545453          1          0          0
    6545468    6545471   17000001    2545437      90910          0          0
    5000016    5000019   17000001     999992     999992          0          0
    7671620    7617127   17000003    3614464    3363363          0          0
    7737340    7737345   17000001    3737321    3737259          0          0
    7802916    7747108   17000003    3737478    3735919          0          0
    7928784    7796057   17000007    3767962    3676744          0          0
    7941072    7847463   17000003    3781103    3651595          0          0
    7787812    7779151   17000005    3765109    3685600          0          0
    7792524    7738029   17000002    3736858    3736764          0          0
    7736000    7736007   17000001    3735983    3735945          0          0
    
  • I tried the xmm version of Code7 as Code8. The effect is similar, but the faster runtime sustains longer. I haven't found significant difference between a 1.6GHz i5-5250U and 3.7GHz i7-5775C.

  • 16 and 17 was made with disabled HyperThreading. With enabled HTT the effect is less.


  • Updated

    I've got no explanation for you, since I'm on Haswell, but I do have code to share that might help you or someone else with Broadwell or Skylake hardware isolate your problem. If you could please run it on your machine and share the results, we could gain an insight into what's happening to your machine.

    Intro

    Recent Intel Core i7 processors have 7 performance monitor counters (PMCs), 3 fixed-function and 4 general-purpose, that may be used to profile code. The fixed-function PMCs are:

  • Instructions retired
  • Unhalted core cycles (Clock ticks including the effects of TurboBoost)
  • Unhalted Reference cycles (Fixed-frequency clock ticks)
  • The ratio of core:reference clock cycles determines the relative speedup or slowdown from dynamic frequency scaling.

    Although software exists (see comments below) that accesses these counters, I did not know them and still find them to be insufficiently fine-grained.

    I therefore wrote myself a Linux kernel module, perfcount , over the past few days to grant me access to the Intel performance counter monitors, and a userspace testbench and library for your code that wraps your FMA code around calls to my LKM. Instructions for how to reproduce my setup will follow.

    My testbench source code is below. It warms up, then runs your code several times, testing it over a long list of metrics. I changed your loop count to 1 billion. Because only 4 general-purpose PMCs can be programmed at once, I do the measurements 4 at a time.

    perfcountdemo.c

    /* Includes */
    #include "libperfcount.h"
    #include <ctype.h>
    #include <stdint.h>
    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    
    
    /* Function prototypes */
    void code1(void);
    void code2(void);
    void code3(void);
    void code4(void);
    void code5(void);
    
    /* Global variables */
    void ((*FN_TABLE[])(void)) = {
        code1,
        code2,
        code3,
        code4,
        code5
    };
    
    
    /**
     * Code snippets to bench
     */
    
    void code1(void){
        asm volatile(
        ".intel_syntax noprefixnt"
        "vzeroallnt"
        "mov             rcx, 1000000000nt"
        "LstartLabel1:nt"
        "vfmadd231ps     %%ymm0, %%ymm0, %%ymm0nt"
        "vfmadd231ps     ymm1, ymm1, ymm1nt"
        "vfmadd231ps     ymm2, ymm2, ymm2nt"
        "vfmadd231ps     ymm3, ymm3, ymm3nt"
        "vfmadd231ps     ymm4, ymm4, ymm4nt"
        "vfmadd231ps     ymm5, ymm5, ymm5nt"
        "vfmadd231ps     ymm6, ymm6, ymm6nt"
        "vfmadd231ps     ymm7, ymm7, ymm7nt"
        "vfmadd231ps     ymm8, ymm8, ymm8nt"
        "vfmadd231ps     ymm9, ymm9, ymm9nt"
        "vpaddd          ymm10, ymm10, ymm10nt"
        "vpaddd          ymm11, ymm11, ymm11nt"
        "vpaddd          ymm12, ymm12, ymm12nt"
        "vpaddd          ymm13, ymm13, ymm13nt"
        "vpaddd          ymm14, ymm14, ymm14nt"
        "dec             rcxnt"
        "jnz             LstartLabel1nt"
        ".att_syntax noprefixnt"
        : /* No outputs we care about */
        : /* No inputs we care about */
        : "xmm0",  "xmm1",  "xmm2",  "xmm3",  "xmm4",  "xmm5",  "xmm6",  "xmm7",
          "xmm8",  "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15",
          "rcx",
          "memory"
        );
    }
    void code2(void){
    
    }
    void code3(void){
    
    }
    void code4(void){
    
    }
    void code5(void){
    
    }
    
    
    
    /* Test Schedule */
    const char* const SCHEDULE[] = {
        /* Batch */
        "uops_issued.any",
        "uops_issued.any<1",
        "uops_issued.any>=1",
        "uops_issued.any>=2",
        /* Batch */
        "uops_issued.any>=3",
        "uops_issued.any>=4",
        "uops_issued.any>=5",
        "uops_issued.any>=6",
        /* Batch */
        "uops_executed_port.port_0",
        "uops_executed_port.port_1",
        "uops_executed_port.port_2",
        "uops_executed_port.port_3",
        /* Batch */
        "uops_executed_port.port_4",
        "uops_executed_port.port_5",
        "uops_executed_port.port_6",
        "uops_executed_port.port_7",
        /* Batch */
        "resource_stalls.any",
        "resource_stalls.rs",
        "resource_stalls.sb",
        "resource_stalls.rob",
        /* Batch */
        "uops_retired.all",
        "uops_retired.all<1",
        "uops_retired.all>=1",
        "uops_retired.all>=2",
        /* Batch */
        "uops_retired.all>=3",
        "uops_retired.all>=4",
        "uops_retired.all>=5",
        "uops_retired.all>=6",
        /* Batch */
        "inst_retired.any_p",
        "inst_retired.any_p<1",
        "inst_retired.any_p>=1",
        "inst_retired.any_p>=2",
        /* Batch */
        "inst_retired.any_p>=3",
        "inst_retired.any_p>=4",
        "inst_retired.any_p>=5",
        "inst_retired.any_p>=6",
        /* Batch */
        "idq_uops_not_delivered.core",
        "idq_uops_not_delivered.core<1",
        "idq_uops_not_delivered.core>=1",
        "idq_uops_not_delivered.core>=2",
        /* Batch */
        "idq_uops_not_delivered.core>=3",
        "idq_uops_not_delivered.core>=4",
        "rs_events.empty",
        "idq.empty",
        /* Batch */
        "idq.mite_all_uops",
        "idq.mite_all_uops<1",
        "idq.mite_all_uops>=1",
        "idq.mite_all_uops>=2",
        /* Batch */
        "idq.mite_all_uops>=3",
        "idq.mite_all_uops>=4",
        "move_elimination.int_not_eliminated",
        "move_elimination.simd_not_eliminated",
        /* Batch */
        "lsd.uops",
        "lsd.uops<1",
        "lsd.uops>=1",
        "lsd.uops>=2",
        /* Batch */
        "lsd.uops>=3",
        "lsd.uops>=4",
        "ild_stall.lcp",
        "ild_stall.iq_full",
        /* Batch */
        "br_inst_exec.all_branches",
        "br_inst_exec.0x81",
        "br_inst_exec.0x82",
        "icache.misses",
        /* Batch */
        "br_misp_exec.all_branches",
        "br_misp_exec.0x81",
        "br_misp_exec.0x82",
        "fp_assist.any",
        /* Batch */
        "cpu_clk_unhalted.core_clk",
        "cpu_clk_unhalted.ref_xclk",
        "baclears.any"
    
    };
    const int NUMCOUNTS = sizeof(SCHEDULE)/sizeof(*SCHEDULE);
    
    
    /**
     * Main
     */
    
    int main(int argc, char* argv[]){
        int i;
    
        /**
         * Initialize
         */
    
        pfcInit();
        if(argc <= 1){
            pfcDumpEvents();
            exit(1);
        }
        pfcPinThread(3);
    
    
        /**
         * Arguments are:
         * 
         *     perfcountdemo #codesnippet
         * 
         * There is a schedule of configuration that is followed.
         */
    
        void (*fn)(void) = FN_TABLE[strtoull(argv[1], NULL, 0)];
        static const uint64_t ZERO_CNT[7] = {0,0,0,0,0,0,0};
        static const uint64_t ZERO_CFG[7] = {0,0,0,0,0,0,0};
    
        uint64_t cnt[7]                   = {0,0,0,0,0,0,0};
        uint64_t cfg[7]                   = {2,2,2,0,0,0,0};
    
        /* Warmup */
        for(i=0;i<10;i++){
            fn();
        }
    
        /* Run master loop */
        for(i=0;i<NUMCOUNTS;i+=4){
            /* Configure counters */
            const char* sched0 = i+0 < NUMCOUNTS ? SCHEDULE[i+0] : "";
            const char* sched1 = i+1 < NUMCOUNTS ? SCHEDULE[i+1] : "";
            const char* sched2 = i+2 < NUMCOUNTS ? SCHEDULE[i+2] : "";
            const char* sched3 = i+3 < NUMCOUNTS ? SCHEDULE[i+3] : "";
            cfg[3] = pfcParseConfig(sched0);
            cfg[4] = pfcParseConfig(sched1);
            cfg[5] = pfcParseConfig(sched2);
            cfg[6] = pfcParseConfig(sched3);
    
            pfcWrConfigCnts(0, 7, cfg);
            pfcWrCountsCnts(0, 7, ZERO_CNT);
            pfcRdCountsCnts(0, 7, cnt);
            /* ^ Should report 0s, and launch the counters. */
            /************** Hot section **************/
            fn();
            /************ End Hot section ************/
            pfcRdCountsCnts(0, 7, cnt);
            pfcWrConfigCnts(0, 7, ZERO_CFG);
            /* ^ Should clear the counter config and disable them. */
    
            /**
             * Print the lovely results
             */
    
            printf("Instructions Issued                : %20llun", cnt[0]);
            printf("Unhalted core cycles               : %20llun", cnt[1]);
            printf("Unhalted reference cycles          : %20llun", cnt[2]);
            printf("%-35s: %20llun", sched0, cnt[3]);
            printf("%-35s: %20llun", sched1, cnt[4]);
            printf("%-35s: %20llun", sched2, cnt[5]);
            printf("%-35s: %20llun", sched3, cnt[6]);
        }
    
        /**
         * Close up shop
         */
    
        pfcFini();
    }
    

    On my machine, I got the following results:

    Haswell Core i7-4700MQ

    > ./perfcountdemo 0
    Instructions Issued                :          17000001807
    Unhalted core cycles               :           5305920785
    Unhalted reference cycles          :           4245764952
    uops_issued.any                    :          16000811079
    uops_issued.any<1                  :           1311417889
    uops_issued.any>=1                 :           4000292290
    uops_issued.any>=2                 :           4000229358
    Instructions Issued                :          17000001806
    Unhalted core cycles               :           5303822082
    Unhalted reference cycles          :           4243345896
    uops_issued.any>=3                 :           4000156998
    uops_issued.any>=4                 :           4000110067
    uops_issued.any>=5                 :                    0
    uops_issued.any>=6                 :                    0
    Instructions Issued                :          17000001811
    Unhalted core cycles               :           5314227923
    Unhalted reference cycles          :           4252020624
    uops_executed_port.port_0          :           5016261477
    uops_executed_port.port_1          :           5036728509
    uops_executed_port.port_2          :                 5282
    uops_executed_port.port_3          :                12481
    Instructions Issued                :          17000001816
    Unhalted core cycles               :           5329351248
    Unhalted reference cycles          :           4265809728
    uops_executed_port.port_4          :                 7087
    uops_executed_port.port_5          :           4946019835
    uops_executed_port.port_6          :           1000228324
    uops_executed_port.port_7          :                 1372
    Instructions Issued                :          17000001816
    Unhalted core cycles               :           5325153463
    Unhalted reference cycles          :           4261060248
    resource_stalls.any                :           1322734589
    resource_stalls.rs                 :            844250210
    resource_stalls.sb                 :                    0
    resource_stalls.rob                :                    0
    Instructions Issued                :          17000001814
    Unhalted core cycles               :           5327823817
    Unhalted reference cycles          :           4262914728
    uops_retired.all                   :          16000445793
    uops_retired.all<1                 :            687284798
    uops_retired.all>=1                :           4646263984
    uops_retired.all>=2                :           4452324050
    Instructions Issued                :          17000001809
    Unhalted core cycles               :           5311736558
    Unhalted reference cycles          :           4250015688
    uops_retired.all>=3                :           3545695253
    uops_retired.all>=4                :           3341664653
    uops_retired.all>=5                :                 1016
    uops_retired.all>=6                :                    1
    Instructions Issued                :          17000001871
    Unhalted core cycles               :           5477215269
    Unhalted reference cycles          :           4383891984
    inst_retired.any_p                 :          17000001871
    inst_retired.any_p<1               :            891904306
    inst_retired.any_p>=1              :           4593972062
    inst_retired.any_p>=2              :           4441024510
    Instructions Issued                :          17000001835
    Unhalted core cycles               :           5377202052
    Unhalted reference cycles          :           4302895152
    inst_retired.any_p>=3              :           3555852364
    inst_retired.any_p>=4              :           3369559466
    inst_retired.any_p>=5              :            999980244
    inst_retired.any_p>=6              :                    0
    Instructions Issued                :          17000001826
    Unhalted core cycles               :           5349373678
    Unhalted reference cycles          :           4280991912
    idq_uops_not_delivered.core        :              1580573
    idq_uops_not_delivered.core<1      :           5354931839
    idq_uops_not_delivered.core>=1     :               471248
    idq_uops_not_delivered.core>=2     :               418625
    Instructions Issued                :          17000001808
    Unhalted core cycles               :           5309687640
    Unhalted reference cycles          :           4248083976
    idq_uops_not_delivered.core>=3     :               280800
    idq_uops_not_delivered.core>=4     :               247923
    rs_events.empty                    :                    0
    idq.empty                          :               649944
    Instructions Issued                :          17000001838
    Unhalted core cycles               :           5392229041
    Unhalted reference cycles          :           4315704216
    idq.mite_all_uops                  :              2496139
    idq.mite_all_uops<1                :           5397877484
    idq.mite_all_uops>=1               :               971582
    idq.mite_all_uops>=2               :               595973
    Instructions Issued                :          17000001822
    Unhalted core cycles               :           5347205506
    Unhalted reference cycles          :           4278845208
    idq.mite_all_uops>=3               :               394011
    idq.mite_all_uops>=4               :               335205
    move_elimination.int_not_eliminated:                    0
    move_elimination.simd_not_eliminated:                    0
    Instructions Issued                :          17000001812
    Unhalted core cycles               :           5320621549
    Unhalted reference cycles          :           4257095280
    lsd.uops                           :          15999287982
    lsd.uops<1                         :           1326629729
    lsd.uops>=1                        :           3999821996
    lsd.uops>=2                        :           3999821996
    Instructions Issued                :          17000001813
    Unhalted core cycles               :           5320533147
    Unhalted reference cycles          :           4257105096
    lsd.uops>=3                        :           3999823498
    lsd.uops>=4                        :           3999823498
    ild_stall.lcp                      :                    0
    ild_stall.iq_full                  :                 3468
    Instructions Issued                :          17000001813
    Unhalted core cycles               :           5323278281
    Unhalted reference cycles          :           4258969200
    br_inst_exec.all_branches          :           1000016626
    br_inst_exec.0x81                  :           1000016616
    br_inst_exec.0x82                  :                    0
    icache.misses                      :                  294
    Instructions Issued                :          17000001812
    Unhalted core cycles               :           5315098728
    Unhalted reference cycles          :           4253082504
    br_misp_exec.all_branches          :                    5
    br_misp_exec.0x81                  :                    2
    br_misp_exec.0x82                  :                    0
    fp_assist.any                      :                    0
    Instructions Issued                :          17000001819
    Unhalted core cycles               :           5338484610
    Unhalted reference cycles          :           4271432976
    cpu_clk_unhalted.core_clk          :           5338494250
    cpu_clk_unhalted.ref_xclk          :            177976806
    baclears.any                       :                    1
                                       :                    0
    

    We may see that on Haswell, everything is well-oiled. I'll make a few notes from the above stats:

  • Instructions issued is incredibly consistent for me. It's always around 17000001800 , which is a good sign: It means we can make a very good estimate of our overhead. Idem for the other fixed-function counters. The fact that they all match reasonably well means that the tests in batches of 4 are apples-to-apples comparisons.
  • With a ratio of core:reference cycles of around 5305920785/4245764952, we get an average frequency scaling of ~1.25; This jives well with my observations that my core clocked up from 2.4 GHz to 3.0 GHz. cpu_clk_unhalted.core_clk/(10.0*cpu_clk_unhalted.ref_xclk) gives just under 3 GHz too.
  • The ratio of instructions issued to core cycles gives the IPC, 17000001807/5305920785 ~ 3.20, which is also about right: 2 FMA+1 VPADDD every clock cycle for 4 clock cycles, and 2 extra loop control instructions every 5th clock cycle that go in parallel.
  • uops_issued.any : The number of instructions issued is ~17B, but the number of uops issued is ~16B. That's because the two instructions for loop control are fusing together; Good sign. Moreover, around 1.3B clock cycles out of 5.3B (25% of the time), no uops were issued, while the near-totality of the rest of the time (4B clock cycles), 4 uops issued at a time.
  • uops_executed_port.port_[0-7] : Port saturation. We're in good health. Of the 16B post-fusion uops, Ports 0, 1 and 5 ate 5B uops each over 5.3B cycles (Which means they were distributed optimally: Float, float, int respectively), Port 6 ate 1B (the fused dec-branch op), and ports 2, 3, 4 and 7 ate negligible amounts by comparison.
  • resource_stalls : 1.3B of them occurred, 2/3 of which were due to the reservation station (RS) and the other third to unknown causes.
  • From the cumulative distribution we built with our comparisons on uops_retired.all and inst_retired.all , we know we are retiring 4 uops 60% of the time, 0 uops 13% of the time and 2 uops the rest of the time, with negligible amounts otherwise.
  • (Numerous *idq* counts): The IDQ only rarely holds us up.
  • lsd : The Loop Stream Detector is working; Nearly 16B fused uops were supplied to the frontend from it.
  • ild : Instruction length decoding is not the bottleneck, and not a single length-changing prefix is encountered.
  • br_inst_exec/br_misp_exec : Branch misprediction is a negligible problem.
  • icache.misses : Negligible.
  • fp_assist : Negligible. Denormals not encountered. (I believe that without DAZ denormals-are-zero flushing, they'd require an assist, which should register here)
  • So on Intel Haswell it's smooth sailing. If you could run my suite on your machines, that would be great.

    Instructions for Reproduction

  • Rule #1: Inspect all my code before doing anything with it. Never blindly trust strangers on the Internet.
  • Grab perfcountdemo.c, libperfcount.c and libperfcount.h, put them in the same directory and compile them together.
  • Grab perfcount.c and Makefile, put them in the same directory, and make the kernel module.
  • Reboot your machine with the GRUB boot flags nmi_watchdog=0 modprobe.blacklist=iTCO_wdt,iTCO_vendor_support . The NMI watchdog will tamper with the unhalted-core-cycle counter otherwise.
  • insmod perfcount.ko the module. dmesg | tail -n 10 dmesg | tail -n 10 should say it successfully loaded and say there are 3 Ff counters and 4 Gp counters, or else give a reason for failing to do so.
  • Run my application, preferably while the rest of the system is not under load. Try also changing in perfcountdemo.c the core to which you restrict your affinity by changing the argument to pfcPinThread() .
  • Edit in here the results.

  • Update: previous version contained a 6 VPADDD instructions (vs 5 in the question), and the extra VPADDD caused imbalance on Broadwell. After it was fixed, Haswell, Broadwell and Skylake issue almost the same number of uops to ports 0, 1 and 5.

    There is no port contamination, but uops are scheduled suboptimally, with the majority of uops going to Port 5 on Broadwell, and making it the bottleneck before Ports 0 and 1 are saturated.

    To demonstrate what is going on, I suggest to (ab)use the demo on PeachPy.IO:

  • Open www.peachpy.io in Google Chrome (it wouldn't work in other browsers).

  • Replace the default code (which implements SDOT function) with the code below, which is literally your example ported to PeachPy syntax:

    n = Argument(size_t)
    x = Argument(ptr(const_float_))
    incx = Argument(size_t)
    y = Argument(ptr(const_float_))
    incy = Argument(size_t)
    
    with Function("sdot", (n, x, incx, y, incy)) as function:
        reg_n = GeneralPurposeRegister64()
        LOAD.ARGUMENT(reg_n, n)
    
        VZEROALL()
    
        with Loop() as loop:
            for i in range(15):
                ymm_i = YMMRegister(i)
                if i < 10:
                    VFMADD231PS(ymm_i, ymm_i, ymm_i)
                else:
                    VPADDD(ymm_i, ymm_i, ymm_i)
            DEC(reg_n)
            JNZ(loop.begin)
    
        RETURN()
    
  • I have a number of machines on different microarchitectures as a backend for PeachPy.io. Choose Intel Haswell, Intel Broadwell, or Intel Skylake and press "Quick Run". The system will compile your code, upload it to server, and visualize performance counters collected during execution.

  • Here is the uops distribution over execution ports on Intel Haswell:

  • 英特尔Haswell的端口压力

  • And here is the same plot from Intel Broadwell:
  • 英特尔Broadwell的端口压力

  • Apparently, whatever was the flaw in uops scheduler, it was fixed in Intel Skylake, because port pressure on that machine is the same as on Haswell.
  • 链接地址: http://www.djcxy.com/p/85932.html

    上一篇: 编写一个跨平台的x86基准测试

    下一篇: 英特尔Broadwell处理器经历的重大FMA性能异常