I've got no explanation for you, since I'm on Haswell, but I do have code to share that might help you or someone else with Broadwell or Skylake hardware isolate your problem. If you could please run it on your machine and share the results, we could gain an insight into what's happening to your machine.
Recent Intel Core i7 processors have 7 performance monitor counters (PMCs), 3 fixed-function and 4 general-purpose, that may be used to profile code. The fixed-function PMCs are:
Instructions retired
Unhalted core cycles (Clock ticks including the effects of TurboBoost)
Unhalted Reference cycles (Fixed-frequency clock ticks)
The ratio of core:reference clock cycles determines the relative speedup or slowdown from dynamic frequency scaling.
Although software exists (see comments below) that accesses these counters, I did not know them and still find them to be insufficiently fine-grained.
I therefore wrote myself a Linux kernel module, perfcount
, over the past few days to grant me access to the Intel performance counter monitors, and a userspace testbench and library for your code that wraps your FMA code around calls to my LKM. Instructions for how to reproduce my setup will follow.
My testbench source code is below. It warms up, then runs your code several times, testing it over a long list of metrics. I changed your loop count to 1 billion. Because only 4 general-purpose PMCs can be programmed at once, I do the measurements 4 at a time.
perfcountdemo.c
/* Includes */
#include "libperfcount.h"
#include
#include
#include
#include
#include
/* Function prototypes */
void code1(void);
void code2(void);
void code3(void);
void code4(void);
void code5(void);
/* Global variables */
void ((*FN_TABLE[])(void)) = {
code1,
code2,
code3,
code4,
code5
};
/**
* Code snippets to bench
*/
void code1(void){
asm volatile(
".intel_syntax noprefix\n\t"
"vzeroall\n\t"
"mov rcx, 1000000000\n\t"
"LstartLabel1:\n\t"
"vfmadd231ps %%ymm0, %%ymm0, %%ymm0\n\t"
"vfmadd231ps ymm1, ymm1, ymm1\n\t"
"vfmadd231ps ymm2, ymm2, ymm2\n\t"
"vfmadd231ps ymm3, ymm3, ymm3\n\t"
"vfmadd231ps ymm4, ymm4, ymm4\n\t"
"vfmadd231ps ymm5, ymm5, ymm5\n\t"
"vfmadd231ps ymm6, ymm6, ymm6\n\t"
"vfmadd231ps ymm7, ymm7, ymm7\n\t"
"vfmadd231ps ymm8, ymm8, ymm8\n\t"
"vfmadd231ps ymm9, ymm9, ymm9\n\t"
"vpaddd ymm10, ymm10, ymm10\n\t"
"vpaddd ymm11, ymm11, ymm11\n\t"
"vpaddd ymm12, ymm12, ymm12\n\t"
"vpaddd ymm13, ymm13, ymm13\n\t"
"vpaddd ymm14, ymm14, ymm14\n\t"
"dec rcx\n\t"
"jnz LstartLabel1\n\t"
".att_syntax noprefix\n\t"
: /* No outputs we care about */
: /* No inputs we care about */
: "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
"xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15",
"rcx",
"memory"
);
}
void code2(void){
}
void code3(void){
}
void code4(void){
}
void code5(void){
}
/* Test Schedule */
const char* const SCHEDULE[] = {
/* Batch */
"uops_issued.any",
"uops_issued.any<1",
"uops_issued.any>=1",
"uops_issued.any>=2",
/* Batch */
"uops_issued.any>=3",
"uops_issued.any>=4",
"uops_issued.any>=5",
"uops_issued.any>=6",
/* Batch */
"uops_executed_port.port_0",
"uops_executed_port.port_1",
"uops_executed_port.port_2",
"uops_executed_port.port_3",
/* Batch */
"uops_executed_port.port_4",
"uops_executed_port.port_5",
"uops_executed_port.port_6",
"uops_executed_port.port_7",
/* Batch */
"resource_stalls.any",
"resource_stalls.rs",
"resource_stalls.sb",
"resource_stalls.rob",
/* Batch */
"uops_retired.all",
"uops_retired.all<1",
"uops_retired.all>=1",
"uops_retired.all>=2",
/* Batch */
"uops_retired.all>=3",
"uops_retired.all>=4",
"uops_retired.all>=5",
"uops_retired.all>=6",
/* Batch */
"inst_retired.any_p",
"inst_retired.any_p<1",
"inst_retired.any_p>=1",
"inst_retired.any_p>=2",
/* Batch */
"inst_retired.any_p>=3",
"inst_retired.any_p>=4",
"inst_retired.any_p>=5",
"inst_retired.any_p>=6",
/* Batch */
"idq_uops_not_delivered.core",
"idq_uops_not_delivered.core<1",
"idq_uops_not_delivered.core>=1",
"idq_uops_not_delivered.core>=2",
/* Batch */
"idq_uops_not_delivered.core>=3",
"idq_uops_not_delivered.core>=4",
"rs_events.empty",
"idq.empty",
/* Batch */
"idq.mite_all_uops",
"idq.mite_all_uops<1",
"idq.mite_all_uops>=1",
"idq.mite_all_uops>=2",
/* Batch */
"idq.mite_all_uops>=3",
"idq.mite_all_uops>=4",
"move_elimination.int_not_eliminated",
"move_elimination.simd_not_eliminated",
/* Batch */
"lsd.uops",
"lsd.uops<1",
"lsd.uops>=1",
"lsd.uops>=2",
/* Batch */
"lsd.uops>=3",
"lsd.uops>=4",
"ild_stall.lcp",
"ild_stall.iq_full",
/* Batch */
"br_inst_exec.all_branches",
"br_inst_exec.0x81",
"br_inst_exec.0x82",
"icache.misses",
/* Batch */
"br_misp_exec.all_branches",
"br_misp_exec.0x81",
"br_misp_exec.0x82",
"fp_assist.any",
/* Batch */
"cpu_clk_unhalted.core_clk",
"cpu_clk_unhalted.ref_xclk",
"baclears.any"
};
const int NUMCOUNTS = sizeof(SCHEDULE)/sizeof(*SCHEDULE);
/**
* Main
*/
int main(int argc, char* argv[]){
int i;
/**
* Initialize
*/
pfcInit();
if(argc <= 1){
pfcDumpEvents();
exit(1);
}
pfcPinThread(3);
/**
* Arguments are:
*
* perfcountdemo #codesnippet
*
* There is a schedule of configuration that is followed.
*/
void (*fn)(void) = FN_TABLE[strtoull(argv[1], NULL, 0)];
static const uint64_t ZERO_CNT[7] = {0,0,0,0,0,0,0};
static const uint64_t ZERO_CFG[7] = {0,0,0,0,0,0,0};
uint64_t cnt[7] = {0,0,0,0,0,0,0};
uint64_t cfg[7] = {2,2,2,0,0,0,0};
/* Warmup */
for(i=0;i<10;i++){
fn();
}
/* Run master loop */
for(i=0;i
On my machine, I got the following results:
Haswell Core i7-4700MQ
> ./perfcountdemo 0
Instructions Issued : 17000001807
Unhalted core cycles : 5305920785
Unhalted reference cycles : 4245764952
uops_issued.any : 16000811079
uops_issued.any<1 : 1311417889
uops_issued.any>=1 : 4000292290
uops_issued.any>=2 : 4000229358
Instructions Issued : 17000001806
Unhalted core cycles : 5303822082
Unhalted reference cycles : 4243345896
uops_issued.any>=3 : 4000156998
uops_issued.any>=4 : 4000110067
uops_issued.any>=5 : 0
uops_issued.any>=6 : 0
Instructions Issued : 17000001811
Unhalted core cycles : 5314227923
Unhalted reference cycles : 4252020624
uops_executed_port.port_0 : 5016261477
uops_executed_port.port_1 : 5036728509
uops_executed_port.port_2 : 5282
uops_executed_port.port_3 : 12481
Instructions Issued : 17000001816
Unhalted core cycles : 5329351248
Unhalted reference cycles : 4265809728
uops_executed_port.port_4 : 7087
uops_executed_port.port_5 : 4946019835
uops_executed_port.port_6 : 1000228324
uops_executed_port.port_7 : 1372
Instructions Issued : 17000001816
Unhalted core cycles : 5325153463
Unhalted reference cycles : 4261060248
resource_stalls.any : 1322734589
resource_stalls.rs : 844250210
resource_stalls.sb : 0
resource_stalls.rob : 0
Instructions Issued : 17000001814
Unhalted core cycles : 5327823817
Unhalted reference cycles : 4262914728
uops_retired.all : 16000445793
uops_retired.all<1 : 687284798
uops_retired.all>=1 : 4646263984
uops_retired.all>=2 : 4452324050
Instructions Issued : 17000001809
Unhalted core cycles : 5311736558
Unhalted reference cycles : 4250015688
uops_retired.all>=3 : 3545695253
uops_retired.all>=4 : 3341664653
uops_retired.all>=5 : 1016
uops_retired.all>=6 : 1
Instructions Issued : 17000001871
Unhalted core cycles : 5477215269
Unhalted reference cycles : 4383891984
inst_retired.any_p : 17000001871
inst_retired.any_p<1 : 891904306
inst_retired.any_p>=1 : 4593972062
inst_retired.any_p>=2 : 4441024510
Instructions Issued : 17000001835
Unhalted core cycles : 5377202052
Unhalted reference cycles : 4302895152
inst_retired.any_p>=3 : 3555852364
inst_retired.any_p>=4 : 3369559466
inst_retired.any_p>=5 : 999980244
inst_retired.any_p>=6 : 0
Instructions Issued : 17000001826
Unhalted core cycles : 5349373678
Unhalted reference cycles : 4280991912
idq_uops_not_delivered.core : 1580573
idq_uops_not_delivered.core<1 : 5354931839
idq_uops_not_delivered.core>=1 : 471248
idq_uops_not_delivered.core>=2 : 418625
Instructions Issued : 17000001808
Unhalted core cycles : 5309687640
Unhalted reference cycles : 4248083976
idq_uops_not_delivered.core>=3 : 280800
idq_uops_not_delivered.core>=4 : 247923
rs_events.empty : 0
idq.empty : 649944
Instructions Issued : 17000001838
Unhalted core cycles : 5392229041
Unhalted reference cycles : 4315704216
idq.mite_all_uops : 2496139
idq.mite_all_uops<1 : 5397877484
idq.mite_all_uops>=1 : 971582
idq.mite_all_uops>=2 : 595973
Instructions Issued : 17000001822
Unhalted core cycles : 5347205506
Unhalted reference cycles : 4278845208
idq.mite_all_uops>=3 : 394011
idq.mite_all_uops>=4 : 335205
move_elimination.int_not_eliminated: 0
move_elimination.simd_not_eliminated: 0
Instructions Issued : 17000001812
Unhalted core cycles : 5320621549
Unhalted reference cycles : 4257095280
lsd.uops : 15999287982
lsd.uops<1 : 1326629729
lsd.uops>=1 : 3999821996
lsd.uops>=2 : 3999821996
Instructions Issued : 17000001813
Unhalted core cycles : 5320533147
Unhalted reference cycles : 4257105096
lsd.uops>=3 : 3999823498
lsd.uops>=4 : 3999823498
ild_stall.lcp : 0
ild_stall.iq_full : 3468
Instructions Issued : 17000001813
Unhalted core cycles : 5323278281
Unhalted reference cycles : 4258969200
br_inst_exec.all_branches : 1000016626
br_inst_exec.0x81 : 1000016616
br_inst_exec.0x82 : 0
icache.misses : 294
Instructions Issued : 17000001812
Unhalted core cycles : 5315098728
Unhalted reference cycles : 4253082504
br_misp_exec.all_branches : 5
br_misp_exec.0x81 : 2
br_misp_exec.0x82 : 0
fp_assist.any : 0
Instructions Issued : 17000001819
Unhalted core cycles : 5338484610
Unhalted reference cycles : 4271432976
cpu_clk_unhalted.core_clk : 5338494250
cpu_clk_unhalted.ref_xclk : 177976806
baclears.any : 1
: 0
我们可能会看到Haswell,一切都很好.我将从上面的统计数据中做一些笔记:
发出的指令对我来说非常一致.它始终存在17000001800
,这是一个好兆头:这意味着我们可以很好地估计我们的开销.同意其他固定功能计数器.它们都匹配得相当好的事实意味着4个批次的测试是苹果对苹果的比较.
核心:参考周期的比率约为5305920785/4245764952,我们得到的平均频率缩放为~1.25; 这与我的观察结果相吻合,我的核心时钟频率从2.4 GHz到3.0 GHz.cpu_clk_unhalted.core_clk/(10.0*cpu_clk_unhalted.ref_xclk)
也给出了不到3 GHz的频率.
The ratio of instructions issued to core cycles gives the IPC, 17000001807/5305920785 ~ 3.20, which is also about right: 2 FMA+1 VPADDD every clock cycle for 4 clock cycles, and 2 extra loop control instructions every 5th clock cycle that go in parallel.
uops_issued.any
: The number of instructions issued is ~17B, but the number of uops issued is ~16B. That's because the two instructions for loop control are fusing together; Good sign. Moreover, around 1.3B clock cycles out of 5.3B (25% of the time), no uops were issued, while the near-totality of the rest of the time (4B clock cycles), 4 uops issued at a time.
uops_executed_port.port_[0-7]
: Port saturation. We're in good health. Of the 16B post-fusion uops, Ports 0, 1 and 5 ate 5B uops each over 5.3B cycles (Which means they were distributed optimally: Float, float, int respectively), Port 6 ate 1B (the fused dec-branch op), and ports 2, 3, 4 and 7 ate negligible amounts by comparison.
resource_stalls
: 1.3B of them occurred, 2/3 of which were due to the reservation station (RS) and the other third to unknown causes.
From the cumulative distribution we built with our comparisons on uops_retired.all
and inst_retired.all
, we know we are retiring 4 uops 60% of the time, 0 uops 13% of the time and 2 uops the rest of the time, with negligible amounts otherwise.
(Numerous *idq*
counts): The IDQ only rarely holds us up.
lsd
: The Loop Stream Detector is working; Nearly 16B fused uops were supplied to the frontend from it.
ild
:指令长度解码不是瓶颈,并且不会遇到单个长度更改前缀.
br_inst_exec/br_misp_exec
:分支错误预测是一个可以忽略不计的问题.
icache.misses
:微不足道.
fp_assist
:微不足道.没有遇到异常.(我相信如果没有DAZ denormals-are-zero冲洗,他们需要一个辅助,这应该在这里注册)
因此,在英特尔Haswell上,它一帆风顺.如果你可以在你的机器上运行我的套件,那就太好了.
复制说明
规则#1:在使用它之前检查我的所有代码.千万不要盲目信任互联网上的陌生人.
抓住perfcountdemo.c,libperfcount.c和libperfcount.h,把它们放在同一个目录,并编译在一起.
抓住perfcount.c和Makefile文件,把它们放在同一个目录中,make
内核模块.
使用GRUB引导标志重新启动计算机nmi_watchdog=0 modprobe.blacklist=iTCO_wdt,iTCO_vendor_support
.否则,NMI监管机构将篡改无线核心周期计数器.
insmod perfcount.ko
模块.dmesg | tail -n 10
应该说它成功加载并说有3个Ff计数器和4个Gp计数器,否则就说明没有这样做了.
运行我的应用程序,最好在系统的其余部分没有负载的情况下运行.尝试perfcountdemo.c
通过将参数更改为更改您限制亲缘关系的核心pfcPinThread()
.
在这里编辑结果.
@Zboson我已经用我的机器上更多相关计数器的结果更新了答案,并将软件清理到了我认为它可能更广泛有用的地方,例如对于像你这样的人.
2> Marat Dukhan..:
更新:以前的版本包含6个VPADDD
指令(问题中为5个),并且额外VPADDD
导致Broadwell不平衡.修复后,Haswell,Broadwell和Skylake向端口0,1和5发出几乎相同数量的uop.
没有端口污染,但uops被安排在次优,大多数uops在Broadwell上进入端口5,并且在端口0和1饱和之前成为瓶颈.
为了演示正在发生的事情,我建议(ab)在PeachPy.IO上使用该演示:
在Google Chrome中打开www.peachpy.io(在其他浏览器中无法使用).
用下面的代码替换默认代码(实现SDOT功能),这实际上是移植到PeachPy语法的示例:
n = Argument(size_t)
x = Argument(ptr(const_float_))
incx = Argument(size_t)
y = Argument(ptr(const_float_))
incy = Argument(size_t)
with Function("sdot", (n, x, incx, y, incy)) as function:
reg_n = GeneralPurposeRegister64()
LOAD.ARGUMENT(reg_n, n)
VZEROALL()
with Loop() as loop:
for i in range(15):
ymm_i = YMMRegister(i)
if i < 10:
VFMADD231PS(ymm_i, ymm_i, ymm_i)
else:
VPADDD(ymm_i, ymm_i, ymm_i)
DEC(reg_n)
JNZ(loop.begin)
RETURN()
我有许多不同微体系结构的机器作为PeachPy.io的后端.选择Intel Haswell,Intel Broadwell或Intel Skylake并按"Quick Run".系统将编译您的代码,将其上载到服务器,并可视化执行期间收集的性能计数器.
以下是Intel Haswell上执行端口的uops分布:
以下是英特尔布罗德威尔的情节:
显然,无论uops调度程序中的缺陷是什么,都是在英特尔Skylake中修复的,因为该机器上的端口压力与Haswell相同.
@IwillnotexistIdonotexist @PeterCordes你可以获得原始事件计数.使用PeachPy在本地编译源文件为`python -m peachpy.x86_64 -mabi = sysv -mimage-format = elf -mcpu = haswell experiment.py -o experiment.o`,然后使用`wget`作为`wget上传 - header ="Content-Type:application/octet-stream"--post-file = experiment.o"http://www.peachpy.io/run/broadwell?kernel=sdot&n=1000&incx=1&incy=1&offx=0&offy=0 "-q -O -`.如果需要,用`haswell`或`skylake`替换`broadwell`.
@Marat Dukhan啊哈!*你确实有6个VPADDD*.尝试在范围(16)中替换`for i:`在范围(15)中使用`for i:`.有趣的是,HSW可以对这个额外的VPADDD进行负载平衡,而BRW则没有.
@IwillnotexistIdonotexist确实!没有注意到原始代码没有使用所有寄存器
@MaratDukhan我仍然认为你的帖子非常有价值; 你已经清楚地表明,Haswell可以将`5 1/3`的VPADDDs跳入`p5`,将'2/3`的VPADDDs跳入`p1`,将'4 2/3`的FMA跳转到`p1`和`5 1/3`FMA成'p0`,而BRW看似不能.这值得提出自己的问题.