我在SSE 4.2和AVX 2的2个向量之间矢量化了点积,如下所示.该代码使用GCC 4.8.4和-O2优化标志进行编译.正如预期的那样,两者的性能都有所提高(和AVX 2比SSE 4.2快),但是当我用PAPI分析代码时,我发现未命中的总数(主要是L1和L2)增加了很多:
没有矢量化:
PAPI_L1_TCM: 784,112,091 PAPI_L2_TCM: 195,315,365 PAPI_L3_TCM: 79,362
使用SSE 4.2:
PAPI_L1_TCM: 1,024,234,171 PAPI_L2_TCM: 311,541,918 PAPI_L3_TCM: 68,842
使用AVX 2:
PAPI_L1_TCM: 2,719,959,741 PAPI_L2_TCM: 1,459,375,105 PAPI_L3_TCM: 108,140
我的代码可能有问题或者这种行为是否正常?
AVX 2代码:
double vec_dotProduct(const vec& vecs, const unsigned int& start_a, const unsigned int& start_b, const int& n) { double dot = 0; register int i = 0; const int loopBound = n-3; __m256d vsum, vecPi, vecCi, vecQCi; vsum = _mm256_set1_pd(0); double * const pA = vecs.x+start_a ; double * const pB = vecs.x+start_b ; for( ; iSSE 4.2代码:
double vec_dotProduct(const vec& vecs, const unsigned int& start_a, const unsigned int& start_b, const int& n) { double dot = 0; register int i = 0; const int loopBound = n-1; __m128d vsum, vecPi, vecCi, vecQCi; vsum = _mm_set1_pd(0); double * const pA = vecs.x+start_a ; double * const pB = vecs.x+start_b ; for( ; i非矢量化代码:
double dotProduct(const vec& vecs, const unsigned int& start_a, const unsigned int& start_b, const int& n) { double dot = 0; register int i = 0; for (i = 0; i < n; ++i) { dot += vecs.x[start_a+i] * vecs.x[start_b+i]; } return dot; }编辑:非矢量化代码的汇编:
0x000000000040f9e0 <+0>: mov (%rcx),%r8d 0x000000000040f9e3 <+3>: test %r8d,%r8d 0x000000000040f9e6 <+6>: jle 0x40fa1d0x000000000040f9e8 <+8>: mov (%rsi),%eax 0x000000000040f9ea <+10>: mov (%rdi),%rcx 0x000000000040f9ed <+13>: mov (%rdx),%edi 0x000000000040f9ef <+15>: vxorpd %xmm0,%xmm0,%xmm0 0x000000000040f9f3 <+19>: add %eax,%r8d 0x000000000040f9f6 <+22>: sub %eax,%edi 0x000000000040f9f8 <+24>: nopl 0x0(%rax,%rax,1) 0x000000000040fa00 <+32>: mov %eax,%esi 0x000000000040fa02 <+34>: lea (%rdi,%rax,1),%edx 0x000000000040fa05 <+37>: add $0x1,%eax 0x000000000040fa08 <+40>: vmovsd (%rcx,%rsi,8),%xmm1 0x000000000040fa0d <+45>: cmp %r8d,%eax 0x000000000040fa10 <+48>: vmulsd (%rcx,%rdx,8),%xmm1,%xmm1 0x000000000040fa15 <+53>: vaddsd %xmm1,%xmm0,%xmm0 0x000000000040fa19 <+57>: jne 0x40fa00 0x000000000040fa1b <+59>: repz retq 0x000000000040fa1d <+61>: vxorpd %xmm0,%xmm0,%xmm0 0x000000000040fa21 <+65>: retq 编辑2:下面你可以找到矢量化和非矢量化代码之间的L1缓存未命中对比较大的N(x标签上的N和y标签上的L1缓存未命中).基本上,对于更大的N,矢量化版本中的失误仍然多于非矢量化版本.