首页 > 解决方案 > 编译器是否只完全展开外循环?

问题描述

我尝试编译此代码并使用循环特定的编译指示告诉编译器多少次展开计数循环。

#include <vector>
int main() {
  std::vector<int> v(8192);
#pragma GCC unroll 8 // 16
  for (int i = 0; i < 16; i++) {
    for (int j = 0; j < 512; j++) {
      v[i*512+j] = i*j;
    }
  }
  return 0;
}

当我#pragma GCC unroll 8在外for循环之前放置时,编译器不会展开。

.L3:
        movd    xmm7, ecx
        mov     rax, rsi
        movdqa  xmm2, xmm6
        pshufd  xmm3, xmm7, 0
        movdqa  xmm4, xmm3
        psrlq   xmm4, 32
.L4:
        movdqa  xmm0, xmm2
        movdqa  xmm1, xmm3
        paddd   xmm2, xmm5
        add     rax, 16
        pmuludq xmm1, xmm0
        psrlq   xmm0, 32
        pmuludq xmm0, xmm4
        pshufd  xmm1, xmm1, 8
        pshufd  xmm0, xmm0, 8
        punpckldq       xmm1, xmm0
        movups  XMMWORD PTR [rax-16], xmm1
        cmp     rax, rdx
        jne     .L4
        add     ecx, 1
        add     rsi, 2048
        lea     rdx, [rax+2048]
        cmp     ecx, 16
        jne     .L3
        mov     rdi, rbp
        mov     esi, 16384
        call    _ZdlPvm
        xor     eax, eax
        pop     rbp
        ret

但是当我#pragma GCC unroll 16在外for循环之前放置时,编译器会成功展开外循环。

.L2:
        lea     rdi, [rbp+8]
        mov     rcx, rbp
        movdqa  xmm2, XMMWORD PTR .LC0[rip]
        xor     eax, eax
        and     rdi, -8
        movdqa  xmm0, XMMWORD PTR .LC1[rip]
        mov     QWORD PTR [rbp+0], 0
        lea     rdx, [rbp+4096]
        sub     rcx, rdi
        movdqa  xmm1, xmm2
        mov     QWORD PTR [rbp+2040], 0
        add     ecx, 2048
        shr     ecx, 3
        rep stosq
        lea     rax, [rbp+2048]
.L3:
        movdqa  xmm3, xmm1
        add     rax, 16
        paddd   xmm1, xmm0
        movups  XMMWORD PTR [rax-16], xmm3
        cmp     rax, rdx
        jne     .L3
        lea     rdx, [rbp+6144]
        movdqa  xmm3, xmm2
.L4:
        movdqa  xmm1, xmm3
        add     rax, 16
        paddd   xmm3, xmm0
        pslld   xmm1, 1
        movups  XMMWORD PTR [rax-16], xmm1
        cmp     rax, rdx
        jne     .L4
        lea     rdx, [rbp+8192]
        movdqa  xmm3, xmm2
.L5:
        movdqa  xmm4, xmm3
        add     rax, 16
        paddd   xmm3, xmm0
        movdqa  xmm1, xmm4
        pslld   xmm1, 1
        paddd   xmm1, xmm4
        movups  XMMWORD PTR [rax-16], xmm1
        cmp     rdx, rax
        jne     .L5
        mov     rax, rdx
        movdqa  xmm3, xmm2
        lea     rdx, [rbp+10240]
.L6:
        movdqa  xmm1, xmm3
        add     rax, 16
        paddd   xmm3, xmm0
        pslld   xmm1, 2
        movups  XMMWORD PTR [rax-16], xmm1
        cmp     rax, rdx
        jne     .L6
        mov     rdx, rax
        movdqa  xmm3, xmm2
        lea     rax, [rbp+12288]
.L7:
        movdqa  xmm4, xmm3
        add     rdx, 16
        paddd   xmm3, xmm0
        movdqa  xmm1, xmm4
        pslld   xmm1, 2
        paddd   xmm1, xmm4
        movups  XMMWORD PTR [rdx-16], xmm1
        cmp     rax, rdx
        jne     .L7
        lea     rdx, [rbp+14336]
        movdqa  xmm3, xmm2
.L8:
        movdqa  xmm4, xmm3
        add     rax, 16
        paddd   xmm3, xmm0
        movdqa  xmm1, xmm4
        pslld   xmm1, 1
        paddd   xmm1, xmm4
        pslld   xmm1, 1
        movups  XMMWORD PTR [rax-16], xmm1
        cmp     rax, rdx
        jne     .L8
        movdqa  xmm3, xmm2
.L9:
        movdqa  xmm4, xmm3
        add     rax, 16
        paddd   xmm3, xmm0
        movdqa  xmm1, xmm4
        pslld   xmm1, 3
        psubd   xmm1, xmm4
        movups  XMMWORD PTR [rax-16], xmm1
        cmp     rax, rbx
        jne     .L9
        lea     rdx, [rbp+18432]
        movdqa  xmm3, xmm2
.L10:
        movdqa  xmm1, xmm3
        add     rax, 16
        paddd   xmm3, xmm0
        pslld   xmm1, 3
        movups  XMMWORD PTR [rax-16], xmm1
        cmp     rax, rdx
        jne     .L10
        lea     rdx, [rbp+20480]
        movdqa  xmm3, xmm2
.L11:
        movdqa  xmm4, xmm3
        add     rax, 16
        paddd   xmm3, xmm0
        movdqa  xmm1, xmm4
        pslld   xmm1, 3
        paddd   xmm1, xmm4
        movups  XMMWORD PTR [rax-16], xmm1
        cmp     rdx, rax
        jne     .L11
        lea     rax, [rbp+22528]
        movdqa  xmm3, xmm2
.L12:
        movdqa  xmm4, xmm3
        add     rdx, 16
        paddd   xmm3, xmm0
        movdqa  xmm1, xmm4
        pslld   xmm1, 2
        paddd   xmm1, xmm4
        pslld   xmm1, 1
        movups  XMMWORD PTR [rdx-16], xmm1
        cmp     rax, rdx
        jne     .L12
        lea     rdx, [rbp+24576]
        movdqa  xmm4, xmm2
.L13:
        movdqa  xmm3, xmm4
        add     rax, 16
        paddd   xmm4, xmm0
        movdqa  xmm1, xmm3
        pslld   xmm1, 1
        paddd   xmm1, xmm3
        pslld   xmm1, 2
        psubd   xmm1, xmm3
        movups  XMMWORD PTR [rax-16], xmm1
        cmp     rax, rdx
        jne     .L13
        lea     rdx, [rbp+26624]
        movdqa  xmm3, xmm2
.L14:
        movdqa  xmm4, xmm3
        add     rax, 16
        paddd   xmm3, xmm0
        movdqa  xmm1, xmm4
        pslld   xmm1, 1
        paddd   xmm1, xmm4
        pslld   xmm1, 2
        movups  XMMWORD PTR [rax-16], xmm1
        cmp     rax, rdx
        jne     .L14
        lea     rdx, [rbp+28672]
        movdqa  xmm4, xmm2
.L15:
        movdqa  xmm3, xmm4
        add     rax, 16
        paddd   xmm4, xmm0
        movdqa  xmm1, xmm3
        pslld   xmm1, 1
        paddd   xmm1, xmm3
        pslld   xmm1, 2
        paddd   xmm1, xmm3
        movups  XMMWORD PTR [rax-16], xmm1
        cmp     rax, rdx
        jne     .L15
        lea     rdx, [rbp+30720]
        movdqa  xmm3, xmm2
.L16:
        movdqa  xmm4, xmm3
        add     rax, 16
        paddd   xmm3, xmm0
        movdqa  xmm1, xmm4
        pslld   xmm1, 3
        psubd   xmm1, xmm4
        pslld   xmm1, 1
        movups  XMMWORD PTR [rax-16], xmm1
        cmp     rdx, rax
        jne     .L16
        mov     rax, rdx
        lea     rdx, [rbp+32768]
.L17:
        movdqa  xmm3, xmm2
        add     rax, 16
        paddd   xmm2, xmm0
        movdqa  xmm1, xmm3
        pslld   xmm1, 4
        psubd   xmm1, xmm3
        movups  XMMWORD PTR [rax-16], xmm1
        cmp     rdx, rax
        jne     .L17
        mov     rdi, rbp
        mov     esi, 16384
        call    _ZdlPvm
        add     rsp, 8
        xor     eax, eax
        pop     rbx
        pop     rbp
        ret

那么编译器是否只完全展开外循环?

GCC 版本:g++ (Compiler-Explorer-Build-gcc-b8ef019ab938471f7f877a1eee3a6374fd8a6ae9-binutils-2.36.1) 12.0.0 20211029 (实验性)

选项:-O2

神螺栓: https ://godbolt.org/z/zq7TWesY9

标签: assemblygccx86-64compiler-optimizationloop-unrolling

解决方案


https://godbolt.org/z/PT6T1691W似乎可以-O2 -funroll-loops解决问题,显然需要启用该选项才能使编译指示告诉 GCC 展开多少。(更新:或者至少使它有一些效果。见评论,这似乎还不是一个完整的答案。)

(-funroll-loops默认情况下不启用,除非您使用-fprofile-use, 在运行并运行具有代表性输入的程序之后。很久-fprofile-generate以前它在 -O3 时默认启用,但代码膨胀 I-cache 压力通常会使循环变得更糟不热。这会导致 GCC 花费大部分时间的循环是 SIMD 的几条指令,但完全展开的标量序言/尾声是指令数量的 10 倍,尤其是在更宽的情况下向量。即使使用 AVX-512,GCC 通常只对奇数个元素使用标量,而不是创建掩码。:/)


完全展开循环是 GCC 甚至会做的事情-O2,至少对于非常小的行程计数。(例如,一个int数组最多 3 个p[i] += 1;,带有-O2 -fno-tree-vectorize)。 https://godbolt.org/z/P5rvjYj1b

-O2完全展开更大的循环或更高的行程计数(当静态代码大小可能会因此而增加时)似乎默认情况下没有启用。(GCC在他们的调优选项/参数中称之为剥离循环,即从循环中剥离所有迭代,这样它就消失了。 是 on ,但不是。由于 GCC11,不再打印作为 asm 注释启用的优化选项列表.)-fpeel-loops-O3-O2-fverbose-asm

-O2顺便说一句,现在在 GCC 主干中似乎默认启用了自动矢量化。以前它只在 at 上-O3,所以这很有趣。


推荐阅读