首页 > 解决方案 > GCC -O0 使用内部函数生成奇怪的 AVX 额外存储/重新加载指令

问题描述

我创建了一个简单的矢量化 C 函数来对数组中的每个元素进行平方。代码如下:

#include <immintrin.h>

void square(const double* arr, uint len, double* outarr) {
    __m256d v;

    for (uint i = 0; i <= len - 4; i += 4) {
        v = _mm256_load_pd(&arr[i]);
        _mm256_stream_pd(&outarr[i], _mm256_mul_pd(v, v));
    }

    for (uint i = len-(len&3u); i < len; i++) {
        outarr[i] = arr[i]*arr[i];
    }
}

int main() {
    double* inp = aligned_alloc(32, 100* sizeof(double));
    double* out = aligned_alloc(32, 100* sizeof(double));
    square(inp, 100u, out);
    return 0;
}

当我编译这段代码时:

gcc main.c -mavx -o main

我得到了 square 函数的以下反汇编:

   0x0000000000400546 <+0>:     lea    0x8(%rsp),%r10
   0x000000000040054b <+5>:     and    $0xffffffffffffffe0,%rsp
   0x000000000040054f <+9>:     pushq  -0x8(%r10)
   0x0000000000400553 <+13>:    push   %rbp
   0x0000000000400554 <+14>:    mov    %rsp,%rbp
   0x0000000000400557 <+17>:    push   %r10
   0x0000000000400559 <+19>:    sub    $0x50,%rsp
   0x000000000040055d <+23>:    mov    %rdi,-0xb8(%rbp)
   0x0000000000400564 <+30>:    mov    %esi,-0xbc(%rbp)
   0x000000000040056a <+36>:    mov    %rdx,-0xc8(%rbp)
   0x0000000000400571 <+43>:    movl   $0x0,-0xa8(%rbp)
   0x000000000040057b <+53>:    jmpq   0x400615 <square+207>
   0x0000000000400580 <+58>:    mov    -0xa8(%rbp),%eax
   0x0000000000400586 <+64>:    cltq   
   0x0000000000400588 <+66>:    lea    0x0(,%rax,8),%rdx
   0x0000000000400590 <+74>:    mov    -0xb8(%rbp),%rax
   0x0000000000400597 <+81>:    add    %rdx,%rax
   0x000000000040059a <+84>:    mov    %rax,-0xa0(%rbp)
   0x00000000004005a1 <+91>:    mov    -0xa0(%rbp),%rax
   0x00000000004005a8 <+98>:    vmovapd (%rax),%ymm0
   0x00000000004005ac <+102>:   vmovapd %ymm0,-0x90(%rbp)
   0x00000000004005b4 <+110>:   vmovapd -0x90(%rbp),%ymm0
   0x00000000004005bc <+118>:   vmovapd %ymm0,-0x70(%rbp)
   0x00000000004005c1 <+123>:   vmovapd -0x90(%rbp),%ymm0
   0x00000000004005c9 <+131>:   vmovapd %ymm0,-0x30(%rbp)
   0x00000000004005ce <+136>:   vmovapd -0x70(%rbp),%ymm0
   0x00000000004005d3 <+141>:   vmulpd -0x30(%rbp),%ymm0,%ymm0
   0x00000000004005d8 <+146>:   mov    -0xa8(%rbp),%eax
   0x00000000004005de <+152>:   cltq   
   0x00000000004005e0 <+154>:   lea    0x0(,%rax,8),%rdx
   0x00000000004005e8 <+162>:   mov    -0xc8(%rbp),%rax
   0x00000000004005ef <+169>:   add    %rdx,%rax
   0x00000000004005f2 <+172>:   mov    %rax,-0x98(%rbp)
   0x00000000004005f9 <+179>:   vmovapd %ymm0,-0x50(%rbp)
   0x00000000004005fe <+184>:   mov    -0x98(%rbp),%rax
   0x0000000000400605 <+191>:   vmovapd -0x50(%rbp),%ymm0
   0x000000000040060a <+196>:   vmovntpd %ymm0,(%rax)
   0x000000000040060e <+200>:   addl   $0x4,-0xa8(%rbp)
   0x0000000000400615 <+207>:   mov    -0xbc(%rbp),%eax
   0x000000000040061b <+213>:   lea    -0x4(%rax),%edx
   0x000000000040061e <+216>:   mov    -0xa8(%rbp),%eax
   0x0000000000400624 <+222>:   cmp    %eax,%edx
   0x0000000000400626 <+224>:   jae    0x400580 <square+58>
   0x000000000040062c <+230>:   mov    -0xbc(%rbp),%eax
   0x0000000000400632 <+236>:   and    $0xfffffffc,%eax
   0x0000000000400635 <+239>:   mov    %eax,-0xa4(%rbp)
   0x000000000040063b <+245>:   jmp    0x40069c <square+342>
   0x000000000040063d <+247>:   mov    -0xa4(%rbp),%eax
   0x0000000000400643 <+253>:   lea    0x0(,%rax,8),%rdx
   0x000000000040064b <+261>:   mov    -0xc8(%rbp),%rax
   0x0000000000400652 <+268>:   add    %rdx,%rax

然后,当我编译代码时:

gcc main.c -mavx -O3 -o main

我得到了 square 函数的以下反汇编:

   0x00000000004005c0 <+0>:     lea    -0x4(%rsi),%r9d
   0x00000000004005c4 <+4>:     mov    %rdi,%r8
   0x00000000004005c7 <+7>:     mov    %rdx,%rcx
   0x00000000004005ca <+10>:    xor    %eax,%eax
   0x00000000004005cc <+12>:    nopl   0x0(%rax)
   0x00000000004005d0 <+16>:    vmovapd (%r8),%ymm0
   0x00000000004005d5 <+21>:    add    $0x4,%eax
   0x00000000004005d8 <+24>:    add    $0x20,%r8
   0x00000000004005dc <+28>:    add    $0x20,%rcx
   0x00000000004005e0 <+32>:    vmulpd %ymm0,%ymm0,%ymm0
   0x00000000004005e4 <+36>:    vmovntpd %ymm0,-0x20(%rcx)
   0x00000000004005e9 <+41>:    cmp    %eax,%r9d
   0x00000000004005ec <+44>:    jae    0x4005d0 <square+16>
   0x00000000004005ee <+46>:    mov    %esi,%eax
   0x00000000004005f0 <+48>:    and    $0xfffffffc,%eax
   0x00000000004005f3 <+51>:    cmp    %eax,%esi
   0x00000000004005f5 <+53>:    jbe    0x400617 <square+87>
   0x00000000004005f7 <+55>:    nopw   0x0(%rax,%rax,1)
   0x0000000000400600 <+64>:    mov    %eax,%ecx
   0x0000000000400602 <+66>:    add    $0x1,%eax
   0x0000000000400605 <+69>:    vmovsd (%rdi,%rcx,8),%xmm0
   0x000000000040060a <+74>:    cmp    %eax,%esi
   0x000000000040060c <+76>:    vmulsd %xmm0,%xmm0,%xmm0
   0x0000000000400610 <+80>:    vmovsd %xmm0,(%rdx,%rcx,8)
   0x0000000000400615 <+85>:    jne    0x400600 <square+64>
   0x0000000000400617 <+87>:    vzeroupper 
   0x000000000040061a <+90>:    retq

我的 GCC 版本是:

gcc (Ubuntu 5.4.0-6ubuntu1~16.04.12) 5.4.0 20160609
Copyright (C) 2015 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

我只是想知道是否有人可以在第一个代码片段中解释 GCC 在做什么。似乎有一堆vmovapd没有特定目的的随机指令。即指令<+110>似乎是无用的内容%ymm0是一样的-0x90(%rbp)?我可以理解-O3代码的作用,但我对未优化的代码感到困惑。谢谢你。

标签: cgccassemblyavx

解决方案


推荐阅读