gcc - x86_64 内联汇编；将 64 位寄存器直接复制到 64 位内存位置

问题描述

我正在运行下面的代码并遇到两个问题：

1）当我将 movl （从寄存器中复制值）更改为 movq 时，我面临 gcc 错误：Error: operand size mismatch for movq。在普通程序集中，我看到通过添加 qword 前缀或喜欢这是可能的，但这也无法满足 gcc

uint64_t cpuid_0(uint64_t* _rax, uint64_t* _rbx, uint64_t* _rcx, uint64_t* _rdx){

    int a, b, c, d;
    *_rax = 0x0;

    __asm__
    __volatile__
    (
        "movq $0,  %%rax\n"
        "cpuid\n"
        "movl %%eax, %0\n"
        "movl %%ebx, %1\n"
        "movl %%ecx, %2\n"
        "movl %%edx, %3\n"
        : "=r" (a), "=r" (b), "=r" (c), "=r" (d)
        : "0" (a)
    );
    *_rax=a;*_rbx=b;*_rcx=c;*_rdx=d;
    return *_rax;
}

2）我想消除额外的复制操作，所以我在约束规范中修改了我的代码：

uint64_t cpuid_0(uint64_t* _rax, uint64_t* _rbx, uint64_t* _rcx, uint64_t* _rdx){

    int a, b, c, d;
    *_rax = 0x0;

    __asm__
    __volatile__
    (
         "movq $0,  %%rax\n"
         "cpuid\n"
         "movl %%eax, %0\n"
         "movl %%ebx, %1\n"
         "movl %%ecx, %2\n"
         "movl %%edx, %3\n"
        : "+m" (*_rax), "=m" (*_rbx), "=m" (*_rcx), "=m" (_rdx)
        : "0" (*_rax)
    );
    *_rax=a;*_rbx=b;*_rcx=c;*_rdx=d;
    return *_rax;
}

这给了我很多错误，如下所示：

warning: matching constraint does not allow a register
error: inconsistent operand constraints in an ‘asm’

另外，我认为__volatile__可以在这个小代码中删除。

标签： gccassemblyx86-64inline-assembly

解决方案

是输入"0" (*_rax)使它变得笨拙……似乎它"0"不适用于"=m"内存约束，也不适用于"+m". （我不知道为什么。）

更改您的第二个函数以编译和工作：

uint32_t cpuid_0(uint32_t* _eax, uint32_t* _ebx, uint32_t* _ecx, uint32_t* _edx)
{
  __asm__
  (
    "mov $0,  %%eax\n"
    "cpuid\n"
    "mov %%eax, %0\n"
    "mov %%ebx, %1\n"
    "mov %%ecx, %2\n"
    "mov %%edx, %3\n"
    : "=m" (*_eax), "=m" (*_ebx), "=m" (*_ecx), "=m" (*_edx)
    : //"0" (*_eax) -- not required and throws errors !!
    : "%rax", "%rbx", "%rcx", "%rdx"  // ESSENTIAL "clobbers"
  ) ;
  return *_eax ;
}

在哪里：

为了一致性，一切都像 uint32_t 一样。
丢弃多余的int a, b, c, d;
省略"0"输入，在任何情况下都没有使用。
声明简单的“=m”输出(*_eax)
“clobbers” 所有“%rax”、“%rbx”、“%rcx”、“%rdx”
丢弃多余的volatile。

最后一点是必不可少的，因为没有它，编译器不知道这些寄存器会受到影响。

以上编译为：

   push   %rbx                 # compiler (now) knows %rbx is "clobbered"
   mov    %rdx,%r8             # likewise %rdx
   mov    %rcx,%r9             # ditto %rcx

     mov    $0x0,%eax          # the __asm__(....
     cpuid  
     mov    %eax,(%rdi)
     mov    %ebx,(%rsi)
     mov    %ecx,(%r8)
     mov    %edx,(%r9)         # ....) ;

   mov    (%rdi),%eax
   pop    %rbx
   retq

注意：没有“clobbers”编译为：

   mov    $0x0,%eax
   cpuid  
   mov    %eax,(%rdi)
   mov    %ebx,(%rsi)
   mov    %ecx,(%rdx)
   mov    %edx,(%rcx)
   mov    (%rdi),%eax
   retq

哪个更短，但遗憾的是不起作用！

您还可以（版本 2）：

struct cpuid
{
  uint32_t  eax ;
  uint32_t  ebx ;
  uint32_t  ecx ;
  uint32_t  edx ;
};

uint32_t cpuid_0(struct cpuid* cid)
{
  uint32_t eax ;

  __asm__
  (
    "mov $0,  %%eax\n"
    "cpuid\n"
    "mov %%ebx, %1\n"
    "mov %%ecx, %2\n"
    "mov %%edx, %3\n"
    : "=a" (eax), "=m" (cid->ebx), "=m" (cid->ecx), "=m" (cid->edx)
    :: "%ebx", "%ecx", "%edx"
  ) ;

  return cid->eax = eax ;
}

编译成稍微短一点的东西：

   push   %rbx
   mov    $0x0,%eax
   cpuid  
   mov    %ebx,0x4(%rdi)
   mov    %ecx,0x8(%rdi)
   mov    %edx,0xc(%rdi)
   pop    %rbx
   mov    %eax,(%rdi)
   retq

或者你可以做一些更像你的第一个版本（版本 3）的东西：

uint32_t cpuid_0(struct cpuid* cid)
{
  uint32_t eax, ebx, ecx, edx ;

  eax = 0 ;
  __asm__(" cpuid\n" : "+a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx));

  cid->edx = edx ;
  cid->ecx = ecx ;
  cid->ebx = ebx ;
  return cid->eax = eax ;
}

编译为：

   push   %rbx
   xor    %eax,%eax
   cpuid  
   mov    %ebx,0x4(%rdi)
   mov    %edx,0xc(%rdi)
   pop    %rbx
   mov    %ecx,0x8(%rdi)
   mov    %eax,(%rdi)
   retq

这个版本使用"+a","=b"等魔法来告诉编译器将特定的寄存器分配给各种变量。这将汇编程序的数量减少到最低限度，这通常是一件好事。[请注意，编译器知道这xor %eax,%eax比以前更好（并且更短），mov $0,%eax并认为这样做有一些优势pop %rbx。]

更好的是——@Peter Cordes（第 4 版）发表评论：

uint32_t cpuid_1(struct cpuid* cid)
{
  __asm__
  (
    "xor %%eax, %%eax\n"
    "cpuid\n"
    : "=a" (cid->eax), "=b" (cid->ebx), "=c" (cid->ecx), "=d" (cid->edx)
  ) ;

  return cid->eax ;
}

编译器发现cid->eax已经存在的地方%eax，因此编译为：

   push   %rbx
   xor    %eax,%eax
   cpuid  
   mov    %ebx,0x4(%rdi)
   mov    %eax,(%rdi)
   pop    %rbx
   mov    %ecx,0x8(%rdi)
   mov    %edx,0xc(%rdi)
   retq

与第 3 版相同，只是指令顺序略有不同。

FWIW：an__asm__()定义为：

asm asm 限定符 (AssemblerTemplate : OutputOperands [ : InputOperands [ : Clobbers ] ] )

内联汇编器的关键是了解编译器：

不知道AssemblerTemplate部分是什么意思。

它确实扩展了%xx占位符，但什么都不理解。
确实了解OutputOperands，InputOperands（如果有的话）和Clobbers（如果有的话）......

...这些告诉编译器汇编器需要什么作为参数，以及如何扩展各种%xx.

...但是这些也告诉编译器AssemblerTemplate 做了什么，用编译器理解的术语。

所以，编译器理解的是一种“数据流”。它理解汇编器接受多个输入，返回多个输出，并且（可能）作为副作用“破坏”一些寄存器和/或内存量。有了这些信息，编译器就可以将“黑盒”汇编程序序列与围绕它生成的代码集成在一起。除其他外，编译器将：

为输出和输入操作数分配寄存器

并将输入安排在所需的寄存器中（根据需要）。

注意：编译器将汇编器视为单个操作，其中所有输入都在生成任何输出之前被消耗。如果__asm__()在编译器可以将给定寄存器分配为输入和输出之后未使用输入。因此需要所谓的“早期破坏者”。
在周围的代码中移动“黑匣子”，维护汇编器对其输入源的依赖性以及随后的代码对汇编器输出的依赖性。
如果似乎没有任何东西依赖于它的输出，则完全丢弃“黑匣子”！

gcc - x86_64 内联汇编；将 64 位寄存器直接复制到 64 位内存位置

问题描述

解决方案

推荐阅读