首页 > 解决方案 > 在不更改代码的情况下帮助 clang 做得更好

问题描述

我正在使用 clang 7/8 和 c11 进行各种测试。

我实现了一些排序功能并开始优化其中的一些。

网络排序功能已经是非常简单的代码和平,但我发现一些重写可以在 x64 平台(不是 arm)上提供更好的性能。

https://godbolt.org/z/uD7_DM

如果我们比较两个程序集。唯一显着的区别是 push 指令计数和第一个 mov 指令结构:

network_sort_08_a:
    push    rbp
    push    r14
    push    rbx
    mov     esi, dword ptr [rdi]
    mov     eax, dword ptr [rdi + 4]
    mov     edx, dword ptr [rdi + 8]
    mov     r9d, dword ptr [rdi + 12]
    mov     ebx, dword ptr [rdi + 16]
    cmp     esi, eax
    mov     ecx, eax
    cmovle  ecx, esi

代替 :

network_sort_8_b:
    push    rbp
    push    r15
    push    r14
    push    rbx
    mov     eax, dword ptr [rdi]
    mov     edx, dword ptr [rdi + 4]
    cmp     eax, edx
    mov     ecx, edx
    cmovle  ecx, eax
    cmovge  edx, eax
    mov     eax, dword ptr [rdi + 8]

我想在不重写函数的情况下输出第一个版本。

有什么想法可以实现吗?

此致

__attribute__((always_inline))
inline int min(const int a, const int b)
{
  return a < b ? a : b;
}

__attribute__((always_inline))
inline int max(const int a, const int b)
{
  return a > b ? a : b;
}

__attribute__((always_inline))
inline void network_swap(int* keys, const int key_index0, const int key_index1)
{
  const int key0 = keys[key_index0];
  const int key1 = keys[key_index1];
  const int min_key = min(key0, key1);
  const int max_key = max(key0, key1);
  keys[key_index0] = min_key;
  keys[key_index1] = max_key;
}

void network_sort_08_a(int* keys)
{
  // Resorted by dependencies
  // Serialize first reads
  const int key0 = keys[0];
  const int key1 = keys[1];
  const int key2 = keys[2];
  const int key3 = keys[3];
  const int key4 = keys[4];
  const int key5 = keys[5];
  const int key6 = keys[6];
  const int key7 = keys[7];
  keys[0] = min(key0, key1);
  keys[1] = max(key0, key1);
  keys[2] = min(key2, key3);
  keys[3] = max(key2, key3);
  keys[4] = min(key4, key5);
  keys[5] = max(key4, key5);
  keys[6] = min(key6, key7);
  keys[7] = max(key6, key7);
  network_swap(keys, 0, 2);
  network_swap(keys, 1, 3);
  network_swap(keys, 4, 6);
  network_swap(keys, 5, 7);
  network_swap(keys, 1, 2);
  network_swap(keys, 5, 6);
  network_swap(keys, 0, 4);
  network_swap(keys, 3, 7);
  network_swap(keys, 1, 5);
  network_swap(keys, 1, 4);
  network_swap(keys, 2, 6);
  network_swap(keys, 3, 6);
  network_swap(keys, 2, 4);
  network_swap(keys, 3, 5);
  network_swap(keys, 3, 4);
}

void network_sort_8_b(int* restrict keys)
{
  network_swap(keys, 0, 1);
  network_swap(keys, 2, 3);
  network_swap(keys, 0, 2);
  network_swap(keys, 1, 3);
  network_swap(keys, 1, 2);
  network_swap(keys, 4, 5);
  network_swap(keys, 6, 7);
  network_swap(keys, 4, 6);
  network_swap(keys, 5, 7);
  network_swap(keys, 5, 6);
  network_swap(keys, 0, 4);
  network_swap(keys, 1, 5);
  network_swap(keys, 1, 4);
  network_swap(keys, 2, 6);
  network_swap(keys, 3, 7);
  network_swap(keys, 3, 6);
  network_swap(keys, 2, 4);
  network_swap(keys, 3, 5);
  network_swap(keys, 3, 4);
}

标签: coptimizationclangc11

解决方案


推荐阅读