首页 > 解决方案 > 让 MSVC 接近 clang 的性能

问题描述

我试图从 MSVC 中为这段代码获得不错的性能。我幼稚的基准测试显示 clang 的可执行运行时间约为 MSVC 运行时间的 10%。GCC 介于两者之间,通常约为 MSVC 的 25%。是否有可能欺骗 MSVC 生产更好的程序集?我看过Compiler Explorer但我的实验并没有太大的不同。作为背景,这是 Philox4x32 随机数生成器的核心模块。

#include <inttypes.h>
#include <time.h>
#include <stdio.h>

struct array2x32 {
  uint32_t v[2];
};
struct array4x32 {
  uint32_t v[4];
};


struct array4x32 round4x32(struct array4x32 ctr, struct array2x32 key) {
  uint32_t hi0, hi1, lo0, lo1;
  uint64_t product;
  product = 0x00000000D2511F53ULL * (uint64_t)ctr.v[0];
  lo0 = (uint32_t)product;
  hi0 = ((uint32_t)(product>>32)) ^ ctr.v[3] ^ key.v[1];
  product = 0x00000000CD9E8D57ULL * (uint64_t)ctr.v[2];
  lo1 = (uint32_t)product;
  hi1 = ((uint32_t)(product>>32)) ^ ctr.v[1] ^ key.v[0];
  struct array4x32 out = {{hi1, lo1, hi0, lo0}};
  return out;
}

#define N 1000000000
int main(){
      struct array4x32 ctr = {{0, 0, 0, 0}};
  struct array2x32 key = {{0, 0xDEADBEAF}};
  struct array4x32 out;
  uint64_t count = 0, sum = 0;
  int i, j;
  clock_t begin = clock();
  for (i = 0; i < N / 4UL; i++) {
    ctr.v[0]++;
    out = round4x32(ctr, key);
    for (j = 0; j < 4; j++) {
      sum += out.v[j];
      count++;
    }
  }
  clock_t end = clock();
  double time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
  printf("%0.10f", time_spent);
  printf("0x%" PRIx64 "\ncount: %" PRIu64 "\n", sum, count);
  printf("%" PRIu64 " randoms per second\n",
         (uint64_t)((N / time_spent) / 1000000 * 1000000));
}

标签: cperformancevisual-c++random

解决方案


推荐阅读