c++ - Cpp uint32_fast_t 解析为 uint64_t,但几乎所有操作都比 uint32_t (x86_64) 慢。为什么它解析为 uint64_t?
问题描述
跑了一个基准测试,uint32_fast_t 是 8 字节,但几乎所有操作都比 4 字节 uint32_t 慢。如果是这种情况,为什么 uint32_fast_t 不保持为 4 个字节?
操作系统信息:5.3.0-62-generic #56~18.04.1-Ubuntu SMP Wed Jun 24 16:17:03 UTC 2020 x86_64 x86_64 x86_64 GNU/Linux
处理器信息:
cat /sys/devices/cpu/caps/pmu_name
skylake
model name : Intel(R) Core(TM) i7-8565U CPU @ 1.80GHz
我用于测试的基准:
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <cstdint>
#define TEST_SIZE (1u << 26)
#define ADD(X, Y) X += Y
#define SUB(X, Y) X -= Y
#define MULT(X, Y) X *= Y
#define DIV(X, Y) X /= Y
#define MOD(X, Y) X = X % Y
#define AND(X, Y) X &= Y
#define OR(X, Y) X |= Y
#define XOR(X, Y) X ^= Y
// if you compile this make sure to have -DOP=<Operation macro name here>
#define bench_flush_all_pending() asm volatile("" : : : "memory");
#define bench_do_not_optimize_out(X) asm volatile("" : : "r,m"(X) : "memory")
uint64_t inline __attribute__((always_inline)) __attribute__((const))
get_cycles() {
uint32_t hi, lo;
__asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi));
return (((uint64_t)lo) | (((uint64_t)hi) << 32));
}
constexpr uint32_t
size_ratio() {
return sizeof(uint_fast32_t) / sizeof(uint32_t);
}
int
main(int argc, char ** argv) {
if (argc < 2) {
fprintf(stderr, "Usage: ./%s <A or B>\n", argv[0]);
}
uint_fast32_t * valsf32 =
(uint_fast32_t *)calloc(TEST_SIZE, sizeof(uint_fast32_t));
for (uint32_t i = 0; i < TEST_SIZE; ++i) {
valsf32[i] = rand();
}
uint32_t * vals32 = (uint32_t *)valsf32;
uint_fast32_t sinkf32 = rand();
uint32_t sink32 = rand();
uint64_t start, end;
if (argv[1][0] == 'A') {
start = get_cycles();
#ifndef DO_NOTHING
for (uint32_t i = 0; i < TEST_SIZE; ++i) {
OP(sinkf32, valsf32[i]);
}
bench_do_not_optimize_out(sinkf32);
bench_flush_all_pending();
#endif
end = get_cycles();
}
else if (argv[1][0] == 'B') {
start = get_cycles();
#ifndef DO_NOTHING
for (uint32_t i = 0; i < TEST_SIZE; ++i) {
OP(sink32, vals32[size_ratio() * i]);
}
bench_do_not_optimize_out(sink32);
bench_flush_all_pending();
#endif
end = get_cycles();
}
else {
bench_do_not_optimize_out(sinkf32);
bench_do_not_optimize_out(sink32);
}
fprintf(stderr,
"\t%s (%zu Bytes): %.2E \"Cycles\" Per operator\n",
argv[1][0] == 'A' ? "Fast u32" : "Norm u32",
argv[1][0] == 'A' ? sizeof(uint_fast32_t) : sizeof(uint32_t),
((double)(end - start)) / TEST_SIZE);
free(valsf32);
}
我编译了这个:
g++ -O3 test_operations.cc -o test_ops -DOP=<Operation macro name here>
即测试加法:
g++ -O3 test_operations.cc -o test_ops -DOP=ADD
要获得基线,只需包括-DDO_NOTHING
我希望有人可以解释将 uint32_fast_t 设置为 8 个字节的优势,或者我在这个基准测试中遗漏的一些东西,以便我实际上可以得到一个很好的比较。
这同样适用于-fno-unroll-loops
这是一个示例运行:
Running: test_ADD Fast u32
Fast u32 (8 Bytes): 8.52E-01 "Cycles" Per operator
Running: test_ADD Norm u32
Norm u32 (4 Bytes): 7.92E-01 "Cycles" Per operator
Running: test_SUB Fast u32
Fast u32 (8 Bytes): 8.34E-01 "Cycles" Per operator
Running: test_SUB Norm u32
Norm u32 (4 Bytes): 7.94E-01 "Cycles" Per operator
Running: test_MULT Fast u32
Fast u32 (8 Bytes): 2.54E+00 "Cycles" Per operator
Running: test_MULT Norm u32
Norm u32 (4 Bytes): 1.26E+00 "Cycles" Per operator
Running: test_DIV Fast u32
Fast u32 (8 Bytes): 1.48E+01 "Cycles" Per operator
Running: test_DIV Norm u32
Norm u32 (4 Bytes): 1.09E+01 "Cycles" Per operator
Running: test_MOD Fast u32
Fast u32 (8 Bytes): 1.52E+01 "Cycles" Per operator
Running: test_MOD Norm u32
Norm u32 (4 Bytes): 1.20E+01 "Cycles" Per operator
Running: test_AND Fast u32
Fast u32 (8 Bytes): 8.30E-01 "Cycles" Per operator
Running: test_AND Norm u32
Norm u32 (4 Bytes): 7.98E-01 "Cycles" Per operator
Running: test_OR Fast u32
Fast u32 (8 Bytes): 8.30E-01 "Cycles" Per operator
Running: test_OR Norm u32
Norm u32 (4 Bytes): 8.00E-01 "Cycles" Per operator
Running: test_XOR Fast u32
Fast u32 (8 Bytes): 8.29E-01 "Cycles" Per operator
Running: test_XOR Norm u32
Norm u32 (4 Bytes): 7.95E-01 "Cycles" Per operator
Running: test_NOTHING Fast u32
Fast u32 (8 Bytes): 2.09E-07 "Cycles" Per operator
Running: test_NOTHING Norm u32
Norm u32 (4 Bytes): 2.09E-07 "Cycles" Per operator
更新初始化函数:
uint_fast32_t * valsf32 =
(uint_fast32_t *)calloc(TEST_SIZE, sizeof(uint_fast32_t));
for (uint32_t i = 0; i < TEST_SIZE; ++i) {
valsf32[i] = (uint32_t)rand(); // rand max is already an int by why not
valsf32[i] += (valsf32[i] == 0);
assert(valsf32[i]);
}
uint32_t * vals32 = (uint32_t *)calloc(TEST_SIZE, sizeof(uint_fast32_t));
for (uint32_t i = 0; i < TEST_SIZE; ++i) {
vals32[size_ratio() * i] = (uint32_t)valsf32[i];
vals32[size_ratio() * i] += (vals32[size_ratio() * i] == 0);
assert(vals32[size_ratio() * i]);
assert(vals32[size_ratio() * i] == valsf32[i]);
}
结果:
Running: test_ADD Fast u32
Fast u32 (8 Bytes): 8.27E-01 "Cycles" Per operator
Running: test_ADD Norm u32
Norm u32 (4 Bytes): 7.83E-01 "Cycles" Per operator
Running: test_SUB Fast u32
Fast u32 (8 Bytes): 8.29E-01 "Cycles" Per operator
Running: test_SUB Norm u32
Norm u32 (4 Bytes): 7.72E-01 "Cycles" Per operator
Running: test_MULT Fast u32
Fast u32 (8 Bytes): 2.55E+00 "Cycles" Per operator
Running: test_MULT Norm u32
Norm u32 (4 Bytes): 1.28E+00 "Cycles" Per operator
Running: test_DIV Fast u32
Fast u32 (8 Bytes): 1.49E+01 "Cycles" Per operator
Running: test_DIV Norm u32
Norm u32 (4 Bytes): 1.10E+01 "Cycles" Per operator
Running: test_MOD Fast u32
Fast u32 (8 Bytes): 1.53E+01 "Cycles" Per operator
Running: test_MOD Norm u32
Norm u32 (4 Bytes): 1.25E+01 "Cycles" Per operator
Running: test_AND Fast u32
Fast u32 (8 Bytes): 8.35E-01 "Cycles" Per operator
Running: test_AND Norm u32
Norm u32 (4 Bytes): 8.34E-01 "Cycles" Per operator
Running: test_OR Fast u32
Fast u32 (8 Bytes): 8.31E-01 "Cycles" Per operator
Running: test_OR Norm u32
Norm u32 (4 Bytes): 7.76E-01 "Cycles" Per operator
Running: test_XOR Fast u32
Fast u32 (8 Bytes): 8.34E-01 "Cycles" Per operator
Running: test_XOR Norm u32
Norm u32 (4 Bytes): 7.82E-01 "Cycles" Per operator
Running: test_NOTHING Fast u32
Fast u32 (8 Bytes): 1.79E-07 "Cycles" Per operator
Running: test_NOTHING Norm u32
Norm u32 (4 Bytes): 1.79E-07 "Cycles" Per operator
为了测试内存带宽,添加了一个 TOUCH 运算符:
#define TOUCH(X, Y) X = Y
触摸结果:
Fast u32 (8 Bytes): 1.05E+00 "Cycles" Per operator
Norm u32 (4 Bytes): 1.04E+00 "Cycles" Per operator
添加:
#define THREE_WAY ((A * vals8[i] + B) * vals8[i] + C)
#define TW(X, Y) X ^= THREE_WAY
// A, B, C, vals8, and i are all define
...
uint8_t * vals8 = (uint8_t *)calloc(TEST_SIZE, 1);
for (uint32_t i = 0; i < TEST_SIZE; ++i) {
vals8[i] = rand();
}
...
if (argv[1][0] == 'A') {
uint_fast32_t A = rand();
uint_fast32_t B = rand();
uint_fast32_t C = rand();
...
else if (argv[1][0] == 'B') {
uint32_t A = rand();
uint32_t B = rand();
uint32_t C = rand();
OP=TW 的结果
Fast u32 (8 Bytes): 2.01E+00 "Cycles" Per operator
Norm u32 (4 Bytes): 9.48E-01 "Cycles" Per operator
编辑 6:添加 CPU 信息。
解决方案
推荐阅读
- ffmpeg - FFmpeg - 添加音频后帧延迟加倍
- scala - 从scala中的列表中删除有向图的重复循环
- python - 防止“内核似乎已经死亡”
- dotnetnuke - DNN DAL 2 跨多个存储库的事务
- charts - echarts:条形图显示右边对齐的值
- android - OnLongPress 不适用于我们的触摸事件
- r - 可以编织成 pdf,但不能使用 knitr::kable 运行块
- ocaml - 为什么需要关闭另一个进程的管道加上 set_close_on_exec 才能真正关闭?
- python - 使用递归的嵌套列表的最大值和最小值
- jquery - 即使没有结果,UI 自动完成显示 URL