arrays - 为什么在 C 语言循环中,访问数组元素比访问变量慢很多?
问题描述
以下两个测试循环代码之间的唯一区别是i += shift
和i += shifts[str[i]]
。它们具有相同的字符集大小、相同的每个班次的步幅和相同的比较次数。分支预测和缓存命中率也几乎相同。让我难以理解的是为什么 Test1 比 Test2 快几倍。Test2使用一个小数组(长度为8)进行移位,这样就几乎不会受到缓存的影响。
访问数组比访问变量慢得多吗?我只知道访问数组会比只访问变量慢一点(如果忽略缓存命中,甚至可以忽略),但肯定不会有太大不同。如果不是这个原因,是什么原因造成的?
测试环境:系统:Mac OS,CPU:core i9,编译器:clang 11,编译选项:-O3和-O0
C代码:
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <string.h>
#define n 134217728
#define Alpha 8
int c1 = 0, c2 = 0;
// Fixed shift assigned during execution.
int shift;
// Create a table for Test2 shift.
int shifts[Alpha];
void func1(char *str, char ch);
void func2(char *str, char ch);
int main(int argc, const char * argv[]) {
srandom((unsigned)time(NULL));
printf("shift = "); scanf("%d", &shift);
// Elements of the shifts are all fixed values(shift).
for (int i = 0; i < Alpha; ++i) shifts[i] = shift;
// Create a random string with length 134217728 and alphabet 0 ~ 7.
char *str = malloc(n);
for (int i = 0; i < n; ++i) str[i] = random()%Alpha;
// Create a character(not assigned, random 0 ~ 7 in the test).
char ch;
// Test1 ============================================================
double duration1 = 0;
for (int i = 0; i < 128; ++i) {
ch = random()%Alpha; // random char
clock_t begin = clock(); // time1 begin ------------------
func1(str, ch);
clock_t end = clock(); // time1 end --------------------
duration1 += ((double)(end - begin)) / CLOCKS_PER_SEC;
}
printf("1-st: %lf s, and count1: %d\n", (duration1/128.0), c1/128);
// Test2 ============================================================
double duration2 = 0;
for (int i = 0; i < 128; ++i) {
ch = random()%Alpha; // random char
clock_t begin = clock(); // time2 begin ------------------
func2(str, ch);
clock_t end = clock(); // time2 end --------------------
duration2 += ((double)(end - begin)) / CLOCKS_PER_SEC;
}
printf("2-st: %lf s, and count2: %d\n", (duration2/128.0), c2/128);
return 0;
}
void func1(char *str, char ch){
for (int i = 0; i < n; i += shift)
if (str[i] == ch) c1++;
}
void func2(char *str, char ch){
for (int i = 0; i < n; i += shifts[str[i]])
if (str[i] == ch) c2++;
}
以下代码的输出是 128 个测试周期的平均值。短班 (16)、中班 (64) 和长班 (1024) 测试的代码分别使用 -O3 和 -O0 选项编译。我想确保它shift
是一个运行时变量(它不能被编译器优化为常量,然后执行额外的特殊优化),所以我在执行期间得到它的值。
- -O3 编译选项
shift = 16 1-st: 0.015870 s, and count1: 1048746 2-st: 0.036174 s, and count2: 1048650 shift = 64 1-st: 0.009782 s, and count1: 262195 2-st: 0.016102 s, and count2: 262160 shift = 1024 1-st: 0.001339 s, and count1: 16375 2-st: 0.009361 s, and count2: 16402
- -O0 编译选项
shift = 16 1-st: 0.026950 s, and count1: 1048651 2-st: 0.045186 s, and count2: 1048541 shift = 64 1-st: 0.010594 s, and count1: 262129 2-st: 0.018006 s, and count2: 262140 shift = 1024 1-st: 0.001665 s, and count1: 16359 2-st: 0.007348 s, and count2: 16381
以下是这两个函数的汇编代码:
func1:
mov ecx, DWORD PTR shift[rip]
mov rax, rdi
lea rdx, 134217728[rdi]
cmp ecx, 1
jne .L13
.L7:
cmp sil, BYTE PTR [rax]
jne .L6
add DWORD PTR c1[rip], 1
.L6:
add rax, 1
cmp rdx, rax
jne .L7
ret
.L13:
movsx rdx, ecx
xor eax, eax
.L4:
cmp sil, BYTE PTR [rdi]
jne .L3
add DWORD PTR c1[rip], 1
.L3:
add eax, ecx
add rdi, rdx
cmp eax, 134217727
jle .L4
ret
func2:
xor edx, edx
lea r8, shifts[rip]
.L18:
movsx rax, edx
add rax, rdi
movsx rcx, BYTE PTR [rax]
cmp sil, cl
je .L19
add edx, DWORD PTR [r8+rcx*4]
cmp edx, 134217727
jle .L18
ret
.L19:
add DWORD PTR c2[rip], 1
movsx rax, BYTE PTR [rax]
add edx, DWORD PTR [r8+rax*4]
cmp edx, 134217727
jle .L18
ret
这是我定义Alpha
为2
(即大小为shifts
is 2
)后的结果,-O3:
shift = 16
1-st: 0.033265 s, and count1: 4194253
2-st: 0.051381 s, and count2: 4194253
shift = 64
1-st: 0.012764 s, and count1: 1048683
2-st: 0.019852 s, and count2: 1048576
shift = 1024
1-st: 0.001424 s, and count1: 65544
2-st: 0.007971 s, and count2: 65537
性能差异还是那么大,可见主要原因一定不是寄存器和缓存性能差异造成的。(ps:可能原因和CPU流水线有关,请硬件专家给我详细的解释,非常感谢。)