首页 > 解决方案 > 为什么在 C 语言循环中,访问数组元素比访问变量慢很多?

问题描述

以下两个测试循环代码之间的唯一区别是i += shifti += shifts[str[i]]。它们具有相同的字符集大小、相同的每个班次的步幅和相同的比较次数。分支预测和缓存命中率也几乎相同。让我难以理解的是为什么 Test1 比 Test2 快几倍。Test2使用一个小数组(长度为8)进行移位,这样就几乎不会受到缓存的影响。

访问数组比访问变量慢得多吗?我只知道访问数组会比只访问变量慢一点(如果忽略缓存命中,甚至可以忽略),但肯定不会有太大不同。如果不是这个原因,是什么原因造成的?

测试环境:系统:Mac OS,CPU:core i9,编译器:clang 11,编译选项:-O3和-O0

C代码:

#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <string.h>

#define n     134217728
#define Alpha 8
int c1 = 0, c2 = 0;
// Fixed shift assigned during execution.
int shift;
// Create a table for Test2 shift.
int shifts[Alpha];

void func1(char *str, char ch);
void func2(char *str, char ch);

int main(int argc, const char * argv[]) {
    srandom((unsigned)time(NULL));
    printf("shift = "); scanf("%d", &shift);
    
    // Elements of the shifts are all fixed values(shift).
    for (int i = 0; i < Alpha; ++i) shifts[i] = shift;
    
    // Create a random string with length 134217728 and alphabet 0 ~ 7.
    char *str = malloc(n);
    for (int i = 0; i < n; ++i) str[i] = random()%Alpha;

    // Create a character(not assigned, random 0 ~ 7 in the test).
    char ch;

    // Test1 ============================================================
    double duration1 = 0;
    for (int i = 0; i < 128; ++i) {
        ch = random()%Alpha; // random char
        clock_t begin = clock();        // time1 begin ------------------
        func1(str, ch);
        clock_t end  = clock();         // time1 end --------------------
        duration1 += ((double)(end - begin)) / CLOCKS_PER_SEC;
    }
    printf("1-st: %lf s, and count1: %d\n", (duration1/128.0), c1/128);

    // Test2 ============================================================
    double duration2 = 0;
    for (int i = 0; i < 128; ++i) {
        ch = random()%Alpha; // random char
        clock_t begin = clock();        // time2 begin ------------------
        func2(str, ch);
        clock_t end  = clock();         // time2 end --------------------
        duration2 += ((double)(end - begin)) / CLOCKS_PER_SEC;
    }
    printf("2-st: %lf s, and count2: %d\n", (duration2/128.0), c2/128);

    return 0;
}

void func1(char *str, char ch){
    for (int i = 0; i < n; i += shift)
        if (str[i] == ch) c1++;
}

void func2(char *str, char ch){
    for (int i = 0; i < n; i += shifts[str[i]])
        if (str[i] == ch) c2++;
}

以下代码的输出是 128 个测试周期的平均值。短班 (16)、中班 (64) 和长班 (1024) 测试的代码分别使用 -O3 和 -O0 选项编译。我想确保它shift是一个运行时变量(它不能被编译器优化为常量,然后执行额外的特殊优化),所以我在执行期间得到它的值。

  1. -O3 编译选项
    shift = 16
    1-st: 0.015870 s, and count1: 1048746
    2-st: 0.036174 s, and count2: 1048650
    
    shift = 64
    1-st: 0.009782 s, and count1: 262195
    2-st: 0.016102 s, and count2: 262160
    
    shift = 1024
    1-st: 0.001339 s, and count1: 16375
    2-st: 0.009361 s, and count2: 16402
    
  2. -O0 编译选项
    shift = 16
    1-st: 0.026950 s, and count1: 1048651
    2-st: 0.045186 s, and count2: 1048541
    
    shift = 64
    1-st: 0.010594 s, and count1: 262129
    2-st: 0.018006 s, and count2: 262140
    
    shift = 1024
    1-st: 0.001665 s, and count1: 16359
    2-st: 0.007348 s, and count2: 16381
    

以下是这两个函数的汇编代码:

func1:
        mov     ecx, DWORD PTR shift[rip]
        mov     rax, rdi
        lea     rdx, 134217728[rdi]
        cmp     ecx, 1
        jne     .L13
.L7:
        cmp     sil, BYTE PTR [rax]
        jne     .L6
        add     DWORD PTR c1[rip], 1
.L6:
        add     rax, 1
        cmp     rdx, rax
        jne     .L7
        ret
.L13:
        movsx   rdx, ecx
        xor     eax, eax
.L4:
        cmp     sil, BYTE PTR [rdi]
        jne     .L3
        add     DWORD PTR c1[rip], 1
.L3:
        add     eax, ecx
        add     rdi, rdx
        cmp     eax, 134217727
        jle     .L4
        ret
func2:
        xor     edx, edx
        lea     r8, shifts[rip]
.L18:
        movsx   rax, edx
        add     rax, rdi
        movsx   rcx, BYTE PTR [rax]
        cmp     sil, cl
        je      .L19
        add     edx, DWORD PTR [r8+rcx*4]
        cmp     edx, 134217727
        jle     .L18
        ret
.L19:
        add     DWORD PTR c2[rip], 1
        movsx   rax, BYTE PTR [rax]
        add     edx, DWORD PTR [r8+rax*4]
        cmp     edx, 134217727
        jle     .L18
        ret

这是我定义Alpha2(即大小为shiftsis 2)后的结果,-O3:

shift = 16
1-st: 0.033265 s, and count1: 4194253
2-st: 0.051381 s, and count2: 4194253

shift = 64
1-st: 0.012764 s, and count1: 1048683
2-st: 0.019852 s, and count2: 1048576

shift = 1024
1-st: 0.001424 s, and count1: 65544
2-st: 0.007971 s, and count2: 65537

性能差异还是那么大,可见主要原因一定不是寄存器和缓存性能差异造成的。(ps:可能原因和CPU流水线有关,请硬件专家给我详细的解释,非常感谢。)

标签: arrayscperformanceloopsx86-64

解决方案


推荐阅读