首页 > 解决方案 > x86-64 在一次读取 8 个字符的循环中对齐数据?

问题描述

strlen:
   xor r8,r8

.Lalignlong:
    test rdi, 0xf
    je .LfindNull
    prefetch [rdi + 8]
    cmp  Byte PTR [rdi], 0
    je  .LansNoAdd
    inc r8
    inc rdi
    jmp .Lalignlong

# do while is faster than while because of less  jumps (Agner)
.LfindNull:
    mov  r9, 0xFEFEFEFEFEFEFEFF
    mov  r10, 0x8080808080808080 # citation: Bit Twiddling Hacks Sean Eron Anderson
    prefetch [rdi + 192]
    mov rcx, [rdi]
    lea    rax, [rcx + r9]
    not     rcx
    and     rcx, rax
    and     rcx, r10
    jne .Lanswer
    nop # no idea why this makes it 2 cycles faster. findloop changes from 4a -> 4b
.Lfindloop:
    prefetch [rdi + 420]
    mov rcx, [rdi + 8]
    add rdi, 8
    add r8, 8
    lea     rax, [rcx + r9]
    not     rcx
    and     rcx, rax
    and     rcx, r10
    je .Lfindloop
.Lanswer:
    bsf     rcx, rcx
    shr     rcx, 3
    lea rax, [rcx + r8]
    ret
.LansNoAdd:
    mov rax, r8
    ret

这应该是 x86 64 位汇编代码,用于计算一个 char 字符串的长度,并将字符串的地址传递给 RDI。

我不明白第一.Lalignlong部分;那会做数据对齐吗?

如果是,它应该如何工作?尤其是这条线test rdi, 0xf让我非常困惑。

标签: assemblyx86memory-alignmentmicro-optimization

解决方案


推荐阅读