首页 > 解决方案 > Get bus error after replacing stp with str in glibc's aarch64 memcpy.S

问题描述

For some reasons, I need to replace memcpy's stp instruction with str, here is what I did:

modified   sysdeps/aarch64/memcpy.S
@@ -102,11 +102,19 @@ ENTRY (MEMCPY)
    tbz tmp1, 5, 1f
    ldp B_l, B_h, [src, 16]
    ldp C_l, C_h, [srcend, -32]
-   stp B_l, B_h, [dstin, 16]
-   stp C_l, C_h, [dstend, -32]
+   //stp   B_l, B_h, [dstin, 16]
+   str B_l, [dstin, 16]
+   str B_h, [dstin, 24]
+   //stp   C_l, C_h, [dstend, -32]
+   str C_l, [dstend, -32]
+   str C_h, [dstend, -24]
 1:
-   stp A_l, A_h, [dstin]
-   stp D_l, D_h, [dstend, -16]
+   //stp   A_l, A_h, [dstin]
+   str A_l, [dstin]
+   str A_h, [dstin, 8]
+   //stp   D_l, D_h, [dstend, -16]
+   str D_l, [dstend, -16]
+   str D_h, [dstend, -8]
    ret
 
    .p2align 4
@@ -150,12 +158,24 @@ L(copy96):
    ldp D_l, D_h, [src, 48]
    ldp E_l, E_h, [srcend, -32]
    ldp F_l, F_h, [srcend, -16]
-   stp A_l, A_h, [dstin]
-   stp B_l, B_h, [dstin, 16]
-   stp C_l, C_h, [dstin, 32]
-   stp D_l, D_h, [dstin, 48]
-   stp E_l, E_h, [dstend, -32]
-   stp F_l, F_h, [dstend, -16]
+   //stp   A_l, A_h, [dstin]
+   str A_l, [dstin]
+   str A_h, [dstin, 8]
+   //stp   B_l, B_h, [dstin, 16]
+   str B_l, [dstin, 16]
+   str B_h, [dstin, 24]
+   //stp   C_l, C_h, [dstin, 32]
+   str C_l, [dstin, 32]
+   str C_h, [dstin, 40]
+   //stp   D_l, D_h, [dstin, 48]
+   str D_l, [dstin, 48]
+   str D_h, [dstin, 56]
+   //stp   E_l, E_h, [dstend, -32]
+   str E_l, [dstend, -32]
+   str E_h, [dstend, -24]
+   //stp   F_l, F_h, [dstend, -16]
+   str F_l, [dstend, -16]
+   str F_h, [dstend, -8]
    ret
 
    /* Align DST to 16 byte alignment so that we don't cross cache line
@@ -171,20 +191,31 @@ L(copy_long):
    sub src, src, tmp1
    add count, count, tmp1  /* Count is now 16 too large.  */
    ldp A_l, A_h, [src, 16]
-   stp D_l, D_h, [dstin]
+   //stp   D_l, D_h, [dstin]
+   str D_l, [dstin]
+   str D_h, [dstin, 8]
    ldp B_l, B_h, [src, 32]
    ldp C_l, C_h, [src, 48]
    ldp D_l, D_h, [src, 64]!
    subs    count, count, 128 + 16  /* Test and readjust count.  */
    b.ls    L(last64)
 L(loop64):
-   stp A_l, A_h, [dst, 16]
+   //stp   A_l, A_h, [dst, 16]
+   str A_l, [dst, 16]
+   str A_h, [dst, 24]
    ldp A_l, A_h, [src, 16]
-   stp B_l, B_h, [dst, 32]
+   //stp   B_l, B_h, [dst, 32]
+   str B_l, [dst, 32]
+   str B_h, [dst, 40]
    ldp B_l, B_h, [src, 32]
-   stp C_l, C_h, [dst, 48]
+   //stp   C_l, C_h, [dst, 48]
+   str C_l, [dst, 48]
+   str C_h, [dst, 56]
    ldp C_l, C_h, [src, 48]
-   stp D_l, D_h, [dst, 64]!
+   //stp   D_l, D_h, [dst, 64]!
+   str D_l, [dst, 64]
+   str D_h, [dst, 72]
+   add dst, dst, 64
    ldp D_l, D_h, [src, 64]!
    subs    count, count, 64
    b.hi    L(loop64)
@@ -194,17 +225,33 @@ L(loop64):
       there is just 1 byte left.  */
 L(last64):
    ldp E_l, E_h, [srcend, -64]
-   stp A_l, A_h, [dst, 16]
+   //stp   A_l, A_h, [dst, 16]
+   str A_l, [dst, 16]
+   str A_h, [dst, 24]
    ldp A_l, A_h, [srcend, -48]
-   stp B_l, B_h, [dst, 32]
+   //stp   B_l, B_h, [dst, 32]
+   str B_l, [dst, 32]
+   str B_h, [dst, 40]
    ldp B_l, B_h, [srcend, -32]
-   stp C_l, C_h, [dst, 48]
+   //stp   C_l, C_h, [dst, 48]
+   str C_l, [dst, 48]
+   str C_h, [dst, 56]
    ldp C_l, C_h, [srcend, -16]
-   stp D_l, D_h, [dst, 64]
-   stp E_l, E_h, [dstend, -64]
-   stp A_l, A_h, [dstend, -48]
-   stp B_l, B_h, [dstend, -32]
-   stp C_l, C_h, [dstend, -16]
+   //stp   D_l, D_h, [dst, 64]
+   str D_l, [dst, 64]
+   str D_h, [dst, 72]
+   //stp   E_l, E_h, [dstend, -64]
+   str E_l, [dstend, -64]
+   str E_h, [dstend, -56]
+   //stp   A_l, A_h, [dstend, -48]
+   str A_l, [dstend, -48]
+   str A_h, [dstend, -40]
+   //stp   B_l, B_h, [dstend, -32]
+   str B_l, [dstend, -32]
+   str B_h, [dstend, -24]
+   //stp   C_l, C_h, [dstend, -16]
+   str C_l, [dstend, -16]
+   str C_h, [dstend, -8]
    ret
 
    .p2align 4
@@ -224,7 +271,9 @@ L(move_long):
    sub srcend, srcend, tmp1
    sub count, count, tmp1
    ldp A_l, A_h, [srcend, -16]
-   stp D_l, D_h, [dstend, -16]
+   //stp   D_l, D_h, [dstend, -16]
+   str D_l, [dstend, -16]
+   str D_h, [dstend, -8]
    ldp B_l, B_h, [srcend, -32]
    ldp C_l, C_h, [srcend, -48]
    ldp D_l, D_h, [srcend, -64]!
@@ -234,13 +283,22 @@ L(move_long):
 
    nop
 1:
-   stp A_l, A_h, [dstend, -16]
+   //stp   A_l, A_h, [dstend, -16]
+   str A_l, [dstend, -16]
+   str A_h, [dstend, -8]
    ldp A_l, A_h, [srcend, -16]
-   stp B_l, B_h, [dstend, -32]
+   //stp   B_l, B_h, [dstend, -32]
+   str B_l, [dstend, -32]
+   str B_h, [dstend, -24]
    ldp B_l, B_h, [srcend, -32]
-   stp C_l, C_h, [dstend, -48]
+   //stp   C_l, C_h, [dstend, -48]
+   str C_l, [dstend, -48]
+   str C_h, [dstend, -40]
    ldp C_l, C_h, [srcend, -48]
-   stp D_l, D_h, [dstend, -64]!
+   //stp   D_l, D_h, [dstend, -64]!
+   str D_l, [dstend, -64]
+   str D_h, [dstend, -56]
+   sub dstend, dstend, 64
    ldp D_l, D_h, [srcend, -64]!
    subs    count, count, 64
    b.hi    1b
@@ -250,17 +308,33 @@ L(move_long):
       there is just 1 byte left.  */
 2:
    ldp G_l, G_h, [src, 48]
-   stp A_l, A_h, [dstend, -16]
+   //stp   A_l, A_h, [dstend, -16]
+   str A_l, [dstend, -16]
+   str A_h, [dstend, -8]
    ldp A_l, A_h, [src, 32]
-   stp B_l, B_h, [dstend, -32]
+   //stp   B_l, B_h, [dstend, -32]
+   str B_l, [dstend, -32]
+   str B_h, [dstend, -24]
    ldp B_l, B_h, [src, 16]
-   stp C_l, C_h, [dstend, -48]
+   //stp   C_l, C_h, [dstend, -48]
+   str C_l, [dstend, -48]
+   str C_h, [dstend, -40]
    ldp C_l, C_h, [src]
-   stp D_l, D_h, [dstend, -64]
-   stp G_l, G_h, [dstin, 48]
-   stp A_l, A_h, [dstin, 32]
-   stp B_l, B_h, [dstin, 16]
-   stp C_l, C_h, [dstin]
+   //stp   D_l, D_h, [dstend, -64]
+   str D_l, [dstend, -64]
+   str D_h, [dstend, -56]
+   //stp   G_l, G_h, [dstin, 48]
+   str G_l, [dstin, 48]
+   str G_h, [dstin, 56]
+   //stp   A_l, A_h, [dstin, 32]
+   str A_l, [dstin, 32]
+   str A_h, [dstin, 40]
+   //stp   B_l, B_h, [dstin, 16]
+   str B_l, [dstin, 16]
+   str B_h, [dstin, 24]
+   //stp   C_l, C_h, [dstin]
+   str C_l, [dstin]
+   str C_h, [dstin, 8]
 3: ret
 
 END (MEMCPY)

(if you want to view diff in two windows, pls visit https://www.diffchecker.com/qAgmBLFu)

glibc compiled successfully and reboot is also ok. But when I run glmark2, it generates a bus error:

Using host libthread_db library "/lib/aarch64-linux-gnu/libthread_db.so.1".
Core was generated by `./build/src/glmark2'.
Program terminated with signal SIGBUS, Bus error.
#0  __memcpy_generic () at ../sysdeps/aarch64/multiarch/../memcpy.S:195

warning: Source file is more recent than executable.
195     str D_l, [dstin]
[Current thread is 1 (Thread 0x7f80e77a70 (LWP 9281))]
(gdb) info registers 
x0             0x7f782368ac        547476433068
x1             0x11744640          292832832
x2             0x3de0              15840
x3             0x7f782368a0        547476433056
x4             0x11748420          292848672
x5             0x7f7823a680        547476448896
x6             0x3da941653f800000  4443154410490560512
x7             0x3f8000003f066666  4575657222465807974
x8             0xbf666666          3211159142
x9             0xbf8000003e8ccccd  -4647714814396937011
x10            0x3e99999abd1f1347  4510805391665206087
x11            0xbf800000          3212836864
x12            0x3f8000003f0ccccc  4575657222466227404
x13            0x3f0666663da94165  4541429863556923749
x14            0xc                 12
x15            0x525521dd864a      90525593863754
x16            0x4d42e8            5063400
x17            0x7f808cc2c0        547617555136
x18            0xbf                191
x19            0x116911f0          292098544
x20            0xc                 12
x21            0x7f54001930        546870139184
x22            0x7fe9aad628        549381133864
--Type <RET> for more, q to quit, c to continue without paging--
x23            0x7f7820c000        547476258816
x24            0x4d4000            5062656
x25            0x11719da0          292658592
x26            0x0                 0
x27            0x0                 0
x28            0x0                 0
x29            0x7fe9aad540        549381133632
x30            0x476f40            4681536
sp             0x7fe9aad540        0x7fe9aad540
pc             0x7f808cc3f8        0x7f808cc3f8 <__memcpy_generic+296>
cpsr           0x20001000          [ EL=0 C ]
fpsr           0x11                17
fpcr           0x0                 0
(gdb) 



(gdb) bt
#0  __memcpy_generic () at ../sysdeps/aarch64/multiarch/../memcpy.S:195
#1  0x0000000000476f40 in std::__copy_move<false, true, std::random_access_iterator_tag>::__copy_m<float> (__result=<optimized out>, __last=<optimized out>, 
    __first=<optimized out>) at /usr/include/c++/8/bits/stl_iterator.h:783
#2  std::__copy_move_a<false, float*, float*> (__result=<optimized out>, 
    __last=<optimized out>, __first=<optimized out>)
    at /usr/include/c++/8/bits/stl_algobase.h:386
#3  std::__copy_move_a2<false, float*, float*> (__result=<optimized out>, 
    __last=<optimized out>, __first=<optimized out>)
    at /usr/include/c++/8/bits/stl_algobase.h:422
#4  std::copy<float*, float*> (__result=<optimized out>, 
    __last=<optimized out>, __first=<optimized out>)
    at /usr/include/c++/8/bits/stl_algobase.h:455
#5  Mesh::update_single_vbo (this=0x7f54001930, 
    ranges=std::vector of length 10, capacity 16 = {...}, n=<optimized out>, 
    nfloats=<optimized out>) at ../src/mesh.cpp:469
#6  0x0000000000478308 in Mesh::update_vbo (this=0x7f54001930, 
    ranges=std::vector of length 10, capacity 16 = {...})
    at /usr/include/c++/8/bits/stl_vector.h:930
#7  0x000000000041e094 in WaveMesh::update (elapsed=0.060067000000003645, 
    this=0x7f54001930) at ../src/scene-buffer.cpp:163
#8  SceneBuffer::update (this=<optimized out>) at ../src/scene-buffer.cpp:434
#9  0x0000000000416494 in MainLoop::draw (this=0x11717500)

So, it triggered bus error in the first str here:

    L(copy_long):
        and tmp1, dstin, 15
        bic dst, dstin, 15
        ldp D_l, D_h, [src]
        sub src, src, tmp1
        add count, count, tmp1  /* Count is now 16 too large.  */
        ldp A_l, A_h, [src, 16]
        //stp   D_l, D_h, [dstin]
        str D_l, [dstin] /* Oops, bus error! */
        str D_h, [dstin, 8]

Question:

  1. Why do I get bus error here?
  2. How to fix it?

update

here is disassembler of memcpy:

(gdb) disassemble __memcpy_generic
Dump of assembler code for function __memcpy_generic:
   0x0000007f9272d2d0 <+0>: prfm    pldl1keep, [x1]
   0x0000007f9272d2d4 <+4>: add x4, x1, x2
   0x0000007f9272d2d8 <+8>: add x5, x0, x2
   0x0000007f9272d2dc <+12>:    cmp x2, #0x10
   0x0000007f9272d2e0 <+16>:    b.ls    0x7f9272d330 <__memcpy_generic+96>  // b.plast
   0x0000007f9272d2e4 <+20>:    cmp x2, #0x60
   0x0000007f9272d2e8 <+24>:    b.hi    0x7f9272d3e0 <__memcpy_generic+272>  // b.pmore
   0x0000007f9272d2ec <+28>:    sub x14, x2, #0x1
   0x0000007f9272d2f0 <+32>:    ldp x6, x7, [x1]
   0x0000007f9272d2f4 <+36>:    tbnz    w14, #6, 0x7f9272d390 <__memcpy_generic+192>
   0x0000007f9272d2f8 <+40>:    ldp x12, x13, [x4, #-16]
   0x0000007f9272d2fc <+44>:    tbz w14, #5, 0x7f9272d318 <__memcpy_generic+72>
   0x0000007f9272d300 <+48>:    ldp x8, x9, [x1, #16]
   0x0000007f9272d304 <+52>:    ldp x10, x11, [x4, #-32]
   0x0000007f9272d308 <+56>:    str x8, [x0, #16]
   0x0000007f9272d30c <+60>:    str x9, [x0, #24]
   0x0000007f9272d310 <+64>:    stur    x10, [x5, #-32]
   0x0000007f9272d314 <+68>:    stur    x11, [x5, #-24]
   0x0000007f9272d318 <+72>:    str x6, [x0]
   0x0000007f9272d31c <+76>:    str x7, [x0, #8]
   0x0000007f9272d320 <+80>:    stur    x12, [x5, #-16]
   0x0000007f9272d324 <+84>:    stur    x13, [x5, #-8]
   0x0000007f9272d328 <+88>:    ret
   0x0000007f9272d32c <+92>:    nop
   0x0000007f9272d330 <+96>:    cmp x2, #0x8
   0x0000007f9272d334 <+100>:   b.cc    0x7f9272d350 <__memcpy_generic+128>  // b.lo, b.ul, b.last
   0x0000007f9272d338 <+104>:   ldr x6, [x1]
   0x0000007f9272d33c <+108>:   ldur    x7, [x4, #-8]
   0x0000007f9272d340 <+112>:   str x6, [x0]
   0x0000007f9272d344 <+116>:   stur    x7, [x5, #-8]
   0x0000007f9272d348 <+120>:   ret
   0x0000007f9272d34c <+124>:   nop
   0x0000007f9272d350 <+128>:   tbz w2, #2, 0x7f9272d368 <__memcpy_generic+152>
   0x0000007f9272d354 <+132>:   ldr w6, [x1]
   0x0000007f9272d358 <+136>:   ldur    w7, [x4, #-4]
   0x0000007f9272d35c <+140>:   str w6, [x0]
   0x0000007f9272d360 <+144>:   stur    w7, [x5, #-4]
   0x0000007f9272d364 <+148>:   ret
   0x0000007f9272d368 <+152>:   cbz x2, 0x7f9272d388 <__memcpy_generic+184>
   0x0000007f9272d36c <+156>:   lsr x14, x2, #1
--Type <RET> for more, q to quit, c to continue without paging--
   0x0000007f9272d370 <+160>:   ldrb    w6, [x1]
   0x0000007f9272d374 <+164>:   ldurb   w7, [x4, #-1]
   0x0000007f9272d378 <+168>:   ldrb    w8, [x1, x14]
   0x0000007f9272d37c <+172>:   strb    w6, [x0]
   0x0000007f9272d380 <+176>:   strb    w8, [x0, x14]
   0x0000007f9272d384 <+180>:   sturb   w7, [x5, #-1]
   0x0000007f9272d388 <+184>:   ret
   0x0000007f9272d38c <+188>:   nop
   0x0000007f9272d390 <+192>:   ldp x8, x9, [x1, #16]
   0x0000007f9272d394 <+196>:   ldp x10, x11, [x1, #32]
   0x0000007f9272d398 <+200>:   ldp x12, x13, [x1, #48]
   0x0000007f9272d39c <+204>:   ldp x1, x2, [x4, #-32]
   0x0000007f9272d3a0 <+208>:   ldp x4, x3, [x4, #-16]
   0x0000007f9272d3a4 <+212>:   str x6, [x0]
   0x0000007f9272d3a8 <+216>:   str x7, [x0, #8]
   0x0000007f9272d3ac <+220>:   str x8, [x0, #16]
   0x0000007f9272d3b0 <+224>:   str x9, [x0, #24]
   0x0000007f9272d3b4 <+228>:   str x10, [x0, #32]
   0x0000007f9272d3b8 <+232>:   str x11, [x0, #40]
   0x0000007f9272d3bc <+236>:   str x12, [x0, #48]
   0x0000007f9272d3c0 <+240>:   str x13, [x0, #56]
   0x0000007f9272d3c4 <+244>:   stur    x1, [x5, #-32]
   0x0000007f9272d3c8 <+248>:   stur    x2, [x5, #-24]
   0x0000007f9272d3cc <+252>:   stur    x4, [x5, #-16]
   0x0000007f9272d3d0 <+256>:   stur    x3, [x5, #-8]
   0x0000007f9272d3d4 <+260>:   ret
   0x0000007f9272d3d8 <+264>:   nop
   0x0000007f9272d3dc <+268>:   nop
   0x0000007f9272d3e0 <+272>:   and x14, x0, #0xf
   0x0000007f9272d3e4 <+276>:   and x3, x0, #0xfffffffffffffff0
   0x0000007f9272d3e8 <+280>:   ldp x12, x13, [x1]
   0x0000007f9272d3ec <+284>:   sub x1, x1, x14
   0x0000007f9272d3f0 <+288>:   add x2, x2, x14
   0x0000007f9272d3f4 <+292>:   ldp x6, x7, [x1, #16]
=> 0x0000007f9272d3f8 <+296>:   str x12, [x0]
   0x0000007f9272d3fc <+300>:   str x13, [x0, #8]
   0x0000007f9272d400 <+304>:   ldp x8, x9, [x1, #32]
   0x0000007f9272d404 <+308>:   ldp x10, x11, [x1, #48]
   0x0000007f9272d408 <+312>:   ldp x12, x13, [x1, #64]!
   0x0000007f9272d40c <+316>:   subs    x2, x2, #0x90
   0x0000007f9272d410 <+320>:   b.ls    0x7f9272d450 <__memcpy_generic+384>  // b.plast
--Type <RET> for more, q to quit, c to continue without paging--
   0x0000007f9272d414 <+324>:   str x6, [x3, #16]
   0x0000007f9272d418 <+328>:   str x7, [x3, #24]
   0x0000007f9272d41c <+332>:   ldp x6, x7, [x1, #16]
   0x0000007f9272d420 <+336>:   str x8, [x3, #32]
   0x0000007f9272d424 <+340>:   str x9, [x3, #40]
   0x0000007f9272d428 <+344>:   ldp x8, x9, [x1, #32]
   0x0000007f9272d42c <+348>:   str x10, [x3, #48]
   0x0000007f9272d430 <+352>:   str x11, [x3, #56]
   0x0000007f9272d434 <+356>:   ldp x10, x11, [x1, #48]
   0x0000007f9272d438 <+360>:   str x12, [x3, #64]
   0x0000007f9272d43c <+364>:   str x13, [x3, #72]
   0x0000007f9272d440 <+368>:   add x3, x3, #0x40
   0x0000007f9272d444 <+372>:   ldp x12, x13, [x1, #64]!
   0x0000007f9272d448 <+376>:   subs    x2, x2, #0x40
   0x0000007f9272d44c <+380>:   b.hi    0x7f9272d414 <__memcpy_generic+324>  // b.pmore
   0x0000007f9272d450 <+384>:   ldp x1, x2, [x4, #-64]
   0x0000007f9272d454 <+388>:   str x6, [x3, #16]
   0x0000007f9272d458 <+392>:   str x7, [x3, #24]
   0x0000007f9272d45c <+396>:   ldp x6, x7, [x4, #-48]
   0x0000007f9272d460 <+400>:   str x8, [x3, #32]
   0x0000007f9272d464 <+404>:   str x9, [x3, #40]
   0x0000007f9272d468 <+408>:   ldp x8, x9, [x4, #-32]
   0x0000007f9272d46c <+412>:   str x10, [x3, #48]
   0x0000007f9272d470 <+416>:   str x11, [x3, #56]
   0x0000007f9272d474 <+420>:   ldp x10, x11, [x4, #-16]
   0x0000007f9272d478 <+424>:   str x12, [x3, #64]
   0x0000007f9272d47c <+428>:   str x13, [x3, #72]
   0x0000007f9272d480 <+432>:   stur    x1, [x5, #-64]
   0x0000007f9272d484 <+436>:   stur    x2, [x5, #-56]
   0x0000007f9272d488 <+440>:   stur    x6, [x5, #-48]
   0x0000007f9272d48c <+444>:   stur    x7, [x5, #-40]
   0x0000007f9272d490 <+448>:   stur    x8, [x5, #-32]
   0x0000007f9272d494 <+452>:   stur    x9, [x5, #-24]
   0x0000007f9272d498 <+456>:   stur    x10, [x5, #-16]
   0x0000007f9272d49c <+460>:   stur    x11, [x5, #-8]
   0x0000007f9272d4a0 <+464>:   ret
   0x0000007f9272d4a4 <+468>:   nop
   0x0000007f9272d4a8 <+472>:   nop
   0x0000007f9272d4ac <+476>:   nop
   0x0000007f9272d4b0 <+480>:   cbz x14, 0x7f9272d580 <__memcpy_generic+688>
   0x0000007f9272d4b4 <+484>:   add x4, x1, x2
--Type <RET> for more, q to quit, c to continue without paging--
   0x0000007f9272d4b8 <+488>:   add x5, x0, x2
   0x0000007f9272d4bc <+492>:   and x14, x5, #0xf
   0x0000007f9272d4c0 <+496>:   ldp x12, x13, [x4, #-16]
   0x0000007f9272d4c4 <+500>:   sub x4, x4, x14
   0x0000007f9272d4c8 <+504>:   sub x2, x2, x14
   0x0000007f9272d4cc <+508>:   ldp x6, x7, [x4, #-16]
   0x0000007f9272d4d0 <+512>:   stur    x12, [x5, #-16]
   0x0000007f9272d4d4 <+516>:   stur    x13, [x5, #-8]
   0x0000007f9272d4d8 <+520>:   ldp x8, x9, [x4, #-32]
   0x0000007f9272d4dc <+524>:   ldp x10, x11, [x4, #-48]
   0x0000007f9272d4e0 <+528>:   ldp x12, x13, [x4, #-64]!
   0x0000007f9272d4e4 <+532>:   sub x5, x5, x14
   0x0000007f9272d4e8 <+536>:   subs    x2, x2, #0x80
   0x0000007f9272d4ec <+540>:   b.ls    0x7f9272d530 <__memcpy_generic+608>  // b.plast
   0x0000007f9272d4f0 <+544>:   nop
   0x0000007f9272d4f4 <+548>:   stur    x6, [x5, #-16]
   0x0000007f9272d4f8 <+552>:   stur    x7, [x5, #-8]
   0x0000007f9272d4fc <+556>:   ldp x6, x7, [x4, #-16]
   0x0000007f9272d500 <+560>:   stur    x8, [x5, #-32]
   0x0000007f9272d504 <+564>:   stur    x9, [x5, #-24]
   0x0000007f9272d508 <+568>:   ldp x8, x9, [x4, #-32]
   0x0000007f9272d50c <+572>:   stur    x10, [x5, #-48]
   0x0000007f9272d510 <+576>:   stur    x11, [x5, #-40]
   0x0000007f9272d514 <+580>:   ldp x10, x11, [x4, #-48]
   0x0000007f9272d518 <+584>:   stur    x12, [x5, #-64]
   0x0000007f9272d51c <+588>:   stur    x13, [x5, #-56]
   0x0000007f9272d520 <+592>:   sub x5, x5, #0x40
   0x0000007f9272d524 <+596>:   ldp x12, x13, [x4, #-64]!
   0x0000007f9272d528 <+600>:   subs    x2, x2, #0x40
   0x0000007f9272d52c <+604>:   b.hi    0x7f9272d4f4 <__memcpy_generic+548>  // b.pmore
   0x0000007f9272d530 <+608>:   ldp x2, x3, [x1, #48]
   0x0000007f9272d534 <+612>:   stur    x6, [x5, #-16]
   0x0000007f9272d538 <+616>:   stur    x7, [x5, #-8]
   0x0000007f9272d53c <+620>:   ldp x6, x7, [x1, #32]
   0x0000007f9272d540 <+624>:   stur    x8, [x5, #-32]
   0x0000007f9272d544 <+628>:   stur    x9, [x5, #-24]
   0x0000007f9272d548 <+632>:   ldp x8, x9, [x1, #16]
   0x0000007f9272d54c <+636>:   stur    x10, [x5, #-48]
   0x0000007f9272d550 <+640>:   stur    x11, [x5, #-40]
   0x0000007f9272d554 <+644>:   ldp x10, x11, [x1]
   0x0000007f9272d558 <+648>:   stur    x12, [x5, #-64]
--Type <RET> for more, q to quit, c to continue without paging--
   0x0000007f9272d55c <+652>:   stur    x13, [x5, #-56]
   0x0000007f9272d560 <+656>:   str x2, [x0, #48]
   0x0000007f9272d564 <+660>:   str x3, [x0, #56]
   0x0000007f9272d568 <+664>:   str x6, [x0, #32]
   0x0000007f9272d56c <+668>:   str x7, [x0, #40]
   0x0000007f9272d570 <+672>:   str x8, [x0, #16]
   0x0000007f9272d574 <+676>:   str x9, [x0, #24]
   0x0000007f9272d578 <+680>:   str x10, [x0]
   0x0000007f9272d57c <+684>:   str x11, [x0, #8]
   0x0000007f9272d580 <+688>:   ret
End of assembler dump.

UPDATE2

The root cause should be that dst is unaligned to 8(64bit for x0), so triggered bus error. dst in glmar2 is float *dest(dest_start + nfloats * iter->first);, where nfloats is 3, ri->first is 121 and dest_start is 0xf7ff070f000, so dest is dest_start + nfloats * iter->first * sizeof float), it will never be aligned to 8.

标签: assemblyglibcmemory-alignmentarm64bus-error

解决方案


推荐阅读