[PATCH v1 0/5] arm64: avoid out-of-line ll/sc atomics

* [PATCH v1 0/5] arm64: avoid out-of-line ll/sc atomics
@ 2019-05-16 15:53 Andrew Murray
  2019-05-16 15:53 ` [PATCH v1 1/5] jump_label: Don't warn on __exit jump entries Andrew Murray
                   ` (5 more replies)
  0 siblings, 6 replies; 14+ messages in thread
From: Andrew Murray @ 2019-05-16 15:53 UTC (permalink / raw)
  To: Catalin Marinas, Will Deacon, Peter Zijlstra, Ard.Biesheuvel
  Cc: Boqun Feng, linux-arm-kernel

When building for LSE atomics (CONFIG_ARM64_LSE_ATOMICS), if the hardware
or toolchain doesn't support it the existing code will fallback to ll/sc
atomics. It achieves this by branching from inline assembly to a function
that is built with specical compile flags. Further this results in the
clobbering of registers even when the fallback isn't used increasing
register pressure.

Let's improve this by providing inline implementatins of both LSE and
ll/sc and use a static key to select between them. This allows for the
compiler to generate better atomics code.

Build and boot tested, along with atomic_64_test.

Following is the assembly of a function that has three consecutive
atomic_add calls when built with LSE and this patchset:

Dump of assembler code for function atomics_test:
   0xffff000010084338 <+0>:     stp     x29, x30, [sp, #-32]!
   0xffff00001008433c <+4>:     adrp    x0, 0xffff0000112dd000 <crypto_ft_tab+2368>
   0xffff000010084340 <+8>:     add     x1, x0, #0x6c8
   0xffff000010084344 <+12>:    mov     x29, sp
   0xffff000010084348 <+16>:    ldr     x2, [x1]
   0xffff00001008434c <+20>:    str     x2, [x29, #24]
   0xffff000010084350 <+24>:    mov     x2, #0x0                        // #0
   0xffff000010084354 <+28>:    b       0xffff000010084394 <atomics_test+92>
   0xffff000010084358 <+32>:    b       0xffff000010084394 <atomics_test+92>
   0xffff00001008435c <+36>:    mov     w1, #0x18                       // #24
   0xffff000010084360 <+40>:    add     x2, x29, #0x14
   0xffff000010084364 <+44>:    stadd   w1, [x2]
   0xffff000010084368 <+48>:    b       0xffff0000100843b0 <atomics_test+120>
   0xffff00001008436c <+52>:    b       0xffff0000100843b0 <atomics_test+120>
   0xffff000010084370 <+56>:    mov     w1, #0x18                       // #24
   0xffff000010084374 <+60>:    add     x2, x29, #0x14
   0xffff000010084378 <+64>:    stadd   w1, [x2]
   0xffff00001008437c <+68>:    b       0xffff0000100843cc <atomics_test+148>
   0xffff000010084380 <+72>:    b       0xffff0000100843cc <atomics_test+148>
   0xffff000010084384 <+76>:    mov     w1, #0x18                       // #24
   0xffff000010084388 <+80>:    add     x2, x29, #0x14
   0xffff00001008438c <+84>:    stadd   w1, [x2]
   0xffff000010084390 <+88>:    b       0xffff0000100843e4 <atomics_test+172>
   0xffff000010084394 <+92>:    add     x3, x29, #0x14
   0xffff000010084398 <+96>:    prfm    pstl1strm, [x3]
   0xffff00001008439c <+100>:   ldxr    w1, [x3]
   0xffff0000100843a0 <+104>:   add     w1, w1, #0x18
   0xffff0000100843a4 <+108>:   stxr    w2, w1, [x3]
   0xffff0000100843a8 <+112>:   cbnz    w2, 0xffff00001008439c <atomics_test+100>
   0xffff0000100843ac <+116>:   b       0xffff000010084368 <atomics_test+48>
   0xffff0000100843b0 <+120>:   add     x3, x29, #0x14
   0xffff0000100843b4 <+124>:   prfm    pstl1strm, [x3]
   0xffff0000100843b8 <+128>:   ldxr    w1, [x3]
   0xffff0000100843bc <+132>:   add     w1, w1, #0x18
   0xffff0000100843c0 <+136>:   stxr    w2, w1, [x3]
   0xffff0000100843c4 <+140>:   cbnz    w2, 0xffff0000100843b8 <atomics_test+128>
   0xffff0000100843c8 <+144>:   b       0xffff00001008437c <atomics_test+68>
   0xffff0000100843cc <+148>:   add     x3, x29, #0x14
   0xffff0000100843d0 <+152>:   prfm    pstl1strm, [x3]
   0xffff0000100843d4 <+156>:   ldxr    w1, [x3]
   0xffff0000100843d8 <+160>:   add     w1, w1, #0x18
   0xffff0000100843dc <+164>:   stxr    w2, w1, [x3]
   0xffff0000100843e0 <+168>:   cbnz    w2, 0xffff0000100843d4 <atomics_test+156>
   0xffff0000100843e4 <+172>:   add     x0, x0, #0x6c8
   0xffff0000100843e8 <+176>:   ldr     x1, [x29, #24]
   0xffff0000100843ec <+180>:   ldr     x0, [x0]
   0xffff0000100843f0 <+184>:   eor     x0, x1, x0
   0xffff0000100843f4 <+188>:   cbnz    x0, 0xffff000010084400 <atomics_test+200>
   0xffff0000100843f8 <+192>:   ldp     x29, x30, [sp], #32
   0xffff0000100843fc <+196>:   ret
   0xffff000010084400 <+200>:   bl      0xffff0000100db740 <__stack_chk_fail>
End of assembler dump.

The two branches before each section of atomics relates to the two static
keys which both become nop's when LSE is available. When LSE isn't
available the branches are used to run the slowpath fallback LL/SC atomics.

Where CONFIG_ARM64_LSE_ATOMICS isn't enabled then the same function is as
follows:

Dump of assembler code for function atomics_test:
   0xffff000010084338 <+0>:     stp     x29, x30, [sp, #-32]!
   0xffff00001008433c <+4>:     adrp    x0, 0xffff00001126d000 <crypto_ft_tab+2368>
   0xffff000010084340 <+8>:     add     x0, x0, #0x6c8
   0xffff000010084344 <+12>:    mov     x29, sp
   0xffff000010084348 <+16>:    add     x3, x29, #0x14
   0xffff00001008434c <+20>:    ldr     x1, [x0]
   0xffff000010084350 <+24>:    str     x1, [x29, #24]
   0xffff000010084354 <+28>:    mov     x1, #0x0                        // #0
   0xffff000010084358 <+32>:    prfm    pstl1strm, [x3]
   0xffff00001008435c <+36>:    ldxr    w1, [x3]
   0xffff000010084360 <+40>:    add     w1, w1, #0x18
   0xffff000010084364 <+44>:    stxr    w2, w1, [x3]
   0xffff000010084368 <+48>:    cbnz    w2, 0xffff00001008435c <atomics_test+36>
   0xffff00001008436c <+52>:    prfm    pstl1strm, [x3]
   0xffff000010084370 <+56>:    ldxr    w1, [x3]
   0xffff000010084374 <+60>:    add     w1, w1, #0x18
   0xffff000010084378 <+64>:    stxr    w2, w1, [x3]
   0xffff00001008437c <+68>:    cbnz    w2, 0xffff000010084370 <atomics_test+56>
   0xffff000010084380 <+72>:    prfm    pstl1strm, [x3]
   0xffff000010084384 <+76>:    ldxr    w1, [x3]
   0xffff000010084388 <+80>:    add     w1, w1, #0x18
   0xffff00001008438c <+84>:    stxr    w2, w1, [x3]
   0xffff000010084390 <+88>:    cbnz    w2, 0xffff000010084384 <atomics_test+76>
   0xffff000010084394 <+92>:    ldr     x1, [x29, #24]
   0xffff000010084398 <+96>:    ldr     x0, [x0]
   0xffff00001008439c <+100>:   eor     x0, x1, x0
   0xffff0000100843a0 <+104>:   cbnz    x0, 0xffff0000100843ac <atomics_test+116>
   0xffff0000100843a4 <+108>:   ldp     x29, x30, [sp], #32
   0xffff0000100843a8 <+112>:   ret
   0xffff0000100843ac <+116>:   bl      0xffff0000100da4f0 <__stack_chk_fail>
End of assembler dump.

These changes add a small amount of bloat on defconfig according to
bloat-o-meter:

text:
  add/remove: 1/108 grow/shrink: 3448/20 up/down: 272768/-4320 (268448)
  Total: Before=12363112, After=12631560, chg +2.17%

data:
  add/remove: 0/95 grow/shrink: 2/0 up/down: 40/-3251 (-3211)
  Total: Before=4628123, After=4624912, chg -0.07%

Andrew Murray (5):
  jump_label: Don't warn on __exit jump entries
  arm64: Use correct ll/sc atomic constraints
  arm64: atomics: avoid out-of-line ll/sc atomics
  arm64: avoid using hard-coded registers for LSE atomics
  arm64: atomics: remove atomic_ll_sc compilation unit

 arch/arm64/include/asm/atomic.h       |  11 +-
 arch/arm64/include/asm/atomic_arch.h  | 154 ++++++++++
 arch/arm64/include/asm/atomic_ll_sc.h | 164 +++++------
 arch/arm64/include/asm/atomic_lse.h   | 395 +++++++++-----------------
 arch/arm64/include/asm/cmpxchg.h      |   2 +-
 arch/arm64/include/asm/lse.h          |  11 -
 arch/arm64/lib/Makefile               |  19 --
 arch/arm64/lib/atomic_ll_sc.c         |   3 -
 kernel/jump_label.c                   |  16 +-
 9 files changed, 375 insertions(+), 400 deletions(-)
 create mode 100644 arch/arm64/include/asm/atomic_arch.h
 delete mode 100644 arch/arm64/lib/atomic_ll_sc.c

-- 
2.21.0

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 14+ messages in thread