From dca69f4c0deda3c9277bed027b2481f69268289d Mon Sep 17 00:00:00 2001 From: kettenis Date: Tue, 2 Jul 2024 10:25:16 +0000 Subject: [PATCH] The traditional LL/SC atomics perform poorly on modern arm64 systems with many CPU cores. With the recent conversion of the sched lock to a mutex some systems appear to hang if the sched lock is contended. ARMv8.1 introduced an LSE feature that provides atomic instructions such as CAS that perform much better. Unfortunately these can't be used on older ARMv8.0 systems. Use -moutline-atomics to make the compiler generate function calls for atomic operations and provide an implementation for the functions we use in the kernel that use LSE when available and fall back on LL/SC. Fixes regressions seen on Ampere Altra and Apple M2 Pro/Max/Ultra since the conversion of the sched lock to a mutex. tested by claudio@, phessler@, mpi@ ok patrick@ --- sys/arch/arm64/arm64/cpu.c | 4 +- sys/arch/arm64/arm64/lse.S | 170 +++++++++++++++++++++++++++++ sys/arch/arm64/conf/Makefile.arm64 | 3 +- sys/arch/arm64/conf/files.arm64 | 3 +- 4 files changed, 177 insertions(+), 3 deletions(-) create mode 100644 sys/arch/arm64/arm64/lse.S diff --git a/sys/arch/arm64/arm64/cpu.c b/sys/arch/arm64/arm64/cpu.c index a350d11a843..925ccfb4486 100644 --- a/sys/arch/arm64/arm64/cpu.c +++ b/sys/arch/arm64/arm64/cpu.c @@ -1,4 +1,4 @@ -/* $OpenBSD: cpu.c,v 1.121 2024/06/23 10:17:16 kettenis Exp $ */ +/* $OpenBSD: cpu.c,v 1.122 2024/07/02 10:25:16 kettenis Exp $ */ /* * Copyright (c) 2016 Dale Rahn @@ -244,6 +244,7 @@ uint64_t cpu_id_aa64isar2; uint64_t cpu_id_aa64pfr0; uint64_t cpu_id_aa64pfr1; +int arm64_has_lse; #ifdef CRYPTO int arm64_has_aes; #endif @@ -714,6 +715,7 @@ cpu_identify(struct cpu_info *ci) if (ID_AA64ISAR0_ATOMIC(id) >= ID_AA64ISAR0_ATOMIC_IMPL) { printf("%sAtomic", sep); sep = ","; + arm64_has_lse = 1; } if (ID_AA64ISAR0_CRC32(id) >= ID_AA64ISAR0_CRC32_BASE) { diff --git a/sys/arch/arm64/arm64/lse.S b/sys/arch/arm64/arm64/lse.S new file mode 100644 index 00000000000..6c5727459c0 --- /dev/null +++ b/sys/arch/arm64/arm64/lse.S @@ -0,0 +1,170 @@ +/* $OpenBSD: lse.S,v 1.1 2024/07/02 10:25:16 kettenis Exp $ */ +/* + * Copyright (c) 2024 Mark Kettenis + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include + +/* + * Out-of-line LSE atomics helpers + */ + +.arch armv8-a+lse + +ENTRY(__aarch64_cas4_acq_rel) + RETGUARD_SETUP(__aarch64_cas4_acq_rel, x15) + adrp x9, arm64_has_lse + ldr w9, [x9, :lo12:arm64_has_lse] + cbz w9, 1f + casal w0, w1, [x2] + RETGUARD_CHECK(__aarch64_cas4_acq_rel, x15) + ret +1: + ldaxr w9, [x2] + cmp w9, w0 + b.ne 2f + stlxr w10, w1, [x2] + cbnz w10, 1b +2: + mov w0, w9 + RETGUARD_CHECK(__aarch64_cas4_acq_rel, x15) + ret +END(__aarch64_cas4_acq_rel) + +ENTRY(__aarch64_cas8_acq_rel) + RETGUARD_SETUP(__aarch64_cas8_acq_rel, x15) + adrp x9, arm64_has_lse + ldr w9, [x9, :lo12:arm64_has_lse] + cbz w9, 1f + casal x0, x1, [x2] + RETGUARD_CHECK(__aarch64_cas8_acq_rel, x15) + ret +1: + ldaxr x9, [x2] + cmp x9, x0 + b.ne 2f + stlxr w10, x1, [x2] + cbnz w10, 1b +2: + mov x0, x9 + RETGUARD_CHECK(__aarch64_cas8_acq_rel, x15) + ret +END(__aarch64_cas8_acq_rel) + +ENTRY(__aarch64_ldadd4_acq_rel) + RETGUARD_SETUP(__aarch64_ldadd4_acq_rel, x15) + adrp x9, arm64_has_lse + ldr w9, [x9, :lo12:arm64_has_lse] + cbz w9, 1f + ldaddal w0, w0, [x1] + RETGUARD_CHECK(__aarch64_ldadd4_acq_rel, x15) + ret +1: + ldaxr w9, [x1] + add w11, w9, w0 + stlxr w10, w11, [x1] + cbnz w10, 1b + mov w0, w9 + RETGUARD_CHECK(__aarch64_ldadd4_acq_rel, x15) + ret +END(__aarch64_ldadd4_acq_rel) + +ENTRY(__aarch64_ldadd8_acq_rel) + RETGUARD_SETUP(__aarch64_ldadd8_acq_rel, x15) + adrp x9, arm64_has_lse + ldr w9, [x9, :lo12:arm64_has_lse] + cbz w9, 1f + ldaddal x0, x0, [x1] + RETGUARD_CHECK(__aarch64_ldadd8_acq_rel, x15) + ret +1: + ldaxr x9, [x1] + add x11, x9, x0 + stlxr w10, x11, [x1] + cbnz w10, 1b + mov x0, x9 + RETGUARD_CHECK(__aarch64_ldadd8_acq_rel, x15) + ret +END(__aarch64_ldadd8_acq_rel) + +ENTRY(__aarch64_ldclr4_acq_rel) + RETGUARD_SETUP(__aarch64_ldclr4_acq_rel, x15) + adrp x9, arm64_has_lse + ldr w9, [x9, :lo12:arm64_has_lse] + cbz w9, 1f + ldclral w0, w0, [x1] + RETGUARD_CHECK(__aarch64_ldclr4_acq_rel, x15) + ret +1: + ldaxr w9, [x1] + bic w11, w9, w0 + stlxr w10, w11, [x1] + cbnz w10, 1b + mov w0, w9 + RETGUARD_CHECK(__aarch64_ldclr4_acq_rel, x15) + ret +END(__aarch64_ldclr4_acq_rel) + +ENTRY(__aarch64_ldset4_acq_rel) + RETGUARD_SETUP(__aarch64_ldset4_acq_rel, x15) + adrp x9, arm64_has_lse + ldr w9, [x9, :lo12:arm64_has_lse] + cbz w9, 1f + ldsetal w0, w0, [x1] + RETGUARD_CHECK(__aarch64_ldset4_acq_rel, x15) + ret +1: + ldaxr w9, [x1] + orr w11, w9, w0 + stlxr w10, w11, [x1] + cbnz w10, 1b + mov w0, w9 + RETGUARD_CHECK(__aarch64_ldset4_acq_rel, x15) + ret +END(__aarch64_ldset4_acq_rel) + +ENTRY(__aarch64_swp4_acq_rel) + RETGUARD_SETUP(__aarch64_swp4_acq_rel, x15) + adrp x9, arm64_has_lse + ldr w9, [x9, :lo12:arm64_has_lse] + cbz w9, 1f + swpal w0, w0, [x1] + RETGUARD_CHECK(__aarch64_swp4_acq_rel, x15) + ret +1: + ldaxr w9, [x1] + stlxr w10, w0, [x1] + cbnz w10, 1b + mov w0, w9 + RETGUARD_CHECK(__aarch64_swp4_acq_rel, x15) + ret +END(__aarch64_swp4_acq_rel) + +ENTRY(__aarch64_swp8_acq_rel) + RETGUARD_SETUP(__aarch64_swp8_acq_rel, x15) + adrp x9, arm64_has_lse + ldr w9, [x9, :lo12:arm64_has_lse] + cbz w9, 1f + swpal x0, x0, [x1] + RETGUARD_CHECK(__aarch64_swp8_acq_rel, x15) + ret +1: + ldaxr x9, [x1] + stlxr w10, x0, [x1] + cbnz w10, 1b + mov x0, x9 + RETGUARD_CHECK(__aarch64_swp8_acq_rel, x15) + ret +END(__aarch64_swp8_acq_rel) diff --git a/sys/arch/arm64/conf/Makefile.arm64 b/sys/arch/arm64/conf/Makefile.arm64 index 636b7b4043b..837b1fe656d 100644 --- a/sys/arch/arm64/conf/Makefile.arm64 +++ b/sys/arch/arm64/conf/Makefile.arm64 @@ -1,4 +1,4 @@ -# $OpenBSD: Makefile.arm64,v 1.47 2023/09/06 01:47:36 jsg Exp $ +# $OpenBSD: Makefile.arm64,v 1.48 2024/07/02 10:25:16 kettenis Exp $ # For instructions on building kernels consult the config(8) and options(4) # manual pages. @@ -60,6 +60,7 @@ CMACHFLAGS= -march=armv8-a+nofp+nosimd \ -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer \ -ffixed-x18 CMACHFLAGS+= -ffreestanding ${NOPIE_FLAGS} +CMACHFLAGS+= -moutline-atomics SORTR= sort -R .if ${IDENT:M-DNO_PROPOLICE} CMACHFLAGS+= -fno-stack-protector diff --git a/sys/arch/arm64/conf/files.arm64 b/sys/arch/arm64/conf/files.arm64 index 833ea3f050f..c8f9acac72a 100644 --- a/sys/arch/arm64/conf/files.arm64 +++ b/sys/arch/arm64/conf/files.arm64 @@ -1,4 +1,4 @@ -# $OpenBSD: files.arm64,v 1.69 2024/03/25 17:24:03 patrick Exp $ +# $OpenBSD: files.arm64,v 1.70 2024/07/02 10:25:16 kettenis Exp $ maxpartitions 16 maxusers 2 8 128 @@ -34,6 +34,7 @@ file arch/arm64/arm64/trap.c file arch/arm64/arm64/ast.c file arch/arm64/arm64/cpufunc_asm.S +file arch/arm64/arm64/lse.S file arch/arm64/arm64/support.S file arch/arm64/arm64/bus_dma.c -- 2.20.1