From cba69464013a4a6b7abb589944eb7f3f4f227eed Mon Sep 17 00:00:00 2001 From: jca Date: Fri, 23 Jul 2021 15:31:14 +0000 Subject: [PATCH] Use 8/4/1 bytes loads/stores for copyin/copyout/kcopy Only use multiple bytes operations on properly aligned addresses, as I have observed a 40x penalty for unaligned 8 bytes operations compared to equivalent 1-byte loops on this Sifive Unmatched. The speed gain is small but significant. Input & ok kettenis@ --- sys/arch/riscv64/riscv64/copy.S | 35 +++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/sys/arch/riscv64/riscv64/copy.S b/sys/arch/riscv64/riscv64/copy.S index 075159f8bf1..e7eb3f11d1f 100644 --- a/sys/arch/riscv64/riscv64/copy.S +++ b/sys/arch/riscv64/riscv64/copy.S @@ -1,4 +1,4 @@ -/* $OpenBSD: copy.S,v 1.6 2021/06/28 18:53:10 deraadt Exp $ */ +/* $OpenBSD: copy.S,v 1.7 2021/07/23 15:31:14 jca Exp $ */ /* * Copyright (c) 2020 Brian Bamsch @@ -49,8 +49,38 @@ ENTRY(copyin) SWAP_FAULT_HANDLER(a3, a4, a5) ENTER_USER_ACCESS(a4) -// XXX optimize? .Lcopyio: +.Lcopy8: + li a5, 8 + bltu a2, a5, .Lcopy4 + + or a7, a0, a1 + andi a7, a7, 7 + bnez a7, .Lcopy4 + +1: ld a4, 0(a0) + addi a0, a0, 8 + sd a4, 0(a1) + addi a1, a1, 8 + addi a2, a2, -8 + bgeu a2, a5, 1b + +.Lcopy4: + li a5, 4 + bltu a2, a5, .Lcopy1 + + andi a7, a7, 3 + bnez a7, .Lcopy1 + +1: lw a4, 0(a0) + addi a0, a0, 4 + sw a4, 0(a1) + addi a1, a1, 4 + addi a2, a2, -4 + bgeu a2, a5, 1b + +.Lcopy1: + beqz a2, .Lcopy0 1: lb a4, 0(a0) addi a0, a0, 1 sb a4, 0(a1) @@ -58,6 +88,7 @@ ENTRY(copyin) addi a2, a2, -1 bnez a2, 1b +.Lcopy0: EXIT_USER_ACCESS(a4) SET_FAULT_HANDLER(a3, a4) .Lcopyiodone: -- 2.20.1