From: gkoehler Date: Sat, 22 Oct 2022 00:58:56 +0000 (+0000) Subject: AltiVec assist for subnormal floats in vectors X-Git-Url: http://artulab.com/gitweb/?a=commitdiff_plain;h=6ea8ccc79462aa3499e3835e392b1d8f285c3698;p=openbsd AltiVec assist for subnormal floats in vectors The old CPU in a macppc traps AltiVec instructions when they encounter denormal or subnormal floats. Emulate most of them. They operate on vectors of 4 single-precision floats. The emulations either use scalar operations (so vmaddfp becomes 4 of fmadds) or a formula (like vrsqrtefp's 1 / sqrt(b) = 1 / sqrt(b * 2**126) * 2**63). I am forgetting to emulate some instructions (at least vrfin, vrfiz, vrfip, vrfim). If I don't emulate it, it will still cause SIGFPE. Mac OS never emulated these instructions, but set AltiVec's "non-Java" NJ bit (which changes all subnormal floats to zero). FreeBSD also sets NJ; NetBSD does SIGFPE; Linux emulates them. The POWER9 running OpenBSD/powerpc64 does them in hardware (without trapping). ok kettenis@ miod@ --- diff --git a/sys/arch/powerpc/conf/files.powerpc b/sys/arch/powerpc/conf/files.powerpc index a05878c88e2..43ded9eb9c1 100644 --- a/sys/arch/powerpc/conf/files.powerpc +++ b/sys/arch/powerpc/conf/files.powerpc @@ -1,4 +1,4 @@ -# $OpenBSD: files.powerpc,v 1.55 2018/01/25 15:06:29 mpi Exp $ +# $OpenBSD: files.powerpc,v 1.56 2022/10/22 00:58:56 gkoehler Exp $ # file arch/powerpc/powerpc/setjmp.S ddb @@ -16,6 +16,7 @@ file arch/powerpc/powerpc/vm_machdep.c file arch/powerpc/powerpc/lock_machdep.c multiprocessor file arch/powerpc/powerpc/intr.c file arch/powerpc/powerpc/softintr.c +file arch/powerpc/powerpc/vecast.S altivec file arch/powerpc/ddb/db_memrw.c ddb file arch/powerpc/ddb/db_disasm.c ddb diff --git a/sys/arch/powerpc/include/trap.h b/sys/arch/powerpc/include/trap.h index 420729a3ed5..ba86f95c6d5 100644 --- a/sys/arch/powerpc/include/trap.h +++ b/sys/arch/powerpc/include/trap.h @@ -1,4 +1,4 @@ -/* $OpenBSD: trap.h,v 1.8 2022/05/19 05:43:48 miod Exp $ */ +/* $OpenBSD: trap.h,v 1.9 2022/10/22 00:58:56 gkoehler Exp $ */ /* $NetBSD: trap.h,v 1.1 1996/09/30 16:34:35 ws Exp $ */ /* @@ -51,7 +51,8 @@ #define EXC_VEC 0x0f20 /* AltiVec Unavailable */ #define EXC_BPT 0x1300 /* Instruction Breakpoint */ #define EXC_SMI 0x1400 /* System Management Interrupt */ -#define EXC_VECAST 0x1600 /* AltiVec Assist */ +#define EXC_VECAST_G4 0x1600 /* AltiVec Assist */ +#define EXC_VECAST_G5 0x1700 /* AltiVec Assist */ /* And these are only on the 603: */ #define EXC_IMISS 0x1000 /* Instruction translation miss */ diff --git a/sys/arch/powerpc/powerpc/trap.c b/sys/arch/powerpc/powerpc/trap.c index 990615fbae4..da68970686b 100644 --- a/sys/arch/powerpc/powerpc/trap.c +++ b/sys/arch/powerpc/powerpc/trap.c @@ -1,4 +1,4 @@ -/* $OpenBSD: trap.c,v 1.125 2022/01/21 14:07:06 tobhe Exp $ */ +/* $OpenBSD: trap.c,v 1.126 2022/10/22 00:58:56 gkoehler Exp $ */ /* $NetBSD: trap.c,v 1.3 1996/10/13 03:31:37 christos Exp $ */ /* @@ -67,6 +67,8 @@ void trap(struct trapframe *frame); #define MOREARGS(sp) ((caddr_t)((int)(sp) + 8)) /* more args go here */ #ifdef ALTIVEC +static int altivec_assist(void *); + /* * Save state of the vector processor, This is done lazily in the hope * that few processes in the system will be using the vector unit @@ -511,7 +513,14 @@ brain_damage: break; #endif - case EXC_VECAST|EXC_USER: + case EXC_VECAST_G4|EXC_USER: + case EXC_VECAST_G5|EXC_USER: +#ifdef ALTIVEC + if (altivec_assist((void *)frame->srr0) == 0) { + frame->srr0 += 4; + break; + } +#endif sv.sival_int = frame->srr0; trapsignal(p, SIGFPE, 0, FPE_FLTRES, sv); break; @@ -638,3 +647,85 @@ fix_unaligned(struct proc *p, struct trapframe *frame) } return -1; } + +#ifdef ALTIVEC +static int +altivec_assist(void *user_pc) +{ + /* These labels are in vecast.S */ + void vecast_asm(uint32_t, void *); + void vecast_vaddfp(void); + void vecast_vsubfp(void); + void vecast_vmaddfp(void); + void vecast_vnmsubfp(void); + void vecast_vrefp(void); + void vecast_vrsqrtefp(void); + void vecast_vlogefp(void); + void vecast_vexptefp(void); + void vecast_vctuxs(void); + void vecast_vctsxs(void); + + uint32_t insn, op, va, vc, lo; + void (*lab)(void); + + if (copyin(user_pc, &insn, sizeof(insn)) != 0) + return -1; + op = (insn & 0xfc000000) >> 26; /* primary opcode */ + va = (insn & 0x001f0000) >> 16; /* vector A */ + vc = (insn & 0x000007c0) >> 6; /* vector C or extended opcode */ + lo = insn & 0x0000003f; /* extended opcode */ + + /* Stop if this isn't an altivec instruction. */ + if (op != 4) + return -1; + + /* Decide which instruction to emulate. */ + lab = NULL; + switch (lo) { + case 10: + switch (vc) { + case 0: + lab = vecast_vaddfp; + break; + case 1: + lab = vecast_vsubfp; + break; + case 4: + if (va == 0) + lab = vecast_vrefp; + break; + case 5: + if (va == 0) + lab = vecast_vrsqrtefp; + break; + case 6: + if (va == 0) + lab = vecast_vexptefp; + break; + case 7: + if (va == 0) + lab = vecast_vlogefp; + break; + case 14: + lab = vecast_vctuxs; + break; + case 15: + lab = vecast_vctsxs; + break; + } + break; + case 46: + lab = vecast_vmaddfp; + break; + case 47: + lab = vecast_vnmsubfp; + break; + } + + if (lab) { + vecast_asm(insn, lab); /* Emulate it. */ + return 0; + } else + return -1; +} +#endif diff --git a/sys/arch/powerpc/powerpc/vecast.S b/sys/arch/powerpc/powerpc/vecast.S new file mode 100644 index 00000000000..a4b1032bdbc --- /dev/null +++ b/sys/arch/powerpc/powerpc/vecast.S @@ -0,0 +1,476 @@ +/* $OpenBSD: vecast.S,v 1.1 2022/10/22 00:58:56 gkoehler Exp $ */ + +/* + * Copyright (c) 2022 George Koehler + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +/* + * To load or store an arbitrary AltiVec register, we extract its + * number from the instruction and multiply it by 8. We do both using + * rlwinm to rotate it left into bits 24 to 28. + * + * 0 10 15 20 24 28 + * | | | | | | + * 000100dddddaaaaabbbbbcccccxxxxxx + */ +#define VD_ROTATE 14, 24, 28 +#define VA_ROTATE 19, 24, 28 +#define VB_ROTATE 24, 24, 28 +#define VC_ROTATE 29, 24, 28 + +/* + * vctuxs, vctsxs have an unsigned immediate UI in bits 11 to 15. We + * extract it into bits 4 to 8, then add FLOAT_1_IS to make 2**UI. + */ +#define UI_ROTATE 7, 4, 8 +#define FLOAT_1_IS 0x3f80 /* (float 1) >> 16 */ + + .rodata + .balign 4 +.Lzero: .float 0 +.Lone: .float 1 +.Ln126: .float 126 +.Ltwo63: .float 0x1p63 +.Ltwo126: .float 0x1p126 +.Lmin: .float 0x1p-126 /* FLT_MIN */ + + .text + +/* This is the stack frame for vecast_asm. */ +#define s_size 128 +#define s_f31 120 +#define s_f30 112 +#define s_f29 104 +#define s_f28 96 +#define s_f27 88 +#define s_f26 80 +#define s_f25 72 +#define s_f24 64 +#define s_vc 48 +#define s_vb 32 +#define s_va 16 + +/* + * vecast_asm(insn r3, label r4) emulates an AltiVec instruction when + * it traps a denormal or subnormal float (with an AltiVec assist + * exception). Such a float f has 0 < |f| < FLT_MIN 2**-126. + * + * MPC7450 RISC Microprocessor Family Reference Manual, 7.1.2.5 Java + * Mode, NaNs, Denormalized Numbers, and Zeros, has a list of trapping + * instructions: vaddfp, vsubfp, vmaddfp, vnmsubfp, vrefp, vrsqrtefp, + * vlogefp, vexptefp, vctsxs, vctuxs. + */ +ENTRY(vecast_asm) + mflr %r0 /* r0 = return address */ + RETGUARD_SETUP_LATE(vecast_asm, %r9, %r0) + stwu %r1, -s_size(%r1) + mfmsr %r5 /* r5 = old msr */ + + /* + * Borrow the vector and floating-point units. We must + * preserve all float and most vector registers. + */ + rlwinm %r6, %r5, 0, 17, 15 /* r6 = r5 & ~PSL_EE */ + oris %r6, %r6, PSL_VEC >> 16 + ori %r6, %r6, PSL_FP + mtmsr %r6 + isync + + stfd %f31, s_f31(%r1) + stfd %f30, s_f30(%r1) + stfd %f29, s_f29(%r1) + stfd %f28, s_f28(%r1) + stfd %f27, s_f27(%r1) + stfd %f26, s_f26(%r1) + stfd %f25, s_f25(%r1) + stfd %f24, s_f24(%r1) + mffs %f31 /* f31 = old fpscr */ + + lis %r6, .Lzero@ha + la %r6, .Lzero@l(%r6) /* r6 = address of .Lzero */ + + /* fpscr = zero (round to nearest, no traps) */ + lfs %f30, 0(%r6) /* f30 = zero */ + mtfsf 255, %f30 + + /* All instructions do s_vb = VB now; VD = s_va at finish. */ + rlwinm %r7, %r3, VB_ROTATE + la %r8, s_vb(%r1) + bl vecast_store_vector + + mtctr %r4 + li %r4, 4 /* r4 = 4 loop iterations */ + bctr /* Branch to our instruction's label. */ + +/* + * vaddfp: d = a + b + */ + .globl vecast_vaddfp +vecast_vaddfp: + rlwinm %r7, %r3, VA_ROTATE + la %r8, s_va(%r1) + bl vecast_store_vector + + /* s_va = s_va + s_vb */ + mtctr %r4 + la %r7, (s_va - 4)(%r1) +1: lfsu %f30, 4(%r7) /* r7 += 4, then load (r7). */ + lfs %f29, (s_vb - s_va)(%r7) + fadds %f30, %f30, %f29 + stfs %f30, 0(%r7) + bdnz 1b /* Loop 4 times. */ + b vecast_finish + +/* + * vsubfp: d = a + b + */ + .globl vecast_vsubfp +vecast_vsubfp: + rlwinm %r7, %r3, VA_ROTATE + la %r8, s_va(%r1) + bl vecast_store_vector + + /* s_va = s_va - s_vb */ + mtctr %r4 + la %r7, (s_va - 4)(%r1) +1: lfsu %f30, 4(%r7) + lfs %f29, (s_vb - s_va)(%r7) + fsubs %f30, %f30, %f29 + stfs %f30, 0(%r7) + bdnz 1b + b vecast_finish + +/* + * vmaddfp: d = a * c + b + */ + .globl vecast_vmaddfp +vecast_vmaddfp: + rlwinm %r7, %r3, VA_ROTATE + la %r8, s_va(%r1) + bl vecast_store_vector + rlwinm %r7, %r3, VC_ROTATE + la %r8, s_vc(%r1) + bl vecast_store_vector + + /* s_va = s_va * s_vc + s_vb */ + mtctr %r4 + la %r7, (s_va - 4)(%r1) +1: lfsu %f30, 4(%r7) + lfs %f29, (s_vb - s_va)(%r7) + lfs %f28, (s_vc - s_va)(%r7) + fmadds %f30, %f30, %f28, %f29 + stfs %f30, 0(%r7) + bdnz 1b + b vecast_finish + +/* + * vnmsubfp: d = b - a * c + */ + .globl vecast_vnmsubfp +vecast_vnmsubfp: + rlwinm %r7, %r3, VA_ROTATE + la %r8, s_va(%r1) + bl vecast_store_vector + rlwinm %r7, %r3, VC_ROTATE + la %r8, s_vc(%r1) + bl vecast_store_vector + + /* s_va = -(s_va * s_vc - s_vb) */ + mtctr %r4 + la %r7, (s_va - 4)(%r1) +1: lfsu %f30, 4(%r7) + lfs %f29, (s_vb - s_va)(%r7) + lfs %f28, (s_vc - s_va)(%r7) + fnmsubs %f30, %f30, %f28, %f29 + stfs %f30, 0(%r7) + bdnz 1b + b vecast_finish + +/* + * vrefp: d = estimate 1 / b + */ + .globl vecast_vrefp +vecast_vrefp: + /* s_va = estimate 1 / s_vb */ + mtctr %r4 + la %r7, (s_vb - 4)(%r1) +1: lfsu %f30, 4(%r7) + fres %f30, %f30 + stfs %f30, (s_va - s_vb)(%r7) + bdnz 1b + b vecast_finish + +/* + * vrsqrtefp: d = estimate 1 / sqrt(b) + * 1 / sqrt(b) = 1 / sqrt(b * 2**126) * 2**63 when b < 2**-126 + * + * MPC7455's frsqrte does 1 / sqrt(1) = 0.984375, relative error 1/64. + * AltiVec must not err over 1/4096, so avoid frsqrte. + */ + .globl vecast_vrsqrtefp +vecast_vrsqrtefp: + /* f30 = 1; f29 = 2**63, f28 = 2**126; f27 = 2**-126 */ + lfs %f30, (.Lone - .Lzero)(%r6) + lfs %f29, (.Ltwo63 - .Lzero)(%r6) + lfs %f28, (.Ltwo126 - .Lzero)(%r6) + lfs %f27, (.Lmin - .Lzero)(%r6) + + /* + * s_vb = s_vb * 2**126, s_va = 2**63 when b < 2**-126 + * s_va = 1 when b >= 2**-126 + */ + mtctr %r4 + la %r7, (s_vb - 4)(%r1) +1: lfsu %f26, 4(%r7) + fmuls %f25, %f26, %f28 + fsubs %f24, %f26, %f27 /* f24 selects b >= 2**-126 */ + fsel %f26, %f24, %f26, %f25 /* f26 = b or b * 2**126 */ + stfs %f26, 0(%r7) + fsel %f25, %f24, %f30, %f29 /* f25 = 1 or 2**63 */ + stfs %f25, (s_va - s_vb)(%r7) + bdnz 1b + + /* s_vb = estimate 1 / sqrt(s_vb) */ + la %r7, s_vc(%r1) + la %r8, s_vb(%r1) + stvx %v31, 0, %r7 /* Save v31 in s_vc. */ + lvx %v31, 0, %r8 + vrsqrtefp %v31, %v31 + stvx %v31, 0, %r8 + lvx %v31, 0, %r7 + + /* s_va = s_vb * s_va */ + mtctr %r4 + la %r7, (s_va - 4)(%r1) +1: lfsu %f30, 4(%r7) + lfs %f29, (s_vb - s_va)(%r7) + fmuls %f30, %f29, %f30 + stfs %f30, 0(%r7) + bdnz 1b + b vecast_finish + +/* + * vlogefp: d = estimate log2(b) + * log2(b) = log2(b * 2**126) - 126 when b < 2**-126 + */ + .globl vecast_vlogefp +vecast_vlogefp: + /* f30 = 0; f29 = 126; f28 = 2**126; f27 = 2**-126 */ + lfs %f29, (.Ln126 - .Lzero)(%r6) + lfs %f28, (.Ltwo126 - .Lzero)(%r6) + lfs %f27, (.Lmin - .Lzero)(%r6) + + /* + * s_vb = s_vb * 2**126, s_va = 126 when s_vb < 2**-126 + * s_va = 0 when s_vb >= 2**-126 + */ + mtctr %r4 + la %r7, (s_vb - 4)(%r1) +1: lfsu %f26, 4(%r7) + fmuls %f25, %f26, %f28 + fsubs %f24, %f26, %f27 /* f24 selects b >= 2**-126 */ + fsel %f26, %f24, %f26, %f25 /* f26 = b or b * 2**126 */ + stfs %f26, 0(%r7) + fsel %f25, %f24, %f30, %f29 /* f25 = 0 or 126 */ + stfs %f25, (s_va - s_vb)(%r7) + bdnz 1b + + /* s_vb = estimate log2(s_vb) */ + la %r7, s_vc(%r1) + la %r8, s_vb(%r1) + stvx %v31, 0, %r7 + lvx %v31, 0, %r8 + vlogefp %v31, %v31 + stvx %v31, 0, %r8 + lvx %v31, 0, %r7 + + /* s_va = s_vb - s_va */ + mtctr %r4 + la %r7, (s_va - 4)(%r1) +1: lfsu %f30, 4(%r7) + lfs %f29, (s_vb - s_va)(%r7) + fsubs %f30, %f29, %f30 + stfs %f30, 0(%r7) + bdnz 1b + b vecast_finish + +/* + * vexptefp: d = estimate 2**b + * 2**b = 2**(b + 126) * 2**-126 when -252 <= b < -126 + */ + .globl vecast_vexptefp +vecast_vexptefp: + /* f30 = 1; f29 = 126; f28 = 2**-126 */ + lfs %f30, (.Lone - .Lzero)(%r6) + lfs %f29, (.Ln126 - .Lzero)(%r6) + lfs %f28, (.Lmin - .Lzero)(%r6) + + /* + * s_vb = s_vb + 126 when -252 <= b < -126 + * s_va = 2**-126 when b < -126 + * s_va = 1 when b >= -126 + * + * If b < -252, we avoid a possibly subnormal 2**(b + 126) + * by calculating 2**b * 2**-126 = 0 * 2**-126 = 0. + */ + mtctr %r4 + la %r7, (s_vb - 4)(%r1) +1: lfsu %f27, 4(%r7) + fadds %f26, %f27, %f29 /* f26 selects b >= -126 */ + fadds %f25, %f26, %f29 /* f25 selects b >= -252 */ + fsel %f24, %f26, %f27, %f26 + fsel %f24, %f25, %f24, %f27 /* f24 = b or b + 126 */ + stfs %f24, 0(%r7) + fsel %f27, %f26, %f30, %f28 /* f27 = 1 or 2**-126 */ + stfs %f27, (s_va - s_vb)(%r7) + bdnz 1b + + /* s_vb = estimate 2**s_vb */ + la %r7, s_vc(%r1) + la %r8, s_vb(%r1) + stvx %v31, 0, %r7 + lvx %v31, 0, %r8 + vexptefp %v31, %v31 + stvx %v31, 0, %r8 + lvx %v31, 0, %r7 + + /* s_va = s_vb * s_va */ + mtctr %r4 + la %r7, (s_va - 4)(%r1) +1: lfsu %f30, 4(%r7) + lfs %f29, (s_vb - s_va)(%r7) + fmuls %f30, %f29, %f30 + stfs %f30, 0(%r7) + bdnz 1b + b vecast_finish + +/* + * vctsxs: d = (int32_t)(b * 2**u) where 0 <= u < 32 + * d = 0 when |b| < 2**-126 + */ + .globl vecast_vctsxs +vecast_vctsxs: + /* f30 = 0; f29 = 2**-126; f28 = 2**u */ + lfs %f29, (.Lmin - .Lzero)(%r6) + rlwinm %r7, %r3, UI_ROTATE + addis %r7, %r7, FLOAT_1_IS + stw %r7, s_va(%r1) + lfs %f28, s_va(%r1) + + /* s_va = s_vb * 2**u, unless b is tiny. */ + mtctr %r4 + la %r7, (s_vb - 4)(%r1) +1: lfsu %f27, 4(%r7) + fmuls %f26, %f27, %f28 + fabs %f27, %f27 + fsubs %f27, %f27, %f29 /* f27 selects |b| >= 2**-126 */ + fsel %f26, %f27, %f26, %f30 /* f26 = b * 2**u or 0 */ + stfs %f26, (s_va - s_vb)(%r7) + bdnz 1b + + /* s_va = (int32_t)b */ + la %r7, s_vc(%r1) + la %r8, s_va(%r1) + stvx %v31, 0, %r7 + lvx %v31, 0, %r8 + vctsxs %v31, %v31, 0 /* May set SAT in vscr. */ + stvx %v31, 0, %r8 + lvx %v31, 0, %r7 + b vecast_finish + +/* + * vctuxs: d = (uint32_t)(b * 2**u) where 0 <= u < 32 + * d = 0 when |b| < 2**-126 + */ + .globl vecast_vctuxs +vecast_vctuxs: + /* f30 = 0; f29 = 2**-126; f28 = 2**u */ + lfs %f29, (.Lmin - .Lzero)(%r6) + rlwinm %r7, %r3, UI_ROTATE + addis %r7, %r7, FLOAT_1_IS + stw %r7, s_va(%r1) + lfs %f28, s_va(%r1) + + /* s_va = s_vb * 2**u, unless b is tiny. */ + mtctr %r4 + la %r7, (s_vb - 4)(%r1) +1: lfsu %f27, 4(%r7) + fmuls %f26, %f27, %f28 + fabs %f27, %f27 + fsubs %f27, %f27, %f29 /* f27 selects |b| >= 2**-126 */ + fsel %f26, %f27, %f26, %f30 /* f26 = b * 2**u or 0 */ + stfs %f26, (s_va - s_vb)(%r7) + bdnz 1b + + /* s_va = (uint32_t)b */ + la %r7, s_vc(%r1) + la %r8, s_va(%r1) + stvx %v31, 0, %r7 + lvx %v31, 0, %r8 + vctuxs %v31, %v31, 0 /* May set SAT in vscr. */ + stvx %v31, 0, %r8 + lvx %v31, 0, %r7 + /* b vecast_finish */ + +vecast_finish: + /* VD = s_va */ + rlwinm %r7, %r3, VD_ROTATE + addis %r7, %r7, 1f@ha + addi %r7, %r7, 1f@l + mtctr %r7 + la %r8, s_va(%r1) + bctr +#define M(n) lvx %v##n, 0, %r8; b 2f +1: M( 0); M( 1); M( 2); M( 3); M( 4); M( 5); M( 6); M( 7) + M( 8); M( 9); M(10); M(11); M(12); M(13); M(14); M(15) + M(16); M(17); M(18); M(19); M(20); M(21); M(22); M(23) + M(24); M(25); M(26); M(27); M(28); M(29); M(30); M(31) +#undef M +2: mtlr %r0 + mtfsf 255, %f31 /* Restore old fpscr. */ + lfd %f24, s_f24(%r1) + lfd %f25, s_f25(%r1) + lfd %f26, s_f26(%r1) + lfd %f27, s_f27(%r1) + lfd %f28, s_f28(%r1) + lfd %f29, s_f29(%r1) + lfd %f30, s_f30(%r1) + lfd %f31, s_f31(%r1) + mtmsr %r5 /* Restore old msr. */ + isync + addi %r1, %r1, s_size + RETGUARD_CHECK(vecast_asm, %r9, %r0) + blr + +/* + * Stores vector v(r7 / 8) to address r8. + */ +vecast_store_vector: + RETGUARD_SETUP(vecast_store_vector, %r11, %r12) + addis %r7, %r7, 1f@ha + addi %r7, %r7, 1f@l + mtctr %r7 + bctr +#define M(n) stvx %v##n, 0, %r8; b 2f +1: M( 0); M( 1); M( 2); M( 3); M( 4); M( 5); M( 6); M( 7) + M( 8); M( 9); M(10); M(11); M(12); M(13); M(14); M(15) + M(16); M(17); M(18); M(19); M(20); M(21); M(22); M(23) + M(24); M(25); M(26); M(27); M(28); M(29); M(30); M(31) +#undef M +2: RETGUARD_CHECK(vecast_store_vector, %r11, %r12) + blr