From b767b017801a7911c8695888fb3667cbbba30f7b Mon Sep 17 00:00:00 2001 From: guenther Date: Wed, 21 Feb 2018 19:24:15 +0000 Subject: [PATCH] Meltdown: implement user/kernel page table separation. On Intel CPUs which speculate past user/supervisor page permission checks, use a separate page table for userspace with only the minimum of kernel code and data required for the transitions to/from the kernel (still marked as supervisor-only, of course): - the IDT (RO) - three pages of kernel text in the .kutext section for interrupt, trap, and syscall trampoline code (RX) - one page of kernel data in the .kudata section for TLB flush IPIs (RW) - the lapic page (RW, uncachable) - per CPU: one page for the TSS+GDT (RO) and one page for trampoline stacks (RW) When a syscall, trap, or interrupt takes a CPU from userspace to kernel the trampoline code switches page tables, switches stacks to the thread's real kernel stack, then copies over the necessary bits from the trampoline stack. On return to userspace the opposite occurs: recreate the iretq frame on the trampoline stack, switch stack, switch page tables, and return to userspace. mlarkin@ implemented the pmap bits and did 90% of the debugging, diagnosing issues on MP in particular, and drove the final push to completion. Many rounds of testing by naddy@, sthen@, and others Thanks to Alex Wilson from Joyent for early discussions about trampolines and their data requirements. Per-CPU page layout mostly inspired by DragonFlyBSD. ok mlarkin@ deraadt@ --- sys/arch/amd64/amd64/cpu.c | 83 +++++++++- sys/arch/amd64/amd64/gdt.c | 29 +--- sys/arch/amd64/amd64/genassym.cf | 17 +- sys/arch/amd64/amd64/identcpu.c | 7 +- sys/arch/amd64/amd64/lapic.c | 22 ++- sys/arch/amd64/amd64/locore.S | 190 +++++++++++++++++++--- sys/arch/amd64/amd64/locore0.S | 47 ++++-- sys/arch/amd64/amd64/machdep.c | 95 ++++++++--- sys/arch/amd64/amd64/pmap.c | 230 +++++++++++++++++++++++++-- sys/arch/amd64/amd64/spl.S | 12 +- sys/arch/amd64/amd64/trap.c | 32 ++-- sys/arch/amd64/amd64/vector.S | 237 ++++++++++++++++++++-------- sys/arch/amd64/conf/ld.script | 22 ++- sys/arch/amd64/include/asm.h | 19 ++- sys/arch/amd64/include/cpu.h | 29 +++- sys/arch/amd64/include/cpu_full.h | 66 ++++++++ sys/arch/amd64/include/cpufunc.h | 5 +- sys/arch/amd64/include/frame.h | 16 +- sys/arch/amd64/include/frameasm.h | 76 ++++++--- sys/arch/amd64/include/gdt.h | 3 +- sys/arch/amd64/include/pmap.h | 18 ++- sys/arch/amd64/include/specialreg.h | 5 +- 22 files changed, 1020 insertions(+), 240 deletions(-) create mode 100644 sys/arch/amd64/include/cpu_full.h diff --git a/sys/arch/amd64/amd64/cpu.c b/sys/arch/amd64/amd64/cpu.c index c8727f583b0..7e14b3709de 100644 --- a/sys/arch/amd64/amd64/cpu.c +++ b/sys/arch/amd64/amd64/cpu.c @@ -1,4 +1,4 @@ -/* $OpenBSD: cpu.c,v 1.111 2018/02/06 01:09:17 patrick Exp $ */ +/* $OpenBSD: cpu.c,v 1.112 2018/02/21 19:24:15 guenther Exp $ */ /* $NetBSD: cpu.c,v 1.1 2003/04/26 18:39:26 fvdl Exp $ */ /*- @@ -81,7 +81,7 @@ #include #include -#include +#include #include #include #include @@ -116,6 +116,14 @@ #include #endif /* HIBERNATE */ +/* #define CPU_DEBUG */ + +#ifdef CPU_DEBUG +#define DPRINTF(x...) do { printf(x); } while(0) +#else +#define DPRINTF(x...) +#endif /* CPU_DEBUG */ + int cpu_match(struct device *, void *, void *); void cpu_attach(struct device *, struct device *, void *); int cpu_activate(struct device *, int); @@ -172,7 +180,7 @@ struct cfdriver cpu_cd = { * CPU, on uniprocessors). The CPU info list is initialized to * point at it. */ -struct cpu_info cpu_info_primary = { 0, &cpu_info_primary }; +struct cpu_info_full cpu_info_full_primary = { .cif_cpu = { .ci_self = &cpu_info_primary } }; struct cpu_info *cpu_info_list = &cpu_info_primary; @@ -338,8 +346,15 @@ cpu_attach(struct device *parent, struct device *self, void *aux) * structure, otherwise use the primary's. */ if (caa->cpu_role == CPU_ROLE_AP) { - ci = malloc(sizeof(*ci), M_DEVBUF, M_WAITOK|M_ZERO); + struct cpu_info_full *cif; + + cif = km_alloc(sizeof *cif, &kv_any, &kp_zero, &kd_waitok); + ci = &cif->cif_cpu; #if defined(MULTIPROCESSOR) + ci->ci_tss = &cif->cif_tss; + ci->ci_gdt = (void *)(ci->ci_tss + 1); + memcpy(ci->ci_gdt, cpu_info_primary.ci_gdt, GDT_SIZE); + cpu_enter_pages(cif); if (cpu_info[cpunum] != NULL) panic("cpu at apic id %d already attached?", cpunum); cpu_info[cpunum] = ci; @@ -451,7 +466,6 @@ cpu_attach(struct device *parent, struct device *self, void *aux) #if defined(MULTIPROCESSOR) cpu_intr_init(ci); - gdt_alloc_cpu(ci); sched_init_cpu(ci); cpu_start_secondary(ci); ncpus++; @@ -938,3 +952,62 @@ cpu_activate(struct device *self, int act) return (0); } + +/* + * cpu_enter_pages + * + * Requests mapping of various special pages required in the Intel Meltdown + * case (to be entered into the U-K page table): + * + * 1 tss+gdt page for each CPU + * 1 trampoline stack page for each CPU + * + * The cpu_info_full struct for each CPU straddles these pages. The offset into + * 'cif' is calculated below, for each page. For more information, consult + * the definition of struct cpu_info_full in cpu_full.h + * + * On CPUs unaffected by Meltdown, this function still configures 'cif' but + * the calls to pmap_enter_special become no-ops. + * + * Parameters: + * cif : the cpu_info_full structure describing a CPU whose pages are to be + * entered into the special meltdown U-K page table. + */ +void +cpu_enter_pages(struct cpu_info_full *cif) +{ + vaddr_t va; + paddr_t pa; + + /* The TSS+GDT need to be readable */ + va = (vaddr_t)cif; + pmap_extract(pmap_kernel(), va, &pa); + pmap_enter_special(va, pa, PROT_READ); + DPRINTF("%s: entered tss+gdt page at va 0x%llx pa 0x%llx\n", __func__, + (uint64_t)va, (uint64_t)pa); + + /* The trampoline stack page needs to be read/write */ + va = (vaddr_t)&cif->cif_tramp_stack; + pmap_extract(pmap_kernel(), va, &pa); + pmap_enter_special(va, pa, PROT_READ | PROT_WRITE); + DPRINTF("%s: entered t.stack page at va 0x%llx pa 0x%llx\n", __func__, + (uint64_t)va, (uint64_t)pa); + + cif->cif_tss.tss_rsp0 = va + sizeof(cif->cif_tramp_stack) - 16; + DPRINTF("%s: cif_tss.tss_rsp0 = 0x%llx\n" ,__func__, + (uint64_t)cif->cif_tss.tss_rsp0); + cif->cif_cpu.ci_intr_rsp = cif->cif_tss.tss_rsp0 - + sizeof(struct iretq_frame); + +#define SETUP_IST_SPECIAL_STACK(ist, cif, member) do { \ + (cif)->cif_tss.tss_ist[(ist)] = (vaddr_t)&(cif)->member + \ + sizeof((cif)->member) - 16; \ + (cif)->member[nitems((cif)->member) - 2] = (int64_t)&(cif)->cif_cpu; \ +} while (0) + + SETUP_IST_SPECIAL_STACK(0, cif, cif_dblflt_stack); + SETUP_IST_SPECIAL_STACK(1, cif, cif_nmi_stack); + + /* an empty iomap, by setting its offset to the TSS limit */ + cif->cif_tss.tss_iobase = sizeof(cif->cif_tss); +} diff --git a/sys/arch/amd64/amd64/gdt.c b/sys/arch/amd64/amd64/gdt.c index 8aa28a098f9..1372ebd083e 100644 --- a/sys/arch/amd64/amd64/gdt.c +++ b/sys/arch/amd64/amd64/gdt.c @@ -1,4 +1,4 @@ -/* $OpenBSD: gdt.c,v 1.25 2018/01/07 05:36:47 guenther Exp $ */ +/* $OpenBSD: gdt.c,v 1.26 2018/02/21 19:24:15 guenther Exp $ */ /* $NetBSD: gdt.c,v 1.1 2003/04/26 18:39:28 fvdl Exp $ */ /*- @@ -39,33 +39,6 @@ #include #include -/* - * Allocate shadow GDT for a slave cpu. - */ -void -gdt_alloc_cpu(struct cpu_info *ci) -{ - struct vm_page *pg; - vaddr_t va; - - ci->ci_gdt = (char *)uvm_km_valloc(kernel_map, - GDT_SIZE + sizeof(*ci->ci_tss)); - ci->ci_tss = (void *)(ci->ci_gdt + GDT_SIZE); - uvm_map_pageable(kernel_map, (vaddr_t)ci->ci_gdt, - (vaddr_t)ci->ci_gdt + GDT_SIZE, FALSE, FALSE); - for (va = (vaddr_t)ci->ci_gdt; - va < (vaddr_t)ci->ci_gdt + GDT_SIZE + sizeof(*ci->ci_tss); - va += PAGE_SIZE) { - pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO); - if (pg == NULL) - panic("gdt_init: no pages"); - pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pg), PROT_READ | PROT_WRITE); - } - memcpy(ci->ci_gdt, cpu_info_primary.ci_gdt, GDT_SIZE); - bzero(ci->ci_tss, sizeof(*ci->ci_tss)); -} - - /* * Load appropriate gdt descriptor; we better be running on *ci */ diff --git a/sys/arch/amd64/amd64/genassym.cf b/sys/arch/amd64/amd64/genassym.cf index f72dd494c9b..4d65a3a56a2 100644 --- a/sys/arch/amd64/amd64/genassym.cf +++ b/sys/arch/amd64/amd64/genassym.cf @@ -1,4 +1,4 @@ -# $OpenBSD: genassym.cf,v 1.33 2018/02/10 09:21:12 mpi Exp $ +# $OpenBSD: genassym.cf,v 1.34 2018/02/21 19:24:15 guenther Exp $ # Written by Artur Grabowski art@openbsd.org, Public Domain include @@ -78,6 +78,15 @@ member tf_ss define FRAMESIZE sizeof(struct trapframe) +struct iretq_frame +member IRETQ_CS iretq_cs +member IRETQ_RIP iretq_rip +member IRETQ_RFLAGS iretq_rflags +member IRETQ_RSP iretq_rsp +member IRETQ_SS iretq_ss + +define IRETQ_SIZE sizeof(struct iretq_frame) + struct pcb member pcb_cr3 member pcb_rsp @@ -91,6 +100,8 @@ member pcb_cr0 struct pmap member pm_cpus +member pm_pdirpa +member pm_pdirpa_intel struct x86_64_tss member tss_rsp0 @@ -115,6 +126,10 @@ endif member CPU_INFO_GDT ci_gdt member CPU_INFO_TSS ci_tss member CPU_INFO_FLAGS ci_flags +member CPU_INFO_KERN_CR3 ci_kern_cr3 +member CPU_INFO_USER_CR3 ci_user_cr3 +member CPU_INFO_KERN_RSP ci_kern_rsp +member CPU_INFO_INTR_RSP ci_intr_rsp export CPUF_USERSEGS_BIT diff --git a/sys/arch/amd64/amd64/identcpu.c b/sys/arch/amd64/amd64/identcpu.c index 046fde6855a..4bc2e6d10d5 100644 --- a/sys/arch/amd64/amd64/identcpu.c +++ b/sys/arch/amd64/amd64/identcpu.c @@ -1,4 +1,4 @@ -/* $OpenBSD: identcpu.c,v 1.94 2018/02/10 09:46:58 jsg Exp $ */ +/* $OpenBSD: identcpu.c,v 1.95 2018/02/21 19:24:15 guenther Exp $ */ /* $NetBSD: identcpu.c,v 1.1 2003/04/26 18:39:28 fvdl Exp $ */ /* @@ -208,6 +208,7 @@ const struct { { SEFF0EDX_AVX512_4FMAPS, "AVX512FMAPS" }, { SEFF0EDX_IBRS, "IBRS,IBPB" }, { SEFF0EDX_STIBP, "STIBP" }, + /* SEFF0EDX_ARCH_CAP (not printed) */ }, cpu_tpm_eaxfeatures[] = { { TPM_SENSOR, "SENSOR" }, { TPM_ARAT, "ARAT" }, @@ -455,6 +456,7 @@ identifycpu(struct cpu_info *ci) int i; char *brandstr_from, *brandstr_to; int skipspace; + extern uint32_t cpu_meltdown; CPUID(1, ci->ci_signature, val, dummy, ci->ci_feature_flags); CPUID(0x80000000, ci->ci_pnfeatset, dummy, dummy, dummy); @@ -612,6 +614,9 @@ identifycpu(struct cpu_info *ci) } } + if (cpu_meltdown) + printf(",MELTDOWN"); + printf("\n"); x86_print_cacheinfo(ci); diff --git a/sys/arch/amd64/amd64/lapic.c b/sys/arch/amd64/amd64/lapic.c index 6a1086c2f62..83ee4472d9f 100644 --- a/sys/arch/amd64/amd64/lapic.c +++ b/sys/arch/amd64/amd64/lapic.c @@ -1,4 +1,4 @@ -/* $OpenBSD: lapic.c,v 1.49 2017/10/14 04:44:43 jsg Exp $ */ +/* $OpenBSD: lapic.c,v 1.50 2018/02/21 19:24:15 guenther Exp $ */ /* $NetBSD: lapic.c,v 1.2 2003/05/08 01:04:35 fvdl Exp $ */ /*- @@ -59,6 +59,14 @@ #include #endif +/* #define LAPIC_DEBUG */ + +#ifdef LAPIC_DEBUG +#define DPRINTF(x...) do { printf(x); } while(0) +#else +#define DPRINTF(x...) +#endif /* LAPIC_DEBUG */ + struct evcount clk_count; #ifdef MULTIPROCESSOR struct evcount ipi_count; @@ -201,6 +209,7 @@ lapic_map(paddr_t lapic_base) codepatch_call(CPTAG_EOI, &x2apic_eoi); lapic_writereg(LAPIC_TPRI, s); + va = (vaddr_t)&local_apic; } else { /* * Map local apic. If we have a local apic, it's safe to @@ -220,6 +229,17 @@ lapic_map(paddr_t lapic_base) lapic_tpr = s; } + /* + * Enter the LAPIC MMIO page in the U-K page table for handling + * Meltdown (needed in the interrupt stub to acknowledge the + * incoming interrupt). On CPUs unaffected by Meltdown, + * pmap_enter_special is a no-op. + * XXX - need to map this PG_N + */ + pmap_enter_special(va, lapic_base, PROT_READ | PROT_WRITE); + DPRINTF("%s: entered lapic page va 0x%llx pa 0x%llx\n", __func__, + (uint64_t)va, (uint64_t)lapic_base); + enable_intr(); } diff --git a/sys/arch/amd64/amd64/locore.S b/sys/arch/amd64/amd64/locore.S index 6e00ce3dddf..282a25310c6 100644 --- a/sys/arch/amd64/amd64/locore.S +++ b/sys/arch/amd64/amd64/locore.S @@ -1,4 +1,4 @@ -/* $OpenBSD: locore.S,v 1.93 2018/01/07 19:56:19 mlarkin Exp $ */ +/* $OpenBSD: locore.S,v 1.94 2018/02/21 19:24:15 guenther Exp $ */ /* $NetBSD: locore.S,v 1.13 2004/03/25 18:33:17 drochner Exp $ */ /* @@ -113,6 +113,7 @@ #include #include +#include #include #include #include @@ -176,6 +177,7 @@ _C_LABEL(lapic_isr): .globl _C_LABEL(bootapiver) .globl _C_LABEL(pg_nx) .globl _C_LABEL(pg_g_kern) + .globl _C_LABEL(cpu_meltdown) _C_LABEL(cpu_id): .long 0 # saved from `cpuid' instruction _C_LABEL(cpu_feature): .long 0 # feature flags from 'cpuid' # instruction @@ -210,7 +212,8 @@ _C_LABEL(biosextmem): .long REALEXTMEM _C_LABEL(pg_nx): .quad 0 # NX PTE bit (if CPU supports) _C_LABEL(pg_g_kern): .quad 0 # 0x100 if global pages should be used # in kernel mappings, 0 otherwise (for - # Intel) + # insecure CPUs) +_C_LABEL(cpu_meltdown): .long 0 # 1 if this CPU has Meltdown #define _RELOC(x) ((x) - KERNBASE) #define RELOC(x) _RELOC(_C_LABEL(x)) @@ -236,7 +239,7 @@ gdt64_end: /*****************************************************************************/ /* - * Signal trampoline; copied to top of user stack. + * Signal trampoline; copied to a page mapped into userspace. * gdb's backtrace logic matches against the instructions in this. */ .section .rodata @@ -401,20 +404,34 @@ restore_saved: movq PCB_RSP(%r13),%rsp movq PCB_RBP(%r13),%rbp - movq CPUVAR(TSS),%rcx - movq PCB_KSTACK(%r13),%rdx - movq %rdx,TSS_RSP0(%rcx) - movq PCB_CR3(%r13),%rax - movq %rax,%cr3 + movq %rax,%cr3 /* %rax used below too */ /* Don't bother with the rest if switching to a system process. */ testl $P_SYSTEM,P_FLAG(%r12) jnz switch_restored + /* record the bits needed for future U-->K transition */ + movq PCB_KSTACK(%r13),%rdx + subq $FRAMESIZE,%rdx + movq %rdx,CPUVAR(KERN_RSP) + movq PCB_PMAP(%r13),%rcx + + /* + * Meltdown: iff we're doing separate U+K and U-K page tables, + * then record them in cpu_info for easy access in syscall and + * interrupt trampolines. XXX code patch this + */ + + movq PM_PDIRPA_INTEL(%rcx),%rdx + testq %rdx,%rdx + jz 0f /* yay, no intel suckiness */ + movq %rax,CPUVAR(KERN_CR3) + movq %rdx,CPUVAR(USER_CR3) +0: + /* set the new pmap's bit for the cpu */ movl CPUVAR(CPUID),%edi - movq PCB_PMAP(%r13),%rcx lock btsq %rdi,PM_CPUS(%rcx) #ifdef DIAGNOSTIC @@ -503,8 +520,7 @@ IDTVEC(syscall32) sysret /* go away please */ /* - * syscall insn entry. This currently isn't much faster, but - * it can be made faster in the future. + * syscall insn entry. */ IDTVEC(syscall) /* @@ -514,13 +530,20 @@ IDTVEC(syscall) * the user-space value. * First order of business is to swap to the kernel gs.base so that * we can access our struct cpu_info and use the scratch space there - * to switch to our kernel stack. Once that's in place we can + * to switch to the kernel page tables (thank you, Intel), then + * switch to our kernel stack. Once that's in place we can * unblock interrupts and save the rest of the syscall frame. */ swapgs movq %r15,CPUVAR(SCRATCH) - movq CPUVAR(CURPCB),%r15 - movq PCB_KSTACK(%r15),%r15 + movq CPUVAR(KERN_CR3),%r15 + testq %r15,%r15 + jz Xsyscall_untramp + movq %r15,%cr3 + jmp Xsyscall_untramp + +NENTRY(Xsyscall_untramp) + movq CPUVAR(KERN_RSP),%r15 xchgq %r15,%rsp sti @@ -531,12 +554,11 @@ IDTVEC(syscall) * ss:rsp, etc, so that all GP registers can be * saved. Then, fill in the rest. */ - pushq $(GSEL(GUDATA_SEL, SEL_UPL)) - pushq %r15 - subq $(TF_RSP-TF_TRAPNO),%rsp + movq $(GSEL(GUDATA_SEL, SEL_UPL)),TF_SS(%rsp) + movq %r15,TF_RSP(%rsp) movq CPUVAR(SCRATCH),%r15 - subq $32,%rsp - INTR_SAVE_GPRS + INTR_SAVE_MOST_GPRS_NO_ADJ + movq %rcx,TF_RCX(%rsp) movq %r11, TF_RFLAGS(%rsp) /* old rflags from syscall insn */ movq $(GSEL(GUCODE_SEL, SEL_UPL)), TF_CS(%rsp) movq %rcx,TF_RIP(%rsp) @@ -581,16 +603,45 @@ IDTVEC(syscall) movq TF_RBP(%rsp),%rbp movq TF_RBX(%rsp),%rbx - INTR_RESTORE_SELECTORS + /* Restore FS.base if it's not already in the CPU */ + btsl $CPUF_USERSEGS_BIT,CPUVAR(FLAGS) + jc 99f + movq CPUVAR(CURPCB),%rdx + movq PCB_FSBASE(%rdx),%rax + movq %rax,%rdx + shrq $32,%rdx + movl $MSR_FSBASE,%ecx + wrmsr +99: + /* + * We need to finish reading from the trapframe, then switch + * to the user page tables, swapgs, and return. We need + * to get the final value for the register that was used + * for the mov to %cr3 from somewhere accessible on the + * user page tables, so save it in CPUVAR(SCRATCH) across + * the switch. + */ movq TF_RDX(%rsp),%rdx movq TF_RAX(%rsp),%rax + movq %rax,CPUVAR(SCRATCH) + movq CPUVAR(USER_CR3),%rax movq TF_RIP(%rsp),%rcx movq TF_RFLAGS(%rsp),%r11 movq TF_RSP(%rsp),%rsp + testq %rax,%rax + jz 1f + jmp syscall_trampback + +KUENTRY(syscall_trampback) + movq %rax,%cr3 +1: movq CPUVAR(SCRATCH),%rax + swapgs sysretq + .text + #ifdef DIAGNOSTIC .Lsyscall_spl_not_lowered: movabsq $spl_lowered, %rdi @@ -627,6 +678,12 @@ NENTRY(proc_trampoline) * Return via iretq, for real interrupts and signal returns */ NENTRY(intr_fast_exit) +#ifdef DIAGNOSTIC + pushfq + popq %rdx + testq $PSL_I,%rdx + jnz .Lintr_exit_not_blocked +#endif /* DIAGNOSTIC */ movq TF_RDI(%rsp),%rdi movq TF_RSI(%rsp),%rsi movq TF_R8(%rsp),%r8 @@ -640,11 +697,68 @@ NENTRY(intr_fast_exit) movq TF_RBX(%rsp),%rbx testq $SEL_RPL,TF_CS(%rsp) - je 5f + je intr_exit_recurse /* returning back to kernel? */ + + /* returning to userspace. XXX fix up iret frame here */ + + /* restore FS.base if it's not already in the CPU */ + btsl $CPUF_USERSEGS_BIT,CPUVAR(FLAGS) + jc 99f + movq CPUVAR(CURPCB),%rdx /* for below */ + movq PCB_FSBASE(%rdx),%rax + movq %rax,%rdx + shrq $32,%rdx + movl $MSR_FSBASE,%ecx + wrmsr +99: + /* + * Returning to userspace. We need to go things in this order: + * - update the iret frame from the trapframe + * - finish reading from the trapframe + * - switch to the trampoline stack + * - jump to the .kutext segment + * - switch to the user page tables + * - swapgs + * - iretq + * To get the final value for the register that was used + * for the mov to %cr3, we need access to somewhere accessible + * on the user page tables, so we save it in CPUVAR(SCRATCH) + * across the switch. + */ + /* update iret frame */ + movq CPUVAR(INTR_RSP),%rdx + movq $(GSEL(GUCODE_SEL,SEL_UPL)),IRETQ_CS(%rdx) + movq TF_RIP(%rsp),%rax + movq %rax,IRETQ_RIP(%rdx) + movq TF_RFLAGS(%rsp),%rax + movq %rax,IRETQ_RFLAGS(%rdx) + movq TF_RSP(%rsp),%rax + movq %rax,IRETQ_RSP(%rdx) + movq $(GSEL(GUDATA_SEL,SEL_UPL)),IRETQ_SS(%rdx) + /* finish with the trap frame */ + movq TF_RAX(%rsp),%rax + movq %rax,CPUVAR(SCRATCH) + movq TF_RCX(%rsp),%rcx + movq TF_R11(%rsp),%r11 + /* switch to the trampoline stack */ + xchgq %rdx,%rsp + movq TF_RDX(%rdx),%rdx + movq CPUVAR(USER_CR3),%rax + testq %rax,%rax + jz 1f + jmp iretq_tramp - INTR_RESTORE_SELECTORS +KUENTRY(iretq_tramp) + movq %rax,%cr3 +1: movq CPUVAR(SCRATCH),%rax + swapgs -5: movq TF_RDX(%rsp),%rdx + .globl _C_LABEL(doreti_iret) +_C_LABEL(doreti_iret): + iretq + +NENTRY(intr_exit_recurse) + movq TF_RDX(%rsp),%rdx movq TF_RCX(%rsp),%rcx movq TF_R11(%rsp),%r11 movq TF_RAX(%rsp),%rax @@ -662,9 +776,6 @@ NENTRY(intr_fast_exit) #endif /* !defined(GPROF) && defined(DDBPROF) */ addq $TF_RIP,%rsp - - .globl _C_LABEL(doreti_iret) -_C_LABEL(doreti_iret): iretq @@ -697,6 +808,33 @@ _C_LABEL(doreti_iret): addq $TF_RIP,%rsp iretq #endif /* !defined(GPROF) && defined(DDBPROF) */ + .text + +#ifdef DIAGNOSTIC +.Lintr_exit_not_blocked: + xchgw %bx, %bx + movl warn_once(%rip),%edi + testl %edi,%edi + jnz 1f + incl %edi + movl %edi,warn_once(%rip) + leaq .Lnot_blocked(%rip),%rdi + call _C_LABEL(printf) +#ifdef DDB + int $3 +#endif /* DDB */ +1: cli + jmp intr_fast_exit + + .data +.global warn_once +warn_once: + .long 0 + .section .rodata +.Lnot_blocked: + .asciz "WARNING: INTERRUPTS NOT BLOCKED ON INTERRUPT RETURN: 0x%x 0x%x\n" + .text +#endif ENTRY(xrstor_user) movq %rsi, %rdx diff --git a/sys/arch/amd64/amd64/locore0.S b/sys/arch/amd64/amd64/locore0.S index 50f0c7ecd82..53ef3672be5 100644 --- a/sys/arch/amd64/amd64/locore0.S +++ b/sys/arch/amd64/amd64/locore0.S @@ -1,4 +1,4 @@ -/* $OpenBSD: locore0.S,v 1.6 2018/01/07 19:56:19 mlarkin Exp $ */ +/* $OpenBSD: locore0.S,v 1.7 2018/02/21 19:24:15 guenther Exp $ */ /* $NetBSD: locore.S,v 1.13 2004/03/25 18:33:17 drochner Exp $ */ /* @@ -205,26 +205,47 @@ bi_size_ok: movl $0, 12(%ebp) /* - * Determine if CPU is Intel. Intel CPUs cannot use PG_G (global - * pages) in kernel mappings. If CPU is not Intel, this is safe. - * Cache the result in pg_g_kern - 0 if not supported or PG_G (0x100) - * if supported. - * - * This treatment is required for the meltdown CVE mitigation. + * Determine if CPU has meltdown. Certain Intel CPUs do not properly + * respect page permissions when speculatively loading data into + * the cache ("Meltdown" CVE). These CPUs must utilize a secondary + * sanitized page table lacking kernel mappings when executing user + * processes, and may not use PG_G global PTEs for kernel VAs. */ + movl $0x1, RELOC(cpu_meltdown) /* assume insecure at first */ + movl $0x0, RELOC(pg_g_kern) + cmpl $0x756e6547, %ebx # "Genu" - jne not_intel + jne .Lcpu_secure cmpl $0x6c65746e, %ecx # "ntel" - jne not_intel + jne .Lcpu_secure cmpl $0x49656e69, %edx # "ineI" - jne not_intel + jne .Lcpu_secure - jmp pg_g_check_finished + /* + * Intel CPU, now check if IA32_ARCH_CAPABILITIES is supported and + * if it says this CPU is safe. + */ + movl $0x0, %eax + cpuid + cmpl $0x7, %eax + jl .Lcpu_check_finished + + movl $0x7, %eax + cpuid + testl $SEFF0EDX_ARCH_CAP, %edx + jz .Lcpu_check_finished + + /* IA32_ARCH_CAPABILITIES MSR avaialble, use it to check CPU security */ + movl $MSR_ARCH_CAPABILITIES, %ecx + rdmsr + testl $ARCH_CAPABILITIES_RDCL_NO, %eax + jz .Lcpu_check_finished -not_intel: +.Lcpu_secure: + movl $0x0, RELOC(cpu_meltdown) movl $PG_G, RELOC(pg_g_kern) -pg_g_check_finished: +.Lcpu_check_finished: movl $1,%eax cpuid movl %eax,RELOC(cpu_id) diff --git a/sys/arch/amd64/amd64/machdep.c b/sys/arch/amd64/amd64/machdep.c index a1d5e02f340..dd0623b15c4 100644 --- a/sys/arch/amd64/amd64/machdep.c +++ b/sys/arch/amd64/amd64/machdep.c @@ -1,4 +1,4 @@ -/* $OpenBSD: machdep.c,v 1.238 2018/02/06 01:09:17 patrick Exp $ */ +/* $OpenBSD: machdep.c,v 1.239 2018/02/21 19:24:15 guenther Exp $ */ /* $NetBSD: machdep.c,v 1.3 2003/05/07 22:58:18 fvdl Exp $ */ /*- @@ -90,7 +90,7 @@ #include -#include +#include #include #include #include @@ -141,6 +141,14 @@ extern int db_console; #include #endif +/* #define MACHDEP_DEBUG */ + +#ifdef MACHDEP_DEBUG +#define DPRINTF(x...) do { printf(x); } while(0) +#else +#define DPRINTF(x...) +#endif /* MACHDEP_DEBUG */ + /* the following is used externally (sysctl_hw) */ char machine[] = MACHINE; @@ -257,6 +265,7 @@ void cpu_init_extents(void); void map_tramps(void); void init_x86_64(paddr_t); void (*cpuresetfn)(void); +void enter_shared_special_pages(void); #ifdef APERTURE int allowaperture = 0; @@ -313,6 +322,65 @@ cpu_startup(void) #ifndef SMALL_KERNEL cpu_ucode_setup(); #endif + /* enter the IDT and trampoline code in the u-k maps */ + enter_shared_special_pages(); + + /* initialize CPU0's TSS and GDT and put them in the u-k maps */ + cpu_enter_pages(&cpu_info_full_primary); +} + +/* + * enter_shared_special_pages + * + * Requests mapping of various special pages required in the Intel Meltdown + * case (to be entered into the U-K page table): + * + * 1 IDT page + * Various number of pages covering the U-K ".kutext" section. This section + * contains code needed during trampoline operation + * Various number of pages covering the U-K ".kudata" section. This section + * contains data accessed by the trampoline, before switching to U+K + * (for example, various shared global variables used by IPIs, etc) + * + * The linker script places the required symbols in the sections above. + * + * On CPUs not affected by Meltdown, the calls to pmap_enter_special below + * become no-ops. + */ +void +enter_shared_special_pages(void) +{ + extern char __kutext_start[], __kutext_end[], __kernel_kutext_phys[]; + extern char __kudata_start[], __kudata_end[], __kernel_kudata_phys[]; + vaddr_t va; + paddr_t pa; + + /* idt */ + pmap_enter_special(idt_vaddr, idt_paddr, PROT_READ); + DPRINTF("%s: entered idt page va 0x%llx pa 0x%llx\n", __func__, + (uint64_t)idt_vaddr, (uint64_t)idt_paddr); + + /* .kutext section */ + va = (vaddr_t)__kutext_start; + pa = (paddr_t)__kernel_kutext_phys; + while (va < (vaddr_t)__kutext_end) { + pmap_enter_special(va, pa, PROT_READ | PROT_EXEC); + DPRINTF("%s: entered kutext page va 0x%llx pa 0x%llx\n", + __func__, (uint64_t)va, (uint64_t)pa); + va += PAGE_SIZE; + pa += PAGE_SIZE; + } + + /* .kudata section */ + va = (vaddr_t)__kudata_start; + pa = (paddr_t)__kernel_kudata_phys; + while (va < (vaddr_t)__kudata_end) { + pmap_enter_special(va, pa, PROT_READ | PROT_WRITE); + DPRINTF("%s: entered kudata page va 0x%llx pa 0x%llx\n", + __func__, (uint64_t)va, (uint64_t)pa); + va += PAGE_SIZE; + pa += PAGE_SIZE; + } } /* @@ -329,12 +397,6 @@ x86_64_proc0_tss_ldt_init(void) pcb->pcb_kstack = (u_int64_t)proc0.p_addr + USPACE - 16; proc0.p_md.md_regs = (struct trapframe *)pcb->pcb_kstack - 1; - /* an empty iomap, by setting its offset to the TSS limit */ - cpu_info_primary.ci_tss->tss_iobase = sizeof(struct x86_64_tss); - cpu_info_primary.ci_tss->tss_rsp0 = pcb->pcb_kstack; - cpu_info_primary.ci_tss->tss_ist[0] = - (u_int64_t)proc0.p_addr + PAGE_SIZE - 16; - ltr(GSYSSEL(GPROC0_SEL, SEL_KPL)); lldt(0); } @@ -346,15 +408,11 @@ x86_64_proc0_tss_ldt_init(void) #ifdef MULTIPROCESSOR void x86_64_init_pcb_tss_ldt(struct cpu_info *ci) -{ +{ struct pcb *pcb = ci->ci_idle_pcb; - ci->ci_tss->tss_iobase = sizeof(*ci->ci_tss); - ci->ci_tss->tss_rsp0 = pcb->pcb_kstack; - ci->ci_tss->tss_ist[0] = pcb->pcb_kstack - USPACE + PAGE_SIZE; - pcb->pcb_cr0 = rcr0(); -} +} #endif /* MULTIPROCESSOR */ bios_diskinfo_t * @@ -1551,8 +1609,6 @@ init_x86_64(paddr_t first_avail) pmap_growkernel(VM_MIN_KERNEL_ADDRESS + 32 * 1024 * 1024); pmap_kenter_pa(idt_vaddr, idt_paddr, PROT_READ | PROT_WRITE); - pmap_kenter_pa(idt_vaddr + PAGE_SIZE, idt_paddr + PAGE_SIZE, - PROT_READ | PROT_WRITE); #if defined(MULTIPROCESSOR) || \ (NACPI > 0 && !defined(SMALL_KERNEL)) @@ -1560,7 +1616,7 @@ init_x86_64(paddr_t first_avail) #endif idt = (struct gate_descriptor *)idt_vaddr; - cpu_info_primary.ci_tss = (void *)(idt + NIDT); + cpu_info_primary.ci_tss = &cpu_info_full_primary.cif_tss; cpu_info_primary.ci_gdt = (void *)(cpu_info_primary.ci_tss + 1); /* make gdt gates and memory segments */ @@ -1585,9 +1641,10 @@ init_x86_64(paddr_t first_avail) /* exceptions */ for (x = 0; x < 32; x++) { - ist = (x == 8) ? 1 : 0; + /* trap2 == NMI, trap8 == double fault */ + ist = (x == 2) ? 2 : (x == 8) ? 1 : 0; setgate(&idt[x], IDTVEC(exceptions)[x], ist, SDT_SYS386IGT, - (x == 3 || x == 4) ? SEL_UPL : SEL_KPL, + (x == 3) ? SEL_UPL : SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); idt_allocmap[x] = 1; } diff --git a/sys/arch/amd64/amd64/pmap.c b/sys/arch/amd64/amd64/pmap.c index bb7ba397bbe..3e559206608 100644 --- a/sys/arch/amd64/amd64/pmap.c +++ b/sys/arch/amd64/amd64/pmap.c @@ -1,4 +1,4 @@ -/* $OpenBSD: pmap.c,v 1.108 2018/01/07 19:56:19 mlarkin Exp $ */ +/* $OpenBSD: pmap.c,v 1.109 2018/02/21 19:24:15 guenther Exp $ */ /* $NetBSD: pmap.c,v 1.3 2003/05/08 18:13:13 thorpej Exp $ */ /* @@ -119,6 +119,15 @@ #include "acpi.h" +/* #define PMAP_DEBUG */ + +#ifdef PMAP_DEBUG +#define DPRINTF(x...) do { printf(x); } while(0) +#else +#define DPRINTF(x...) +#endif /* PMAP_DEBUG */ + + /* * general info: * @@ -255,6 +264,7 @@ TAILQ_HEAD(pg_to_free, vm_page); struct pool pmap_pdp_pool; void pmap_pdp_ctor(pd_entry_t *); +void pmap_pdp_ctor_intel(pd_entry_t *); extern vaddr_t msgbuf_vaddr; extern paddr_t msgbuf_paddr; @@ -268,6 +278,8 @@ extern vaddr_t lo32_paddr; vaddr_t virtual_avail; extern int end; +extern uint32_t cpu_meltdown; + /* * local prototypes */ @@ -309,7 +321,6 @@ void pmap_tlb_shootwait(void); #define pmap_tlb_shootwait() #endif - /* * p m a p i n l i n e h e l p e r f u n c t i o n s */ @@ -323,7 +334,8 @@ static __inline boolean_t pmap_is_curpmap(struct pmap *pmap) { return((pmap == pmap_kernel()) || - (pmap->pm_pdirpa == (paddr_t) rcr3())); + (pmap->pm_pdirpa == (paddr_t) rcr3()) || + (pmap->pm_pdirpa_intel == (paddr_t) rcr3())); } /* @@ -484,7 +496,6 @@ pmap_find_pte_direct(struct pmap *pm, vaddr_t va, pt_entry_t **pd, int *offs) return (0); } - /* * p m a p k e n t e r f u n c t i o n s * @@ -586,12 +597,12 @@ pmap_kremove(vaddr_t sva, vsize_t len) paddr_t pmap_bootstrap(paddr_t first_avail, paddr_t max_pa) { - vaddr_t kva, kva_end, kva_start = VM_MIN_KERNEL_ADDRESS; + vaddr_t kva_start = VM_MIN_KERNEL_ADDRESS; struct pmap *kpm; int i; - unsigned long p1i; long ndmpdp; paddr_t dmpd, dmpdp; + vaddr_t kva, kva_end; /* * define the boundaries of the managed kernel virtual address @@ -643,9 +654,14 @@ pmap_bootstrap(paddr_t first_avail, paddr_t max_pa) curpcb->pcb_pmap = kpm; /* proc0's pcb */ /* - * enable global TLB entries. + * Add PG_G attribute to already mapped kernel pages. pg_g_kern + * is calculated in locore0.S and may be set to: + * + * 0 if this CPU does not safely support global pages in the kernel + * (Intel/Meltdown) + * PG_G if this CPU does safely support global pages in the kernel + * (AMD) */ - /* add PG_G attribute to already mapped kernel pages */ #if KERNBASE == VM_MIN_KERNEL_ADDRESS for (kva = VM_MIN_KERNEL_ADDRESS ; kva < virtual_avail ; #else @@ -653,7 +669,7 @@ pmap_bootstrap(paddr_t first_avail, paddr_t max_pa) for (kva = KERNBASE; kva < kva_end ; #endif kva += PAGE_SIZE) { - p1i = pl1_i(kva); + unsigned long p1i = pl1_i(kva); if (pmap_valid_entry(PTE_BASE[p1i])) PTE_BASE[p1i] |= pg_g_kern; } @@ -726,7 +742,7 @@ pmap_bootstrap(paddr_t first_avail, paddr_t max_pa) LIST_INIT(&pmaps); /* - * initialize the pmap pool. + * initialize the pmap pools. */ pool_init(&pmap_pmap_pool, sizeof(struct pmap), 0, IPL_NONE, 0, @@ -742,6 +758,9 @@ pmap_bootstrap(paddr_t first_avail, paddr_t max_pa) pool_init(&pmap_pdp_pool, PAGE_SIZE, 0, IPL_NONE, PR_WAITOK, "pdppl", NULL); + kpm->pm_pdir_intel = 0; + kpm->pm_pdirpa_intel = 0; + /* * ensure the TLB is sync'd with reality by flushing it... */ @@ -894,13 +913,21 @@ pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va, unsigned long index; int level; vaddr_t invaladdr; - pd_entry_t opde; + pd_entry_t opde, *mdpml4es; level = 1; do { pmap_freepage(pmap, ptp, level, pagelist); index = pl_i(va, level + 1); opde = pmap_pte_set(&pdes[level - 1][index], 0); + if (level == 3 && pmap->pm_pdir_intel) { + /* Zap special meltdown PML4e */ + mdpml4es = (pd_entry_t *)pmap->pm_pdir_intel; + opde = pmap_pte_set(&mdpml4es[index], 0); + DPRINTF("%s: cleared meltdown PML4e @ index %lu " + "(va range start 0x%llx)\n", __func__, index, + (uint64_t)(index << L4_SHIFT)); + } invaladdr = level == 1 ? (vaddr_t)ptes : (vaddr_t)pdes[level - 2]; pmap_tlb_shootpage(curpcb->pcb_pmap, @@ -934,7 +961,7 @@ pmap_get_ptp(struct pmap *pmap, vaddr_t va, pd_entry_t **pdes) struct vm_page *ptp, *pptp; int i; unsigned long index; - pd_entry_t *pva; + pd_entry_t *pva, *pva_intel; paddr_t ppa, pa; struct uvm_object *obj; @@ -973,6 +1000,20 @@ pmap_get_ptp(struct pmap *pmap, vaddr_t va, pd_entry_t **pdes) pmap->pm_ptphint[i - 2] = ptp; pa = VM_PAGE_TO_PHYS(ptp); pva[index] = (pd_entry_t) (pa | PG_u | PG_RW | PG_V); + + /* + * Meltdown Special case - if we are adding a new PML4e for + * usermode addresses, just copy the PML4e to the U-K page + * table. + */ + if (pmap->pm_pdir_intel && i == 4 && va < VM_MAXUSER_ADDRESS) { + pva_intel = (pd_entry_t *)pmap->pm_pdir_intel; + pva_intel[index] = pva[index]; + DPRINTF("%s: copying usermode PML4e (content=0x%llx) " + "from 0x%llx -> 0x%llx\n", __func__, pva[index], + (uint64_t)&pva[index], (uint64_t)&pva_intel[index]); + } + pmap->pm_stats.resident_count++; /* * If we're not in the top level, increase the @@ -1048,6 +1089,15 @@ pmap_pdp_ctor(pd_entry_t *pdir) #endif } +void +pmap_pdp_ctor_intel(pd_entry_t *pdir) +{ + struct pmap *kpm = pmap_kernel(); + + /* Copy PML4es from pmap_kernel's U-K view */ + memcpy(pdir, kpm->pm_pdir_intel, PAGE_SIZE); +} + /* * pmap_create: create a pmap * @@ -1088,6 +1138,22 @@ pmap_create(void) pmap->pm_pdirpa = pmap->pm_pdir[PDIR_SLOT_PTE] & PG_FRAME; + /* + * Intel CPUs need a special page table to be used during usermode + * execution, one that lacks all kernel mappings. + */ + if (cpu_meltdown) { + pmap->pm_pdir_intel = pool_get(&pmap_pdp_pool, PR_WAITOK); + pmap_pdp_ctor_intel(pmap->pm_pdir_intel); + if (!pmap_extract(pmap_kernel(), (vaddr_t)pmap->pm_pdir_intel, + &pmap->pm_pdirpa_intel)) + panic("%s: unknown PA mapping for meltdown PML4\n", + __func__); + } else { + pmap->pm_pdir_intel = 0; + pmap->pm_pdirpa_intel = 0; + } + LIST_INSERT_HEAD(&pmaps, pmap, pm_list); return (pmap); } @@ -1145,6 +1211,9 @@ pmap_destroy(struct pmap *pmap) /* XXX: need to flush it out of other processor's space? */ pool_put(&pmap_pdp_pool, pmap->pm_pdir); + if (pmap->pm_pdir_intel) + pool_put(&pmap_pdp_pool, pmap->pm_pdir_intel); + pool_put(&pmap_pmap_pool, pmap); } @@ -1959,6 +2028,137 @@ pmap_collect(struct pmap *pmap) * defined as macro in pmap.h */ +void +pmap_enter_special(vaddr_t va, paddr_t pa, vm_prot_t prot) +{ + uint64_t l4idx, l3idx, l2idx, l1idx; + pd_entry_t *pd, *ptp; + paddr_t npa; + struct pmap *pmap = pmap_kernel(); + + /* If CPU is secure, no need to do anything */ + if (!cpu_meltdown) + return; + + /* Must be kernel VA */ + if (va < VM_MIN_KERNEL_ADDRESS) + panic("%s: invalid special mapping va 0x%lx requested", + __func__, va); + + if (!pmap->pm_pdir_intel) + pmap->pm_pdir_intel = pool_get(&pmap_pdp_pool, + PR_WAITOK | PR_ZERO); + + l4idx = (va & L4_MASK) >> L4_SHIFT; /* PML4E idx */ + l3idx = (va & L3_MASK) >> L3_SHIFT; /* PDPTE idx */ + l2idx = (va & L2_MASK) >> L2_SHIFT; /* PDE idx */ + l1idx = (va & L1_MASK) >> L1_SHIFT; /* PTE idx */ + + DPRINTF("%s: va=0x%llx pa=0x%llx l4idx=%lld l3idx=%lld " + "l2idx=%lld l1idx=%lld\n", __func__, (uint64_t)va, + (uint64_t)pa, l4idx, l3idx, l2idx, l1idx); + + /* Start at PML4 / top level */ + pd = (pd_entry_t *)pmap->pm_pdir_intel; + + if (!pd) + panic("%s: PML4 not initialized for pmap @ %p\n", __func__, + pmap); + + /* npa = physaddr of PDPT */ + npa = pd[l4idx] & PMAP_PA_MASK; + + /* Valid PML4e for the 512GB region containing va? */ + if (!npa) { + /* No valid PML4E - allocate PDPT page and set PML4E */ + + ptp = pool_get(&pmap_pdp_pool, PR_WAITOK | PR_ZERO); + + if (!pmap_extract(pmap, (vaddr_t)ptp, &npa)) + panic("%s: can't locate PDPT page\n", __func__); + + pd[l4idx] = (npa | PG_u | PG_RW | PG_V); + + DPRINTF("%s: allocated new PDPT page at phys 0x%llx, " + "setting PML4e[%lld] = 0x%llx\n", __func__, + (uint64_t)npa, l4idx, pd[l4idx]); + } + + pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa); + if (!pd) + panic("%s: can't locate PDPT @ pa=0x%llx\n", __func__, + (uint64_t)npa); + + /* npa = physaddr of PD page */ + npa = pd[l3idx] & PMAP_PA_MASK; + + /* Valid PDPTe for the 1GB region containing va? */ + if (!npa) { + /* No valid PDPTe - allocate PD page and set PDPTe */ + + ptp = pool_get(&pmap_pdp_pool, PR_WAITOK | PR_ZERO); + + if (!pmap_extract(pmap, (vaddr_t)ptp, &npa)) + panic("%s: can't locate PD page\n", __func__); + + pd[l3idx] = (npa | PG_u | PG_RW | PG_V); + + DPRINTF("%s: allocated new PD page at phys 0x%llx, " + "setting PDPTe[%lld] = 0x%llx\n", __func__, + (uint64_t)npa, l3idx, pd[l3idx]); + } + + pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa); + if (!pd) + panic("%s: can't locate PD page @ pa=0x%llx\n", __func__, + (uint64_t)npa); + + /* npa = physaddr of PT page */ + npa = pd[l2idx] & PMAP_PA_MASK; + + /* Valid PDE for the 2MB region containing va? */ + if (!npa) { + /* No valid PDE - allocate PT page and set PDE */ + + ptp = pool_get(&pmap_pdp_pool, PR_WAITOK | PR_ZERO); + + if (!pmap_extract(pmap, (vaddr_t)ptp, &npa)) + panic("%s: can't locate PT page\n", __func__); + + pd[l2idx] = (npa | PG_u | PG_RW | PG_V); + + DPRINTF("%s: allocated new PT page at phys 0x%llx, " + "setting PDE[%lld] = 0x%llx\n", __func__, + (uint64_t)npa, l2idx, pd[l2idx]); + } + + pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa); + if (!pd) + panic("%s: can't locate PT page @ pa=0x%llx\n", __func__, + (uint64_t)npa); + + DPRINTF("%s: setting PTE, PT page @ phys 0x%llx virt 0x%llx prot " + "0x%llx was 0x%llx\n", __func__, (uint64_t)npa, (uint64_t)pd, (uint64_t)prot, (uint64_t)pd[l1idx]); + + pd[l1idx] = pa | protection_codes[prot] | PG_V | pg_g_kern | PG_W; + DPRINTF("%s: setting PTE[%lld] = 0x%llx\n", __func__, l1idx, pd[l1idx]); + + if (pg_g_kern) { + /* now set the PG_G flag on the corresponding U+K entry */ + pt_entry_t *ptes; + int level, offs; + + level = pmap_find_pte_direct(pmap, va, &ptes, &offs); + if (__predict_true(level == 0 && + pmap_valid_entry(ptes[offs]))) { + ptes[offs] |= pg_g_kern; + } else { + DPRINTF("%s: no U+K mapping for special mapping?\n", + __func__); + } + } +} + /* * pmap_enter: enter a mapping into a pmap * @@ -2439,10 +2639,10 @@ pmap_convert(struct pmap *pmap, int mode) * release the lock if we get an interrupt in a bad moment. */ -volatile long tlb_shoot_wait; +volatile long tlb_shoot_wait __attribute__((section(".kudata"))); -volatile vaddr_t tlb_shoot_addr1; -volatile vaddr_t tlb_shoot_addr2; +volatile vaddr_t tlb_shoot_addr1 __attribute__((section(".kudata"))); +volatile vaddr_t tlb_shoot_addr2 __attribute__((section(".kudata"))); void pmap_tlb_shootpage(struct pmap *pm, vaddr_t va, int shootself) diff --git a/sys/arch/amd64/amd64/spl.S b/sys/arch/amd64/amd64/spl.S index c4b6fe697b6..2ea315f2fb5 100644 --- a/sys/arch/amd64/amd64/spl.S +++ b/sys/arch/amd64/amd64/spl.S @@ -1,4 +1,4 @@ -/* $OpenBSD: spl.S,v 1.11 2016/05/20 14:37:53 deraadt Exp $ */ +/* $OpenBSD: spl.S,v 1.12 2018/02/21 19:24:15 guenther Exp $ */ /* $NetBSD: spl.S,v 1.3 2004/06/28 09:13:11 fvdl Exp $ */ /* @@ -114,7 +114,7 @@ _C_LABEL(splx): * a lower-prio one first, which needs to take the kernel lock --> * the sending CPU will never see the that CPU accept the IPI */ -IDTVEC(spllower) +KIDTVEC(spllower) _PROF_PROLOGUE pushq %rbx pushq %r13 @@ -143,7 +143,7 @@ IDTVEC(spllower) * ebx - cpl to restore * r13 - address to resume loop at */ -IDTVEC(doreti) +KIDTVEC(doreti) popq %rbx # get previous priority decl CPUVAR(IDEPTH) leaq 1f(%rip),%r13 @@ -168,4 +168,8 @@ IDTVEC(doreti) call _C_LABEL(ast) cli jmp 5b -3: INTRFASTEXIT +3: +#ifdef DIAGNOSTIC + movl $254,%esi +#endif /* DIAGNOSTIC */ + INTRFASTEXIT diff --git a/sys/arch/amd64/amd64/trap.c b/sys/arch/amd64/amd64/trap.c index 47b3bee5128..dc2d115c207 100644 --- a/sys/arch/amd64/amd64/trap.c +++ b/sys/arch/amd64/amd64/trap.c @@ -1,4 +1,4 @@ -/* $OpenBSD: trap.c,v 1.63 2018/01/05 11:10:25 pirofti Exp $ */ +/* $OpenBSD: trap.c,v 1.64 2018/02/21 19:24:15 guenther Exp $ */ /* $NetBSD: trap.c,v 1.2 2003/05/04 23:51:56 fvdl Exp $ */ /*- @@ -212,6 +212,18 @@ trap(struct trapframe *frame) frame->tf_rip = (u_int64_t)xrstor_resume; return; } + + /* + * Check for failure during return to user mode. + * We do this by looking at the address of the + * instruction that faulted. + */ + if (frame->tf_rip == (u_int64_t)doreti_iret) { + frame->tf_rip = (u_int64_t)resume_iret; + return; + } + /* FALLTHROUGH */ + case T_SEGNPFLT: case T_ALIGNFLT: case T_TSSFLT: @@ -223,16 +235,6 @@ copyfault: frame->tf_rip = (u_int64_t)pcb->pcb_onfault; return; } - - /* - * Check for failure during return to user mode. - * We do this by looking at the address of the - * instruction that faulted. - */ - if (frame->tf_rip == (u_int64_t)doreti_iret) { - frame->tf_rip = (u_int64_t)resume_iret; - return; - } goto we_re_toast; case T_PROTFLT|T_USER: /* protection fault */ @@ -459,8 +461,12 @@ out: static void frame_dump(struct trapframe *tf) { - printf("rip %p rsp %p rfl %p\n", - (void *)tf->tf_rip, (void *)tf->tf_rsp, (void *)tf->tf_rflags); + printf("rip %p cs 0x%x rfl %p rsp %p ss 0x%x\n", + (void *)tf->tf_rip, (unsigned)tf->tf_cs & 0xffff, + (void *)tf->tf_rflags, + (void *)tf->tf_rsp, (unsigned)tf->tf_ss & 0xffff); + printf("err 0x%llx trapno 0x%llx\n", + tf->tf_err, tf->tf_trapno); printf("rdi %p rsi %p rdx %p\n", (void *)tf->tf_rdi, (void *)tf->tf_rsi, (void *)tf->tf_rdx); printf("rcx %p r8 %p r9 %p\n", diff --git a/sys/arch/amd64/amd64/vector.S b/sys/arch/amd64/amd64/vector.S index 730220af132..5de23fe67ab 100644 --- a/sys/arch/amd64/amd64/vector.S +++ b/sys/arch/amd64/amd64/vector.S @@ -1,4 +1,4 @@ -/* $OpenBSD: vector.S,v 1.51 2017/10/04 02:10:33 guenther Exp $ */ +/* $OpenBSD: vector.S,v 1.52 2018/02/21 19:24:15 guenther Exp $ */ /* $NetBSD: vector.S,v 1.5 2004/06/28 09:13:11 fvdl Exp $ */ /* @@ -104,36 +104,97 @@ #define TRAP(a) pushq $(a) ; jmp _C_LABEL(alltraps) #define ZTRAP(a) pushq $0 ; TRAP(a) - .text IDTVEC(trap00) ZTRAP(T_DIVIDE) IDTVEC(trap01) ZTRAP(T_TRCTRAP) + +/* + * NMIs can happen at any time, so there's no simple way to tell + * which GS.base is in place at the time of the interrupt. Instead, + * borrow a couple ideas from FreeBSD and put the CPU's kernel + * GS.base in the memory right above the stack, storing the current + * one in a pair of callee-saved registers (%r12/13). We save the + * current %cr3 in a callee-saved register too (%r15). + * Note: we don't unblock interrupts because a nested normal interrupt + * would also reenable NMIs. + */ IDTVEC(trap02) - ZTRAP(T_NMI) + pushq $0 + pushq $T_NMI +calltrap_specstk: # special stack path + INTR_REENTRY + movl $MSR_FSBASE,%ecx # save current GS.base... + rdmsr + movq %rax,%r12 # ...in %r12 and %r13 + movq %rdx,%r13 + movq FRAMESIZE(%rsp),%rax # get kernel GS.base + movq %rax,%rdx + shrq $32,%rdx + wrmsr # switch to it + movq %cr3,%r15 # save current %cr3 in %r15 + movq CPUVAR(KERN_CR3),%rax # switch to kernel page tables + testq %rax,%rax + jz INTRENTRY_LABEL(calltrap_specstk) + movq %rax,%cr3 + jmp INTRENTRY_LABEL(calltrap_specstk) + .text + .globl INTRENTRY_LABEL(calltrap_specstk) +INTRENTRY_LABEL(calltrap_specstk): + cld + SMAP_CLAC + movq %rsp,%rdi + call trap + movl $MSR_FSBASE,%ecx # restore GS.base + movq %r12,%rax + movq %r13,%rdx + wrmsr + popq %rdi + popq %rsi + popq %rdx + popq %rcx + popq %r8 + popq %r9 + popq %r10 + popq %r11 + popq %r12 + popq %r13 + popq %r14 + jmp calltrap_specstk_tramp +KUENTRY(calltrap_specstk_tramp) + movq %r15,%cr3 # restore %cr3 + popq %r15 + popq %rbp + popq %rbx + popq %rax + addq $48,%rsp # ignored TF_[DEFG]S + iretq + IDTVEC(trap03) ZTRAP(T_BPTFLT) IDTVEC(trap04) - ZTRAP(T_OFLOW) + ZTRAP(T_OFLOW) # impossible: INTO instruction invalid in amd64 IDTVEC(trap05) - ZTRAP(T_BOUND) + ZTRAP(T_BOUND) # impossible: BOUND instruction invalid in amd64 IDTVEC(trap06) ZTRAP(T_PRIVINFLT) IDTVEC(trap07) pushq $0 # dummy error code pushq $T_DNA - INTRENTRY + INTRENTRY(trap07) sti cld SMAP_CLAC movq CPUVAR(SELF),%rdi movq %rsp, %rsi call _C_LABEL(fpudna) + cli INTRFASTEXIT IDTVEC(trap08) - TRAP(T_DOUBLEFLT) + pushq $T_DOUBLEFLT + jmp calltrap_specstk IDTVEC(trap09) - ZTRAP(T_FPOPFLT) + ZTRAP(T_FPOPFLT) # impossible: not generated on amd64 IDTVEC(trap0a) TRAP(T_TSSFLT) IDTVEC(trap0b) @@ -149,30 +210,49 @@ IDTVEC(trap0c) * so that we can do the necessary swapgs in that case. */ IDTVEC(trap0d) - subq $TF_ERR,%rsp - movl $T_PROTFLT,TF_TRAPNO(%rsp) - movq %rdi,TF_RDI(%rsp) - leaq _C_LABEL(doreti_iret)(%rip),%rdi - cmpq %rdi,TF_RIP(%rsp) + pushq %rcx + leaq _C_LABEL(doreti_iret)(%rip),%rcx + cmpq %rcx,16(%rsp) /* over %rcx and err to %rip */ + popq %rcx je 1f - testq $SEL_RPL,TF_CS(%rsp) - jz 2f + testq $SEL_RPL,16(%rsp) /* over err and %rip to %cs */ + je INTRENTRY_LABEL(trap0d) 1: swapgs -2: movq %r15,TF_R15(%rsp) - movq %r14,TF_R14(%rsp) - movq %r13,TF_R13(%rsp) - movq %r12,TF_R12(%rsp) - movq %r11,TF_R11(%rsp) - movq %r10,TF_R10(%rsp) - movq %r9,TF_R9(%rsp) - movq %r8,TF_R8(%rsp) - /*movq %rdi,TF_RDI(%rsp) done above */ - movq %rsi,TF_RSI(%rsp) - movq %rbp,TF_RBP(%rsp) - movq %rbx,TF_RBX(%rsp) - movq %rdx,TF_RDX(%rsp) + movq %rax,CPUVAR(SCRATCH) + movq CPUVAR(KERN_CR3),%rax + testq %rax,%rax + jz 98f + movq %rax,%cr3 + jmp 98f + .text + .globl INTRENTRY_LABEL(trap0d) +INTRENTRY_LABEL(trap0d): /* from kernel */ + pushq $T_PROTFLT + subq $152,%rsp movq %rcx,TF_RCX(%rsp) - movq %rax,TF_RAX(%rsp) + jmp 99f +98: /* from userspace */ + movq CPUVAR(KERN_RSP),%rax + xchgq %rax,%rsp + movq %rcx,TF_RCX(%rsp) + /* set trapno in the trap frame */ + movq $T_PROTFLT,TF_TRAPNO(%rsp) + /* copy err and iretq frame to the trap frame */ + movq 0(%rax),%rcx + movq %rcx,TF_ERR(%rsp) + add $8,%rax + movq IRETQ_RIP(%rax),%rcx + movq %rcx,TF_RIP(%rsp) + movq IRETQ_CS(%rax),%rcx + movq %rcx,TF_CS(%rsp) + movq IRETQ_RFLAGS(%rax),%rcx + movq %rcx,TF_RFLAGS(%rsp) + movq IRETQ_RSP(%rax),%rcx + movq %rcx,TF_RSP(%rsp) + movq IRETQ_SS(%rax),%rcx + movq %rcx,TF_SS(%rsp) + movq CPUVAR(SCRATCH),%rax +99: INTR_SAVE_MOST_GPRS_NO_ADJ sti jmp calltrap @@ -204,7 +284,9 @@ IDTVEC(trap1f) /* 20 - 31 reserved for future exp */ ZTRAP(T_RESERVED) -IDTVEC(exceptions) + .section .rodata + .globl Xexceptions +Xexceptions: .quad _C_LABEL(Xtrap00), _C_LABEL(Xtrap01) .quad _C_LABEL(Xtrap02), _C_LABEL(Xtrap03) .quad _C_LABEL(Xtrap04), _C_LABEL(Xtrap05) @@ -232,19 +314,44 @@ IDTVEC(exceptions) * protection fault. This will cause the process to get a SIGBUS. */ NENTRY(resume_iret) - pushq $0 - pushq $T_PROTFLT - subq $32,%rsp - INTR_SAVE_GPRS + movq %rax,CPUVAR(SCRATCH) + movq CPUVAR(KERN_CR3),%rax + testq %rax,%rax + jz INTRENTRY_LABEL(iret) + movq %rax,%cr3 + jmp INTRENTRY_LABEL(iret) + .text + .globl INTRENTRY_LABEL(iret) +INTRENTRY_LABEL(iret): /* from kernel */ + movq CPUVAR(KERN_RSP),%rax + xchgq %rax,%rsp + movq %rcx,TF_RCX(%rsp) + /* set trapno+err in the trap frame */ + movq $T_PROTFLT,TF_TRAPNO(%rsp) + movq $0,TF_ERR(%rsp) + /* copy iretq frame to the trap frame */ + movq IRETQ_RIP(%rax),%rcx + movq %rcx,TF_RIP(%rsp) + movq IRETQ_CS(%rax),%rcx + movq %rcx,TF_CS(%rsp) + movq IRETQ_RFLAGS(%rax),%rcx + movq %rcx,TF_RFLAGS(%rsp) + movq IRETQ_RSP(%rax),%rcx + movq %rcx,TF_RSP(%rsp) + movq IRETQ_SS(%rax),%rcx + movq %rcx,TF_SS(%rsp) + movq CPUVAR(SCRATCH),%rax + INTR_SAVE_MOST_GPRS_NO_ADJ sti jmp calltrap + /* * All traps go through here. Call the generic trap handler, and * check for ASTs afterwards. */ -NENTRY(alltraps) - INTRENTRY +KUENTRY(alltraps) + INTRENTRY(alltraps) sti calltrap: cld @@ -329,6 +436,7 @@ spl_lowered: /* XXX See comment in locore.s */ #define XINTR(name,num) Xintr_##name##num + KUTEXT .globl _C_LABEL(x2apic_eoi) _C_LABEL(x2apic_eoi): pushq %rax @@ -345,23 +453,23 @@ _C_LABEL(x2apic_eoi): #if NLAPIC > 0 #ifdef MULTIPROCESSOR -IDTVEC(recurse_lapic_ipi) +KIDTVEC(recurse_lapic_ipi) INTR_RECURSE_HWFRAME - pushq $0 + pushq $0 subq $8,%rsp /* unused __if_trapno */ - INTRENTRY + INTR_REENTRY jmp 1f IDTVEC(intr_lapic_ipi) - pushq $0 + pushq $0 subq $8,%rsp /* unused __if_trapno */ - INTRENTRY + INTRENTRY(intr_lapic_ipi) CODEPATCH_START movl $0,_C_LABEL(local_apic)+LAPIC_EOI CODEPATCH_END(CPTAG_EOI) movl CPUVAR(ILEVEL),%ebx cmpl $IPL_IPI,%ebx jae 2f -IDTVEC(resume_lapic_ipi) +KIDTVEC(resume_lapic_ipi) 1: incl CPUVAR(IDEPTH) movl $IPL_IPI,CPUVAR(ILEVEL) @@ -425,27 +533,27 @@ IDTVEC(ipi_invlrange) iretq #endif /* MULTIPROCESSOR */ - + /* * Interrupt from the local APIC timer. */ -IDTVEC(recurse_lapic_ltimer) +KIDTVEC(recurse_lapic_ltimer) INTR_RECURSE_HWFRAME - pushq $0 + pushq $0 subq $8,%rsp /* unused __if_trapno */ - INTRENTRY + INTR_REENTRY jmp 1f IDTVEC(intr_lapic_ltimer) - pushq $0 + pushq $0 subq $8,%rsp /* unused __if_trapno */ - INTRENTRY + INTRENTRY(intr_lapic_ltimer) CODEPATCH_START movl $0,_C_LABEL(local_apic)+LAPIC_EOI CODEPATCH_END(CPTAG_EOI) movl CPUVAR(ILEVEL),%ebx cmpl $IPL_CLOCK,%ebx jae 2f -IDTVEC(resume_lapic_ltimer) +KIDTVEC(resume_lapic_ltimer) 1: incl CPUVAR(IDEPTH) movl $IPL_CLOCK,CPUVAR(ILEVEL) @@ -466,21 +574,21 @@ IDTVEC(resume_lapic_ltimer) * Xen event channel upcall interrupt handler. * Only used when the hypervisor supports direct vector callbacks. */ -IDTVEC(recurse_xen_upcall) +KIDTVEC(recurse_xen_upcall) INTR_RECURSE_HWFRAME pushq $0 subq $8,%rsp /* unused __if_trapno */ - INTRENTRY + INTR_REENTRY jmp 1f IDTVEC(intr_xen_upcall) pushq $0 subq $8,%rsp /* unused __if_trapno */ - INTRENTRY + INTRENTRY(intr_xen_upcall) call _C_LABEL(xen_intr_ack) movl CPUVAR(ILEVEL),%ebx cmpl $IPL_NET,%ebx jae 2f -IDTVEC(resume_xen_upcall) +KIDTVEC(resume_xen_upcall) 1: incl CPUVAR(IDEPTH) movl $IPL_NET,CPUVAR(ILEVEL) @@ -502,20 +610,20 @@ IDTVEC(resume_xen_upcall) * Hyperv event channel upcall interrupt handler. * Only used when the hypervisor supports direct vector callbacks. */ -IDTVEC(recurse_hyperv_upcall) +KIDTVEC(recurse_hyperv_upcall) INTR_RECURSE_HWFRAME pushq $0 subq $8,%rsp /* unused __if_trapno */ - INTRENTRY + INTR_REENTRY jmp 1f IDTVEC(intr_hyperv_upcall) pushq $0 subq $8,%rsp /* unused __if_trapno */ - INTRENTRY + INTRENTRY(intr_hyperv_upcall) movl CPUVAR(ILEVEL),%ebx cmpl $IPL_NET,%ebx jae 2f -IDTVEC(resume_hyperv_upcall) +KIDTVEC(resume_hyperv_upcall) 1: incl CPUVAR(IDEPTH) movl $IPL_NET,CPUVAR(ILEVEL) @@ -542,11 +650,11 @@ IDTVEC(resume_hyperv_upcall) */ #define INTRSTUB(name, num, early_ack, late_ack, mask, unmask, level_mask) \ -IDTVEC(recurse_##name##num) ;\ +KIDTVEC(recurse_##name##num) ;\ INTR_RECURSE_HWFRAME ;\ subq $16,%rsp /* space for __if_{trapno,err} */;\ - INTRENTRY ;\ -IDTVEC(resume_##name##num) \ + INTR_REENTRY ;\ +KIDTVEC(resume_##name##num) \ movq $IREENT_MAGIC,TF_ERR(%rsp) ;\ movl %ebx,%r13d ;\ movq CPUVAR(ISOURCES) + (num) * 8, %r14 ;\ @@ -555,7 +663,7 @@ IDTVEC(resume_##name##num) \ IDTVEC(intr_##name##num) ;\ pushq $0 /* dummy error code */ ;\ subq $8,%rsp /* unused __if_trapno */ ;\ - INTRENTRY ;\ + INTRENTRY(intr_##name##num) ;\ movq CPUVAR(ISOURCES) + (num) * 8, %r14 ;\ mask(num) /* mask it in hardware */ ;\ early_ack(num) /* and allow other intrs */ ;\ @@ -1094,8 +1202,7 @@ _C_LABEL(ioapic_level_stubs): /* * Soft interrupt handlers */ - .text -IDTVEC(softtty) +KIDTVEC(softtty) movl $IPL_SOFTTTY, CPUVAR(ILEVEL) sti incl CPUVAR(IDEPTH) @@ -1104,7 +1211,7 @@ IDTVEC(softtty) decl CPUVAR(IDEPTH) jmp *%r13 -IDTVEC(softnet) +KIDTVEC(softnet) movl $IPL_SOFTNET, CPUVAR(ILEVEL) sti incl CPUVAR(IDEPTH) @@ -1113,7 +1220,7 @@ IDTVEC(softnet) decl CPUVAR(IDEPTH) jmp *%r13 -IDTVEC(softclock) +KIDTVEC(softclock) movl $IPL_SOFTCLOCK, CPUVAR(ILEVEL) sti incl CPUVAR(IDEPTH) diff --git a/sys/arch/amd64/conf/ld.script b/sys/arch/amd64/conf/ld.script index 4d74b3eb8e3..9c60d69f2c8 100644 --- a/sys/arch/amd64/conf/ld.script +++ b/sys/arch/amd64/conf/ld.script @@ -1,4 +1,4 @@ -/* $OpenBSD: ld.script,v 1.8 2017/10/24 20:06:54 guenther Exp $ */ +/* $OpenBSD: ld.script,v 1.9 2018/02/21 19:24:15 guenther Exp $ */ /* * Copyright (c) 2009 Tobias Weingartner @@ -52,6 +52,15 @@ SECTIONS *(.text .text.*) } :text =0xcccccccc + . = ALIGN(__ALIGN_SIZE); + __kernel_kutext_phys = (. - __kernel_virt_base) + 0x1000000; + .kutext : AT (__kernel_kutext_phys) + { + __kutext_start = ABSOLUTE(.); + *(.kutext) + __kutext_end = ABSOLUTE(.); + } :text =0xcccccccc + PROVIDE (etext = .); _etext = .; @@ -84,6 +93,17 @@ SECTIONS __data_start = ABSOLUTE(.); *(.data .data.*) } :data =0xcccccccc + . = ALIGN(0x1000); + + . = ALIGN(__ALIGN_SIZE); + __kernel_kudata_phys = (. - __kernel_virt_base) + 0x1000000; + .kudata : AT (__kernel_kudata_phys) + { + __kudata_start = ABSOLUTE(.); + *(.kudata) + __kudata_end = ABSOLUTE(.); + } :data =0xcccccccc + . = ALIGN(0x1000); PROVIDE (edata = .); _edata = .; diff --git a/sys/arch/amd64/include/asm.h b/sys/arch/amd64/include/asm.h index f64e5338f07..cd3922b4b26 100644 --- a/sys/arch/amd64/include/asm.h +++ b/sys/arch/amd64/include/asm.h @@ -1,4 +1,4 @@ -/* $OpenBSD: asm.h,v 1.8 2017/06/29 17:36:16 deraadt Exp $ */ +/* $OpenBSD: asm.h,v 1.9 2018/02/21 19:24:15 guenther Exp $ */ /* $NetBSD: asm.h,v 1.2 2003/05/02 18:05:47 yamt Exp $ */ /*- @@ -68,14 +68,19 @@ .text; _ALIGN_TEXT; .globl x; .type x,@function; x: #ifdef _KERNEL +#define KUTEXT .section .kutext, "ax" +/*#define KUTEXT .text */ + /* XXX Can't use __CONCAT() here, as it would be evaluated incorrectly. */ -#ifdef __STDC__ #define IDTVEC(name) \ - .text; ALIGN_TEXT; .globl X ## name; .type X ## name,@function; X ## name: -#else -#define IDTVEC(name) \ - .text; ALIGN_TEXT; .globl X/**/name; .type X/**/name,@function; X/**/name: -#endif /* __STDC__ */ + KUTEXT; ALIGN_TEXT; \ + .globl X ## name; .type X ## name,@function; X ## name: +#define KIDTVEC(name) \ + .text; ALIGN_TEXT; \ + .globl X ## name; .type X ## name,@function; X ## name: +#define KUENTRY(x) \ + KUTEXT; _ALIGN_TEXT; .globl x; .type x,@function; x: + #endif /* _KERNEL */ #ifdef __STDC__ diff --git a/sys/arch/amd64/include/cpu.h b/sys/arch/amd64/include/cpu.h index 59f99ebdc8a..8f973ba1423 100644 --- a/sys/arch/amd64/include/cpu.h +++ b/sys/arch/amd64/include/cpu.h @@ -1,4 +1,4 @@ -/* $OpenBSD: cpu.h,v 1.118 2018/01/07 01:08:20 mlarkin Exp $ */ +/* $OpenBSD: cpu.h,v 1.119 2018/02/21 19:24:15 guenther Exp $ */ /* $NetBSD: cpu.h,v 1.1 2003/04/26 18:39:39 fvdl Exp $ */ /*- @@ -43,7 +43,7 @@ */ #ifdef _KERNEL #include -#include +#include /* USERMODE */ #include #include #endif /* _KERNEL */ @@ -89,6 +89,17 @@ union vmm_cpu_cap { struct x86_64_tss; struct cpu_info { + /* + * The beginning of this structure in mapped in the userspace "u-k" + * page tables, so that these first couple members can be accessed + * from the trampoline code. The ci_PAGEALIGN member defines where + * the part that is *not* visible begins, so don't put anything + * above it that must be kept hidden from userspace! + */ + u_int64_t ci_kern_cr3; /* U+K page table */ + u_int64_t ci_scratch; /* for U<-->K transition */ + +#define ci_PAGEALIGN ci_dev struct device *ci_dev; struct cpu_info *ci_self; struct schedstate_percpu ci_schedstate; /* scheduler state */ @@ -100,7 +111,9 @@ struct cpu_info { u_int ci_acpi_proc_id; u_int32_t ci_randseed; - u_int64_t ci_scratch; + u_int64_t ci_kern_rsp; /* kernel-only stack */ + u_int64_t ci_intr_rsp; /* U<-->K trampoline stack */ + u_int64_t ci_user_cr3; /* U-K page table */ struct proc *ci_fpcurproc; struct proc *ci_fpsaveproc; @@ -216,7 +229,10 @@ struct cpu_info { #define PROC_PC(p) ((p)->p_md.md_regs->tf_rip) #define PROC_STACK(p) ((p)->p_md.md_regs->tf_rsp) -extern struct cpu_info cpu_info_primary; +struct cpu_info_full; +extern struct cpu_info_full cpu_info_full_primary; +#define cpu_info_primary (*(struct cpu_info *)((char *)&cpu_info_full_primary + 4096*2 - offsetof(struct cpu_info, ci_PAGEALIGN))) + extern struct cpu_info *cpu_info_list; #define CPU_INFO_ITERATOR int @@ -241,7 +257,8 @@ extern void need_resched(struct cpu_info *); #define CPU_START_CLEANUP(_ci) ((_ci)->ci_func->cleanup(_ci)) #define curcpu() ({struct cpu_info *__ci; \ - asm volatile("movq %%gs:8,%0" : "=r" (__ci)); \ + asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) \ + :"n" (offsetof(struct cpu_info, ci_self))); \ __ci;}) #define cpu_number() (curcpu()->ci_cpuid) @@ -262,8 +279,6 @@ void cpu_unidle(struct cpu_info *); #define MAXCPUS 1 #ifdef _KERNEL -extern struct cpu_info cpu_info_primary; - #define curcpu() (&cpu_info_primary) #define cpu_kick(ci) diff --git a/sys/arch/amd64/include/cpu_full.h b/sys/arch/amd64/include/cpu_full.h new file mode 100644 index 00000000000..995cab087cf --- /dev/null +++ b/sys/arch/amd64/include/cpu_full.h @@ -0,0 +1,66 @@ +/* $OpenBSD: cpu_full.h,v 1.1 2018/02/21 19:24:15 guenther Exp $ */ +/* + * Copyright (c) Philip Guenther + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef _MACHINE_CPU_FULL_H_ +#define _MACHINE_CPU_FULL_H_ + +#include /* offsetof, PAGE_SIZE */ +#include +#include + +/* + * The layout of the full per-CPU information, including TSS, GDT, + * trampoline stacks, and cpu_info described in + */ +struct cpu_info_full { + /* page mapped kRO in u-k */ + union { + struct x86_64_tss u_tss; /* followed by gdt */ + char u_align[PAGE_SIZE]; + } cif_RO; +#define cif_tss cif_RO.u_tss + + /* start of page mapped kRW in u-k */ + uint64_t cif_tramp_stack[(PAGE_SIZE / 4 + - offsetof(struct cpu_info, ci_PAGEALIGN)) / sizeof(uint64_t)]; + uint64_t cif_dblflt_stack[(PAGE_SIZE / 4) / sizeof(uint64_t)]; + uint64_t cif_nmi_stack[(2 * PAGE_SIZE / 4) / sizeof(uint64_t)]; + + /* + * Beginning of this hangs over into the kRW page; rest is + * unmapped in u-k + */ + struct cpu_info cif_cpu; +} __aligned(PAGE_SIZE); + +/* tss, align shim, and gdt must fit in a page */ +CTASSERT(_ALIGN(sizeof(struct x86_64_tss)) + + sizeof(struct mem_segment_descriptor) * (NGDT_MEM + 2*NGDT_SYS) + < PAGE_SIZE); + +/* verify expected alignment */ +CTASSERT(offsetof(struct cpu_info_full, cif_cpu.ci_PAGEALIGN) % PAGE_SIZE == 0); + +/* verify total size is multiple of page size */ +CTASSERT(sizeof(struct cpu_info_full) % PAGE_SIZE == 0); + +extern struct cpu_info_full cpu_info_full_primary; + +/* Now make sure the cpu_info_primary macro is correct */ +CTASSERT(&cpu_info_primary == &cpu_info_full_primary.cif_cpu); + +#endif /* _MACHINE_CPU_FULL_H_ */ diff --git a/sys/arch/amd64/include/cpufunc.h b/sys/arch/amd64/include/cpufunc.h index b52e4b3d2ae..ed8c6ba8905 100644 --- a/sys/arch/amd64/include/cpufunc.h +++ b/sys/arch/amd64/include/cpufunc.h @@ -1,4 +1,4 @@ -/* $OpenBSD: cpufunc.h,v 1.23 2018/02/06 01:09:17 patrick Exp $ */ +/* $OpenBSD: cpufunc.h,v 1.24 2018/02/21 19:24:15 guenther Exp $ */ /* $NetBSD: cpufunc.h,v 1.3 2003/05/08 10:27:43 fvdl Exp $ */ /*- @@ -317,6 +317,9 @@ void amd64_errata(struct cpu_info *); void cpu_ucode_setup(void); void cpu_ucode_apply(struct cpu_info *); +struct cpu_info_full; +void cpu_enter_pages(struct cpu_info_full *); + #endif /* _KERNEL */ #endif /* !_MACHINE_CPUFUNC_H_ */ diff --git a/sys/arch/amd64/include/frame.h b/sys/arch/amd64/include/frame.h index e71d4093274..997adbf570c 100644 --- a/sys/arch/amd64/include/frame.h +++ b/sys/arch/amd64/include/frame.h @@ -1,4 +1,4 @@ -/* $OpenBSD: frame.h,v 1.6 2016/02/26 09:29:20 mpi Exp $ */ +/* $OpenBSD: frame.h,v 1.7 2018/02/21 19:24:15 guenther Exp $ */ /* $NetBSD: frame.h,v 1.1 2003/04/26 18:39:40 fvdl Exp $ */ /*- @@ -147,6 +147,20 @@ struct intrframe { int64_t if_ss; }; + +/* + * The trampoline frame used on the kernel stack page which is present + * but kernel-only, in the page tables used when in userspace. This is + * the minimum for iretq operation. + */ +struct iretq_frame { + int64_t iretq_rip; + int64_t iretq_cs; + int64_t iretq_rflags; + int64_t iretq_rsp; + int64_t iretq_ss; +}; + /* * Stack frame inside cpu_switch() */ diff --git a/sys/arch/amd64/include/frameasm.h b/sys/arch/amd64/include/frameasm.h index 88309d1dd4f..5e384acb9dc 100644 --- a/sys/arch/amd64/include/frameasm.h +++ b/sys/arch/amd64/include/frameasm.h @@ -1,4 +1,4 @@ -/* $OpenBSD: frameasm.h,v 1.11 2018/01/06 22:03:12 guenther Exp $ */ +/* $OpenBSD: frameasm.h,v 1.12 2018/02/21 19:24:15 guenther Exp $ */ /* $NetBSD: frameasm.h,v 1.1 2003/04/26 18:39:40 fvdl Exp $ */ #ifndef _AMD64_MACHINE_FRAMEASM_H @@ -13,7 +13,10 @@ * These are used on interrupt or trap entry or exit. */ #define INTR_SAVE_GPRS \ - subq $120,%rsp ; \ + subq $120,%rsp ; \ + INTR_SAVE_MOST_GPRS_NO_ADJ ; \ + movq %rcx,TF_RCX(%rsp) +#define INTR_SAVE_MOST_GPRS_NO_ADJ \ movq %r15,TF_R15(%rsp) ; \ movq %r14,TF_R14(%rsp) ; \ movq %r13,TF_R13(%rsp) ; \ @@ -27,15 +30,54 @@ movq %rbp,TF_RBP(%rsp) ; \ movq %rbx,TF_RBX(%rsp) ; \ movq %rdx,TF_RDX(%rsp) ; \ - movq %rcx,TF_RCX(%rsp) ; \ movq %rax,TF_RAX(%rsp) -#define INTRENTRY \ - subq $32,%rsp ; \ - testq $SEL_RPL,56(%rsp) ; \ - je 98f ; \ +/* For real interrupt code paths, where we can come from userspace */ +#define INTRENTRY_LABEL(label) X##label##_untramp +#define INTRENTRY(label) \ + testq $SEL_RPL,24(%rsp) ; \ + je INTRENTRY_LABEL(label) ; \ swapgs ; \ -98: INTR_SAVE_GPRS + movq %rax,CPUVAR(SCRATCH) ; \ + movq CPUVAR(KERN_CR3),%rax ; \ + testq %rax,%rax ; \ + jz 98f ; \ + movq %rax,%cr3 ; \ + jmp 98f ; \ + .text ; \ + .global INTRENTRY_LABEL(label) ; \ +INTRENTRY_LABEL(label): /* from kernel */ \ + subq $152,%rsp ; \ + movq %rcx,TF_RCX(%rsp) ; \ + jmp 99f ; \ +98: /* from userspace */ \ + movq CPUVAR(KERN_RSP),%rax ; \ + xchgq %rax,%rsp ; \ + movq %rcx,TF_RCX(%rsp) ; \ + /* copy trapno+err to the trap frame */ \ + movq 0(%rax),%rcx ; \ + movq %rcx,TF_TRAPNO(%rsp) ; \ + movq 8(%rax),%rcx ; \ + movq %rcx,TF_ERR(%rsp) ; \ + addq $16,%rax ; \ + /* copy iretq frame to the trap frame */ \ + movq IRETQ_RIP(%rax),%rcx ; \ + movq %rcx,TF_RIP(%rsp) ; \ + movq IRETQ_CS(%rax),%rcx ; \ + movq %rcx,TF_CS(%rsp) ; \ + movq IRETQ_RFLAGS(%rax),%rcx ; \ + movq %rcx,TF_RFLAGS(%rsp) ; \ + movq IRETQ_RSP(%rax),%rcx ; \ + movq %rcx,TF_RSP(%rsp) ; \ + movq IRETQ_SS(%rax),%rcx ; \ + movq %rcx,TF_SS(%rsp) ; \ + movq CPUVAR(SCRATCH),%rax ; \ +99: INTR_SAVE_MOST_GPRS_NO_ADJ + +/* For faking up an interrupt frame when we're already in the kernel */ +#define INTR_REENTRY \ + subq $32,%rsp ; \ + INTR_SAVE_GPRS #define INTRFASTEXIT \ jmp intr_fast_exit @@ -50,24 +92,6 @@ pushq %r11 ; \ pushq %r13 ; -/* - * Restore FS.base if it's not already in the CPU, and do the cli/swapgs. - * Uses %rax, %rcx, and %rdx - */ -#define INTR_RESTORE_SELECTORS \ - btsl $CPUF_USERSEGS_BIT, CPUVAR(FLAGS) ; \ - jc 99f ; \ - movq CPUVAR(CURPCB),%rdx /* for below */ ; \ - movq PCB_FSBASE(%rdx),%rax ; \ - cmpq $0,%rax ; \ - je 99f /* setting %fs has zeroed FS.base */ ; \ - movq %rax,%rdx ; \ - shrq $32,%rdx ; \ - movl $MSR_FSBASE,%ecx ; \ - wrmsr ; \ -99: cli ; \ - swapgs - #define INTR_FAKE_TRAP 0xbadabada #define CHECK_ASTPENDING(reg) movq CPUVAR(CURPROC),reg ; \ diff --git a/sys/arch/amd64/include/gdt.h b/sys/arch/amd64/include/gdt.h index 65a116e8bc1..bfdc521d6c2 100644 --- a/sys/arch/amd64/include/gdt.h +++ b/sys/arch/amd64/include/gdt.h @@ -1,4 +1,4 @@ -/* $OpenBSD: gdt.h,v 1.5 2010/11/13 04:16:42 guenther Exp $ */ +/* $OpenBSD: gdt.h,v 1.6 2018/02/21 19:24:15 guenther Exp $ */ /* $NetBSD: gdt.h,v 1.1 2003/04/26 18:39:40 fvdl Exp $ */ /*- @@ -31,4 +31,3 @@ */ void gdt_init_cpu(struct cpu_info *); -void gdt_alloc_cpu(struct cpu_info *); diff --git a/sys/arch/amd64/include/pmap.h b/sys/arch/amd64/include/pmap.h index ef776eb959f..c316521f6f3 100644 --- a/sys/arch/amd64/include/pmap.h +++ b/sys/arch/amd64/include/pmap.h @@ -1,4 +1,4 @@ -/* $OpenBSD: pmap.h,v 1.63 2018/01/07 21:43:25 mlarkin Exp $ */ +/* $OpenBSD: pmap.h,v 1.64 2018/02/21 19:24:15 guenther Exp $ */ /* $NetBSD: pmap.h,v 1.1 2003/04/26 18:39:46 fvdl Exp $ */ /* @@ -280,8 +280,19 @@ struct pmap { struct mutex pm_mtx; struct uvm_object pm_obj[PTP_LEVELS-1]; /* objects for lvl >= 1) */ LIST_ENTRY(pmap) pm_list; /* list (lck by pm_list lock) */ - pd_entry_t *pm_pdir; /* VA of PD (lck by object lock) */ - paddr_t pm_pdirpa; /* PA of PD (read-only after create) */ + /* + * pm_pdir : VA of page table to be used when executing in + * privileged mode + * pm_pdirpa : PA of page table to be used when executing in + * privileged mode + * pm_pdir_intel : VA of special page table to be used when executing + * on an Intel CPU in usermode (no kernel mappings) + * pm_pdirpa_intel : PA of special page table to be used when executing + * on an Intel CPU in usermode (no kernel mappings) + */ + pd_entry_t *pm_pdir, *pm_pdir_intel; + paddr_t pm_pdirpa, pm_pdirpa_intel; + struct vm_page *pm_ptphint[PTP_LEVELS-1]; /* pointer to a PTP in our pmap */ struct pmap_statistics pm_stats; /* pmap stats (lck by object lock) */ @@ -375,6 +386,7 @@ paddr_t pmap_prealloc_lowmem_ptps(paddr_t); void pagezero(vaddr_t); int pmap_convert(struct pmap *, int); +void pmap_enter_special(vaddr_t, paddr_t, vm_prot_t); /* * functions for flushing the cache for vaddrs and pages. diff --git a/sys/arch/amd64/include/specialreg.h b/sys/arch/amd64/include/specialreg.h index ae81a593f9a..b7aa6e7a4d6 100644 --- a/sys/arch/amd64/include/specialreg.h +++ b/sys/arch/amd64/include/specialreg.h @@ -1,4 +1,4 @@ -/* $OpenBSD: specialreg.h,v 1.67 2018/02/10 09:46:58 jsg Exp $ */ +/* $OpenBSD: specialreg.h,v 1.68 2018/02/21 19:24:15 guenther Exp $ */ /* $NetBSD: specialreg.h,v 1.1 2003/04/26 18:39:48 fvdl Exp $ */ /* $NetBSD: x86/specialreg.h,v 1.2 2003/04/25 21:54:30 fvdl Exp $ */ @@ -219,6 +219,7 @@ #define SEFF0EDX_AVX512_4FMAPS 0x00000008 /* AVX-512 mult accum single prec */ #define SEFF0EDX_IBRS 0x04000000 /* IBRS / IBPB Speculation Control */ #define SEFF0EDX_STIBP 0x08000000 /* STIBP Speculation Control */ +#define SEFF0EDX_ARCH_CAP 0x20000000 /* Has IA32_ARCH_CAPABILITIES MSR */ /* * Thermal and Power Management (CPUID function 0x6) EAX bits @@ -351,6 +352,8 @@ #define MTRRcap_FIXED 0x100 /* bit 8 - fixed MTRRs supported */ #define MTRRcap_WC 0x400 /* bit 10 - WC type supported */ #define MTRRcap_SMRR 0x800 /* bit 11 - SMM range reg supported */ +#define MSR_ARCH_CAPABILITIES 0x10a +#define ARCH_CAPABILITIES_RDCL_NO (1ULL << 0) /* Meltdown safe */ #define MSR_BBL_CR_ADDR 0x116 /* PII+ only */ #define MSR_BBL_CR_DECC 0x118 /* PII+ only */ #define MSR_BBL_CR_CTL 0x119 /* PII+ only */ -- 2.20.1