From 30e2643ae7ed7dfd637efff87fd5d6cbb8ead7f2 Mon Sep 17 00:00:00 2001 From: dv Date: Thu, 26 Sep 2024 13:18:25 +0000 Subject: [PATCH] Add an ipi for executing INVEPT to flush EPT on remote cpus. Similar to how the fast ipi for tlb flush is implemented, this adds one for calling INVEPT to invalidate EPT caches on the cpu. This is the first step to allowing guest memory to not be wired by UVM and decreases the behavioral differences between Intel and AMD's nested paging in vmm(4) and pmap(9). This change does not hook EPT ptes into the PV list, so the ipi is only used during address space teardown and pte removal. (With the removal of the "mprotect" ioctl, vmm(4) no longer modifies EPT ptes other than inserting them and removing them.) ok mlarkin@ --- sys/arch/amd64/amd64/cpu.c | 15 +++++- sys/arch/amd64/amd64/lapic.c | 9 +++- sys/arch/amd64/amd64/pmap.c | 85 ++++++++++++++++++++++++------ sys/arch/amd64/amd64/vector.S | 24 ++++++++- sys/arch/amd64/amd64/vmm_machdep.c | 14 ++--- sys/arch/amd64/include/cpu.h | 11 ++-- sys/arch/amd64/include/i82489var.h | 8 ++- sys/arch/amd64/include/pmap.h | 3 +- sys/arch/amd64/include/vmmvar.h | 3 +- 9 files changed, 136 insertions(+), 36 deletions(-) diff --git a/sys/arch/amd64/amd64/cpu.c b/sys/arch/amd64/amd64/cpu.c index 26f96841188..60ce18df6f5 100644 --- a/sys/arch/amd64/amd64/cpu.c +++ b/sys/arch/amd64/amd64/cpu.c @@ -1,4 +1,4 @@ -/* $OpenBSD: cpu.c,v 1.192 2024/08/08 07:02:38 kettenis Exp $ */ +/* $OpenBSD: cpu.c,v 1.193 2024/09/26 13:18:25 dv Exp $ */ /* $NetBSD: cpu.c,v 1.1 2003/04/26 18:39:26 fvdl Exp $ */ /*- @@ -889,8 +889,10 @@ cpu_init(struct cpu_info *ci) void cpu_init_vmm(struct cpu_info *ci) { + uint64_t msr; + /* - * Allocate a per-cpu VMXON region for VMX CPUs + * Detect VMX specific features and initialize VMX-related state. */ if (ci->ci_vmm_flags & CI_VMM_VMX) { ci->ci_vmxon_region = (struct vmxon_region *)malloc(PAGE_SIZE, @@ -898,8 +900,17 @@ cpu_init_vmm(struct cpu_info *ci) if (!pmap_extract(pmap_kernel(), (vaddr_t)ci->ci_vmxon_region, &ci->ci_vmxon_region_pa)) panic("Can't locate VMXON region in phys mem"); + ci->ci_vmcs_pa = VMX_VMCS_PA_CLEAR; rw_init(&ci->ci_vmcs_lock, "vmcslock"); + + msr = rdmsr(IA32_VMX_EPT_VPID_CAP); + if (msr & IA32_EPT_VPID_CAP_INVEPT_CONTEXT) + ci->ci_vmm_cap.vcc_vmx.vmx_invept_mode = + IA32_VMX_INVEPT_SINGLE_CTX; + else + ci->ci_vmm_cap.vcc_vmx.vmx_invept_mode = + IA32_VMX_INVEPT_GLOBAL_CTX; } } #endif /* NVMM > 0 */ diff --git a/sys/arch/amd64/amd64/lapic.c b/sys/arch/amd64/amd64/lapic.c index 82c4eca4689..14c62bfd7c3 100644 --- a/sys/arch/amd64/amd64/lapic.c +++ b/sys/arch/amd64/amd64/lapic.c @@ -1,4 +1,4 @@ -/* $OpenBSD: lapic.c,v 1.72 2024/04/03 02:01:21 guenther Exp $ */ +/* $OpenBSD: lapic.c,v 1.73 2024/09/26 13:18:25 dv Exp $ */ /* $NetBSD: lapic.c,v 1.2 2003/05/08 01:04:35 fvdl Exp $ */ /*- @@ -56,6 +56,7 @@ #include "ioapic.h" #include "xen.h" #include "hyperv.h" +#include "vmm.h" #if NIOAPIC > 0 #include @@ -368,7 +369,11 @@ lapic_boot_init(paddr_t lapic_base) idt_vec_set(LAPIC_IPI_INVLPG, Xipi_invlpg_pcid); idt_vec_set(LAPIC_IPI_INVLRANGE, Xipi_invlrange_pcid); } -#endif +#if NVMM > 0 + idt_allocmap[LAPIC_IPI_INVEPT] = 1; + idt_vec_set(LAPIC_IPI_INVEPT, Xipi_invept); +#endif /* NVMM > 0 */ +#endif /* MULTIPROCESSOR */ idt_allocmap[LAPIC_SPURIOUS_VECTOR] = 1; idt_vec_set(LAPIC_SPURIOUS_VECTOR, Xintrspurious); diff --git a/sys/arch/amd64/amd64/pmap.c b/sys/arch/amd64/amd64/pmap.c index cf699b04f4f..47e9e5c045b 100644 --- a/sys/arch/amd64/amd64/pmap.c +++ b/sys/arch/amd64/amd64/pmap.c @@ -1,4 +1,4 @@ -/* $OpenBSD: pmap.c,v 1.174 2024/09/20 02:00:46 jsg Exp $ */ +/* $OpenBSD: pmap.c,v 1.175 2024/09/26 13:18:25 dv Exp $ */ /* $NetBSD: pmap.c,v 1.3 2003/05/08 18:13:13 thorpej Exp $ */ /* @@ -338,6 +338,7 @@ void pmap_do_remove(struct pmap *, vaddr_t, vaddr_t, int); void pmap_remove_ept(struct pmap *, vaddr_t, vaddr_t); void pmap_do_remove_ept(struct pmap *, vaddr_t); int pmap_enter_ept(struct pmap *, vaddr_t, paddr_t, vm_prot_t); +void pmap_shootept(struct pmap *, int); #endif /* NVMM > 0 */ int pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *, vaddr_t, int, struct pv_entry **); @@ -387,7 +388,11 @@ pmap_is_curpmap(struct pmap *pmap) static inline int pmap_is_active(struct pmap *pmap, struct cpu_info *ci) { - return pmap == pmap_kernel() || pmap == ci->ci_proc_pmap; + return (pmap == pmap_kernel() || pmap == ci->ci_proc_pmap +#if NVMM > 0 + || (pmap_is_ept(pmap) && pmap == ci->ci_ept_pmap) +#endif /* NVMM > 0 */ + ); } #endif @@ -416,7 +421,7 @@ pmap_map_ptes(struct pmap *pmap) { paddr_t cr3; - KASSERT(pmap->pm_type != PMAP_TYPE_EPT); + KASSERT(!pmap_is_ept(pmap)); /* the kernel's pmap is always accessible */ if (pmap == pmap_kernel()) @@ -1786,7 +1791,7 @@ void pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva) { #if NVMM > 0 - if (pmap->pm_type == PMAP_TYPE_EPT) + if (pmap_is_ept(pmap)) pmap_remove_ept(pmap, sva, eva); else #endif /* NVMM > 0 */ @@ -2437,7 +2442,7 @@ pmap_convert(struct pmap *pmap, int mode) mtx_enter(&pmap->pm_mtx); pmap->pm_type = mode; - if (mode == PMAP_TYPE_EPT) { + if (pmap_is_ept(pmap)) { /* Clear PML4 */ pte = (pt_entry_t *)pmap->pm_pdir; memset(pte, 0, PAGE_SIZE); @@ -2455,7 +2460,6 @@ void pmap_remove_ept(struct pmap *pmap, vaddr_t sgpa, vaddr_t egpa) { vaddr_t v; - struct vmx_invept_descriptor vid; mtx_enter(&pmap->pm_mtx); @@ -2464,15 +2468,11 @@ pmap_remove_ept(struct pmap *pmap, vaddr_t sgpa, vaddr_t egpa) for (v = sgpa; v < egpa + PAGE_SIZE; v += PAGE_SIZE) pmap_do_remove_ept(pmap, v); - if (pmap->eptp != 0) { - memset(&vid, 0, sizeof(vid)); - vid.vid_eptp = pmap->eptp; - DPRINTF("%s: flushing EPT TLB for EPTP 0x%llx\n", __func__, - vid.vid_eptp); - invept(IA32_VMX_INVEPT_SINGLE_CTX, &vid); - } + pmap_shootept(pmap, 1); mtx_leave(&pmap->pm_mtx); + + pmap_tlb_shootwait(); } void @@ -2757,7 +2757,7 @@ pmap_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, int flags) paddr_t scr3; #if NVMM > 0 - if (pmap->pm_type == PMAP_TYPE_EPT) + if (pmap_is_ept(pmap)) return pmap_enter_ept(pmap, va, pa, prot); #endif /* NVMM > 0 */ @@ -3215,6 +3215,12 @@ volatile vaddr_t tlb_shoot_addr1 __attribute__((section(".kudata"))); volatile vaddr_t tlb_shoot_addr2 __attribute__((section(".kudata"))); volatile int tlb_shoot_first_pcid __attribute__((section(".kudata"))); +#if NVMM > 0 +#include +volatile uint64_t ept_shoot_mode __attribute__((section(".kudata"))); +volatile struct vmx_invept_descriptor ept_shoot_vid + __attribute__((section(".kudata"))); +#endif /* NVMM > 0 */ /* Obtain the "lock" for TLB shooting */ static inline int @@ -3363,7 +3369,6 @@ pmap_tlb_shoottlb(struct pmap *pm, int shootself) if (wait) { int s = pmap_start_tlb_shoot(wait, __func__); - CPU_INFO_FOREACH(cii, ci) { if ((mask & (1ULL << ci->ci_cpuid)) == 0) continue; @@ -3384,6 +3389,56 @@ pmap_tlb_shoottlb(struct pmap *pm, int shootself) } } +#if NVMM > 0 +/* + * pmap_shootept: similar to pmap_tlb_shoottlb, but for remotely invalidating + * EPT using invept. + */ +void +pmap_shootept(struct pmap *pm, int shootself) +{ + struct cpu_info *ci, *self = curcpu(); + struct vmx_invept_descriptor vid; + CPU_INFO_ITERATOR cii; + long wait = 0; + u_int64_t mask = 0; + + KASSERT(pmap_is_ept(pm)); + + CPU_INFO_FOREACH(cii, ci) { + if (ci == self || !pmap_is_active(pm, ci) || + !(ci->ci_flags & CPUF_RUNNING) || + !(ci->ci_flags & CPUF_VMM)) + continue; + mask |= (1ULL << ci->ci_cpuid); + wait++; + } + + if (wait) { + int s = pmap_start_tlb_shoot(wait, __func__); + + ept_shoot_mode = self->ci_vmm_cap.vcc_vmx.vmx_invept_mode; + ept_shoot_vid.vid_eptp = pm->eptp; + ept_shoot_vid.vid_reserved = 0; + + CPU_INFO_FOREACH(cii, ci) { + if ((mask & (1ULL << ci->ci_cpuid)) == 0) + continue; + if (x86_fast_ipi(ci, LAPIC_IPI_INVEPT) != 0) + panic("%s: ipi failed", __func__); + } + + splx(s); + } + + if (shootself && (self->ci_flags & CPUF_VMM)) { + vid.vid_eptp = pm->eptp; + vid.vid_reserved = 0; + invept(self->ci_vmm_cap.vcc_vmx.vmx_invept_mode, &vid); + } +} +#endif /* NVMM > 0 */ + void pmap_tlb_shootwait(void) { diff --git a/sys/arch/amd64/amd64/vector.S b/sys/arch/amd64/amd64/vector.S index c51f872f061..3befb31efab 100644 --- a/sys/arch/amd64/amd64/vector.S +++ b/sys/arch/amd64/amd64/vector.S @@ -1,4 +1,4 @@ -/* $OpenBSD: vector.S,v 1.96 2024/07/21 16:19:25 deraadt Exp $ */ +/* $OpenBSD: vector.S,v 1.97 2024/09/26 13:18:25 dv Exp $ */ /* $NetBSD: vector.S,v 1.5 2004/06/28 09:13:11 fvdl Exp $ */ /* @@ -83,6 +83,7 @@ #include "assym.h" #include "xen.h" #include "hyperv.h" +#include "vmm.h" /*****************************************************************************/ @@ -601,6 +602,27 @@ IDTVEC(ipi_invltlb) iretq END(Xipi_invltlb) +#if NVMM > 0 +/* Invalidate VMX EPT */ +IDTVEC(ipi_invept) + pushq %rax + pushq %rdx + + ioapic_asm_ack() + + movq $ept_shoot_vid, %rax + movq ept_shoot_mode, %rdx + invept (%rax), %rdx + + lock + decq tlb_shoot_wait + + popq %rdx + popq %rax + iretq +END(Xipi_invept) +#endif /* NVMM > 0 */ + /* invalidate a single page, no PCIDs version */ IDTVEC(ipi_invlpg) pushq %rax diff --git a/sys/arch/amd64/amd64/vmm_machdep.c b/sys/arch/amd64/amd64/vmm_machdep.c index c9a0531a67b..7c898fb559f 100644 --- a/sys/arch/amd64/amd64/vmm_machdep.c +++ b/sys/arch/amd64/amd64/vmm_machdep.c @@ -1,4 +1,4 @@ -/* $OpenBSD: vmm_machdep.c,v 1.37 2024/09/21 04:36:28 mlarkin Exp $ */ +/* $OpenBSD: vmm_machdep.c,v 1.38 2024/09/26 13:18:25 dv Exp $ */ /* * Copyright (c) 2014 Mike Larkin * @@ -2673,11 +2673,6 @@ vcpu_init_vmx(struct vcpu *vcpu) ret = EINVAL; goto exit; } - if (msr & IA32_EPT_VPID_CAP_INVEPT_CONTEXT) - vcpu->vc_vmx_invept_op = IA32_VMX_INVEPT_SINGLE_CTX; - else - vcpu->vc_vmx_invept_op = IA32_VMX_INVEPT_GLOBAL_CTX; - if (msr & IA32_EPT_VPID_CAP_WB) { /* WB cache type supported */ eptp |= IA32_EPT_PAGING_CACHE_TYPE_WB; @@ -3736,10 +3731,15 @@ vcpu_run_vmx(struct vcpu *vcpu, struct vm_run_params *vrp) ci = curcpu(); vcpu->vc_last_pcpu = ci; + /* We're now using this vcpu's EPT pmap on this cpu. */ + atomic_swap_ptr(&ci->ci_ept_pmap, + vcpu->vc_parent->vm_map->pmap); + /* Invalidate EPT cache. */ vid_ept.vid_reserved = 0; vid_ept.vid_eptp = vcpu->vc_parent->vm_map->pmap->eptp; - if (invept(vcpu->vc_vmx_invept_op, &vid_ept)) { + if (invept(ci->ci_vmm_cap.vcc_vmx.vmx_invept_mode, + &vid_ept)) { printf("%s: invept\n", __func__); return (EINVAL); } diff --git a/sys/arch/amd64/include/cpu.h b/sys/arch/amd64/include/cpu.h index ca198b24a69..34ba78776fc 100644 --- a/sys/arch/amd64/include/cpu.h +++ b/sys/arch/amd64/include/cpu.h @@ -1,4 +1,4 @@ -/* $OpenBSD: cpu.h,v 1.176 2024/08/27 09:16:03 bluhm Exp $ */ +/* $OpenBSD: cpu.h,v 1.177 2024/09/26 13:18:25 dv Exp $ */ /* $NetBSD: cpu.h,v 1.1 2003/04/26 18:39:39 fvdl Exp $ */ /*- @@ -75,6 +75,7 @@ struct vmx { uint32_t vmx_cr3_tgt_count; uint64_t vmx_vm_func; uint8_t vmx_has_l1_flush_msr; + uint64_t vmx_invept_mode; }; /* @@ -237,12 +238,12 @@ struct cpu_info { union vmm_cpu_cap ci_vmm_cap; paddr_t ci_vmxon_region_pa; struct vmxon_region *ci_vmxon_region; - struct vcpu *ci_guest_vcpu; /* [o] last vcpu resumed */ - - char ci_panicbuf[512]; - paddr_t ci_vmcs_pa; struct rwlock ci_vmcs_lock; + struct pmap *ci_ept_pmap; /* [o] last used EPT pmap */ + struct vcpu *ci_guest_vcpu; /* [o] last vcpu resumed */ + + char ci_panicbuf[512]; struct clockqueue ci_queue; }; diff --git a/sys/arch/amd64/include/i82489var.h b/sys/arch/amd64/include/i82489var.h index 61840dd38af..4f32f9f95c9 100644 --- a/sys/arch/amd64/include/i82489var.h +++ b/sys/arch/amd64/include/i82489var.h @@ -1,4 +1,4 @@ -/* $OpenBSD: i82489var.h,v 1.19 2024/06/09 03:12:59 jsg Exp $ */ +/* $OpenBSD: i82489var.h,v 1.20 2024/09/26 13:18:25 dv Exp $ */ /* $NetBSD: i82489var.h,v 1.1 2003/02/26 21:26:10 fvdl Exp $ */ /*- @@ -33,6 +33,8 @@ #ifndef _MACHINE_I82489VAR_H_ #define _MACHINE_I82489VAR_H_ +#include "vmm.h" + /* * Software definitions belonging to Local APIC driver. */ @@ -70,6 +72,7 @@ extern void Xresume_lapic_ipi(void); #define LAPIC_IPI_INVLTLB (LAPIC_IPI_OFFSET + 0) #define LAPIC_IPI_INVLPG (LAPIC_IPI_OFFSET + 1) #define LAPIC_IPI_INVLRANGE (LAPIC_IPI_OFFSET + 2) +#define LAPIC_IPI_INVEPT (LAPIC_IPI_OFFSET + 3) extern void Xipi_invltlb(void); extern void Xipi_invltlb_pcid(void); @@ -77,6 +80,9 @@ extern void Xipi_invlpg(void); extern void Xipi_invlpg_pcid(void); extern void Xipi_invlrange(void); extern void Xipi_invlrange_pcid(void); +#if NVMM > 0 +extern void Xipi_invept(void); +#endif /* NVMM > 0 */ /* * Vector used for local apic timer interrupts. diff --git a/sys/arch/amd64/include/pmap.h b/sys/arch/amd64/include/pmap.h index 6a8e4f92dd3..cc358eab1e9 100644 --- a/sys/arch/amd64/include/pmap.h +++ b/sys/arch/amd64/include/pmap.h @@ -1,4 +1,4 @@ -/* $OpenBSD: pmap.h,v 1.89 2024/07/09 19:11:06 bluhm Exp $ */ +/* $OpenBSD: pmap.h,v 1.90 2024/09/26 13:18:25 dv Exp $ */ /* $NetBSD: pmap.h,v 1.1 2003/04/26 18:39:46 fvdl Exp $ */ /* @@ -293,6 +293,7 @@ LIST_HEAD(pmap_head, pmap); /* struct pmap_head: head of a pmap list */ #define PMAP_TYPE_EPT 2 #define PMAP_TYPE_RVI 3 #define pmap_nested(pm) ((pm)->pm_type != PMAP_TYPE_NORMAL) +#define pmap_is_ept(pm) ((pm)->pm_type == PMAP_TYPE_EPT) struct pmap { struct mutex pm_mtx; diff --git a/sys/arch/amd64/include/vmmvar.h b/sys/arch/amd64/include/vmmvar.h index c607c1d663a..6bc63d5ae3f 100644 --- a/sys/arch/amd64/include/vmmvar.h +++ b/sys/arch/amd64/include/vmmvar.h @@ -1,4 +1,4 @@ -/* $OpenBSD: vmmvar.h,v 1.106 2024/09/21 04:36:28 mlarkin Exp $ */ +/* $OpenBSD: vmmvar.h,v 1.107 2024/09/26 13:18:25 dv Exp $ */ /* * Copyright (c) 2014 Mike Larkin * @@ -879,7 +879,6 @@ struct vcpu { uint32_t vc_vmx_vmcs_state; /* [a] */ #define VMCS_CLEARED 0 #define VMCS_LAUNCHED 1 - uint64_t vc_vmx_invept_op; /* SVM only (all requiring [v]) */ vaddr_t vc_svm_hsa_va; -- 2.20.1