From c0f44e363efa61ae9c08e033ec7078e1551dc4e0 Mon Sep 17 00:00:00 2001 From: dv Date: Mon, 28 Mar 2022 00:22:20 +0000 Subject: [PATCH] vmm(4): add quiesce/wakeup hooks to sync vcpu state. If a host suspends or hibernates, a task in the middle of using vcpu state may be rescheduled to another cpu. This is primarily a problem for Intel hosts as vcpu state is kept local to the physical cpu and must be flushed back to physical memory before another cpu can issue certain vmx instructions. This change ensures no tasks are actively using the vmm device, flushes all vcpu state (if Intel hardware), and turns off virtualization mode on the host cpus. Upon wakeup, we reverse the process. Reported on bugs@ by mpi@. OK mlarkin@ --- sys/arch/amd64/amd64/vmm.c | 167 ++++++++++++++++++++++++++++++++++++- 1 file changed, 165 insertions(+), 2 deletions(-) diff --git a/sys/arch/amd64/amd64/vmm.c b/sys/arch/amd64/amd64/vmm.c index a195b5d247b..c6477d4cbf2 100644 --- a/sys/arch/amd64/amd64/vmm.c +++ b/sys/arch/amd64/amd64/vmm.c @@ -1,4 +1,4 @@ -/* $OpenBSD: vmm.c,v 1.303 2022/01/29 19:23:02 guenther Exp $ */ +/* $OpenBSD: vmm.c,v 1.304 2022/03/28 00:22:20 dv Exp $ */ /* * Copyright (c) 2014 Mike Larkin * @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -88,6 +89,12 @@ SLIST_HEAD(vmlist_head, vm); struct vmm_softc { struct device sc_dev; + /* Suspend/Resume Synchronization */ + struct refcnt sc_refcnt; + volatile unsigned int sc_status; +#define VMM_SUSPENDED (unsigned int) 0 +#define VMM_ACTIVE (unsigned int) 1 + /* Capabilities */ uint32_t nr_vmx_cpus; uint32_t nr_svm_cpus; @@ -115,9 +122,11 @@ void vmx_dump_vmcs_field(uint16_t, const char *); int vmm_enabled(void); int vmm_probe(struct device *, void *, void *); void vmm_attach(struct device *, struct device *, void *); +int vmm_activate(struct device *, int); int vmmopen(dev_t, int, int, struct proc *); int vmmioctl(dev_t, u_long, caddr_t, int, struct proc *); int vmmclose(dev_t, int, int, struct proc *); +int vmm_quiesce_vmx(void); int vmm_start(void); int vmm_stop(void); size_t vm_create_check_mem_ranges(struct vm_create_params *); @@ -264,7 +273,7 @@ struct cfdriver vmm_cd = { }; const struct cfattach vmm_ca = { - sizeof(struct vmm_softc), vmm_probe, vmm_attach, NULL, NULL + sizeof(struct vmm_softc), vmm_probe, vmm_attach, NULL, vmm_activate }; /* @@ -367,6 +376,11 @@ vmm_attach(struct device *parent, struct device *self, void *aux) struct cpu_info *ci; CPU_INFO_ITERATOR cii; + sc->sc_status = VMM_ACTIVE; + + refcnt_init(&sc->sc_refcnt); + refcnt_rele(&sc->sc_refcnt); + sc->nr_vmx_cpus = 0; sc->nr_svm_cpus = 0; sc->nr_rvi_cpus = 0; @@ -440,6 +454,137 @@ vmm_attach(struct device *parent, struct device *self, void *aux) vmm_softc = sc; } +/* + * vmm_quiesce_vmx + * + * Prepare the host for suspend by flushing all VMCS states. + */ +int +vmm_quiesce_vmx(void) +{ + struct vm *vm; + struct vcpu *vcpu; + int err; + + /* + * We should be only called from a quiescing device state so we + * don't expect to sleep here. If we can't get all our locks, + * something is wrong. + */ + if ((err = rw_enter(&vmm_softc->vm_lock, RW_WRITE | RW_NOSLEEP))) + return (err); + + /* Iterate over each vm... */ + SLIST_FOREACH(vm, &vmm_softc->vm_list, vm_link) { + if ((err = rw_enter(&vm->vm_vcpu_lock, RW_READ | RW_NOSLEEP))) + break; + + /* Iterate over each vcpu... */ + SLIST_FOREACH(vcpu, &vm->vm_vcpu_list, vc_vcpu_link) { + err = rw_enter(&vcpu->vc_lock, RW_WRITE | RW_NOSLEEP); + if (err) + break; + + /* We can skip unlaunched VMCS. Nothing to flush. */ + if (atomic_load_int(&vcpu->vc_vmx_vmcs_state) + != VMCS_LAUNCHED) { + DPRINTF("%s: skipping vcpu %d for vm %d\n", + __func__, vcpu->vc_id, vm->vm_id); + rw_exit_write(&vcpu->vc_lock); + continue; + } + + if (vcpu->vc_last_pcpu != curcpu()) { + /* Remote cpu vmclear via ipi. */ + err = vmx_remote_vmclear(vcpu->vc_last_pcpu, + vcpu); + if (err) + printf("%s: failed to remote vmclear " + "vcpu %d of vm %d\n", __func__, + vcpu->vc_id, vm->vm_id); + } else { + /* Local cpu vmclear instruction. */ + if ((err = vmclear(&vcpu->vc_control_pa))) + printf("%s: failed to locally vmclear " + "vcpu %d of vm %d\n", __func__, + vcpu->vc_id, vm->vm_id); + atomic_swap_uint(&vcpu->vc_vmx_vmcs_state, + VMCS_CLEARED); + } + + rw_exit_write(&vcpu->vc_lock); + if (err) + break; + DPRINTF("%s: cleared vcpu %d for vm %d\n", __func__, + vcpu->vc_id, vm->vm_id); + } + rw_exit_read(&vm->vm_vcpu_lock); + if (err) + break; + } + rw_exit_write(&vmm_softc->vm_lock); + + if (err) + return (err); + return (0); +} + +/* + * vmm_activate + */ +int +vmm_activate(struct device *self, int act) +{ + struct cpu_info *ci = curcpu(); + unsigned int old_state; + + switch (act) { + case DVACT_QUIESCE: + /* Block device users as we're suspending operation. */ + old_state = atomic_cas_uint(&vmm_softc->sc_status, VMM_ACTIVE, + VMM_SUSPENDED); + if (old_state != VMM_ACTIVE) + DPRINTF("%s: invalid device state on quiesce (%d)\n", + __func__, old_state); + + /* Wait for any device users to finish. */ + while (refcnt_read(&vmm_softc->sc_refcnt) > 0) + tsleep_nsec(&vmm_softc, PPAUSE, "vmm", MSEC_TO_NSEC(1)); + + /* If we're not in vmm mode, nothing to do. */ + if ((ci->ci_flags & CPUF_VMM) == 0) + break; + + /* Intel systems need extra steps to sync vcpu state. */ + if (vmm_softc->mode == VMM_MODE_EPT || + vmm_softc->mode == VMM_MODE_VMX) + if (vmm_quiesce_vmx()) + DPRINTF("%s: vmx quiesce failed\n", __func__); + + /* Stop virtualization mode on all cpus. */ + vmm_stop(); + break; + + case DVACT_WAKEUP: + /* Restart virtualization mode on all cpu's. */ + if (vmm_softc->vm_ct > 0) + vmm_start(); + + /* Set the device back to active. */ + old_state = atomic_cas_uint(&vmm_softc->sc_status, + VMM_SUSPENDED, VMM_ACTIVE); + if (old_state != VMM_SUSPENDED) + DPRINTF("%s: invalid device state on wakeup (%d)\n", + __func__, old_state); + + /* Notify any waiting device users. */ + wakeup(&vmm_softc); + break; + } + + return (0); +} + /* * vmmopen * @@ -480,6 +625,22 @@ vmmioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p) KERNEL_UNLOCK(); + refcnt_take(&vmm_softc->sc_refcnt); + while (atomic_load_int(&vmm_softc->sc_status) != VMM_ACTIVE) { + refcnt_rele(&vmm_softc->sc_refcnt); + /* Wait for the signal that we're running again. */ + ret = tsleep_nsec(&vmm_softc, PWAIT | PCATCH, "vmm", + MSEC_TO_NSEC(1)); + if (ret != ERESTART && ret != EINTR && ret != EWOULDBLOCK + && ret != 0) { + printf("%s: unhandled wakeup (%d) for device\n", + __func__, ret); + ret = EBUSY; + goto out; + } + refcnt_take(&vmm_softc->sc_refcnt); + } + switch (cmd) { case VMM_IOC_CREATE: if ((ret = vmm_start()) != 0) { @@ -524,6 +685,8 @@ vmmioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p) ret = ENOTTY; } + refcnt_rele(&vmm_softc->sc_refcnt); +out: KERNEL_LOCK(); return (ret); -- 2.20.1