From 175f768bfbe5f3b8e0ee49f9b8da968dab1667cb Mon Sep 17 00:00:00 2001 From: pd Date: Wed, 29 Aug 2018 04:51:12 +0000 Subject: [PATCH] First pass in bringing i386 in sync with amd64. This does not yet work, but is being committed now so we can work on the rest in-tree. ok mlarkin@ --- sys/arch/i386/i386/vmm.c | 1056 ++++++++++++++++++++++++++++---- sys/arch/i386/include/vmmvar.h | 44 +- 2 files changed, 995 insertions(+), 105 deletions(-) diff --git a/sys/arch/i386/i386/vmm.c b/sys/arch/i386/i386/vmm.c index 6552d7e3043..82988f37eb0 100644 --- a/sys/arch/i386/i386/vmm.c +++ b/sys/arch/i386/i386/vmm.c @@ -1,4 +1,4 @@ -/* $OpenBSD: vmm.c,v 1.41 2018/07/12 15:48:02 mlarkin Exp $ */ +/* $OpenBSD: vmm.c,v 1.42 2018/08/29 04:51:12 pd Exp $ */ /* * Copyright (c) 2014 Mike Larkin * @@ -97,6 +97,10 @@ struct vmm_softc { struct rwlock vm_lock; size_t vm_ct; /* number of in-memory VMs */ size_t vm_idx; /* next unique VM index */ + + struct rwlock vpid_lock; + uint16_t max_vpid; + uint8_t vpids[512]; /* bitmap of used VPID/ASIDs */ }; int vmm_enabled(void); @@ -143,13 +147,19 @@ void vm_teardown(struct vm *); int vcpu_vmx_check_cap(struct vcpu *, uint32_t, uint32_t, int); int vcpu_vmx_compute_ctrl(uint64_t, uint16_t, uint32_t, uint32_t, uint32_t *); int vmx_get_exit_info(uint32_t *, uint32_t *); +int vmx_load_pdptes(struct vcpu *); int vmx_handle_exit(struct vcpu *); int vmm_handle_cpuid(struct vcpu *); int vmx_handle_rdmsr(struct vcpu *); int vmx_handle_wrmsr(struct vcpu *); +int vmx_handle_cr0_write(struct vcpu *, uint32_t); +int vmx_handle_cr4_write(struct vcpu *, uint32_t); int vmx_handle_cr(struct vcpu *); int vmx_handle_inout(struct vcpu *); int vmx_handle_hlt(struct vcpu *); +int vmm_inject_ud(struct vcpu *); +int vmm_inject_gp(struct vcpu *); +int vmm_inject_db(struct vcpu *); void vmx_handle_intr(struct vcpu *); void vmx_handle_intwin(struct vcpu *); int vmm_get_guest_memtype(struct vm *, paddr_t); @@ -159,6 +169,8 @@ int svm_get_guest_faulttype(void); int vmx_get_exit_qualification(uint32_t *); int vmx_fault_page(struct vcpu *, paddr_t); int vmx_handle_np_fault(struct vcpu *); +int vmm_alloc_vpid(uint16_t *); +void vmm_free_vpid(uint16_t); const char *vcpu_state_decode(u_int); const char *vmx_exit_reason_decode(uint32_t); const char *vmx_instruction_error_decode(uint32_t); @@ -364,6 +376,15 @@ vmm_attach(struct device *parent, struct device *self, void *aux) sc->mode = VMM_MODE_UNKNOWN; } + if (sc->mode == VMM_MODE_SVM || sc->mode == VMM_MODE_RVI) { + /* XXX SVM not supported */ + } else { + sc->max_vpid = 0xFFF; + } + + bzero(&sc->vpids, sizeof(sc->vpids)); + rw_init(&sc->vpid_lock, "vpidlock"); + pool_init(&vm_pool, sizeof(struct vm), 0, IPL_NONE, PR_WAITOK, "vmpool", NULL); pool_init(&vcpu_pool, sizeof(struct vcpu), 0, IPL_NONE, PR_WAITOK, @@ -621,13 +642,14 @@ vm_intr_pending(struct vm_intr_params *vip) * * Parameters: * vrwp: Describes the VM and VCPU to get/set the registers from. The - * register values are returned here as well. + * register values are returned here as well. * dir: 0 for reading, 1 for writing * * Return values: * 0: if successful * ENOENT: if the VM/VCPU defined by 'vgp' cannot be found * EINVAL: if an error occured reading the registers of the guest + * EPERM: if the vm cannot be accessed from the calling process */ int vm_rwregs(struct vm_rwregs_params *vrwp, int dir) @@ -668,8 +690,10 @@ vm_rwregs(struct vm_rwregs_params *vrwp, int dir) return (dir == 0) ? vcpu_readregs_svm(vcpu, vrwp->vrwp_mask, vrs) : vcpu_writeregs_svm(vcpu, vrwp->vrwp_mask, vrs); - else - panic("%s: unknown vmm mode: %d", __func__, vmm_softc->mode); + else { + DPRINTF("%s: unknown vmm mode", __func__); + return (EINVAL); + } } /* @@ -912,7 +936,7 @@ stop_vmm_on_cpu(struct cpu_info *ci) } /* - * vm_create_check_mem_ranges: + * vm_create_check_mem_ranges * * Make sure that the guest physical memory ranges given by the user process * do not overlap and are in ascending order. @@ -927,7 +951,6 @@ stop_vmm_on_cpu(struct cpu_info *ci) size_t vm_create_check_mem_ranges(struct vm_create_params *vcp) { - int disjunct_range; size_t i, memsize = 0; struct vm_mem_range *vmr, *pvmr; const paddr_t maxgpa = (uint32_t)VMM_MAX_VM_MEM_SIZE * 1024 * 1024; @@ -960,10 +983,21 @@ vm_create_check_mem_ranges(struct vm_create_params *vcp) vmr->vmr_size >= VM_MAXUSER_ADDRESS - vmr->vmr_va) return (0); - /* Specifying ranges within the PCI MMIO space is forbidden */ - disjunct_range = (vmr->vmr_gpa > VMM_PCI_MMIO_BAR_END) || - (vmr->vmr_gpa + vmr->vmr_size <= VMM_PCI_MMIO_BAR_BASE); - if (!disjunct_range) + /* + * Specifying ranges within the PCI MMIO space is forbidden. + * Disallow ranges that start inside the MMIO space: + * [VMM_PCI_MMIO_BAR_BASE .. VMM_PCI_MMIO_BAR_END] + */ + if (vmr->vmr_gpa >= VMM_PCI_MMIO_BAR_BASE && + vmr->vmr_gpa <= VMM_PCI_MMIO_BAR_END) + return (0); + + /* + * ... and disallow ranges that end inside the MMIO space: + * (VMM_PCI_MMIO_BAR_BASE .. VMM_PCI_MMIO_BAR_END] + */ + if (vmr->vmr_gpa + vmr->vmr_size > VMM_PCI_MMIO_BAR_BASE && + vmr->vmr_gpa + vmr->vmr_size <= VMM_PCI_MMIO_BAR_END) return (0); /* @@ -1225,6 +1259,14 @@ vm_impl_init_svm(struct vm *vm, struct proc *p) * vm_impl_init * * Calls the architecture-specific VM init routine + * + * Parameters: + * vm: the VM being initialized + * p: vmd process owning the VM + * + * Return values (from architecture-specific init routines): + * 0: the initialization was successful + * ENOMEM: the initialization failed (lack of resources) */ int vm_impl_init(struct vm *vm, struct proc *p) @@ -1367,7 +1409,7 @@ vcpu_readregs_vmx(struct vcpu *vcpu, uint64_t regmask, if (vmread(vmm_vmx_sreg_vmcs_fields[i].arid, &ar)) goto errout; if (vmread(vmm_vmx_sreg_vmcs_fields[i].baseid, - &sregs[i].vsi_base)) + &sregs[i].vsi_base)) goto errout; sregs[i].vsi_sel = sel; @@ -1397,6 +1439,14 @@ vcpu_readregs_vmx(struct vcpu *vcpu, uint64_t regmask, goto errout; if (vmread(VMCS_GUEST_IA32_CR4, &crs[VCPU_REGS_CR4])) goto errout; + if (vmread(VMCS_GUEST_PDPTE0, &crs[VCPU_REGS_PDPTE0])) + goto errout; + if (vmread(VMCS_GUEST_PDPTE1, &crs[VCPU_REGS_PDPTE1])) + goto errout; + if (vmread(VMCS_GUEST_PDPTE2, &crs[VCPU_REGS_PDPTE2])) + goto errout; + if (vmread(VMCS_GUEST_PDPTE3, &crs[VCPU_REGS_PDPTE3])) + goto errout; } msr_store = (struct vmx_msr_store *)vcpu->vc_vmx_msr_exit_save_va; @@ -1432,7 +1482,7 @@ vcpu_readregs_svm(struct vcpu *vcpu, uint64_t regmask, /* * vcpu_writeregs_vmx * - * Writes 'vcpu's registers + * Writes VCPU registers * * Parameters: * vcpu: the vcpu that has to get its registers written to @@ -1515,6 +1565,14 @@ vcpu_writeregs_vmx(struct vcpu *vcpu, uint64_t regmask, int loadvmcs, goto errout; if (vmwrite(VMCS_GUEST_IA32_CR4, crs[VCPU_REGS_CR4])) goto errout; + if (vmwrite(VMCS_GUEST_PDPTE0, crs[VCPU_REGS_PDPTE0])) + goto errout; + if (vmwrite(VMCS_GUEST_PDPTE1, crs[VCPU_REGS_PDPTE1])) + goto errout; + if (vmwrite(VMCS_GUEST_PDPTE2, crs[VCPU_REGS_PDPTE2])) + goto errout; + if (vmwrite(VMCS_GUEST_PDPTE3, crs[VCPU_REGS_PDPTE3])) + goto errout; } msr_store = (struct vmx_msr_store *)vcpu->vc_vmx_msr_exit_save_va; @@ -1926,14 +1984,18 @@ vcpu_reset_regs_vmx(struct vcpu *vcpu, struct vcpu_reg_state *vrs) uint32_t pinbased, procbased, procbased2, exit, entry; uint32_t want1, want0; uint64_t msr, ctrlval, eptp, cr3; - uint16_t ctrl; + uint16_t ctrl, vpid; struct vmx_msr_store *msr_store; ret = 0; ug = 0; - if (vcpu_reload_vmcs_vmx(&vcpu->vc_control_pa)) + cr0 = vrs->vrs_crs[VCPU_REGS_CR0]; + + if (vcpu_reload_vmcs_vmx(&vcpu->vc_control_pa)) { + DPRINTF("%s: error reloading VMCS\n", __func__); return (EINVAL); + } /* Compute Basic Entry / Exit Controls */ vcpu->vc_vmx_basic = rdmsr(IA32_VMX_BASIC); @@ -1977,11 +2039,13 @@ vcpu_reset_regs_vmx(struct vcpu *vcpu, struct vcpu_reg_state *vrs) } if (vcpu_vmx_compute_ctrl(ctrlval, ctrl, want1, want0, &pinbased)) { + DPRINTF("%s: error computing pinbased controls\n", __func__); ret = EINVAL; goto exit; } if (vmwrite(VMCS_PINBASED_CTLS, pinbased)) { + DPRINTF("%s: error setting pinbased controls\n", __func__); ret = EINVAL; goto exit; } @@ -1997,6 +2061,7 @@ vcpu_reset_regs_vmx(struct vcpu *vcpu, struct vcpu_reg_state *vrs) * IA32_VMX_CR8_LOAD_EXITING - guest TPR access * IA32_VMX_CR8_STORE_EXITING - guest TPR access * IA32_VMX_USE_TPR_SHADOW - guest TPR access (shadow) + * IA32_VMX_MONITOR_EXITING - exit on MONITOR instruction * * If we have EPT, we must be able to clear the following * IA32_VMX_CR3_LOAD_EXITING - don't care about guest CR3 accesses @@ -2008,6 +2073,7 @@ vcpu_reset_regs_vmx(struct vcpu *vcpu, struct vcpu_reg_state *vrs) IA32_VMX_USE_MSR_BITMAPS | IA32_VMX_CR8_LOAD_EXITING | IA32_VMX_CR8_STORE_EXITING | + IA32_VMX_MONITOR_EXITING | IA32_VMX_USE_TPR_SHADOW; want0 = 0; @@ -2026,11 +2092,13 @@ vcpu_reset_regs_vmx(struct vcpu *vcpu, struct vcpu_reg_state *vrs) } if (vcpu_vmx_compute_ctrl(ctrlval, ctrl, want1, want0, &procbased)) { + DPRINTF("%s: error computing procbased controls\n", __func__); ret = EINVAL; goto exit; } if (vmwrite(VMCS_PROCBASED_CTLS, procbased)) { + DPRINTF("%s: error setting procbased controls\n", __func__); ret = EINVAL; goto exit; } @@ -2046,7 +2114,8 @@ vcpu_reset_regs_vmx(struct vcpu *vcpu, struct vcpu_reg_state *vrs) * * If we have unrestricted guest capability, we must be able to set * the following: - * IA32_VMX_UNRESTRICTED_GUEST - enable unrestricted guest + * IA32_VMX_UNRESTRICTED_GUEST - enable unrestricted guest (if caller + * specified CR0_PG | CR0_PE in %cr0 in the 'vrs' parameter) */ want1 = 0; @@ -2065,8 +2134,10 @@ vcpu_reset_regs_vmx(struct vcpu *vcpu, struct vcpu_reg_state *vrs) IA32_VMX_ACTIVATE_SECONDARY_CONTROLS, 1)) { if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS, IA32_VMX_UNRESTRICTED_GUEST, 1)) { - want1 |= IA32_VMX_UNRESTRICTED_GUEST; - ug = 1; + if ((cr0 & (CR0_PE | CR0_PG)) == 0) { + want1 |= IA32_VMX_UNRESTRICTED_GUEST; + ug = 1; + } } } @@ -2075,11 +2146,15 @@ vcpu_reset_regs_vmx(struct vcpu *vcpu, struct vcpu_reg_state *vrs) ctrl = IA32_VMX_PROCBASED2_CTLS; if (vcpu_vmx_compute_ctrl(ctrlval, ctrl, want1, want0, &procbased2)) { + DPRINTF("%s: error computing secondary procbased controls\n", + __func__); ret = EINVAL; goto exit; } if (vmwrite(VMCS_PROCBASED2_CTLS, procbased2)) { + DPRINTF("%s: error setting secondary procbased controls\n", + __func__); ret = EINVAL; goto exit; } @@ -2103,11 +2178,13 @@ vcpu_reset_regs_vmx(struct vcpu *vcpu, struct vcpu_reg_state *vrs) } if (vcpu_vmx_compute_ctrl(ctrlval, ctrl, want1, want0, &exit)) { + DPRINTF("%s: error computing exit controls\n", __func__); ret = EINVAL; goto exit; } if (vmwrite(VMCS_EXIT_CTLS, exit)) { + DPRINTF("%s: error setting exit controls\n", __func__); ret = EINVAL; goto exit; } @@ -2115,6 +2192,8 @@ vcpu_reset_regs_vmx(struct vcpu *vcpu, struct vcpu_reg_state *vrs) /* * Entry ctrls * + * We must be able to set the following: + * IA32_VMX_IA32E_MODE_GUEST (if no unrestricted guest) * We must be able to clear the following: * IA32_VMX_ENTRY_TO_SMM - enter to SMM * IA32_VMX_DEACTIVATE_DUAL_MONITOR_TREATMENT @@ -2177,14 +2256,25 @@ vcpu_reset_regs_vmx(struct vcpu *vcpu, struct vcpu_reg_state *vrs) if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED_CTLS, IA32_VMX_ACTIVATE_SECONDARY_CONTROLS, 1)) { if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS, - IA32_VMX_ENABLE_VPID, 1)) - if (vmwrite(VMCS_GUEST_VPID, - (uint16_t)vcpu->vc_parent->vm_id)) { + IA32_VMX_ENABLE_VPID, 1)) { + if (vmm_alloc_vpid(&vpid)) { + DPRINTF("%s: could not allocate VPID\n", + __func__); + ret = EINVAL; + goto exit; + } + if (vmwrite(VMCS_GUEST_VPID, vpid)) { + DPRINTF("%s: error setting guest VPID\n", + __func__); ret = EINVAL; goto exit; } + + vcpu->vc_vpid = vpid; + } } + /* * Determine which bits in CR0 have to be set to a fixed * value as per Intel SDM A.7. @@ -2203,12 +2293,9 @@ vcpu_reset_regs_vmx(struct vcpu *vcpu, struct vcpu_reg_state *vrs) * any value for CR0_PG and CR0_PE in vrs->vrs_crs[VCPU_REGS_CR0] if * the CPU has the unrestricted guest capability. */ - cr0 = vrs->vrs_crs[VCPU_REGS_CR0]; - if (ug) { want1 &= ~(CR0_PG | CR0_PE); want0 &= ~(CR0_PG | CR0_PE); - cr0 &= ~(CR0_PG | CR0_PE); } /* @@ -2227,23 +2314,60 @@ vcpu_reset_regs_vmx(struct vcpu *vcpu, struct vcpu_reg_state *vrs) goto exit; } - if (ug) - cr3 = 0; - else - cr3 = vrs->vrs_crs[VCPU_REGS_CR3]; - /* - * Determine default CR4 as per Intel SDM A.8 - * All flexible bits are set to 0 + * Determine which bits in CR4 have to be set to a fixed + * value as per Intel SDM A.8. + * CR4 bits in the vrs parameter must match these, except + * CR4_VMXE - we add that here since it must always be set. */ - cr4 = (curcpu()->ci_vmm_cap.vcc_vmx.vmx_cr4_fixed0) & + want1 = (curcpu()->ci_vmm_cap.vcc_vmx.vmx_cr4_fixed0) & (curcpu()->ci_vmm_cap.vcc_vmx.vmx_cr4_fixed1); + want0 = ~(curcpu()->ci_vmm_cap.vcc_vmx.vmx_cr4_fixed0) & + ~(curcpu()->ci_vmm_cap.vcc_vmx.vmx_cr4_fixed1); - if (ug == 0) - cr4 |= CR4_PSE; + cr4 = vrs->vrs_crs[VCPU_REGS_CR4] | CR4_VMXE; + + if ((cr4 & want1) != want1) { + ret = EINVAL; + goto exit; + } + + if ((~cr4 & want0) != want0) { + ret = EINVAL; + goto exit; + } + + cr3 = vrs->vrs_crs[VCPU_REGS_CR3]; + + /* Restore PDPTEs if 32-bit PAE paging is being used */ + if (cr3 && (cr4 & CR4_PAE) && + !(vrs->vrs_msrs[VCPU_REGS_EFER] & EFER_LMA)) { + if (vmwrite(VMCS_GUEST_PDPTE0, + vrs->vrs_crs[VCPU_REGS_PDPTE0])) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_GUEST_PDPTE1, + vrs->vrs_crs[VCPU_REGS_PDPTE1])) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_GUEST_PDPTE2, + vrs->vrs_crs[VCPU_REGS_PDPTE2])) { + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_GUEST_PDPTE3, + vrs->vrs_crs[VCPU_REGS_PDPTE3])) { + ret = EINVAL; + goto exit; + } + } vrs->vrs_crs[VCPU_REGS_CR0] = cr0; - vrs->vrs_crs[VCPU_REGS_CR3] = cr3; vrs->vrs_crs[VCPU_REGS_CR4] = cr4; /* @@ -2252,73 +2376,137 @@ vcpu_reset_regs_vmx(struct vcpu *vcpu, struct vcpu_reg_state *vrs) msr_store = (struct vmx_msr_store *)vcpu->vc_vmx_msr_exit_load_va; msr_store[0].vms_index = MSR_EFER; msr_store[0].vms_data = rdmsr(MSR_EFER); + msr_store[1].vms_index = MSR_STAR; + msr_store[1].vms_data = rdmsr(MSR_STAR); + msr_store[2].vms_index = MSR_LSTAR; + msr_store[2].vms_data = rdmsr(MSR_LSTAR); + msr_store[3].vms_index = MSR_CSTAR; + msr_store[3].vms_data = rdmsr(MSR_CSTAR); + msr_store[4].vms_index = MSR_SFMASK; + msr_store[4].vms_data = rdmsr(MSR_SFMASK); + msr_store[5].vms_index = MSR_KERNELGSBASE; + msr_store[5].vms_data = rdmsr(MSR_KERNELGSBASE); + msr_store[6].vms_index = MSR_MISC_ENABLE; + msr_store[6].vms_data = rdmsr(MSR_MISC_ENABLE); /* * Select guest MSRs to be loaded on entry / saved on exit */ msr_store = (struct vmx_msr_store *)vcpu->vc_vmx_msr_exit_save_va; - msr_store[0].vms_index = MSR_EFER; + msr_store[VCPU_REGS_EFER].vms_index = MSR_EFER; + msr_store[VCPU_REGS_STAR].vms_index = MSR_STAR; + msr_store[VCPU_REGS_LSTAR].vms_index = MSR_LSTAR; + msr_store[VCPU_REGS_CSTAR].vms_index = MSR_CSTAR; + msr_store[VCPU_REGS_SFMASK].vms_index = MSR_SFMASK; + msr_store[VCPU_REGS_KGSBASE].vms_index = MSR_KERNELGSBASE; + msr_store[VCPU_REGS_MISC_ENABLE].vms_index = MSR_MISC_ENABLE; + + /* + * Initialize MSR_MISC_ENABLE as it can't be read and populated from vmd + * and some of the content is based on the host. + */ + msr_store[VCPU_REGS_MISC_ENABLE].vms_data = rdmsr(MSR_MISC_ENABLE); + msr_store[VCPU_REGS_MISC_ENABLE].vms_data &= + ~(MISC_ENABLE_TCC | MISC_ENABLE_PERF_MON_AVAILABLE | + MISC_ENABLE_EIST_ENABLED | MISC_ENABLE_ENABLE_MONITOR_FSM | + MISC_ENABLE_xTPR_MESSAGE_DISABLE); + msr_store[VCPU_REGS_MISC_ENABLE].vms_data |= + MISC_ENABLE_BTS_UNAVAILABLE | MISC_ENABLE_PEBS_UNAVAILABLE; /* * Currently we have the same count of entry/exit MSRs loads/stores * but this is not an architectural requirement. */ if (vmwrite(VMCS_EXIT_MSR_STORE_COUNT, VMX_NUM_MSR_STORE)) { + DPRINTF("%s: error setting guest MSR exit store count\n", + __func__); ret = EINVAL; goto exit; } if (vmwrite(VMCS_EXIT_MSR_LOAD_COUNT, VMX_NUM_MSR_STORE)) { + DPRINTF("%s: error setting guest MSR exit load count\n", + __func__); ret = EINVAL; goto exit; } if (vmwrite(VMCS_ENTRY_MSR_LOAD_COUNT, VMX_NUM_MSR_STORE)) { + DPRINTF("%s: error setting guest MSR entry load count\n", + __func__); ret = EINVAL; goto exit; } if (vmwrite(VMCS_EXIT_STORE_MSR_ADDRESS, vcpu->vc_vmx_msr_exit_save_pa)) { + DPRINTF("%s: error setting guest MSR exit store address\n", + __func__); ret = EINVAL; goto exit; } if (vmwrite(VMCS_EXIT_STORE_MSR_ADDRESS_HI, 0)) { + DPRINTF("%s: error setting guest MSR exit store address HI\n", + __func__); ret = EINVAL; goto exit; } if (vmwrite(VMCS_EXIT_LOAD_MSR_ADDRESS, vcpu->vc_vmx_msr_exit_load_pa)) { + DPRINTF("%s: error setting guest MSR exit load address\n", + __func__); ret = EINVAL; goto exit; } if (vmwrite(VMCS_EXIT_LOAD_MSR_ADDRESS_HI, 0)) { + DPRINTF("%s: error setting guest MSR exit load address HI\n", + __func__); ret = EINVAL; goto exit; } if (vmwrite(VMCS_ENTRY_LOAD_MSR_ADDRESS, vcpu->vc_vmx_msr_exit_save_pa)) { + DPRINTF("%s: error setting guest MSR entry load address\n", + __func__); ret = EINVAL; goto exit; } if (vmwrite(VMCS_ENTRY_LOAD_MSR_ADDRESS_HI, 0)) { + DPRINTF("%s: error setting guest MSR entry load address HI\n", + __func__); ret = EINVAL; goto exit; } if (vmwrite(VMCS_MSR_BITMAP_ADDRESS, vcpu->vc_msr_bitmap_pa)) { + DPRINTF("%s: error setting guest MSR bitmap address\n", + __func__); ret = EINVAL; goto exit; } if (vmwrite(VMCS_MSR_BITMAP_ADDRESS_HI, 0)) { + DPRINTF("%s: error setting guest MSR bitmap address HI\n", + __func__); + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_CR4_MASK, CR4_VMXE)) { + DPRINTF("%s: error setting guest CR4 mask\n", __func__); + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_CR0_MASK, CR0_NE)) { + DPRINTF("%s: error setting guest CR0 mask\n", __func__); ret = EINVAL; goto exit; } @@ -2346,12 +2534,14 @@ vcpu_reset_regs_vmx(struct vcpu *vcpu, struct vcpu_reg_state *vrs) vmx_setmsrbrw(vcpu, MSR_FSBASE); vmx_setmsrbrw(vcpu, MSR_GSBASE); vmx_setmsrbrw(vcpu, MSR_KERNELGSBASE); + vmx_setmsrbr(vcpu, MSR_MISC_ENABLE); /* XXX CR0 shadow */ /* XXX CR4 shadow */ /* Flush the VMCS */ if (vmclear(&vcpu->vc_control_pa)) { + DPRINTF("%s: vmclear failed\n", __func__); ret = EINVAL; goto exit; } @@ -2475,8 +2665,9 @@ vcpu_init_vmx(struct vcpu *vcpu) } /* Host CR0 */ - cr0 = rcr0(); + cr0 = rcr0() & ~CR0_TS; if (vmwrite(VMCS_HOST_IA32_CR0, cr0)) { + DPRINTF("%s: error writing host CR0\n", __func__); ret = EINVAL; goto exit; } @@ -2484,59 +2675,70 @@ vcpu_init_vmx(struct vcpu *vcpu) /* Host CR4 */ cr4 = rcr4(); if (vmwrite(VMCS_HOST_IA32_CR4, cr4)) { + DPRINTF("%s: error writing host CR4\n", __func__); ret = EINVAL; goto exit; } /* Host Segment Selectors */ if (vmwrite(VMCS_HOST_IA32_CS_SEL, GSEL(GCODE_SEL, SEL_KPL))) { + DPRINTF("%s: error writing host CS selector\n", __func__); ret = EINVAL; goto exit; } if (vmwrite(VMCS_HOST_IA32_DS_SEL, GSEL(GDATA_SEL, SEL_KPL))) { + DPRINTF("%s: error writing host DS selector\n", __func__); ret = EINVAL; goto exit; } if (vmwrite(VMCS_HOST_IA32_ES_SEL, GSEL(GDATA_SEL, SEL_KPL))) { + DPRINTF("%s: error writing host ES selector\n", __func__); ret = EINVAL; goto exit; } if (vmwrite(VMCS_HOST_IA32_FS_SEL, GSEL(GDATA_SEL, SEL_KPL))) { + DPRINTF("%s: error writing host FS selector\n", __func__); ret = EINVAL; goto exit; } if (vmwrite(VMCS_HOST_IA32_GS_SEL, GSEL(GDATA_SEL, SEL_KPL))) { + DPRINTF("%s: error writing host GS selector\n", __func__); ret = EINVAL; goto exit; } if (vmwrite(VMCS_HOST_IA32_SS_SEL, GSEL(GDATA_SEL, SEL_KPL))) { + DPRINTF("%s: error writing host SS selector\n", __func__); ret = EINVAL; goto exit; } if (vmwrite(VMCS_HOST_IA32_TR_SEL, GSEL(GTSS_SEL, SEL_KPL))) { + DPRINTF("%s: error writing host TR selector\n", __func__); ret = EINVAL; goto exit; } /* Host IDTR base */ if (vmwrite(VMCS_HOST_IA32_IDTR_BASE, (uint32_t)idt)) { + DPRINTF("%s: error writing host IDTR base\n", __func__); ret = EINVAL; goto exit; } /* VMCS link */ if (vmwrite(VMCS_LINK_POINTER, 0xFFFFFFFF)) { + DPRINTF("%s: error writing VMCS link pointer\n", __func__); ret = EINVAL; goto exit; } if (vmwrite(VMCS_LINK_POINTER_HI, 0xFFFFFFFF)) { + DPRINTF("%s: error writing VMCS link pointer HI\n", __func__); ret = EINVAL; goto exit; } @@ -2559,8 +2761,10 @@ exit: km_free((void *)vcpu->vc_vmx_msr_entry_load_va, PAGE_SIZE, &kv_page, &kp_zero); } else { - if (vmclear(&vcpu->vc_control_pa)) + if (vmclear(&vcpu->vc_control_pa)) { + DPRINTF("%s: vmclear failed\n", __func__); ret = EINVAL; + } } return (ret); @@ -2730,6 +2934,7 @@ vcpu_init(struct vcpu *vcpu) vcpu->vc_virt_mode = vmm_softc->mode; vcpu->vc_state = VCPU_STATE_STOPPED; + vcpu->vc_vpid = 0; if (vmm_softc->mode == VMM_MODE_VMX || vmm_softc->mode == VMM_MODE_EPT) ret = vcpu_init_vmx(vcpu); @@ -2765,6 +2970,9 @@ vcpu_deinit_vmx(struct vcpu *vcpu) if (vcpu->vc_vmx_msr_entry_load_va) km_free((void *)vcpu->vc_vmx_msr_entry_load_va, PAGE_SIZE, &kv_page, &kp_zero); + + if (vcpu->vc_vmx_vpid_enabled) + vmm_free_vpid(vcpu->vc_vpid); } /* @@ -3095,7 +3303,20 @@ vcpu_vmx_compute_ctrl(uint64_t ctrlval, uint16_t ctrl, uint32_t want1, /* * vm_get_info * - * Returns information about the VM indicated by 'vip'. + * Returns information about the VM indicated by 'vip'. The 'vip_size' field + * in the 'vip' parameter is used to indicate the size of the caller's buffer. + * If insufficient space exists in that buffer, the required size needed is + * returned in vip_size and the number of VM information structures returned + * in vip_info_count is set to 0. The caller should then try the ioctl again + * after allocating a sufficiently large buffer. + * + * Parameters: + * vip: information structure identifying the VM to queery + * + * Return values: + * 0: the operation succeeded + * ENOMEM: memory allocation error during processng + * EFAULT: error copying data to user process */ int vm_get_info(struct vm_info_params *vip) @@ -3159,6 +3380,13 @@ vm_get_info(struct vm_info_params *vip) * vm_terminate * * Terminates the VM indicated by 'vtp'. + * + * Parameters: + * vtp: structure defining the VM to terminate + * + * Return values: + * 0: the VM was terminated + * !0: the VM could not be located */ int vm_terminate(struct vm_terminate_params *vtp) @@ -3171,7 +3399,7 @@ vm_terminate(struct vm_terminate_params *vtp) /* * Find desired VM */ - rw_enter_read(&vmm_softc->vm_lock); + rw_enter_write(&vmm_softc->vm_lock); error = vm_find(vtp->vtp_vm_id, &vm); if (error == 0) { @@ -3189,19 +3417,17 @@ vm_terminate(struct vm_terminate_params *vtp) old, next)); } rw_exit_read(&vm->vm_vcpu_lock); - } - rw_exit_read(&vmm_softc->vm_lock); - - if (error != 0) + } else { + rw_exit_write(&vmm_softc->vm_lock); return (error); + } - /* XXX possible race here two threads terminating the same vm? */ - rw_enter_write(&vmm_softc->vm_lock); SLIST_REMOVE(&vmm_softc->vm_list, vm, vm, vm_link); - rw_exit_write(&vmm_softc->vm_lock); if (vm->vm_vcpus_running == 0) vm_teardown(vm); + rw_exit_write(&vmm_softc->vm_lock); + return (0); } @@ -3209,6 +3435,18 @@ vm_terminate(struct vm_terminate_params *vtp) * vm_run * * Run the vm / vcpu specified by 'vrp' + * + * Parameters: + * vrp: structure defining the VM to run + * + * Return value: + * ENOENT: the VM defined in 'vrp' could not be located + * EBUSY: the VM defined in 'vrp' is already running + * EFAULT: error copying data from userspace (vmd) on return from previous + * exit. + * EAGAIN: help is needed from vmd(8) (device I/O or exit vmm(4) cannot + * handle in-kernel.) + * 0: the run loop exited and no help is needed from vmd(8) */ int vm_run(struct vm_run_params *vrp) @@ -3359,7 +3597,7 @@ vcpu_run_vmx(struct vcpu *vcpu, struct vm_run_params *vrp) uint32_t insn_error, exit_reason; struct schedstate_percpu *spc; struct vmx_invvpid_descriptor vid; - uint32_t eii, procbased; + uint32_t eii, procbased, int_st; uint16_t irq; resume = 0; @@ -3374,8 +3612,9 @@ vcpu_run_vmx(struct vcpu *vcpu, struct vm_run_params *vrp) if (vrp->vrp_continue) { switch (vcpu->vc_gueststate.vg_exit_reason) { case VMX_EXIT_IO: - vcpu->vc_gueststate.vg_eax = - vcpu->vc_exit.vei.vei_data; + if (vcpu->vc_exit.vei.vei_dir == VEI_DIR_IN) + vcpu->vc_gueststate.vg_eax = + vcpu->vc_exit.vei.vei_data; break; case VMX_EXIT_HLT: break; @@ -3385,6 +3624,10 @@ vcpu_run_vmx(struct vcpu *vcpu, struct vm_run_params *vrp) break; case VMX_EXIT_EPT_VIOLATION: break; + case VMX_EXIT_CPUID: + break; + case VMX_EXIT_XSETBV: + break; #ifdef VMM_DEBUG case VMX_EXIT_TRIPLE_FAULT: DPRINTF("%s: vm %d vcpu %d triple fault\n", @@ -3462,24 +3705,27 @@ vcpu_run_vmx(struct vcpu *vcpu, struct vm_run_params *vrp) /* Handle vmd(8) injected interrupts */ /* Is there an interrupt pending injection? */ if (irq != 0xFFFF) { - if (!vcpu->vc_irqready) { - printf("%s: error - irq injected while not " - "ready\n", __func__); - ret = EINVAL; - break; - } - - eii = (irq & 0xFF); - eii |= (1ULL << 31); /* Valid */ - eii |= (0ULL << 8); /* Hardware Interrupt */ - if (vmwrite(VMCS_ENTRY_INTERRUPTION_INFO, eii)) { - printf("%s: can't vector interrupt to guest\n", + if (vmread(VMCS_GUEST_INTERRUPTIBILITY_ST, &int_st)) { + printf("%s: can't get interruptibility state\n", __func__); ret = EINVAL; break; } - irq = 0xFFFF; + /* Interruptbility state 0x3 covers NMIs and STI */ + if (!(int_st & 0x3) && vcpu->vc_irqready) { + eii = (irq & 0xFF); + eii |= (1ULL << 31); /* Valid */ + eii |= (0ULL << 8); /* Hardware Interrupt */ + if (vmwrite(VMCS_ENTRY_INTERRUPTION_INFO, eii)) { + printf("vcpu_run_vmx: can't vector " + "interrupt to guest\n"); + ret = EINVAL; + break; + } + + irq = 0xFFFF; + } } else if (!vcpu->vc_intr) { /* * Disable window exiting @@ -3500,13 +3746,52 @@ vcpu_run_vmx(struct vcpu *vcpu, struct vm_run_params *vrp) } } - /* Invalidate old TLB mappings */ - vid.vid_vpid = (uint64_t)vcpu->vc_parent->vm_id; - vid.vid_addr = 0ULL; - invvpid(IA32_VMX_INVVPID_SINGLE_CTX_GLB, &vid); + /* Inject event if present */ + if (vcpu->vc_event != 0) { + eii = (vcpu->vc_event & 0xFF); + eii |= (1ULL << 31); /* Valid */ + + /* Set the "Send error code" flag for certain vectors */ + switch (vcpu->vc_event & 0xFF) { + case VMM_EX_DF: + case VMM_EX_TS: + case VMM_EX_NP: + case VMM_EX_SS: + case VMM_EX_GP: + case VMM_EX_PF: + case VMM_EX_AC: + eii |= (1ULL << 11); + } + + eii |= (3ULL << 8); /* Hardware Exception */ + if (vmwrite(VMCS_ENTRY_INTERRUPTION_INFO, eii)) { + printf("%s: can't vector event to guest\n", + __func__); + ret = EINVAL; + break; + } + + if (vmwrite(VMCS_ENTRY_EXCEPTION_ERROR_CODE, 0)) { + printf("%s: can't write error code to guest\n", + __func__); + ret = EINVAL; + break; + } + + vcpu->vc_event = 0; + } + + if (vcpu->vc_vmx_vpid_enabled) { + /* Invalidate old TLB mappings */ + vid.vid_vpid = vcpu->vc_parent->vm_id; + vid.vid_addr = 0; + invvpid(IA32_VMX_INVVPID_SINGLE_CTX_GLB, &vid); + } /* Start / resume the VCPU */ +#ifdef VMM_DEBUG KERNEL_ASSERT_LOCKED(); +#endif /* VMM_DEBUG */ KERNEL_UNLOCK(); ret = vmx_enter_guest(&vcpu->vc_control_pa, &vcpu->vc_gueststate, resume, gdt.rd_base); @@ -3540,12 +3825,16 @@ vcpu_run_vmx(struct vcpu *vcpu, struct vm_run_params *vrp) resume = 1; if (!(exitinfo & VMX_EXIT_INFO_HAVE_RIP)) { printf("%s: cannot read guest rip\n", __func__); + if (!locked) + KERNEL_LOCK(); ret = EINVAL; break; } if (!(exitinfo & VMX_EXIT_INFO_HAVE_REASON)) { printf("%s: cant read exit reason\n", __func__); + if (!locked) + KERNEL_LOCK(); ret = EINVAL; break; } @@ -3659,6 +3948,10 @@ vcpu_run_vmx(struct vcpu *vcpu, struct vm_run_params *vrp) } } + /* Copy the VCPU register state to the exit structure */ + if (vcpu_readregs_vmx(vcpu, VM_RWREGS_ALL, &vcpu->vc_exit.vrs)) + ret = EINVAL; + /* * We are heading back to userspace (vmd), either because we need help * handling an exit, a guest interrupt is pending, or we failed in some @@ -3672,6 +3965,9 @@ vcpu_run_vmx(struct vcpu *vcpu, struct vm_run_params *vrp) } else ret = EINVAL; +#ifdef VMM_DEBUG + KERNEL_ASSERT_LOCKED(); +#endif /* VMM_DEBUG */ return (ret); } @@ -3706,24 +4002,47 @@ vmx_handle_intr(struct vcpu *vcpu) /* * vmx_handle_hlt * - * Handle HLT exits + * Handle HLT exits. HLTing the CPU with interrupts disabled will terminate + * the guest (no NMIs handled) by returning EIO to vmd. + * + * Paramters: + * vcpu: The VCPU that executed the HLT instruction + * + * Return Values: + * EINVAL: An error occurred extracting information from the VMCS, or an + * invalid HLT instruction was encountered + * EIO: The guest halted with interrupts disabled + * EAGAIN: Normal return to vmd - vmd should halt scheduling this VCPU + * until a virtual interrupt is ready to inject + * */ int vmx_handle_hlt(struct vcpu *vcpu) { - uint32_t insn_length; + uint32_t insn_length, eflags; if (vmread(VMCS_INSTRUCTION_LENGTH, &insn_length)) { printf("%s: can't obtain instruction length\n", __func__); return (EINVAL); } + if (vmread(VMCS_GUEST_IA32_RFLAGS, &eflags)) { + printf("%s: can't obtain guest eflags\n", __func__); + return (EINVAL); + } + if (insn_length != 1) { DPRINTF("%s: HLT with instruction length %d not supported\n", __func__, insn_length); return (EINVAL); } + if (!(eflags & PSL_I)) { + DPRINTF("%s: guest halted with interrupts disabled\n", + __func__); + return (EIO); + } + vcpu->vc_gueststate.vg_eip += insn_length; return (EAGAIN); } @@ -3758,7 +4077,7 @@ int vmx_handle_exit(struct vcpu *vcpu) { uint64_t exit_reason; - uint32_t eflags; + uint32_t eflags, istate; int update_rip, ret = 0; update_rip = 0; @@ -3808,6 +4127,24 @@ vmx_handle_exit(struct vcpu *vcpu) ret = vmx_handle_wrmsr(vcpu); update_rip = 1; break; + case VMX_EXIT_MWAIT: + case VMX_EXIT_MONITOR: + case VMX_EXIT_VMXON: + case VMX_EXIT_VMWRITE: + case VMX_EXIT_VMREAD: + case VMX_EXIT_VMLAUNCH: + case VMX_EXIT_VMRESUME: + case VMX_EXIT_VMPTRLD: + case VMX_EXIT_VMPTRST: + case VMX_EXIT_VMCLEAR: + case VMX_EXIT_VMCALL: + case VMX_EXIT_VMFUNC: + case VMX_EXIT_VMXOFF: + case VMX_EXIT_INVVPID: + case VMX_EXIT_INVEPT: + ret = vmm_inject_ud(vcpu); + update_rip = 0; + break; case VMX_EXIT_TRIPLE_FAULT: #ifdef VMM_DEBUG DPRINTF("%s: vm %d vcpu %d triple fault\n", __func__, @@ -3831,11 +4168,99 @@ vmx_handle_exit(struct vcpu *vcpu) printf("%s: can't advance rip\n", __func__); return (EINVAL); } + + if (vmread(VMCS_GUEST_INTERRUPTIBILITY_ST, + &istate)) { + printf("%s: can't read interruptibility state\n", + __func__); + return (EINVAL); + } + + /* Interruptibilty state 0x3 covers NMIs and STI */ + istate &= ~0x3; + + if (vmwrite(VMCS_GUEST_INTERRUPTIBILITY_ST, + istate)) { + printf("%s: can't write interruptibility state\n", + __func__); + return (EINVAL); + } + + if (eflags & PSL_T) { + if (vmm_inject_db(vcpu)) { + printf("%s: can't inject #DB exception to " + "guest", __func__); + return (EINVAL); + } + } } return (ret); } +/* + * vmm_inject_gp + * + * Injects an #GP exception into the guest VCPU. + * + * Parameters: + * vcpu: vcpu to inject into + * + * Return values: + * Always 0 + */ +int +vmm_inject_gp(struct vcpu *vcpu) +{ + DPRINTF("%s: injecting #GP at guest %%eip 0x%x\n", __func__, + vcpu->vc_gueststate.vg_eip); + vcpu->vc_event = VMM_EX_GP; + + return (0); +} + +/* + * vmm_inject_ud + * + * Injects an #UD exception into the guest VCPU. + * + * Parameters: + * vcpu: vcpu to inject into + * + * Return values: + * Always 0 + */ +int +vmm_inject_ud(struct vcpu *vcpu) +{ + DPRINTF("%s: injecting #UD at guest %%eip 0x%x\n", __func__, + vcpu->vc_gueststate.vg_eip); + vcpu->vc_event = VMM_EX_UD; + + return (0); +} + +/* + * vmm_inject_db + * + * Injects a #DB exception into the guest VCPU. + * + * Parameters: + * vcpu: vcpu to inject into + * + * Return values: + * Always 0 + */ +int +vmm_inject_db(struct vcpu *vcpu) +{ + DPRINTF("%s: injecting #DB at guest %%eip 0x%x\n", __func__, + vcpu->vc_gueststate.vg_eip); + vcpu->vc_event = VMM_EX_DB; + + return (0); +} + /* * vmm_get_guest_memtype * @@ -3961,8 +4386,10 @@ vmx_fault_page(struct vcpu *vcpu, paddr_t gpa) ret = uvm_fault(vcpu->vc_parent->vm_map, gpa, fault_type, PROT_READ | PROT_WRITE | PROT_EXEC); + if (ret) - printf("%s: uvm_fault returns %d\n", __func__, ret); + printf("%s: uvm_fault returns %d, GPA=0x%llx, eip=0x%x\n", + __func__, ret, (uint64_t)gpa, vcpu->vc_gueststate.vg_eip); return (ret); } @@ -4061,24 +4488,283 @@ vmx_handle_inout(struct vcpu *vcpu) switch (vcpu->vc_exit.vei.vei_port) { case IO_ICU1 ... IO_ICU1 + 1: case 0x40 ... 0x43: + case PCKBC_AUX: case IO_RTC ... IO_RTC + 1: case IO_ICU2 ... IO_ICU2 + 1: case 0x3f8 ... 0x3ff: + case ELCR0 ... ELCR1: case 0xcf8: - case 0xcfc: + case 0xcfc ... 0xcff: + case 0x500 ... 0x50f: case VMM_PCI_IO_BAR_BASE ... VMM_PCI_IO_BAR_END: ret = EAGAIN; break; default: /* Read from unsupported ports returns FFs */ - if (vcpu->vc_exit.vei.vei_dir == 1) - vcpu->vc_gueststate.vg_eax = 0xFFFFFFFF; + if (vcpu->vc_exit.vei.vei_dir == VEI_DIR_IN) { + if (vcpu->vc_exit.vei.vei_size == 4) + vcpu->vc_gueststate.vg_eax |= 0xFFFFFFFF; + else if (vcpu->vc_exit.vei.vei_size == 2) + vcpu->vc_gueststate.vg_eax |= 0xFFFF; + else if (vcpu->vc_exit.vei.vei_size == 1) + vcpu->vc_gueststate.vg_eax |= 0xFF; + } ret = 0; } return (ret); } +/* + * vmx_load_pdptes + * + * Update the PDPTEs in the VMCS with the values currently indicated by the + * guest CR3. This is used for 32-bit PAE guests when enabling paging. + * + * Parameters + * vcpu: The vcpu whose PDPTEs should be loaded + * + * Return values: + * 0: if successful + * EINVAL: if the PDPTEs could not be loaded + * ENOMEM: memory allocation failure + */ +int +vmx_load_pdptes(struct vcpu *vcpu) +{ + uint32_t cr3, cr3_host_phys; + vaddr_t cr3_host_virt; + pd_entry_t *pdptes; + int ret; + + if (vmread(VMCS_GUEST_IA32_CR3, &cr3)) { + printf("%s: can't read guest cr3\n", __func__); + return (EINVAL); + } + + if (!pmap_extract(vcpu->vc_parent->vm_map->pmap, (vaddr_t)cr3, + (paddr_t *)&cr3_host_phys)) { + DPRINTF("%s: nonmapped guest CR3, setting PDPTEs to 0\n", + __func__); + if (vmwrite(VMCS_GUEST_PDPTE0, 0)) { + printf("%s: can't write guest PDPTE0\n", __func__); + return (EINVAL); + } + + if (vmwrite(VMCS_GUEST_PDPTE1, 0)) { + printf("%s: can't write guest PDPTE1\n", __func__); + return (EINVAL); + } + + if (vmwrite(VMCS_GUEST_PDPTE2, 0)) { + printf("%s: can't write guest PDPTE2\n", __func__); + return (EINVAL); + } + + if (vmwrite(VMCS_GUEST_PDPTE3, 0)) { + printf("%s: can't write guest PDPTE3\n", __func__); + return (EINVAL); + } + return (0); + } + + ret = 0; + + cr3_host_virt = (vaddr_t)km_alloc(PAGE_SIZE, &kv_any, &kp_none, &kd_waitok); + if (!cr3_host_virt) { + printf("%s: can't allocate address for guest CR3 mapping\n", + __func__); + return (ENOMEM); + } + + pmap_kenter_pa(cr3_host_virt, cr3_host_phys, PROT_READ); + + pdptes = (pd_entry_t *)cr3_host_virt; + if (vmwrite(VMCS_GUEST_PDPTE0, pdptes[0])) { + printf("%s: can't write guest PDPTE0\n", __func__); + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_GUEST_PDPTE1, pdptes[1])) { + printf("%s: can't write guest PDPTE1\n", __func__); + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_GUEST_PDPTE2, pdptes[2])) { + printf("%s: can't write guest PDPTE2\n", __func__); + ret = EINVAL; + goto exit; + } + + if (vmwrite(VMCS_GUEST_PDPTE3, pdptes[3])) { + printf("%s: can't write guest PDPTE3\n", __func__); + ret = EINVAL; + goto exit; + } + +exit: + pmap_kremove(cr3_host_virt, PAGE_SIZE); + km_free((void *)cr3_host_virt, PAGE_SIZE, &kv_any, &kp_none); + return (ret); +} + +/* + * vmx_handle_cr0_write + * + * Write handler for CR0. This function ensures valid values are written into + * CR0 for the cpu/vmm mode in use (cr0 must-be-0 and must-be-1 bits, etc). + * + * Parameters + * vcpu: The vcpu taking the cr0 write exit + * r: The guest's desired (incoming) cr0 value + * + * Return values: + * 0: if succesful + * EINVAL: if an error occurred + */ +int +vmx_handle_cr0_write(struct vcpu *vcpu, uint32_t r) +{ + struct vmx_msr_store *msr_store; + uint32_t ectls, oldcr0, cr4, mask; + int ret; + + /* Check must-be-0 bits */ + mask = ~(curcpu()->ci_vmm_cap.vcc_vmx.vmx_cr0_fixed1); + if (r & mask) { + /* Inject #GP, let the guest handle it */ + DPRINTF("%s: guest set invalid bits in %%cr0. Zeros " + "mask=0x%llx, data=0x%x\n", __func__, + curcpu()->ci_vmm_cap.vcc_vmx.vmx_cr0_fixed1, + r); + vmm_inject_gp(vcpu); + return (0); + } + + /* Check must-be-1 bits */ + mask = curcpu()->ci_vmm_cap.vcc_vmx.vmx_cr0_fixed0; + if ((r & mask) != mask) { + /* Inject #GP, let the guest handle it */ + DPRINTF("%s: guest set invalid bits in %%cr0. Ones " + "mask=0x%llx, data=0x%x\n", __func__, + curcpu()->ci_vmm_cap.vcc_vmx.vmx_cr0_fixed0, + r); + vmm_inject_gp(vcpu); + return (0); + } + + if (vmread(VMCS_GUEST_IA32_CR0, &oldcr0)) { + printf("%s: can't read guest cr0\n", __func__); + return (EINVAL); + } + + if (vmread(VMCS_GUEST_IA32_CR4, &cr4)) { + printf("%s: can't read guest cr4\n", __func__); + return (EINVAL); + } + + /* CR0 must always have NE set */ + r |= CR0_NE; + + if (vmwrite(VMCS_GUEST_IA32_CR0, r)) { + printf("%s: can't write guest cr0\n", __func__); + return (EINVAL); + } + + /* If the guest hasn't enabled paging, nothing more to do. */ + if (!(r & CR0_PG)) + return (0); + + /* + * Since the guest has enabled paging, then the IA32_VMX_IA32E_MODE_GUEST + * control must be set to the same as EFER_LME. + */ + msr_store = (struct vmx_msr_store *)vcpu->vc_vmx_msr_exit_save_va; + + if (vmread(VMCS_ENTRY_CTLS, &ectls)) { + printf("%s: can't read entry controls", __func__); + return (EINVAL); + } + + if (msr_store[VCPU_REGS_EFER].vms_data & EFER_LME) + ectls |= IA32_VMX_IA32E_MODE_GUEST; + else + ectls &= ~IA32_VMX_IA32E_MODE_GUEST; + + if (vmwrite(VMCS_ENTRY_CTLS, ectls)) { + printf("%s: can't write entry controls", __func__); + return (EINVAL); + } + + /* Load PDPTEs if PAE guest enabling paging */ + if (!(oldcr0 & CR0_PG) && (r & CR0_PG) && (cr4 & CR4_PAE)) { + ret = vmx_load_pdptes(vcpu); + + if (ret) { + printf("%s: updating PDPTEs failed\n", __func__); + return (ret); + } + } + + return (0); +} + +/* + * vmx_handle_cr4_write + * + * Write handler for CR4. This function ensures valid values are written into + * CR4 for the cpu/vmm mode in use (cr4 must-be-0 and must-be-1 bits, etc). + * + * Parameters + * vcpu: The vcpu taking the cr4 write exit + * r: The guest's desired (incoming) cr4 value + * + * Return values: + * 0: if succesful + * EINVAL: if an error occurred + */ +int +vmx_handle_cr4_write(struct vcpu *vcpu, uint32_t r) +{ + uint64_t mask; + + /* Check must-be-0 bits */ + mask = ~(curcpu()->ci_vmm_cap.vcc_vmx.vmx_cr4_fixed1); + if (r & mask) { + /* Inject #GP, let the guest handle it */ + DPRINTF("%s: guest set invalid bits in %%cr4. Zeros " + "mask=0x%llx, data=0x%x\n", __func__, + curcpu()->ci_vmm_cap.vcc_vmx.vmx_cr4_fixed1, + r); + vmm_inject_gp(vcpu); + return (0); + } + + /* Check must-be-1 bits */ + mask = curcpu()->ci_vmm_cap.vcc_vmx.vmx_cr4_fixed0; + if ((r & mask) != mask) { + /* Inject #GP, let the guest handle it */ + DPRINTF("%s: guest set invalid bits in %%cr4. Ones " + "mask=0x%llx, data=0x%x\n", __func__, + curcpu()->ci_vmm_cap.vcc_vmx.vmx_cr4_fixed0, + r); + vmm_inject_gp(vcpu); + return (0); + } + + /* CR4_VMXE must always be enabled */ + r |= CR4_VMXE; + + if (vmwrite(VMCS_GUEST_IA32_CR4, r)) { + printf("%s: can't write guest cr4\n", __func__); + return (EINVAL); + } + + return (0); +} + /* * vmx_handle_cr * @@ -4087,8 +4773,8 @@ vmx_handle_inout(struct vcpu *vcpu) int vmx_handle_cr(struct vcpu *vcpu) { - uint32_t insn_length, exit_qual; - uint8_t crnum, dir; + uint32_t insn_length, exit_qual, r; + uint8_t crnum, dir, reg; if (vmread(VMCS_INSTRUCTION_LENGTH, &insn_length)) { printf("%s: can't obtain instruction length\n", __func__); @@ -4103,12 +4789,43 @@ vmx_handle_cr(struct vcpu *vcpu) /* Low 4 bits of exit_qual represent the CR number */ crnum = exit_qual & 0xf; + /* + * Bits 5:4 indicate the direction of operation (or special CR-modifying + * instruction + */ dir = (exit_qual & 0x30) >> 4; + /* Bits 11:8 encode the source/target register */ + reg = (exit_qual & 0xf00) >> 8; + switch (dir) { case CR_WRITE: - DPRINTF("%s: mov to cr%d @ %x\n", __func__, - crnum, vcpu->vc_gueststate.vg_eip); + if (crnum == 0 || crnum == 4) { + switch (reg) { + case 0: r = vcpu->vc_gueststate.vg_eax; break; + case 1: r = vcpu->vc_gueststate.vg_ecx; break; + case 2: r = vcpu->vc_gueststate.vg_edx; break; + case 3: r = vcpu->vc_gueststate.vg_ebx; break; + case 4: if (vmread(VMCS_GUEST_IA32_RSP, &r)) { + printf("%s: unable to read guest " + "RSP\n", __func__); + return (EINVAL); + } + break; + case 5: r = vcpu->vc_gueststate.vg_ebp; break; + case 6: r = vcpu->vc_gueststate.vg_esi; break; + case 7: r = vcpu->vc_gueststate.vg_edi; break; + } + DPRINTF("%s: mov to cr%d @ %x, data=0x%x\n", + __func__, crnum, vcpu->vc_gueststate.vg_eip, r); + } + + if (crnum == 0) + vmx_handle_cr0_write(vcpu, r); + + if (crnum == 4) + vmx_handle_cr4_write(vcpu, r); + break; case CR_READ: DPRINTF("%s: mov from cr%d @ %x\n", __func__, @@ -4138,21 +4855,24 @@ vmx_handle_cr(struct vcpu *vcpu) * Handler for rdmsr instructions. Bitmap MSRs are allowed implicit access * and won't end up here. This handler is primarily intended to catch otherwise * unknown MSR access for possible later inclusion in the bitmap list. For - * each MSR access that ends up here, we log the access. + * each MSR access that ends up here, we log the access (when VMM_DEBUG is + * enabled) * * Parameters: * vcpu: vcpu structure containing instruction info causing the exit * * Return value: * 0: The operation was successful - * 1: An error occurred + * EINVAL: An error occurred */ int vmx_handle_rdmsr(struct vcpu *vcpu) { uint32_t insn_length; - uint64_t msr; - uint32_t *eax, *ecx, *edx; + uint32_t *eax, *edx; +#ifdef VMM_DEBUG + uint32_t *ecx; +#endif /* VMM_DEBUG */ if (vmread(VMCS_INSTRUCTION_LENGTH, &insn_length)) { printf("%s: can't obtain instruction length\n", __func__); @@ -4166,28 +4886,57 @@ vmx_handle_rdmsr(struct vcpu *vcpu) } eax = &vcpu->vc_gueststate.vg_eax; - ecx = &vcpu->vc_gueststate.vg_ecx; edx = &vcpu->vc_gueststate.vg_edx; - msr = rdmsr(*ecx); - *eax = msr & 0xFFFFFFFFULL; - *edx = msr >> 32; + *eax = 0; + *edx = 0; - /* XXX log the access for now, to be able to identify unknown MSRs */ - printf("%s: rdmsr exit, msr=0x%x, data returned to " +#ifdef VMM_DEBUG + /* Log the access, to be able to identify unknown MSRs */ + ecx = &vcpu->vc_gueststate.vg_ecx; + DPRINTF("%s: rdmsr exit, msr=0x%x, data returned to " "guest=0x%x:0x%x\n", __func__, *ecx, *edx, *eax); +#endif /* VMM_DEBUG */ vcpu->vc_gueststate.vg_eip += insn_length; return (0); } +/* + * vmx_handle_misc_enable_msr + * + * Handler for writes to the MSR_MISC_ENABLE (0x1a0) MSR on Intel CPUs. We + * limit what the guest can write to this MSR (certain hardware-related + * settings like speedstep, etc). + * + * Parameters: + * vcpu: vcpu structure containing information about the wrmsr causing this + * exit + */ +void +vmx_handle_misc_enable_msr(struct vcpu *vcpu) +{ + uint32_t *eax, *edx; + struct vmx_msr_store *msr_store; + + eax = &vcpu->vc_gueststate.vg_eax; + edx = &vcpu->vc_gueststate.vg_edx; + msr_store = (struct vmx_msr_store *)vcpu->vc_vmx_msr_exit_save_va; + + /* Filter out guest writes to TCC, EIST, and xTPR */ + *eax &= ~(MISC_ENABLE_TCC | MISC_ENABLE_EIST_ENABLED | + MISC_ENABLE_xTPR_MESSAGE_DISABLE); + + msr_store[VCPU_REGS_MISC_ENABLE].vms_data = *eax | (((uint64_t) *edx << 32)); +} + /* * vmx_handle_wrmsr * * Handler for wrmsr instructions. This handler logs the access, and discards - * the written data. Any valid wrmsr will not end up here (it will be - * whitelisted in the MSR bitmap). + * the written data (when VMM_DEBUG is enabled). Any valid wrmsr will not end + * up here (it will be whitelisted in the MSR bitmap). * * Parameters: * vcpu: vcpu structure containing instruction info causing the exit @@ -4217,9 +4966,20 @@ vmx_handle_wrmsr(struct vcpu *vcpu) ecx = &vcpu->vc_gueststate.vg_ecx; edx = &vcpu->vc_gueststate.vg_edx; - /* XXX log the access for now, to be able to identify unknown MSRs */ - printf("%s: wrmsr exit, msr=0x%x, discarding data written from " - "guest=0x%x:0x%x\n", __func__, *ecx, *edx, *eax); + switch (*ecx) { + case MSR_MISC_ENABLE: + vmx_handle_misc_enable_msr(vcpu); + break; +#ifdef VMM_DEBUG + default: + /* + * Log the access, to be able to identify unknown MSRs + */ + DPRINTF("%s: wrmsr exit, msr=0x%x, discarding data " + "written from guest=0x%x:0x%x\n", __func__, + *ecx, *edx, *eax); +#endif /* VMM_DEBUG */ + } vcpu->vc_gueststate.vg_eip += insn_length; @@ -4242,7 +5002,9 @@ int vmm_handle_cpuid(struct vcpu *vcpu) { uint32_t insn_length; - uint32_t *eax, *ebx, *ecx, *edx; + uint32_t *eax, *ebx, *ecx, *edx, cpuid_limit; + uint32_t new_eax, new_ebx, new_ecx, new_edx; + struct vmx_msr_store *msr_store; if (vmm_softc->mode == VMM_MODE_VMX || vmm_softc->mode == VMM_MODE_EPT) { @@ -4252,11 +5014,11 @@ vmm_handle_cpuid(struct vcpu *vcpu) return (EINVAL); } - if (insn_length != 2) { - DPRINTF("%s: CPUID with instruction length %d not " - "supported\n", __func__, insn_length); - return (EINVAL); - } + eax = &vcpu->vc_gueststate.vg_eax; + msr_store = + (struct vmx_msr_store *)vcpu->vc_vmx_msr_exit_save_va; + cpuid_limit = msr_store[VCPU_REGS_MISC_ENABLE].vms_data & + MISC_ENABLE_LIMIT_CPUID_MAXVAL; } eax = &vcpu->vc_gueststate.vg_eax; @@ -4264,6 +5026,19 @@ vmm_handle_cpuid(struct vcpu *vcpu) ecx = &vcpu->vc_gueststate.vg_ecx; edx = &vcpu->vc_gueststate.vg_edx; + /* + * "CPUID leaves above 02H and below 80000000H are only visible when + * IA32_MISC_ENABLE MSR has bit 22 set to its default value 0" + */ + if ((vmm_softc->mode == VMM_MODE_VMX || vmm_softc->mode == VMM_MODE_EPT) + && cpuid_limit && (*eax > 0x02 && *eax < 0x80000000)) { + *eax = 0; + *ebx = 0; + *ecx = 0; + *edx = 0; + return (0); + } + switch (*eax) { case 0x00: /* Max level and vendor ID */ *eax = 0x07; /* cpuid_level */ @@ -4298,12 +5073,19 @@ vmm_handle_cpuid(struct vcpu *vcpu) *edx = 0; break; case 0x04: - DPRINTF("%s: function 0x04 (deterministic cache info) not " - "supported\n", __func__); - *eax = 0; - *ebx = 0; - *ecx = 0; - *edx = 0; + if (*ecx == 0) { + CPUID_LEAF(*eax, 0, new_eax, new_ebx, new_ecx, new_edx); + *eax = new_eax & VMM_CPUID4_CACHE_TOPOLOGY_MASK; + *ebx = new_ebx; + *ecx = new_ecx; + *edx = new_edx; + } else { + CPUID_LEAF(*eax, *ecx, new_eax, new_ebx, new_ecx, new_edx); + *eax = new_eax & VMM_CPUID4_CACHE_TOPOLOGY_MASK; + *ebx = new_ebx; + *ecx = new_ecx; + *edx = new_edx; + } break; case 0x05: /* MONITOR/MWAIT (not supported) */ DPRINTF("%s: function 0x05 (monitor/mwait) not supported\n", @@ -4489,6 +5271,72 @@ vcpu_run_svm(struct vcpu *vcpu, struct vm_run_params *vrp) return (0); } +/* + * vmm_alloc_vpid + * + * Sets the memory location pointed to by "vpid" to the next available VPID + * or ASID. + * + * Parameters: + * vpid: Pointer to location to receive the next VPID/ASID + * + * Return Values: + * 0: The operation completed successfully + * ENOMEM: No VPIDs/ASIDs were available. Content of 'vpid' is unchanged. + */ +int +vmm_alloc_vpid(uint16_t *vpid) +{ + uint16_t i; + uint8_t idx, bit; + struct vmm_softc *sc = vmm_softc; + + rw_enter_write(&vmm_softc->vpid_lock); + for (i = 1; i <= sc->max_vpid; i++) { + idx = i / 8; + bit = i - (idx * 8); + + if (!(sc->vpids[idx] & (1 << bit))) { + sc->vpids[idx] |= (1 << bit); + *vpid = i; + DPRINTF("%s: allocated VPID/ASID %d\n", __func__, + i); + rw_exit_write(&vmm_softc->vpid_lock); + return 0; + } + } + + printf("%s: no available %ss\n", __func__, + (sc->mode == VMM_MODE_EPT || sc->mode == VMM_MODE_VMX) ? "VPID" : + "ASID"); + + rw_exit_write(&vmm_softc->vpid_lock); + return ENOMEM; +} + +/* + * vmm_free_vpid + * + * Frees the VPID/ASID id supplied in "vpid". + * + * Parameters: + * vpid: VPID/ASID to free. + */ +void +vmm_free_vpid(uint16_t vpid) +{ + uint8_t idx, bit; + struct vmm_softc *sc = vmm_softc; + + rw_enter_write(&vmm_softc->vpid_lock); + idx = vpid / 8; + bit = vpid - (idx * 8); + sc->vpids[idx] &= ~(1 << bit); + + DPRINTF("%s: freed VPID/ASID %d\n", __func__, vpid); + rw_exit_write(&vmm_softc->vpid_lock); +} + /* * vmx_exit_reason_decode * diff --git a/sys/arch/i386/include/vmmvar.h b/sys/arch/i386/include/vmmvar.h index 8ffbd2ee4c3..51c03e53c37 100644 --- a/sys/arch/i386/include/vmmvar.h +++ b/sys/arch/i386/include/vmmvar.h @@ -256,6 +256,31 @@ #define SVM_VMEXIT_NPF 0x400 #define SVM_VMEXIT_INVALID -1 +/* + * Exception injection vectors (these correspond to the CPU exception types + * defined in the SDM.) + */ +#define VMM_EX_DE 0 /* Divide Error #DE */ +#define VMM_EX_DB 1 /* Debug Exception #DB */ +#define VMM_EX_NMI 2 /* NMI */ +#define VMM_EX_BP 3 /* Breakpoint #BP */ +#define VMM_EX_OF 4 /* Overflow #OF */ +#define VMM_EX_BR 5 /* Bound range exceeded #BR */ +#define VMM_EX_UD 6 /* Undefined opcode #UD */ +#define VMM_EX_NM 7 /* Device not available #NM */ +#define VMM_EX_DF 8 /* Double fault #DF */ +#define VMM_EX_CP 9 /* Coprocessor segment overrun (unused) */ +#define VMM_EX_TS 10 /* Invalid TSS #TS */ +#define VMM_EX_NP 11 /* Segment not present #NP */ +#define VMM_EX_SS 12 /* Stack segment fault #SS */ +#define VMM_EX_GP 13 /* General protection #GP */ +#define VMM_EX_PF 14 /* Page fault #PF */ +#define VMM_EX_MF 16 /* x87 FPU floating point error #MF */ +#define VMM_EX_AC 17 /* Alignment check #AC */ +#define VMM_EX_MC 18 /* Machine check #MC */ +#define VMM_EX_XM 19 /* SIMD floating point exception #XM */ +#define VMM_EX_VE 20 /* Virtualization exception #VE */ + /* * VCPU state values. Note that there is a conversion function in vmm.c * (vcpu_state_decode) that converts these to human readable strings, @@ -340,7 +365,13 @@ struct vcpu_segment_info { #define VCPU_REGS_NSREGS (VCPU_REGS_TR + 1) #define VCPU_REGS_EFER 0 -#define VCPU_REGS_NMSRS (VCPU_REGS_EFER + 1) +#define VCPU_REGS_STAR 1 +#define VCPU_REGS_LSTAR 2 +#define VCPU_REGS_CSTAR 3 +#define VCPU_REGS_SFMASK 4 +#define VCPU_REGS_KGSBASE 5 +#define VCPU_REGS_MISC_ENABLE 6 +#define VCPU_REGS_NMSRS (VCPU_REGS_MISC_ENABLE + 1) struct vcpu_reg_state { uint32_t vrs_gprs[VCPU_REGS_NGPRS]; @@ -518,6 +549,11 @@ struct vm_rwregs_params { SEFF0EBX_PCOMMIT | SEFF0EBX_PT) #define VMM_SEFF0ECX_MASK 0xFFFFFFFF +/* + * CPUID[0x4] deterministic cache info + */ +#define VMM_CPUID4_CACHE_TOPOLOGY_MASK 0x3FF + #ifdef _KERNEL #define VMX_FAIL_LAUNCH_UNKNOWN 1 @@ -726,6 +762,7 @@ struct vcpu { struct vm *vc_parent; uint32_t vc_id; + uint16_t vc_vpid; u_int vc_state; SLIST_ENTRY(vcpu) vc_vcpu_link; @@ -737,6 +774,8 @@ struct vcpu { uint16_t vc_intr; uint8_t vc_irqready; + uint8_t vc_event; + /* VMX only */ uint64_t vc_vmx_basic; uint64_t vc_vmx_entry_ctls; @@ -755,6 +794,7 @@ struct vcpu { paddr_t vc_vmx_msr_exit_load_pa; vaddr_t vc_vmx_msr_entry_load_va; paddr_t vc_vmx_msr_entry_load_pa; + uint8_t vc_vmx_vpid_enabled; /* SVM only */ vaddr_t vc_svm_hsa_va; @@ -779,6 +819,8 @@ int vmx_enter_guest(uint64_t *, struct vmx_gueststate *, int, vaddr_t); void start_vmm_on_cpu(struct cpu_info *); void stop_vmm_on_cpu(struct cpu_info *); +typedef u_int64_t pd_entry_t; + #endif /* _KERNEL */ #endif /* ! _MACHINE_VMMVAR_H_ */ -- 2.20.1