From c4fd4c5b29fc2f24970f3ce1ba4877296028afcf Mon Sep 17 00:00:00 2001 From: dv Date: Wed, 10 Jul 2024 09:27:32 +0000 Subject: [PATCH] Split vmd into mi/md parts. Makes as much of the core of vmd mi, pushing x86-isms into separate compilation units. Adds build logic for arm64, but no emulation yet. (You can build vmd, but it won't have a vmm device to connect to.) Some more cleanup probably needed around interrupt controller abstraction, but that can come as we implement more than the i8259. ok mlarkin@ --- sys/arch/amd64/include/vmmvar.h | 23 +- usr.sbin/vmctl/Makefile | 4 +- usr.sbin/vmd/Makefile | 23 +- usr.sbin/vmd/i8253.c | 5 +- usr.sbin/vmd/mc146818.c | 5 +- usr.sbin/vmd/mmio.c | 1046 -------------------------- usr.sbin/vmd/ns8250.c | 7 +- usr.sbin/vmd/pci.c | 16 +- usr.sbin/vmd/pci.h | 11 +- usr.sbin/vmd/vioblk.c | 6 +- usr.sbin/vmd/virtio.c | 23 +- usr.sbin/vmd/virtio.h | 6 +- usr.sbin/vmd/vm.c | 1247 ++----------------------------- usr.sbin/vmd/vmd.c | 131 +--- usr.sbin/vmd/vmd.h | 47 +- usr.sbin/vmd/vmm.c | 6 +- 16 files changed, 155 insertions(+), 2451 deletions(-) diff --git a/sys/arch/amd64/include/vmmvar.h b/sys/arch/amd64/include/vmmvar.h index 50c1f15b0a3..b67d3f0a7ba 100644 --- a/sys/arch/amd64/include/vmmvar.h +++ b/sys/arch/amd64/include/vmmvar.h @@ -1,4 +1,4 @@ -/* $OpenBSD: vmmvar.h,v 1.102 2024/07/09 09:31:37 dv Exp $ */ +/* $OpenBSD: vmmvar.h,v 1.103 2024/07/10 09:27:32 dv Exp $ */ /* * Copyright (c) 2014 Mike Larkin * @@ -88,15 +88,15 @@ #define VMX_EXIT_XSAVES 63 #define VMX_EXIT_XRSTORS 64 +#define VM_EXIT_TERMINATED 0xFFFE +#define VM_EXIT_NONE 0xFFFF + /* * VMX: Misc defines */ #define VMX_MAX_CR3_TARGETS 256 #define VMX_VMCS_PA_CLEAR 0xFFFFFFFFFFFFFFFFUL -#define VM_EXIT_TERMINATED 0xFFFE -#define VM_EXIT_NONE 0xFFFF - /* * SVM: Intercept codes (exit reasons) */ @@ -473,20 +473,6 @@ struct vm_intr_params { uint16_t vip_intr; }; -#define VM_RWVMPARAMS_PVCLOCK_SYSTEM_GPA 0x1 /* read/write pvclock gpa */ -#define VM_RWVMPARAMS_PVCLOCK_VERSION 0x2 /* read/write pvclock version */ -#define VM_RWVMPARAMS_ALL (VM_RWVMPARAMS_PVCLOCK_SYSTEM_GPA | \ - VM_RWVMPARAMS_PVCLOCK_VERSION) - -struct vm_rwvmparams_params { - /* Input parameters to VMM_IOC_READVMPARAMS/VMM_IOC_WRITEVMPARAMS */ - uint32_t vpp_vm_id; - uint32_t vpp_vcpu_id; - uint32_t vpp_mask; - paddr_t vpp_pvclock_system_gpa; - uint32_t vpp_pvclock_version; -}; - #define VM_RWREGS_GPRS 0x1 /* read/write GPRs */ #define VM_RWREGS_SREGS 0x2 /* read/write segment registers */ #define VM_RWREGS_CRS 0x4 /* read/write CRs */ @@ -936,7 +922,6 @@ int vm_impl_init(struct vm *, struct proc *); void vm_impl_deinit(struct vm *); int vcpu_init(struct vcpu *); void vcpu_deinit(struct vcpu *); -int vm_rwvmparams(struct vm_rwvmparams_params *, int); int vm_rwregs(struct vm_rwregs_params *, int); int vcpu_reset_regs(struct vcpu *, struct vcpu_reg_state *); diff --git a/usr.sbin/vmctl/Makefile b/usr.sbin/vmctl/Makefile index 4dab88fba7b..05359639a3b 100644 --- a/usr.sbin/vmctl/Makefile +++ b/usr.sbin/vmctl/Makefile @@ -1,6 +1,6 @@ -# $OpenBSD: Makefile,v 1.6 2019/01/18 01:24:07 pd Exp $ +# $OpenBSD: Makefile,v 1.7 2024/07/10 09:27:33 dv Exp $ -.if ${MACHINE} == "amd64" +.if ${MACHINE} == "amd64" || ${MACHINE} == "arm64" .PATH: ${.CURDIR}/../vmd diff --git a/usr.sbin/vmd/Makefile b/usr.sbin/vmd/Makefile index 3fbb9d086b1..22c1e887823 100644 --- a/usr.sbin/vmd/Makefile +++ b/usr.sbin/vmd/Makefile @@ -1,13 +1,20 @@ -# $OpenBSD: Makefile,v 1.29 2023/04/27 22:47:27 dv Exp $ +# $OpenBSD: Makefile,v 1.30 2024/07/10 09:27:33 dv Exp $ -.if ${MACHINE} == "amd64" +.if ${MACHINE} == "amd64" || ${MACHINE} == "arm64" PROG= vmd -SRCS= vmd.c control.c log.c priv.c proc.c config.c vmm.c -SRCS+= vm.c loadfile_elf.c pci.c virtio.c i8259.c mc146818.c -SRCS+= ns8250.c i8253.c dhcp.c packet.c mmio.c -SRCS+= parse.y atomicio.c vioscsi.c vioraw.c vioqcow2.c fw_cfg.c -SRCS+= vm_agentx.c vioblk.c vionet.c +SRCS= vmd.c control.c log.c priv.c proc.c config.c vmm.c vm.c +SRCS+= pci.c virtio.c dhcp.c packet.c parse.y atomicio.c +SRCS+= vioscsi.c vioraw.c vioqcow2.c vm_agentx.c vioblk.c +SRCS+= vionet.c + +.if ${MACHINE} == "amd64" +SRCS+= i8253.c i8259.c fw_cfg.c loadfile_elf.c mc146818.c ns8250.c +SRCS+= x86_vm.c x86_mmio.c +.endif # amd64 +.if ${MACHINE} == "arm64" +SRCS+= arm64_vm.c +.endif # arm64 CFLAGS+= -Wall -I${.CURDIR} CFLAGS+= -Wstrict-prototypes -Wmissing-prototypes @@ -24,7 +31,7 @@ YFLAGS= NOPROG= yes -.endif +.endif # amd64 or arm64 MAN= vmd.8 vm.conf.5 diff --git a/usr.sbin/vmd/i8253.c b/usr.sbin/vmd/i8253.c index ac9855e38be..7cea3fa3869 100644 --- a/usr.sbin/vmd/i8253.c +++ b/usr.sbin/vmd/i8253.c @@ -1,4 +1,4 @@ -/* $OpenBSD: i8253.c,v 1.40 2024/07/09 09:31:37 dv Exp $ */ +/* $OpenBSD: i8253.c,v 1.41 2024/07/10 09:27:33 dv Exp $ */ /* * Copyright (c) 2016 Mike Larkin * @@ -29,7 +29,6 @@ #include "i8253.h" #include "vmd.h" -#include "vmm.h" #include "atomicio.h" extern char *__progname; @@ -369,7 +368,7 @@ i8253_fire(int fd, short type, void *arg) struct timeval tv; struct i8253_channel *ctr = (struct i8253_channel *)arg; - vcpu_assert_pic_irq(ctr->vm_id, 0, 0); + vcpu_assert_irq(ctr->vm_id, 0, 0); if (ctr->mode != TIMER_INTTC) { timerclear(&tv); diff --git a/usr.sbin/vmd/mc146818.c b/usr.sbin/vmd/mc146818.c index 660c625ebeb..62fc6459a8f 100644 --- a/usr.sbin/vmd/mc146818.c +++ b/usr.sbin/vmd/mc146818.c @@ -1,4 +1,4 @@ -/* $OpenBSD: mc146818.c,v 1.28 2024/07/09 09:31:37 dv Exp $ */ +/* $OpenBSD: mc146818.c,v 1.29 2024/07/10 09:27:33 dv Exp $ */ /* * Copyright (c) 2016 Mike Larkin * @@ -31,7 +31,6 @@ #include "mc146818.h" #include "virtio.h" #include "vmd.h" -#include "vmm.h" #define MC_RATE_MASK 0xf @@ -148,7 +147,7 @@ rtc_fireper(int fd, short type, void *arg) { rtc.regs[MC_REGC] |= MC_REGC_PF; - vcpu_assert_pic_irq((ptrdiff_t)arg, 0, 8); + vcpu_assert_irq((ptrdiff_t)arg, 0, 8); evtimer_add(&rtc.per, &rtc.per_tv); } diff --git a/usr.sbin/vmd/mmio.c b/usr.sbin/vmd/mmio.c index c5a189d5b85..e69de29bb2d 100644 --- a/usr.sbin/vmd/mmio.c +++ b/usr.sbin/vmd/mmio.c @@ -1,1046 +0,0 @@ -/* $OpenBSD: mmio.c,v 1.3 2024/02/10 12:31:16 dv Exp $ */ - -/* - * Copyright (c) 2022 Dave Voutila - * - * Permission to use, copy, modify, and distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#include -#include - -#include -#include - -#include "vmd.h" -#include "mmio.h" - -#define MMIO_DEBUG 0 - -extern char* __progname; - -struct x86_decode_state { - uint8_t s_bytes[15]; - size_t s_len; - size_t s_idx; -}; - -enum decode_result { - DECODE_ERROR = 0, /* Something went wrong. */ - DECODE_DONE, /* Decode success and no more work needed. */ - DECODE_MORE, /* Decode success and more work required. */ -}; - -static const char *str_cpu_mode(int); -static const char *str_decode_res(enum decode_result); -static const char *str_opcode(struct x86_opcode *); -static const char *str_operand_enc(struct x86_opcode *); -static const char *str_reg(int); -static const char *str_sreg(int); -static int detect_cpu_mode(struct vcpu_reg_state *); - -static enum decode_result decode_prefix(struct x86_decode_state *, - struct x86_insn *); -static enum decode_result decode_opcode(struct x86_decode_state *, - struct x86_insn *); -static enum decode_result decode_modrm(struct x86_decode_state *, - struct x86_insn *); -static int get_modrm_reg(struct x86_insn *); -static int get_modrm_addr(struct x86_insn *, struct vcpu_reg_state *vrs); -static enum decode_result decode_disp(struct x86_decode_state *, - struct x86_insn *); -static enum decode_result decode_sib(struct x86_decode_state *, - struct x86_insn *); -static enum decode_result decode_imm(struct x86_decode_state *, - struct x86_insn *); - -static enum decode_result peek_byte(struct x86_decode_state *, uint8_t *); -static enum decode_result next_byte(struct x86_decode_state *, uint8_t *); -static enum decode_result next_value(struct x86_decode_state *, size_t, - uint64_t *); -static int is_valid_state(struct x86_decode_state *, const char *); - -static int emulate_mov(struct x86_insn *, struct vm_exit *); -static int emulate_movzx(struct x86_insn *, struct vm_exit *); - -/* Lookup table for 1-byte opcodes, in opcode alphabetical order. */ -const enum x86_opcode_type x86_1byte_opcode_tbl[255] = { - /* MOV */ - [0x88] = OP_MOV, - [0x89] = OP_MOV, - [0x8A] = OP_MOV, - [0x8B] = OP_MOV, - [0x8C] = OP_MOV, - [0xA0] = OP_MOV, - [0xA1] = OP_MOV, - [0xA2] = OP_MOV, - [0xA3] = OP_MOV, - - /* MOVS */ - [0xA4] = OP_UNSUPPORTED, - [0xA5] = OP_UNSUPPORTED, - - [ESCAPE] = OP_TWO_BYTE, -}; - -/* Lookup table for 1-byte operand encodings, in opcode alphabetical order. */ -const enum x86_operand_enc x86_1byte_operand_enc_tbl[255] = { - /* MOV */ - [0x88] = OP_ENC_MR, - [0x89] = OP_ENC_MR, - [0x8A] = OP_ENC_RM, - [0x8B] = OP_ENC_RM, - [0x8C] = OP_ENC_MR, - [0xA0] = OP_ENC_FD, - [0xA1] = OP_ENC_FD, - [0xA2] = OP_ENC_TD, - [0xA3] = OP_ENC_TD, - - /* MOVS */ - [0xA4] = OP_ENC_ZO, - [0xA5] = OP_ENC_ZO, -}; - -const enum x86_opcode_type x86_2byte_opcode_tbl[255] = { - /* MOVZX */ - [0xB6] = OP_MOVZX, - [0xB7] = OP_MOVZX, -}; - -const enum x86_operand_enc x86_2byte_operand_enc_table[255] = { - /* MOVZX */ - [0xB6] = OP_ENC_RM, - [0xB7] = OP_ENC_RM, -}; - -/* - * peek_byte - * - * Fetch the next byte fron the instruction bytes without advancing the - * position in the stream. - * - * Return values: - * DECODE_DONE: byte was found and is the last in the stream - * DECODE_MORE: byte was found and there are more remaining to be read - * DECODE_ERROR: state is invalid and not byte was found, *byte left unchanged - */ -static enum decode_result -peek_byte(struct x86_decode_state *state, uint8_t *byte) -{ - enum decode_result res; - - if (state == NULL) - return (DECODE_ERROR); - - if (state->s_idx == state->s_len) - return (DECODE_ERROR); - - if (state->s_idx + 1 == state->s_len) - res = DECODE_DONE; - else - res = DECODE_MORE; - - if (byte != NULL) - *byte = state->s_bytes[state->s_idx]; - return (res); -} - -/* - * next_byte - * - * Fetch the next byte fron the instruction bytes, advancing the position in the - * stream and mutating decode state. - * - * Return values: - * DECODE_DONE: byte was found and is the last in the stream - * DECODE_MORE: byte was found and there are more remaining to be read - * DECODE_ERROR: state is invalid and not byte was found, *byte left unchanged - */ -static enum decode_result -next_byte(struct x86_decode_state *state, uint8_t *byte) -{ - uint8_t next; - - /* Cheat and see if we're going to fail. */ - if (peek_byte(state, &next) == DECODE_ERROR) - return (DECODE_ERROR); - - if (byte != NULL) - *byte = next; - state->s_idx++; - - return (state->s_idx < state->s_len ? DECODE_MORE : DECODE_DONE); -} - -/* - * Fetch the next `n' bytes as a single uint64_t value. - */ -static enum decode_result -next_value(struct x86_decode_state *state, size_t n, uint64_t *value) -{ - uint8_t bytes[8]; - size_t i; - enum decode_result res; - - if (value == NULL) - return (DECODE_ERROR); - - if (n == 0 || n > sizeof(bytes)) - return (DECODE_ERROR); - - memset(bytes, 0, sizeof(bytes)); - for (i = 0; i < n; i++) - if ((res = next_byte(state, &bytes[i])) == DECODE_ERROR) - return (DECODE_ERROR); - - *value = *((uint64_t*)bytes); - - return (res); -} - -/* - * is_valid_state - * - * Validate the decode state looks viable. - * - * Returns: - * 1: if state is valid - * 0: if an invariant is detected - */ -static int -is_valid_state(struct x86_decode_state *state, const char *fn_name) -{ - const char *s = (fn_name != NULL) ? fn_name : __func__; - - if (state == NULL) { - log_warnx("%s: null state", s); - return (0); - } - if (state->s_len > sizeof(state->s_bytes)) { - log_warnx("%s: invalid length", s); - return (0); - } - if (state->s_idx + 1 > state->s_len) { - log_warnx("%s: invalid index", s); - return (0); - } - - return (1); -} - -#ifdef MMIO_DEBUG -static void -dump_regs(struct vcpu_reg_state *vrs) -{ - size_t i; - struct vcpu_segment_info *vsi; - - for (i = 0; i < VCPU_REGS_NGPRS; i++) - log_info("%s: %s 0x%llx", __progname, str_reg(i), - vrs->vrs_gprs[i]); - - for (i = 0; i < VCPU_REGS_NSREGS; i++) { - vsi = &vrs->vrs_sregs[i]; - log_info("%s: %s { sel: 0x%04x, lim: 0x%08x, ar: 0x%08x, " - "base: 0x%llx }", __progname, str_sreg(i), - vsi->vsi_sel, vsi->vsi_limit, vsi->vsi_ar, vsi->vsi_base); - } -} - -static void -dump_insn(struct x86_insn *insn) -{ - log_info("instruction { %s, enc=%s, len=%d, mod=0x%02x, (" - "reg=%s, addr=0x%lx) sib=0x%02x }", - str_opcode(&insn->insn_opcode), - str_operand_enc(&insn->insn_opcode), insn->insn_bytes_len, - insn->insn_modrm, str_reg(insn->insn_reg), - insn->insn_gva, insn->insn_sib); -} -#endif /* MMIO_DEBUG */ - -static const char * -str_cpu_mode(int mode) -{ - switch (mode) { - case VMM_CPU_MODE_REAL: return "REAL"; - case VMM_CPU_MODE_PROT: return "PROT"; - case VMM_CPU_MODE_PROT32: return "PROT32"; - case VMM_CPU_MODE_COMPAT: return "COMPAT"; - case VMM_CPU_MODE_LONG: return "LONG"; - default: return "UKNOWN"; - } -} - -__unused static const char * -str_decode_res(enum decode_result res) { - switch (res) { - case DECODE_DONE: return "DONE"; - case DECODE_MORE: return "MORE"; - case DECODE_ERROR: return "ERROR"; - default: return "UNKNOWN"; - } -} - -static const char * -str_opcode(struct x86_opcode *opcode) -{ - switch (opcode->op_type) { - case OP_IN: return "IN"; - case OP_INS: return "INS"; - case OP_MOV: return "MOV"; - case OP_MOVZX: return "MOVZX"; - case OP_OUT: return "OUT"; - case OP_OUTS: return "OUTS"; - case OP_UNSUPPORTED: return "UNSUPPORTED"; - default: return "UNKNOWN"; - } -} - -static const char * -str_operand_enc(struct x86_opcode *opcode) -{ - switch (opcode->op_encoding) { - case OP_ENC_I: return "I"; - case OP_ENC_MI: return "MI"; - case OP_ENC_MR: return "MR"; - case OP_ENC_RM: return "RM"; - case OP_ENC_FD: return "FD"; - case OP_ENC_TD: return "TD"; - case OP_ENC_OI: return "OI"; - case OP_ENC_ZO: return "ZO"; - default: return "UNKNOWN"; - } -} - -static const char * -str_reg(int reg) { - switch (reg) { - case VCPU_REGS_RAX: return "RAX"; - case VCPU_REGS_RCX: return "RCX"; - case VCPU_REGS_RDX: return "RDX"; - case VCPU_REGS_RBX: return "RBX"; - case VCPU_REGS_RSI: return "RSI"; - case VCPU_REGS_RDI: return "RDI"; - case VCPU_REGS_R8: return " R8"; - case VCPU_REGS_R9: return " R9"; - case VCPU_REGS_R10: return "R10"; - case VCPU_REGS_R11: return "R11"; - case VCPU_REGS_R12: return "R12"; - case VCPU_REGS_R13: return "R13"; - case VCPU_REGS_R14: return "R14"; - case VCPU_REGS_R15: return "R15"; - case VCPU_REGS_RSP: return "RSP"; - case VCPU_REGS_RBP: return "RBP"; - case VCPU_REGS_RIP: return "RIP"; - case VCPU_REGS_RFLAGS: return "RFLAGS"; - default: return "UNKNOWN"; - } -} - -static const char * -str_sreg(int sreg) { - switch (sreg) { - case VCPU_REGS_CS: return "CS"; - case VCPU_REGS_DS: return "DS"; - case VCPU_REGS_ES: return "ES"; - case VCPU_REGS_FS: return "FS"; - case VCPU_REGS_GS: return "GS"; - case VCPU_REGS_SS: return "GS"; - case VCPU_REGS_LDTR: return "LDTR"; - case VCPU_REGS_TR: return "TR"; - default: return "UKNOWN"; - } -} - -static int -detect_cpu_mode(struct vcpu_reg_state *vrs) -{ - uint64_t cr0, cr4, cs, efer, rflags; - - /* Is protected mode enabled? */ - cr0 = vrs->vrs_crs[VCPU_REGS_CR0]; - if (!(cr0 & CR0_PE)) - return (VMM_CPU_MODE_REAL); - - cr4 = vrs->vrs_crs[VCPU_REGS_CR4]; - cs = vrs->vrs_sregs[VCPU_REGS_CS].vsi_ar; - efer = vrs->vrs_msrs[VCPU_REGS_EFER]; - rflags = vrs->vrs_gprs[VCPU_REGS_RFLAGS]; - - /* Check for Long modes. */ - if ((efer & EFER_LME) && (cr4 & CR4_PAE) && (cr0 & CR0_PG)) { - if (cs & CS_L) { - /* Long Modes */ - if (!(cs & CS_D)) - return (VMM_CPU_MODE_LONG); - log_warnx("%s: invalid cpu mode", __progname); - return (VMM_CPU_MODE_UNKNOWN); - } else { - /* Compatibility Modes */ - if (cs & CS_D) /* XXX Add Compat32 mode */ - return (VMM_CPU_MODE_UNKNOWN); - return (VMM_CPU_MODE_COMPAT); - } - } - - /* Check for 32-bit Protected Mode. */ - if (cs & CS_D) - return (VMM_CPU_MODE_PROT32); - - /* Check for virtual 8086 mode. */ - if (rflags & EFLAGS_VM) { - /* XXX add Virtual8086 mode */ - log_warnx("%s: Virtual 8086 mode", __progname); - return (VMM_CPU_MODE_UNKNOWN); - } - - /* Can't determine mode. */ - log_warnx("%s: invalid cpu mode", __progname); - return (VMM_CPU_MODE_UNKNOWN); -} - -static enum decode_result -decode_prefix(struct x86_decode_state *state, struct x86_insn *insn) -{ - enum decode_result res = DECODE_ERROR; - struct x86_prefix *prefix; - uint8_t byte; - - if (!is_valid_state(state, __func__) || insn == NULL) - return (-1); - - prefix = &insn->insn_prefix; - memset(prefix, 0, sizeof(*prefix)); - - /* - * Decode prefixes. The last of its kind wins. The behavior is undefined - * in the Intel SDM (see Vol 2, 2.1.1 Instruction Prefixes.) - */ - while ((res = peek_byte(state, &byte)) != DECODE_ERROR) { - switch (byte) { - case LEG_1_LOCK: - case LEG_1_REPNE: - case LEG_1_REP: - prefix->pfx_group1 = byte; - break; - case LEG_2_CS: - case LEG_2_SS: - case LEG_2_DS: - case LEG_2_ES: - case LEG_2_FS: - case LEG_2_GS: - prefix->pfx_group2 = byte; - break; - case LEG_3_OPSZ: - prefix->pfx_group3 = byte; - break; - case LEG_4_ADDRSZ: - prefix->pfx_group4 = byte; - break; - case REX_BASE...REX_BASE + 0x0F: - if (insn->insn_cpu_mode == VMM_CPU_MODE_LONG) - prefix->pfx_rex = byte; - else /* INC encountered */ - return (DECODE_ERROR); - break; - case VEX_2_BYTE: - case VEX_3_BYTE: - log_warnx("%s: VEX not supported", __func__); - return (DECODE_ERROR); - default: - /* Something other than a valid prefix. */ - return (DECODE_MORE); - } - /* Advance our position. */ - next_byte(state, NULL); - } - - return (res); -} - -static enum decode_result -decode_modrm(struct x86_decode_state *state, struct x86_insn *insn) -{ - enum decode_result res; - uint8_t byte = 0; - - if (!is_valid_state(state, __func__) || insn == NULL) - return (DECODE_ERROR); - - insn->insn_modrm_valid = 0; - - /* Check the operand encoding to see if we fetch a byte or abort. */ - switch (insn->insn_opcode.op_encoding) { - case OP_ENC_MR: - case OP_ENC_RM: - case OP_ENC_MI: - res = next_byte(state, &byte); - if (res == DECODE_ERROR) { - log_warnx("%s: failed to get modrm byte", __func__); - break; - } - insn->insn_modrm = byte; - insn->insn_modrm_valid = 1; - break; - - case OP_ENC_I: - case OP_ENC_OI: - log_warnx("%s: instruction does not need memory assist", - __func__); - res = DECODE_ERROR; - break; - - default: - /* Peek to see if we're done decode. */ - res = peek_byte(state, NULL); - } - - return (res); -} - -static int -get_modrm_reg(struct x86_insn *insn) -{ - if (insn == NULL) - return (-1); - - if (insn->insn_modrm_valid) { - switch (MODRM_REGOP(insn->insn_modrm)) { - case 0: - insn->insn_reg = VCPU_REGS_RAX; - break; - case 1: - insn->insn_reg = VCPU_REGS_RCX; - break; - case 2: - insn->insn_reg = VCPU_REGS_RDX; - break; - case 3: - insn->insn_reg = VCPU_REGS_RBX; - break; - case 4: - insn->insn_reg = VCPU_REGS_RSP; - break; - case 5: - insn->insn_reg = VCPU_REGS_RBP; - break; - case 6: - insn->insn_reg = VCPU_REGS_RSI; - break; - case 7: - insn->insn_reg = VCPU_REGS_RDI; - break; - } - } - - /* REX R bit selects extended registers in LONG mode. */ - if (insn->insn_prefix.pfx_rex & REX_R) - insn->insn_reg += 8; - - return (0); -} - -static int -get_modrm_addr(struct x86_insn *insn, struct vcpu_reg_state *vrs) -{ - uint8_t mod, rm; - vaddr_t addr = 0x0UL; - - if (insn == NULL || vrs == NULL) - return (-1); - - if (insn->insn_modrm_valid) { - rm = MODRM_RM(insn->insn_modrm); - mod = MODRM_MOD(insn->insn_modrm); - - switch (rm) { - case 0b000: - addr = vrs->vrs_gprs[VCPU_REGS_RAX]; - break; - case 0b001: - addr = vrs->vrs_gprs[VCPU_REGS_RCX]; - break; - case 0b010: - addr = vrs->vrs_gprs[VCPU_REGS_RDX]; - break; - case 0b011: - addr = vrs->vrs_gprs[VCPU_REGS_RBX]; - break; - case 0b100: - if (mod == 0b11) - addr = vrs->vrs_gprs[VCPU_REGS_RSP]; - break; - case 0b101: - if (mod != 0b00) - addr = vrs->vrs_gprs[VCPU_REGS_RBP]; - break; - case 0b110: - addr = vrs->vrs_gprs[VCPU_REGS_RSI]; - break; - case 0b111: - addr = vrs->vrs_gprs[VCPU_REGS_RDI]; - break; - } - - insn->insn_gva = addr; - } - - return (0); -} - -static enum decode_result -decode_disp(struct x86_decode_state *state, struct x86_insn *insn) -{ - enum decode_result res = DECODE_ERROR; - uint64_t disp = 0; - - if (!is_valid_state(state, __func__) || insn == NULL) - return (DECODE_ERROR); - - if (!insn->insn_modrm_valid) - return (DECODE_ERROR); - - switch (MODRM_MOD(insn->insn_modrm)) { - case 0x00: - insn->insn_disp_type = DISP_0; - res = DECODE_MORE; - break; - case 0x01: - insn->insn_disp_type = DISP_1; - res = next_value(state, 1, &disp); - if (res == DECODE_ERROR) - return (res); - insn->insn_disp = disp; - break; - case 0x02: - if (insn->insn_prefix.pfx_group4 == LEG_4_ADDRSZ) { - insn->insn_disp_type = DISP_2; - res = next_value(state, 2, &disp); - } else { - insn->insn_disp_type = DISP_4; - res = next_value(state, 4, &disp); - } - if (res == DECODE_ERROR) - return (res); - insn->insn_disp = disp; - break; - default: - insn->insn_disp_type = DISP_NONE; - res = DECODE_MORE; - } - - return (res); -} - -static enum decode_result -decode_opcode(struct x86_decode_state *state, struct x86_insn *insn) -{ - enum decode_result res; - enum x86_opcode_type type; - enum x86_operand_enc enc; - struct x86_opcode *opcode = &insn->insn_opcode; - uint8_t byte, byte2; - - if (!is_valid_state(state, __func__) || insn == NULL) - return (-1); - - memset(opcode, 0, sizeof(*opcode)); - - res = next_byte(state, &byte); - if (res == DECODE_ERROR) - return (res); - - type = x86_1byte_opcode_tbl[byte]; - switch(type) { - case OP_UNKNOWN: - case OP_UNSUPPORTED: - log_warnx("%s: unsupported opcode", __func__); - return (DECODE_ERROR); - - case OP_TWO_BYTE: - res = next_byte(state, &byte2); - if (res == DECODE_ERROR) - return (res); - - type = x86_2byte_opcode_tbl[byte2]; - if (type == OP_UNKNOWN || type == OP_UNSUPPORTED) { - log_warnx("%s: unsupported 2-byte opcode", __func__); - return (DECODE_ERROR); - } - - opcode->op_bytes[0] = byte; - opcode->op_bytes[1] = byte2; - opcode->op_bytes_len = 2; - enc = x86_2byte_operand_enc_table[byte2]; - break; - - default: - /* We've potentially got a known 1-byte opcode. */ - opcode->op_bytes[0] = byte; - opcode->op_bytes_len = 1; - enc = x86_1byte_operand_enc_tbl[byte]; - } - - if (enc == OP_ENC_UNKNOWN) - return (DECODE_ERROR); - - opcode->op_type = type; - opcode->op_encoding = enc; - - return (res); -} - -static enum decode_result -decode_sib(struct x86_decode_state *state, struct x86_insn *insn) -{ - enum decode_result res; - uint8_t byte; - - if (!is_valid_state(state, __func__) || insn == NULL) - return (-1); - - /* SIB is optional, so assume we will be continuing. */ - res = DECODE_MORE; - - insn->insn_sib_valid = 0; - if (!insn->insn_modrm_valid) - return (res); - - /* XXX is SIB valid in all cpu modes? */ - if (MODRM_RM(insn->insn_modrm) == 0b100) { - res = next_byte(state, &byte); - if (res != DECODE_ERROR) { - insn->insn_sib_valid = 1; - insn->insn_sib = byte; - } - } - - return (res); -} - -static enum decode_result -decode_imm(struct x86_decode_state *state, struct x86_insn *insn) -{ - enum decode_result res; - size_t num_bytes; - uint64_t value; - - if (!is_valid_state(state, __func__) || insn == NULL) - return (DECODE_ERROR); - - /* Only handle MI encoded instructions. Others shouldn't need assist. */ - if (insn->insn_opcode.op_encoding != OP_ENC_MI) - return (DECODE_DONE); - - /* Exceptions related to MOV instructions. */ - if (insn->insn_opcode.op_type == OP_MOV) { - switch (insn->insn_opcode.op_bytes[0]) { - case 0xC6: - num_bytes = 1; - break; - case 0xC7: - if (insn->insn_cpu_mode == VMM_CPU_MODE_REAL) - num_bytes = 2; - else - num_bytes = 4; - break; - default: - log_warnx("%s: cannot decode immediate bytes for MOV", - __func__); - return (DECODE_ERROR); - } - } else { - /* Fallback to interpreting based on cpu mode and REX. */ - if (insn->insn_cpu_mode == VMM_CPU_MODE_REAL) - num_bytes = 2; - else if (insn->insn_prefix.pfx_rex == REX_NONE) - num_bytes = 4; - else - num_bytes = 8; - } - - res = next_value(state, num_bytes, &value); - if (res != DECODE_ERROR) { - insn->insn_immediate = value; - insn->insn_immediate_len = num_bytes; - } - - return (res); -} - - -/* - * insn_decode - * - * Decode an x86 instruction from the provided instruction bytes. - * - * Return values: - * 0: successful decode - * Non-zero: an exception occurred during decode - */ -int -insn_decode(struct vm_exit *exit, struct x86_insn *insn) -{ - enum decode_result res; - struct vcpu_reg_state *vrs = &exit->vrs; - struct x86_decode_state state; - uint8_t *bytes, len; - int mode; - - if (exit == NULL || insn == NULL) { - log_warnx("%s: invalid input", __func__); - return (DECODE_ERROR); - } - - bytes = exit->vee.vee_insn_bytes; - len = exit->vee.vee_insn_len; - - /* 0. Initialize state and instruction objects. */ - memset(insn, 0, sizeof(*insn)); - memset(&state, 0, sizeof(state)); - state.s_len = len; - memcpy(&state.s_bytes, bytes, len); - - /* 1. Detect CPU mode. */ - mode = detect_cpu_mode(vrs); - if (mode == VMM_CPU_MODE_UNKNOWN) { - log_warnx("%s: failed to identify cpu mode", __func__); -#ifdef MMIO_DEBUG - dump_regs(vrs); -#endif - return (-1); - } - insn->insn_cpu_mode = mode; - -#ifdef MMIO_DEBUG - log_info("%s: cpu mode %s detected", __progname, str_cpu_mode(mode)); - printf("%s: got bytes: [ ", __progname); - for (int i = 0; i < len; i++) { - printf("%02x ", bytes[i]); - } - printf("]\n"); -#endif - /* 2. Decode prefixes. */ - res = decode_prefix(&state, insn); - if (res == DECODE_ERROR) { - log_warnx("%s: error decoding prefixes", __func__); - goto err; - } else if (res == DECODE_DONE) - goto done; - -#ifdef MMIO_DEBUG - log_info("%s: prefixes {g1: 0x%02x, g2: 0x%02x, g3: 0x%02x, g4: 0x%02x," - " rex: 0x%02x }", __progname, insn->insn_prefix.pfx_group1, - insn->insn_prefix.pfx_group2, insn->insn_prefix.pfx_group3, - insn->insn_prefix.pfx_group4, insn->insn_prefix.pfx_rex); -#endif - - /* 3. Pick apart opcode. Here we can start short-circuiting. */ - res = decode_opcode(&state, insn); - if (res == DECODE_ERROR) { - log_warnx("%s: error decoding opcode", __func__); - goto err; - } else if (res == DECODE_DONE) - goto done; - -#ifdef MMIO_DEBUG - log_info("%s: found opcode %s (operand encoding %s) (%s)", __progname, - str_opcode(&insn->insn_opcode), str_operand_enc(&insn->insn_opcode), - str_decode_res(res)); -#endif - - /* Process optional ModR/M byte. */ - res = decode_modrm(&state, insn); - if (res == DECODE_ERROR) { - log_warnx("%s: error decoding modrm", __func__); - goto err; - } - if (get_modrm_addr(insn, vrs) != 0) - goto err; - if (get_modrm_reg(insn) != 0) - goto err; - if (res == DECODE_DONE) - goto done; - -#ifdef MMIO_DEBUG - if (insn->insn_modrm_valid) - log_info("%s: found ModRM 0x%02x (%s)", __progname, - insn->insn_modrm, str_decode_res(res)); -#endif - - /* Process optional SIB byte. */ - res = decode_sib(&state, insn); - if (res == DECODE_ERROR) { - log_warnx("%s: error decoding sib", __func__); - goto err; - } else if (res == DECODE_DONE) - goto done; - -#ifdef MMIO_DEBUG - if (insn->insn_sib_valid) - log_info("%s: found SIB 0x%02x (%s)", __progname, - insn->insn_sib, str_decode_res(res)); -#endif - - /* Process any Displacement bytes. */ - res = decode_disp(&state, insn); - if (res == DECODE_ERROR) { - log_warnx("%s: error decoding displacement", __func__); - goto err; - } else if (res == DECODE_DONE) - goto done; - - /* Process any Immediate data bytes. */ - res = decode_imm(&state, insn); - if (res == DECODE_ERROR) { - log_warnx("%s: error decoding immediate bytes", __func__); - goto err; - } - -done: - insn->insn_bytes_len = state.s_idx; - -#ifdef MMIO_DEBUG - log_info("%s: final instruction length is %u", __func__, - insn->insn_bytes_len); - dump_insn(insn); - log_info("%s: modrm: {mod: %d, regop: %d, rm: %d}", __func__, - MODRM_MOD(insn->insn_modrm), MODRM_REGOP(insn->insn_modrm), - MODRM_RM(insn->insn_modrm)); - dump_regs(vrs); -#endif /* MMIO_DEBUG */ - return (0); - -err: -#ifdef MMIO_DEBUG - dump_insn(insn); - log_info("%s: modrm: {mod: %d, regop: %d, rm: %d}", __func__, - MODRM_MOD(insn->insn_modrm), MODRM_REGOP(insn->insn_modrm), - MODRM_RM(insn->insn_modrm)); - dump_regs(vrs); -#endif /* MMIO_DEBUG */ - return (-1); -} - -static int -emulate_mov(struct x86_insn *insn, struct vm_exit *exit) -{ - /* XXX Only supports read to register for now */ - if (insn->insn_opcode.op_encoding != OP_ENC_RM) - return (-1); - - /* XXX No device emulation yet. Fill with 0xFFs. */ - exit->vrs.vrs_gprs[insn->insn_reg] = 0xFFFFFFFFFFFFFFFF; - - return (0); -} - -static int -emulate_movzx(struct x86_insn *insn, struct vm_exit *exit) -{ - uint8_t byte, len, src = 1, dst = 2; - uint64_t value = 0; - - /* Only RM is valid for MOVZX. */ - if (insn->insn_opcode.op_encoding != OP_ENC_RM) { - log_warnx("invalid op encoding for MOVZX: %d", - insn->insn_opcode.op_encoding); - return (-1); - } - - len = insn->insn_opcode.op_bytes_len; - if (len < 1 || len > sizeof(insn->insn_opcode.op_bytes)) { - log_warnx("invalid opcode byte length: %d", len); - return (-1); - } - - byte = insn->insn_opcode.op_bytes[len - 1]; - switch (byte) { - case 0xB6: - src = 1; - if (insn->insn_cpu_mode == VMM_CPU_MODE_PROT - || insn->insn_cpu_mode == VMM_CPU_MODE_REAL) - dst = 2; - else if (insn->insn_prefix.pfx_rex == REX_NONE) - dst = 4; - else // XXX validate CPU mode - dst = 8; - break; - case 0xB7: - src = 2; - if (insn->insn_prefix.pfx_rex == REX_NONE) - dst = 4; - else // XXX validate CPU mode - dst = 8; - break; - default: - log_warnx("invalid byte in MOVZX opcode: %x", byte); - return (-1); - } - - if (dst == 4) - exit->vrs.vrs_gprs[insn->insn_reg] &= 0xFFFFFFFF00000000; - else - exit->vrs.vrs_gprs[insn->insn_reg] = 0x0UL; - - /* XXX No device emulation yet. Fill with 0xFFs. */ - switch (src) { - case 1: value = 0xFF; break; - case 2: value = 0xFFFF; break; - case 4: value = 0xFFFFFFFF; break; - case 8: value = 0xFFFFFFFFFFFFFFFF; break; - default: - log_warnx("invalid source size: %d", src); - return (-1); - } - - exit->vrs.vrs_gprs[insn->insn_reg] |= value; - - return (0); -} - -/* - * insn_emulate - * - * Returns: - * 0: success - * EINVAL: exception occurred - * EFAULT: page fault occurred, requires retry - * ENOTSUP: an unsupported instruction was provided - */ -int -insn_emulate(struct vm_exit *exit, struct x86_insn *insn) -{ - int res; - - switch (insn->insn_opcode.op_type) { - case OP_MOV: - res = emulate_mov(insn, exit); - break; - - case OP_MOVZX: - res = emulate_movzx(insn, exit); - break; - - default: - log_warnx("%s: emulation not defined for %s", __func__, - str_opcode(&insn->insn_opcode)); - res = ENOTSUP; - } - - if (res == 0) - exit->vrs.vrs_gprs[VCPU_REGS_RIP] += insn->insn_bytes_len; - - return (res); -} diff --git a/usr.sbin/vmd/ns8250.c b/usr.sbin/vmd/ns8250.c index bcb48ef95d8..17cc8bfe525 100644 --- a/usr.sbin/vmd/ns8250.c +++ b/usr.sbin/vmd/ns8250.c @@ -1,4 +1,4 @@ -/* $OpenBSD: ns8250.c,v 1.39 2024/07/09 09:31:37 dv Exp $ */ +/* $OpenBSD: ns8250.c,v 1.40 2024/07/10 09:27:33 dv Exp $ */ /* * Copyright (c) 2016 Mike Larkin * @@ -30,7 +30,6 @@ #include "atomicio.h" #include "ns8250.h" #include "vmd.h" -#include "vmm.h" extern char *__progname; struct ns8250_dev com1_dev; @@ -80,7 +79,7 @@ ratelimit(int fd, short type, void *arg) com1_dev.regs.iir |= IIR_TXRDY; com1_dev.regs.iir &= ~IIR_NOPEND; - vcpu_assert_pic_irq(com1_dev.vmid, 0, com1_dev.irq); + vcpu_assert_irq(com1_dev.vmid, 0, com1_dev.irq); mutex_unlock(&com1_dev.mutex); } @@ -157,7 +156,7 @@ com_rcv_event(int fd, short kind, void *arg) /* If pending interrupt, inject */ if ((com1_dev.regs.iir & IIR_NOPEND) == 0) { /* XXX: vcpu_id */ - vcpu_assert_pic_irq((uintptr_t)arg, 0, com1_dev.irq); + vcpu_assert_irq((uintptr_t)arg, 0, com1_dev.irq); } mutex_unlock(&com1_dev.mutex); diff --git a/usr.sbin/vmd/pci.c b/usr.sbin/vmd/pci.c index 1722baa9ea1..0dbe846fd01 100644 --- a/usr.sbin/vmd/pci.c +++ b/usr.sbin/vmd/pci.c @@ -1,4 +1,4 @@ -/* $OpenBSD: pci.c,v 1.32 2024/07/09 09:31:37 dv Exp $ */ +/* $OpenBSD: pci.c,v 1.33 2024/07/10 09:27:33 dv Exp $ */ /* * Copyright (c) 2015 Mike Larkin @@ -28,12 +28,12 @@ #include "vmd.h" #include "pci.h" -#include "vmm.h" #include "i8259.h" #include "atomicio.h" struct pci pci; +extern struct vmd_vm current_vm; extern char *__progname; /* PIC IRQs, assigned to devices in order */ @@ -86,7 +86,9 @@ pci_add_bar(uint8_t id, uint32_t type, void *barfn, void *cookie) pci.pci_devices[id].pd_bartype[bar_ct] = PCI_BAR_TYPE_MMIO; pci.pci_devices[id].pd_barsize[bar_ct] = VM_PCI_MMIO_BAR_SIZE; pci.pci_devices[id].pd_bar_ct++; - } else if (type == PCI_MAPREG_TYPE_IO) { + } +#ifdef __amd64__ + else if (type == PCI_MAPREG_TYPE_IO) { if (pci.pci_next_io_bar >= VM_PCI_IO_BAR_END) return (1); @@ -102,6 +104,7 @@ pci_add_bar(uint8_t id, uint32_t type, void *barfn, void *cookie) pci.pci_devices[id].pd_barsize[bar_ct] = VM_PCI_IO_BAR_SIZE; pci.pci_devices[id].pd_bar_ct++; } +#endif /* __amd64__ */ return (0); } @@ -195,7 +198,7 @@ pci_add_device(uint8_t *id, uint16_t vid, uint16_t pid, uint8_t class, pci.pci_next_pic_irq++; DPRINTF("assigned irq %d to pci dev %d", pci.pci_devices[*id].pd_irq, *id); - pic_set_elcr(pci.pci_devices[*id].pd_irq, 1); + intr_toggle_el(¤t_vm, pci.pci_devices[*id].pd_irq, 1); } pci.pci_dev_ct ++; @@ -216,7 +219,10 @@ pci_init(void) memset(&pci, 0, sizeof(pci)); pci.pci_next_mmio_bar = VMM_PCI_MMIO_BAR_BASE; + +#ifdef __amd64__ pci.pci_next_io_bar = VM_PCI_IO_BAR_BASE; +#endif /* __amd64__ */ if (pci_add_device(&id, PCI_VENDOR_OPENBSD, PCI_PRODUCT_OPENBSD_PCHB, PCI_CLASS_BRIDGE, PCI_SUBCLASS_BRIDGE_HOST, @@ -226,6 +232,7 @@ pci_init(void) } } +#ifdef __amd64__ void pci_handle_address_reg(struct vm_run_params *vrp) { @@ -415,6 +422,7 @@ pci_handle_data_reg(struct vm_run_params *vrp) } } } +#endif /* __amd64__ */ int pci_dump(int fd) diff --git a/usr.sbin/vmd/pci.h b/usr.sbin/vmd/pci.h index 73b54437bed..0b05a9298d1 100644 --- a/usr.sbin/vmd/pci.h +++ b/usr.sbin/vmd/pci.h @@ -1,4 +1,4 @@ -/* $OpenBSD: pci.h,v 1.10 2023/02/06 20:33:34 dv Exp $ */ +/* $OpenBSD: pci.h,v 1.11 2024/07/10 09:27:33 dv Exp $ */ /* * Copyright (c) 2015 Mike Larkin @@ -93,9 +93,6 @@ struct pci { }; int pci_find_first_device(uint16_t); -void pci_handle_address_reg(struct vm_run_params *); -void pci_handle_data_reg(struct vm_run_params *); -uint8_t pci_handle_io(struct vm_run_params *); void pci_init(void); int pci_add_device(uint8_t *, uint16_t, uint16_t, uint8_t, uint8_t, uint16_t, uint16_t, uint8_t, pci_cs_fn_t); @@ -105,4 +102,10 @@ uint8_t pci_get_dev_irq(uint8_t); int pci_dump(int); int pci_restore(int); +#ifdef __amd64__ +void pci_handle_address_reg(struct vm_run_params *); +void pci_handle_data_reg(struct vm_run_params *); +uint8_t pci_handle_io(struct vm_run_params *); +#endif /* __amd64__ */ + #endif /* _PCI_H_ */ diff --git a/usr.sbin/vmd/vioblk.c b/usr.sbin/vmd/vioblk.c index 6e3e3147536..cef10e32cf4 100644 --- a/usr.sbin/vmd/vioblk.c +++ b/usr.sbin/vmd/vioblk.c @@ -1,4 +1,4 @@ -/* $OpenBSD: vioblk.c,v 1.13 2024/02/20 21:40:37 dv Exp $ */ +/* $OpenBSD: vioblk.c,v 1.14 2024/07/10 09:27:33 dv Exp $ */ /* * Copyright (c) 2023 Dave Voutila @@ -555,7 +555,7 @@ handle_sync_io(int fd, short event, void *arg) case VIODEV_MSG_IO_WRITE: /* Write IO: no reply needed */ if (handle_io_write(&msg, dev) == 1) - virtio_assert_pic_irq(dev, 0); + virtio_assert_irq(dev, 0); break; case VIODEV_MSG_SHUTDOWN: event_del(&dev->sync_iev.ev); @@ -614,7 +614,7 @@ handle_io_write(struct viodev_msg *msg, struct virtio_dev *dev) vioblk->cfg.isr_status = 0; vioblk->vq[0].last_avail = 0; vioblk->vq[0].notified_avail = 0; - virtio_deassert_pic_irq(dev, msg->vcpu); + virtio_deassert_irq(dev, msg->vcpu); } break; default: diff --git a/usr.sbin/vmd/virtio.c b/usr.sbin/vmd/virtio.c index 80d035ef60b..f203f822adc 100644 --- a/usr.sbin/vmd/virtio.c +++ b/usr.sbin/vmd/virtio.c @@ -1,4 +1,4 @@ -/* $OpenBSD: virtio.c,v 1.114 2024/07/09 09:31:37 dv Exp $ */ +/* $OpenBSD: virtio.c,v 1.115 2024/07/10 09:27:33 dv Exp $ */ /* * Copyright (c) 2015 Mike Larkin @@ -47,7 +47,6 @@ #include "vioscsi.h" #include "virtio.h" #include "vmd.h" -#include "vmm.h" extern struct vmd *env; extern char *__progname; @@ -274,7 +273,7 @@ virtio_rnd_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr, case VIRTIO_CONFIG_ISR_STATUS: *data = viornd.cfg.isr_status; viornd.cfg.isr_status = 0; - vcpu_deassert_pic_irq(viornd.vm_id, 0, viornd.irq); + vcpu_deassert_irq(viornd.vm_id, 0, viornd.irq); break; } } @@ -310,7 +309,7 @@ vmmci_ctl(unsigned int cmd) /* Trigger interrupt */ vmmci.cfg.isr_status = VIRTIO_CONFIG_ISR_CONFIG_CHANGE; - vcpu_assert_pic_irq(vmmci.vm_id, 0, vmmci.irq); + vcpu_assert_irq(vmmci.vm_id, 0, vmmci.irq); /* Add ACK timeout */ tv.tv_sec = VMMCI_TIMEOUT; @@ -322,7 +321,7 @@ vmmci_ctl(unsigned int cmd) vmmci.cmd = cmd; vmmci.cfg.isr_status = VIRTIO_CONFIG_ISR_CONFIG_CHANGE; - vcpu_assert_pic_irq(vmmci.vm_id, 0, vmmci.irq); + vcpu_assert_irq(vmmci.vm_id, 0, vmmci.irq); } else { log_debug("%s: RTC sync skipped (guest does not " "support RTC sync)\n", __func__); @@ -468,7 +467,7 @@ vmmci_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr, case VIRTIO_CONFIG_ISR_STATUS: *data = vmmci.cfg.isr_status; vmmci.cfg.isr_status = 0; - vcpu_deassert_pic_irq(vmmci.vm_id, 0, vmmci.irq); + vcpu_deassert_irq(vmmci.vm_id, 0, vmmci.irq); break; } } @@ -1586,9 +1585,9 @@ handle_dev_msg(struct viodev_msg *msg, struct virtio_dev *gdev) switch (msg->type) { case VIODEV_MSG_KICK: if (msg->state == INTR_STATE_ASSERT) - vcpu_assert_pic_irq(vm_id, msg->vcpu, irq); + vcpu_assert_irq(vm_id, msg->vcpu, irq); else if (msg->state == INTR_STATE_DEASSERT) - vcpu_deassert_pic_irq(vm_id, msg->vcpu, irq); + vcpu_deassert_irq(vm_id, msg->vcpu, irq); break; case VIODEV_MSG_READY: log_debug("%s: device reports ready", __func__); @@ -1702,9 +1701,9 @@ virtio_pci_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr, * device performs a register read. */ if (msg.state == INTR_STATE_ASSERT) - vcpu_assert_pic_irq(dev->vm_id, msg.vcpu, msg.irq); + vcpu_assert_irq(dev->vm_id, msg.vcpu, msg.irq); else if (msg.state == INTR_STATE_DEASSERT) - vcpu_deassert_pic_irq(dev->vm_id, msg.vcpu, msg.irq); + vcpu_deassert_irq(dev->vm_id, msg.vcpu, msg.irq); } else { log_warnx("%s: expected IO_READ, got %d", __func__, msg.type); @@ -1716,7 +1715,7 @@ virtio_pci_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr, } void -virtio_assert_pic_irq(struct virtio_dev *dev, int vcpu) +virtio_assert_irq(struct virtio_dev *dev, int vcpu) { struct viodev_msg msg; int ret; @@ -1734,7 +1733,7 @@ virtio_assert_pic_irq(struct virtio_dev *dev, int vcpu) } void -virtio_deassert_pic_irq(struct virtio_dev *dev, int vcpu) +virtio_deassert_irq(struct virtio_dev *dev, int vcpu) { struct viodev_msg msg; int ret; diff --git a/usr.sbin/vmd/virtio.h b/usr.sbin/vmd/virtio.h index 58f2c216837..c293743050c 100644 --- a/usr.sbin/vmd/virtio.h +++ b/usr.sbin/vmd/virtio.h @@ -1,4 +1,4 @@ -/* $OpenBSD: virtio.h,v 1.51 2024/02/20 21:40:37 dv Exp $ */ +/* $OpenBSD: virtio.h,v 1.52 2024/07/10 09:27:33 dv Exp $ */ /* * Copyright (c) 2015 Mike Larkin @@ -346,8 +346,8 @@ uint32_t vring_size(uint32_t); int vm_device_pipe(struct virtio_dev *, void (*)(int, short, void *), struct event_base *); int virtio_pci_io(int, uint16_t, uint32_t *, uint8_t *, void *, uint8_t); -void virtio_assert_pic_irq(struct virtio_dev *, int); -void virtio_deassert_pic_irq(struct virtio_dev *, int); +void virtio_assert_irq(struct virtio_dev *, int); +void virtio_deassert_irq(struct virtio_dev *, int); int virtio_rnd_io(int, uint16_t, uint32_t *, uint8_t *, void *, uint8_t); int viornd_dump(int); diff --git a/usr.sbin/vmd/vm.c b/usr.sbin/vmd/vm.c index 078e9b5172f..e8c73b0e053 100644 --- a/usr.sbin/vmd/vm.c +++ b/usr.sbin/vmd/vm.c @@ -1,4 +1,4 @@ -/* $OpenBSD: vm.c,v 1.103 2024/07/09 09:31:37 dv Exp $ */ +/* $OpenBSD: vm.c,v 1.104 2024/07/10 09:27:33 dv Exp $ */ /* * Copyright (c) 2015 Mike Larkin @@ -22,21 +22,14 @@ #include #include #include -#include #include #include #include #include -#include -#include #include #include -#include -#include -#include - #include #include @@ -55,57 +48,28 @@ #include #include "atomicio.h" -#include "fw_cfg.h" -#include "i8253.h" -#include "i8259.h" -#include "loadfile.h" -#include "mc146818.h" #include "mmio.h" -#include "ns8250.h" #include "pci.h" #include "virtio.h" #include "vmd.h" -#include "vmm.h" - -#define MB(x) (x * 1024UL * 1024UL) -#define GB(x) (x * 1024UL * 1024UL * 1024UL) #define MMIO_NOTYET 0 -io_fn_t ioports_map[MAX_PORTS]; - static int run_vm(struct vmop_create_params *, struct vcpu_reg_state *); -void vm_dispatch_vmm(int, short, void *); -void *event_thread(void *); -void *vcpu_run_loop(void *); -int vcpu_exit(struct vm_run_params *); -int vcpu_reset(uint32_t, uint32_t, struct vcpu_reg_state *); -void create_memory_map(struct vm_create_params *); +static void vm_dispatch_vmm(int, short, void *); +static void *event_thread(void *); +static void *vcpu_run_loop(void *); static int vmm_create_vm(struct vmd_vm *); -int alloc_guest_mem(struct vmd_vm *); -void init_emulated_hw(struct vmop_create_params *, int, - int[][VM_MAX_BASE_PER_DISK], int *); -void restore_emulated_hw(struct vm_create_params *, int, int *, - int[][VM_MAX_BASE_PER_DISK],int); -void vcpu_exit_inout(struct vm_run_params *); -int vcpu_exit_eptviolation(struct vm_run_params *); -uint8_t vcpu_exit_pci(struct vm_run_params *); -int vcpu_pic_intr(uint32_t, uint32_t, uint8_t); -int loadfile_bios(gzFile, off_t, struct vcpu_reg_state *); +static int alloc_guest_mem(struct vmd_vm *); static int send_vm(int, struct vmd_vm *); -int dump_send_header(int); static int dump_vmr(int , struct vm_mem_range *); static int dump_mem(int, struct vmd_vm *); -void restore_vmr(int, struct vm_mem_range *); -void restore_mem(int, struct vm_create_params *); -int restore_vm_params(int, struct vm_create_params *); +static void restore_vmr(int, struct vm_mem_range *); +static void restore_mem(int, struct vm_create_params *); +static int restore_vm_params(int, struct vm_create_params *); static void pause_vm(struct vmd_vm *); static void unpause_vm(struct vmd_vm *); - -int translate_gva(struct vm_exit*, uint64_t, uint64_t *, int); - -static struct vm_mem_range *find_gpa_range(struct vm_create_params *, paddr_t, - size_t); +static int start_vm(struct vmd_vm *, int); int con_fd; struct vmd_vm *current_vm; @@ -127,93 +91,6 @@ pthread_mutex_t vm_mtx; uint8_t vcpu_hlt[VMM_MAX_VCPUS_PER_VM]; uint8_t vcpu_done[VMM_MAX_VCPUS_PER_VM]; -/* - * Represents a standard register set for an OS to be booted - * as a flat 64 bit address space. - * - * NOT set here are: - * RIP - * RSP - * GDTR BASE - * - * Specific bootloaders should clone this structure and override - * those fields as needed. - * - * Note - CR3 and various bits in CR0 may be overridden by vmm(4) based on - * features of the CPU in use. - */ -static const struct vcpu_reg_state vcpu_init_flat64 = { - .vrs_gprs[VCPU_REGS_RFLAGS] = 0x2, - .vrs_gprs[VCPU_REGS_RIP] = 0x0, - .vrs_gprs[VCPU_REGS_RSP] = 0x0, - .vrs_crs[VCPU_REGS_CR0] = CR0_ET | CR0_PE | CR0_PG, - .vrs_crs[VCPU_REGS_CR3] = PML4_PAGE, - .vrs_crs[VCPU_REGS_CR4] = CR4_PAE | CR4_PSE, - .vrs_crs[VCPU_REGS_PDPTE0] = 0ULL, - .vrs_crs[VCPU_REGS_PDPTE1] = 0ULL, - .vrs_crs[VCPU_REGS_PDPTE2] = 0ULL, - .vrs_crs[VCPU_REGS_PDPTE3] = 0ULL, - .vrs_sregs[VCPU_REGS_CS] = { 0x8, 0xFFFFFFFF, 0xC09F, 0x0}, - .vrs_sregs[VCPU_REGS_DS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, - .vrs_sregs[VCPU_REGS_ES] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, - .vrs_sregs[VCPU_REGS_FS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, - .vrs_sregs[VCPU_REGS_GS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, - .vrs_sregs[VCPU_REGS_SS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, - .vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0}, - .vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0}, - .vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0}, - .vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0}, - .vrs_msrs[VCPU_REGS_EFER] = EFER_LME | EFER_LMA, - .vrs_drs[VCPU_REGS_DR0] = 0x0, - .vrs_drs[VCPU_REGS_DR1] = 0x0, - .vrs_drs[VCPU_REGS_DR2] = 0x0, - .vrs_drs[VCPU_REGS_DR3] = 0x0, - .vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0, - .vrs_drs[VCPU_REGS_DR7] = 0x400, - .vrs_msrs[VCPU_REGS_STAR] = 0ULL, - .vrs_msrs[VCPU_REGS_LSTAR] = 0ULL, - .vrs_msrs[VCPU_REGS_CSTAR] = 0ULL, - .vrs_msrs[VCPU_REGS_SFMASK] = 0ULL, - .vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL, - .vrs_msrs[VCPU_REGS_MISC_ENABLE] = 0ULL, - .vrs_crs[VCPU_REGS_XCR0] = XFEATURE_X87 -}; - -/* - * Represents a standard register set for an BIOS to be booted - * as a flat 16 bit address space. - */ -static const struct vcpu_reg_state vcpu_init_flat16 = { - .vrs_gprs[VCPU_REGS_RFLAGS] = 0x2, - .vrs_gprs[VCPU_REGS_RIP] = 0xFFF0, - .vrs_gprs[VCPU_REGS_RSP] = 0x0, - .vrs_crs[VCPU_REGS_CR0] = 0x60000010, - .vrs_crs[VCPU_REGS_CR3] = 0, - .vrs_sregs[VCPU_REGS_CS] = { 0xF000, 0xFFFF, 0x809F, 0xF0000}, - .vrs_sregs[VCPU_REGS_DS] = { 0x0, 0xFFFF, 0x8093, 0x0}, - .vrs_sregs[VCPU_REGS_ES] = { 0x0, 0xFFFF, 0x8093, 0x0}, - .vrs_sregs[VCPU_REGS_FS] = { 0x0, 0xFFFF, 0x8093, 0x0}, - .vrs_sregs[VCPU_REGS_GS] = { 0x0, 0xFFFF, 0x8093, 0x0}, - .vrs_sregs[VCPU_REGS_SS] = { 0x0, 0xFFFF, 0x8093, 0x0}, - .vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0}, - .vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0}, - .vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0}, - .vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0}, - .vrs_msrs[VCPU_REGS_EFER] = 0ULL, - .vrs_drs[VCPU_REGS_DR0] = 0x0, - .vrs_drs[VCPU_REGS_DR1] = 0x0, - .vrs_drs[VCPU_REGS_DR2] = 0x0, - .vrs_drs[VCPU_REGS_DR3] = 0x0, - .vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0, - .vrs_drs[VCPU_REGS_DR7] = 0x400, - .vrs_msrs[VCPU_REGS_STAR] = 0ULL, - .vrs_msrs[VCPU_REGS_LSTAR] = 0ULL, - .vrs_msrs[VCPU_REGS_CSTAR] = 0ULL, - .vrs_msrs[VCPU_REGS_SFMASK] = 0ULL, - .vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL, - .vrs_crs[VCPU_REGS_XCR0] = XFEATURE_X87 -}; - /* * vm_main * @@ -290,58 +167,6 @@ vm_main(int fd, int fd_vmm) _exit(ret); } -/* - * loadfile_bios - * - * Alternatively to loadfile_elf, this function loads a non-ELF BIOS image - * directly into memory. - * - * Parameters: - * fp: file of a kernel file to load - * size: uncompressed size of the image - * (out) vrs: register state to set on init for this kernel - * - * Return values: - * 0 if successful - * various error codes returned from read(2) or loadelf functions - */ -int -loadfile_bios(gzFile fp, off_t size, struct vcpu_reg_state *vrs) -{ - off_t off; - - /* Set up a "flat 16 bit" register state for BIOS */ - memcpy(vrs, &vcpu_init_flat16, sizeof(*vrs)); - - /* Seek to the beginning of the BIOS image */ - if (gzseek(fp, 0, SEEK_SET) == -1) - return (-1); - - /* The BIOS image must end at 1MB */ - if ((off = MB(1) - size) < 0) - return (-1); - - /* Read BIOS image into memory */ - if (mread(fp, off, size) != (size_t)size) { - errno = EIO; - return (-1); - } - - if (gzseek(fp, 0, SEEK_SET) == -1) - return (-1); - - /* Read a second BIOS copy into memory ending at 4GB */ - off = GB(4) - size; - if (mread(fp, off, size) != (size_t)size) { - errno = EIO; - return (-1); - } - - log_debug("%s: loaded BIOS image", __func__); - - return (0); -} - /* * start_vm * @@ -372,10 +197,8 @@ start_vm(struct vmd_vm *vm, int fd) struct vcpu_reg_state vrs; int nicfds[VM_MAX_NICS_PER_VM]; int ret; - gzFile fp; size_t i; struct vm_rwregs_params vrp; - struct stat sb; /* * We first try to initialize and allocate memory before bothering @@ -433,33 +256,8 @@ start_vm(struct vmd_vm *vm, int fd) if (ret != sizeof(vrp)) fatal("received incomplete vrp - exiting"); vrs = vrp.vrwp_regs; - } else { - /* - * Set up default "flat 64 bit" register state - RIP, - * RSP, and GDT info will be set in bootloader - */ - memcpy(&vrs, &vcpu_init_flat64, sizeof(vrs)); - - /* Find and open kernel image */ - if ((fp = gzdopen(vm->vm_kernel, "r")) == NULL) - fatalx("failed to open kernel - exiting"); - - /* Load kernel image */ - ret = loadfile_elf(fp, vm, &vrs, vmc->vmc_bootdevice); - - /* - * Try BIOS as a fallback (only if it was provided as an image - * with vm->vm_kernel and the file is not compressed) - */ - if (ret && errno == ENOEXEC && vm->vm_kernel != -1 && - gzdirect(fp) && (ret = fstat(vm->vm_kernel, &sb)) == 0) - ret = loadfile_bios(fp, sb.st_size, &vrs); - - if (ret) - fatal("failed to load kernel or BIOS - exiting"); - - gzclose(fp); - } + } else if (load_firmware(vm, &vrs)) + fatalx("failed to load kernel or firmware image"); if (vm->vm_kernel != -1) close_fd(vm->vm_kernel); @@ -721,15 +519,7 @@ send_vm(int fd, struct vmd_vm *vm) /* Dump memory before devices to aid in restoration. */ if ((ret = dump_mem(fd, vm))) goto err; - if ((ret = i8253_dump(fd))) - goto err; - if ((ret = i8259_dump(fd))) - goto err; - if ((ret = ns8250_dump(fd))) - goto err; - if ((ret = mc146818_dump(fd))) - goto err; - if ((ret = fw_cfg_dump(fd))) + if ((ret = dump_devs(fd))) goto err; if ((ret = pci_dump(fd))) goto err; @@ -764,46 +554,6 @@ err: return ret; } -int -dump_send_header(int fd) { - struct vm_dump_header vmh; - int i; - - memcpy(&vmh.vmh_signature, VM_DUMP_SIGNATURE, - sizeof(vmh.vmh_signature)); - - vmh.vmh_cpuids[0].code = 0x00; - vmh.vmh_cpuids[0].leaf = 0x00; - - vmh.vmh_cpuids[1].code = 0x01; - vmh.vmh_cpuids[1].leaf = 0x00; - - vmh.vmh_cpuids[2].code = 0x07; - vmh.vmh_cpuids[2].leaf = 0x00; - - vmh.vmh_cpuids[3].code = 0x0d; - vmh.vmh_cpuids[3].leaf = 0x00; - - vmh.vmh_cpuids[4].code = 0x80000001; - vmh.vmh_cpuids[4].leaf = 0x00; - - vmh.vmh_version = VM_DUMP_VERSION; - - for (i=0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) { - CPUID_LEAF(vmh.vmh_cpuids[i].code, - vmh.vmh_cpuids[i].leaf, - vmh.vmh_cpuids[i].a, - vmh.vmh_cpuids[i].b, - vmh.vmh_cpuids[i].c, - vmh.vmh_cpuids[i].d); - } - - if (atomicio(vwrite, fd, &vmh, sizeof(vmh)) != sizeof(vmh)) - return (-1); - - return (0); -} - int dump_mem(int fd, struct vmd_vm *vm) { @@ -933,10 +683,7 @@ pause_vm(struct vmd_vm *vm) return; } - i8253_stop(); - mc146818_stop(); - ns8250_stop(); - virtio_stop(vm); + pause_vm_md(vm); } static void @@ -962,10 +709,7 @@ unpause_vm(struct vmd_vm *vm) } } - i8253_start(); - mc146818_start(); - ns8250_start(); - virtio_start(vm); + unpause_vm_md(vm); } /* @@ -1002,99 +746,6 @@ vcpu_reset(uint32_t vmid, uint32_t vcpu_id, struct vcpu_reg_state *vrs) return (0); } -/* - * create_memory_map - * - * Sets up the guest physical memory ranges that the VM can access. - * - * Parameters: - * vcp: VM create parameters describing the VM whose memory map - * is being created - * - * Return values: - * nothing - */ -void -create_memory_map(struct vm_create_params *vcp) -{ - size_t len, mem_bytes; - size_t above_1m = 0, above_4g = 0; - - mem_bytes = vcp->vcp_memranges[0].vmr_size; - vcp->vcp_nmemranges = 0; - if (mem_bytes == 0 || mem_bytes > VMM_MAX_VM_MEM_SIZE) - return; - - /* First memory region: 0 - LOWMEM_KB (DOS low mem) */ - len = LOWMEM_KB * 1024; - vcp->vcp_memranges[0].vmr_gpa = 0x0; - vcp->vcp_memranges[0].vmr_size = len; - vcp->vcp_memranges[0].vmr_type = VM_MEM_RAM; - mem_bytes -= len; - - /* - * Second memory region: LOWMEM_KB - 1MB. - * - * N.B. - Normally ROMs or parts of video RAM are mapped here. - * We have to add this region, because some systems - * unconditionally write to 0xb8000 (VGA RAM), and - * we need to make sure that vmm(4) permits accesses - * to it. So allocate guest memory for it. - */ - len = MB(1) - (LOWMEM_KB * 1024); - vcp->vcp_memranges[1].vmr_gpa = LOWMEM_KB * 1024; - vcp->vcp_memranges[1].vmr_size = len; - vcp->vcp_memranges[1].vmr_type = VM_MEM_RESERVED; - mem_bytes -= len; - - /* If we have less than 2MB remaining, still create a 2nd BIOS area. */ - if (mem_bytes <= MB(2)) { - vcp->vcp_memranges[2].vmr_gpa = VMM_PCI_MMIO_BAR_END; - vcp->vcp_memranges[2].vmr_size = MB(2); - vcp->vcp_memranges[2].vmr_type = VM_MEM_RESERVED; - vcp->vcp_nmemranges = 3; - return; - } - - /* - * Calculate the how to split any remaining memory across the 4GB - * boundary while making sure we do not place physical memory into - * MMIO ranges. - */ - if (mem_bytes > VMM_PCI_MMIO_BAR_BASE - MB(1)) { - above_1m = VMM_PCI_MMIO_BAR_BASE - MB(1); - above_4g = mem_bytes - above_1m; - } else { - above_1m = mem_bytes; - above_4g = 0; - } - - /* Third memory region: area above 1MB to MMIO region */ - vcp->vcp_memranges[2].vmr_gpa = MB(1); - vcp->vcp_memranges[2].vmr_size = above_1m; - vcp->vcp_memranges[2].vmr_type = VM_MEM_RAM; - - /* Fourth region: PCI MMIO range */ - vcp->vcp_memranges[3].vmr_gpa = VMM_PCI_MMIO_BAR_BASE; - vcp->vcp_memranges[3].vmr_size = VMM_PCI_MMIO_BAR_END - - VMM_PCI_MMIO_BAR_BASE + 1; - vcp->vcp_memranges[3].vmr_type = VM_MEM_MMIO; - - /* Fifth region: 2nd copy of BIOS above MMIO ending at 4GB */ - vcp->vcp_memranges[4].vmr_gpa = VMM_PCI_MMIO_BAR_END + 1; - vcp->vcp_memranges[4].vmr_size = MB(2); - vcp->vcp_memranges[4].vmr_type = VM_MEM_RESERVED; - - /* Sixth region: any remainder above 4GB */ - if (above_4g > 0) { - vcp->vcp_memranges[5].vmr_gpa = GB(4); - vcp->vcp_memranges[5].vmr_size = above_4g; - vcp->vcp_memranges[5].vmr_type = VM_MEM_RAM; - vcp->vcp_nmemranges = 6; - } else - vcp->vcp_nmemranges = 5; -} - /* * alloc_guest_mem * @@ -1190,142 +841,8 @@ vmm_create_vm(struct vmd_vm *vm) return (0); } -/* - * init_emulated_hw - * - * Initializes the userspace hardware emulation - */ -void -init_emulated_hw(struct vmop_create_params *vmc, int child_cdrom, - int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps) -{ - struct vm_create_params *vcp = &vmc->vmc_params; - size_t i; - uint64_t memlo, memhi; - - /* Calculate memory size for NVRAM registers */ - memlo = memhi = 0; - for (i = 0; i < vcp->vcp_nmemranges; i++) { - if (vcp->vcp_memranges[i].vmr_gpa == MB(1) && - vcp->vcp_memranges[i].vmr_size > (15 * MB(1))) - memlo = vcp->vcp_memranges[i].vmr_size - (15 * MB(1)); - else if (vcp->vcp_memranges[i].vmr_gpa == GB(4)) - memhi = vcp->vcp_memranges[i].vmr_size; - } - - /* Reset the IO port map */ - memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS); - - /* Init i8253 PIT */ - i8253_init(vcp->vcp_id); - ioports_map[TIMER_CTRL] = vcpu_exit_i8253; - ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253; - ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253; - ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253; - ioports_map[PCKBC_AUX] = vcpu_exit_i8253_misc; - - /* Init mc146818 RTC */ - mc146818_init(vcp->vcp_id, memlo, memhi); - ioports_map[IO_RTC] = vcpu_exit_mc146818; - ioports_map[IO_RTC + 1] = vcpu_exit_mc146818; - - /* Init master and slave PICs */ - i8259_init(); - ioports_map[IO_ICU1] = vcpu_exit_i8259; - ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259; - ioports_map[IO_ICU2] = vcpu_exit_i8259; - ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259; - ioports_map[ELCR0] = vcpu_exit_elcr; - ioports_map[ELCR1] = vcpu_exit_elcr; - - /* Init ns8250 UART */ - ns8250_init(con_fd, vcp->vcp_id); - for (i = COM1_DATA; i <= COM1_SCR; i++) - ioports_map[i] = vcpu_exit_com; - - /* Initialize PCI */ - for (i = VM_PCI_IO_BAR_BASE; i <= VM_PCI_IO_BAR_END; i++) - ioports_map[i] = vcpu_exit_pci; - - ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci; - ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci; - ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci; - ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci; - ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci; - pci_init(); - - /* Initialize virtio devices */ - virtio_init(current_vm, child_cdrom, child_disks, child_taps); /* - * Init QEMU fw_cfg interface. Must be done last for pci hardware - * detection. - */ - fw_cfg_init(vmc); - ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg; - ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg; - ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma; - ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma; -} - -/* - * restore_emulated_hw - * - * Restores the userspace hardware emulation from fd - */ -void -restore_emulated_hw(struct vm_create_params *vcp, int fd, - int *child_taps, int child_disks[][VM_MAX_BASE_PER_DISK], int child_cdrom) -{ - /* struct vm_create_params *vcp = &vmc->vmc_params; */ - int i; - memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS); - - /* Init i8253 PIT */ - i8253_restore(fd, vcp->vcp_id); - ioports_map[TIMER_CTRL] = vcpu_exit_i8253; - ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253; - ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253; - ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253; - - /* Init master and slave PICs */ - i8259_restore(fd); - ioports_map[IO_ICU1] = vcpu_exit_i8259; - ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259; - ioports_map[IO_ICU2] = vcpu_exit_i8259; - ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259; - - /* Init ns8250 UART */ - ns8250_restore(fd, con_fd, vcp->vcp_id); - for (i = COM1_DATA; i <= COM1_SCR; i++) - ioports_map[i] = vcpu_exit_com; - - /* Init mc146818 RTC */ - mc146818_restore(fd, vcp->vcp_id); - ioports_map[IO_RTC] = vcpu_exit_mc146818; - ioports_map[IO_RTC + 1] = vcpu_exit_mc146818; - - /* Init QEMU fw_cfg interface */ - fw_cfg_restore(fd); - ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg; - ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg; - ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma; - ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma; - - /* Initialize PCI */ - for (i = VM_PCI_IO_BAR_BASE; i <= VM_PCI_IO_BAR_END; i++) - ioports_map[i] = vcpu_exit_pci; - - ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci; - ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci; - ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci; - ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci; - ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci; - pci_restore(fd); - virtio_restore(fd, current_vm, child_cdrom, child_disks, child_taps); -} - -/* * run_vm * * Runs the VM whose creation parameters are specified in vcp @@ -1525,7 +1042,7 @@ run_vm(struct vmop_create_params *vmc, struct vcpu_reg_state *vrs) return (ret); } -void * +static void * event_thread(void *arg) { uint8_t *donep = arg; @@ -1555,7 +1072,7 @@ event_thread(void *arg) * NULL: the VCPU shutdown properly * !NULL: error processing VCPU run, or the VCPU shutdown abnormally */ -void * +static void * vcpu_run_loop(void *arg) { struct vm_run_params *vrp = (struct vm_run_params *)arg; @@ -1593,7 +1110,7 @@ vcpu_run_loop(void *arg) return ((void *)ret); } - /* i8259 may be firing as we pause, release run mtx. */ + /* Interrupt may be firing, release run mtx. */ mutex_unlock(&vcpu_run_mtx[n]); ret = pthread_cond_wait(&vcpu_unpause_cond[n], &vcpu_unpause_mtx[n]); @@ -1636,14 +1153,14 @@ vcpu_run_loop(void *arg) break; } - if (vrp->vrp_irqready && i8259_is_pending()) { - vrp->vrp_inject.vie_vector = i8259_ack(); + if (vrp->vrp_irqready && intr_pending(current_vm)) { + vrp->vrp_inject.vie_vector = intr_ack(current_vm); vrp->vrp_inject.vie_type = VCPU_INJECT_INTR; } else vrp->vrp_inject.vie_type = VCPU_INJECT_NONE; /* Still more interrupts pending? */ - vrp->vrp_intr_pending = i8259_is_pending(); + vrp->vrp_intr_pending = intr_pending(current_vm); if (ioctl(env->vmd_fd, VMM_IOC_RUN, vrp) == -1) { /* If run ioctl failed, exit */ @@ -1682,7 +1199,7 @@ vcpu_run_loop(void *arg) } int -vcpu_pic_intr(uint32_t vm_id, uint32_t vcpu_id, uint8_t intr) +vcpu_intr(uint32_t vm_id, uint32_t vcpu_id, uint8_t intr) { struct vm_intr_params vip; @@ -1698,503 +1215,6 @@ vcpu_pic_intr(uint32_t vm_id, uint32_t vcpu_id, uint8_t intr) return (0); } -/* - * vcpu_exit_pci - * - * Handle all I/O to the emulated PCI subsystem. - * - * Parameters: - * vrp: vcpu run parameters containing guest state for this exit - * - * Return value: - * Interrupt to inject to the guest VM, or 0xFF if no interrupt should - * be injected. - */ -uint8_t -vcpu_exit_pci(struct vm_run_params *vrp) -{ - struct vm_exit *vei = vrp->vrp_exit; - uint8_t intr; - - intr = 0xFF; - - switch (vei->vei.vei_port) { - case PCI_MODE1_ADDRESS_REG: - pci_handle_address_reg(vrp); - break; - case PCI_MODE1_DATA_REG: - case PCI_MODE1_DATA_REG + 1: - case PCI_MODE1_DATA_REG + 2: - case PCI_MODE1_DATA_REG + 3: - pci_handle_data_reg(vrp); - break; - case VM_PCI_IO_BAR_BASE ... VM_PCI_IO_BAR_END: - intr = pci_handle_io(vrp); - break; - default: - log_warnx("%s: unknown PCI register 0x%llx", - __progname, (uint64_t)vei->vei.vei_port); - break; - } - - return (intr); -} - -/* - * vcpu_exit_inout - * - * Handle all I/O exits that need to be emulated in vmd. This includes the - * i8253 PIT, the com1 ns8250 UART, and the MC146818 RTC/NVRAM device. - * - * Parameters: - * vrp: vcpu run parameters containing guest state for this exit - */ -void -vcpu_exit_inout(struct vm_run_params *vrp) -{ - struct vm_exit *vei = vrp->vrp_exit; - uint8_t intr = 0xFF; - - if (vei->vei.vei_rep || vei->vei.vei_string) { -#ifdef MMIO_DEBUG - log_info("%s: %s%s%s %d-byte, enc=%d, data=0x%08x, port=0x%04x", - __func__, - vei->vei.vei_rep == 0 ? "" : "REP ", - vei->vei.vei_dir == VEI_DIR_IN ? "IN" : "OUT", - vei->vei.vei_string == 0 ? "" : "S", - vei->vei.vei_size, vei->vei.vei_encoding, - vei->vei.vei_data, vei->vei.vei_port); - log_info("%s: ECX = 0x%llx, RDX = 0x%llx, RSI = 0x%llx", - __func__, - vei->vrs.vrs_gprs[VCPU_REGS_RCX], - vei->vrs.vrs_gprs[VCPU_REGS_RDX], - vei->vrs.vrs_gprs[VCPU_REGS_RSI]); -#endif /* MMIO_DEBUG */ - fatalx("%s: can't emulate REP prefixed IN(S)/OUT(S)", - __func__); - } - - if (ioports_map[vei->vei.vei_port] != NULL) - intr = ioports_map[vei->vei.vei_port](vrp); - else if (vei->vei.vei_dir == VEI_DIR_IN) - set_return_data(vei, 0xFFFFFFFF); - - vei->vrs.vrs_gprs[VCPU_REGS_RIP] += vei->vei.vei_insn_len; - - if (intr != 0xFF) - vcpu_assert_pic_irq(vrp->vrp_vm_id, vrp->vrp_vcpu_id, intr); -} - -/* - * vcpu_exit_eptviolation - * - * handle an EPT Violation - * - * Parameters: - * vrp: vcpu run parameters containing guest state for this exit - * - * Return values: - * 0: no action required - * EFAULT: a protection fault occured, kill the vm. - */ -int -vcpu_exit_eptviolation(struct vm_run_params *vrp) -{ - struct vm_exit *ve = vrp->vrp_exit; - int ret = 0; -#if MMIO_NOTYET - struct x86_insn insn; - uint64_t va, pa; - size_t len = 15; /* Max instruction length in x86. */ -#endif /* MMIO_NOTYET */ - switch (ve->vee.vee_fault_type) { - case VEE_FAULT_HANDLED: - break; - -#if MMIO_NOTYET - case VEE_FAULT_MMIO_ASSIST: - /* Intel VMX might give us the length of the instruction. */ - if (ve->vee.vee_insn_info & VEE_LEN_VALID) - len = ve->vee.vee_insn_len; - - if (len > 15) - fatalx("%s: invalid instruction length %lu", __func__, - len); - - /* If we weren't given instruction bytes, we need to fetch. */ - if (!(ve->vee.vee_insn_info & VEE_BYTES_VALID)) { - memset(ve->vee.vee_insn_bytes, 0, - sizeof(ve->vee.vee_insn_bytes)); - va = ve->vrs.vrs_gprs[VCPU_REGS_RIP]; - - /* XXX Only support instructions that fit on 1 page. */ - if ((va & PAGE_MASK) + len > PAGE_SIZE) { - log_warnx("%s: instruction might cross page " - "boundary", __func__); - ret = EINVAL; - break; - } - - ret = translate_gva(ve, va, &pa, PROT_EXEC); - if (ret != 0) { - log_warnx("%s: failed gva translation", - __func__); - break; - } - - ret = read_mem(pa, ve->vee.vee_insn_bytes, len); - if (ret != 0) { - log_warnx("%s: failed to fetch instruction " - "bytes from 0x%llx", __func__, pa); - break; - } - } - - ret = insn_decode(ve, &insn); - if (ret == 0) - ret = insn_emulate(ve, &insn); - break; -#endif /* MMIO_NOTYET */ - - case VEE_FAULT_PROTECT: - log_debug("%s: EPT Violation: rip=0x%llx", __progname, - ve->vrs.vrs_gprs[VCPU_REGS_RIP]); - ret = EFAULT; - break; - - default: - fatalx("%s: invalid fault_type %d", __progname, - ve->vee.vee_fault_type); - /* UNREACHED */ - } - - return (ret); -} - -/* - * vcpu_exit - * - * Handle a vcpu exit. This function is called when it is determined that - * vmm(4) requires the assistance of vmd to support a particular guest - * exit type (eg, accessing an I/O port or device). Guest state is contained - * in 'vrp', and will be resent to vmm(4) on exit completion. - * - * Upon conclusion of handling the exit, the function determines if any - * interrupts should be injected into the guest, and asserts the proper - * IRQ line whose interrupt should be vectored. - * - * Parameters: - * vrp: vcpu run parameters containing guest state for this exit - * - * Return values: - * 0: the exit was handled successfully - * 1: an error occurred (eg, unknown exit reason passed in 'vrp') - */ -int -vcpu_exit(struct vm_run_params *vrp) -{ - int ret; - - switch (vrp->vrp_exit_reason) { - case VMX_EXIT_INT_WINDOW: - case SVM_VMEXIT_VINTR: - case VMX_EXIT_CPUID: - case VMX_EXIT_EXTINT: - case SVM_VMEXIT_INTR: - case SVM_VMEXIT_MSR: - case SVM_VMEXIT_CPUID: - /* - * We may be exiting to vmd to handle a pending interrupt but - * at the same time the last exit type may have been one of - * these. In this case, there's nothing extra to be done - * here (and falling through to the default case below results - * in more vmd log spam). - */ - break; - case SVM_VMEXIT_NPF: - case VMX_EXIT_EPT_VIOLATION: - ret = vcpu_exit_eptviolation(vrp); - if (ret) - return (ret); - break; - case VMX_EXIT_IO: - case SVM_VMEXIT_IOIO: - vcpu_exit_inout(vrp); - break; - case VMX_EXIT_HLT: - case SVM_VMEXIT_HLT: - mutex_lock(&vm_mtx); - vcpu_hlt[vrp->vrp_vcpu_id] = 1; - mutex_unlock(&vm_mtx); - break; - case VMX_EXIT_TRIPLE_FAULT: - case SVM_VMEXIT_SHUTDOWN: - /* reset VM */ - return (EAGAIN); - default: - log_debug("%s: unknown exit reason 0x%x", - __progname, vrp->vrp_exit_reason); - } - - return (0); -} - -/* - * find_gpa_range - * - * Search for a contiguous guest physical mem range. - * - * Parameters: - * vcp: VM create parameters that contain the memory map to search in - * gpa: the starting guest physical address - * len: the length of the memory range - * - * Return values: - * NULL: on failure if there is no memory range as described by the parameters - * Pointer to vm_mem_range that contains the start of the range otherwise. - */ -static struct vm_mem_range * -find_gpa_range(struct vm_create_params *vcp, paddr_t gpa, size_t len) -{ - size_t i, n; - struct vm_mem_range *vmr; - - /* Find the first vm_mem_range that contains gpa */ - for (i = 0; i < vcp->vcp_nmemranges; i++) { - vmr = &vcp->vcp_memranges[i]; - if (gpa < vmr->vmr_gpa + vmr->vmr_size) - break; - } - - /* No range found. */ - if (i == vcp->vcp_nmemranges) - return (NULL); - - /* - * vmr may cover the range [gpa, gpa + len) only partly. Make - * sure that the following vm_mem_ranges are contiguous and - * cover the rest. - */ - n = vmr->vmr_size - (gpa - vmr->vmr_gpa); - if (len < n) - len = 0; - else - len -= n; - gpa = vmr->vmr_gpa + vmr->vmr_size; - for (i = i + 1; len != 0 && i < vcp->vcp_nmemranges; i++) { - vmr = &vcp->vcp_memranges[i]; - if (gpa != vmr->vmr_gpa) - return (NULL); - if (len <= vmr->vmr_size) - len = 0; - else - len -= vmr->vmr_size; - - gpa = vmr->vmr_gpa + vmr->vmr_size; - } - - if (len != 0) - return (NULL); - - return (vmr); -} - -/* - * write_mem - * - * Copies data from 'buf' into the guest VM's memory at paddr 'dst'. - * - * Parameters: - * dst: the destination paddr_t in the guest VM - * buf: data to copy (or NULL to zero the data) - * len: number of bytes to copy - * - * Return values: - * 0: success - * EINVAL: if the guest physical memory range [dst, dst + len) does not - * exist in the guest. - */ -int -write_mem(paddr_t dst, const void *buf, size_t len) -{ - const char *from = buf; - char *to; - size_t n, off; - struct vm_mem_range *vmr; - - vmr = find_gpa_range(¤t_vm->vm_params.vmc_params, dst, len); - if (vmr == NULL) { - errno = EINVAL; - log_warn("%s: failed - invalid memory range dst = 0x%lx, " - "len = 0x%zx", __func__, dst, len); - return (EINVAL); - } - - off = dst - vmr->vmr_gpa; - while (len != 0) { - n = vmr->vmr_size - off; - if (len < n) - n = len; - - to = (char *)vmr->vmr_va + off; - if (buf == NULL) - memset(to, 0, n); - else { - memcpy(to, from, n); - from += n; - } - len -= n; - off = 0; - vmr++; - } - - return (0); -} - -/* - * read_mem - * - * Reads memory at guest paddr 'src' into 'buf'. - * - * Parameters: - * src: the source paddr_t in the guest VM to read from. - * buf: destination (local) buffer - * len: number of bytes to read - * - * Return values: - * 0: success - * EINVAL: if the guest physical memory range [dst, dst + len) does not - * exist in the guest. - */ -int -read_mem(paddr_t src, void *buf, size_t len) -{ - char *from, *to = buf; - size_t n, off; - struct vm_mem_range *vmr; - - vmr = find_gpa_range(¤t_vm->vm_params.vmc_params, src, len); - if (vmr == NULL) { - errno = EINVAL; - log_warn("%s: failed - invalid memory range src = 0x%lx, " - "len = 0x%zx", __func__, src, len); - return (EINVAL); - } - - off = src - vmr->vmr_gpa; - while (len != 0) { - n = vmr->vmr_size - off; - if (len < n) - n = len; - - from = (char *)vmr->vmr_va + off; - memcpy(to, from, n); - - to += n; - len -= n; - off = 0; - vmr++; - } - - return (0); -} - -/* - * hvaddr_mem - * - * Translate a guest physical address to a host virtual address, checking the - * provided memory range length to confirm it's contiguous within the same - * guest memory range (vm_mem_range). - * - * Parameters: - * gpa: guest physical address to translate - * len: number of bytes in the intended range - * - * Return values: - * void* to host virtual memory on success - * NULL on error, setting errno to: - * EFAULT: gpa falls outside guest memory ranges - * EINVAL: requested len extends beyond memory range - */ -void * -hvaddr_mem(paddr_t gpa, size_t len) -{ - struct vm_mem_range *vmr; - size_t off; - - vmr = find_gpa_range(¤t_vm->vm_params.vmc_params, gpa, len); - if (vmr == NULL) { - log_warnx("%s: failed - invalid gpa: 0x%lx\n", __func__, gpa); - errno = EFAULT; - return (NULL); - } - - off = gpa - vmr->vmr_gpa; - if (len > (vmr->vmr_size - off)) { - log_warnx("%s: failed - invalid memory range: gpa=0x%lx, " - "len=%zu", __func__, gpa, len); - errno = EINVAL; - return (NULL); - } - - return ((char *)vmr->vmr_va + off); -} - -/* - * vcpu_assert_pic_irq - * - * Injects the specified IRQ on the supplied vcpu/vm - * - * Parameters: - * vm_id: VM ID to inject to - * vcpu_id: VCPU ID to inject to - * irq: IRQ to inject - */ -void -vcpu_assert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq) -{ - int ret; - - i8259_assert_irq(irq); - - if (i8259_is_pending()) { - if (vcpu_pic_intr(vm_id, vcpu_id, 1)) - fatalx("%s: can't assert INTR", __func__); - - mutex_lock(&vm_mtx); - vcpu_hlt[vcpu_id] = 0; - mutex_unlock(&vm_mtx); - - mutex_lock(&vcpu_run_mtx[vcpu_id]); - ret = pthread_cond_signal(&vcpu_run_cond[vcpu_id]); - if (ret) - fatalx("%s: can't signal (%d)", __func__, ret); - mutex_unlock(&vcpu_run_mtx[vcpu_id]); - } -} - -/* - * vcpu_deassert_pic_irq - * - * Clears the specified IRQ on the supplied vcpu/vm - * - * Parameters: - * vm_id: VM ID to clear in - * vcpu_id: VCPU ID to clear in - * irq: IRQ to clear - */ -void -vcpu_deassert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq) -{ - i8259_deassert_irq(irq); - - if (!i8259_is_pending()) { - if (vcpu_pic_intr(vm_id, vcpu_id, 0)) - fatalx("%s: can't deassert INTR for vm_id %d, " - "vcpu_id %d", __func__, vm_id, vcpu_id); - } -} - /* * fd_hasdata * @@ -2258,203 +1278,6 @@ mutex_unlock(pthread_mutex_t *m) } } -/* - * set_return_data - * - * Utility function for manipulating register data in vm exit info structs. This - * function ensures that the data is copied to the vei->vei.vei_data field with - * the proper size for the operation being performed. - * - * Parameters: - * vei: exit information - * data: return data - */ -void -set_return_data(struct vm_exit *vei, uint32_t data) -{ - switch (vei->vei.vei_size) { - case 1: - vei->vei.vei_data &= ~0xFF; - vei->vei.vei_data |= (uint8_t)data; - break; - case 2: - vei->vei.vei_data &= ~0xFFFF; - vei->vei.vei_data |= (uint16_t)data; - break; - case 4: - vei->vei.vei_data = data; - break; - } -} - -/* - * get_input_data - * - * Utility function for manipulating register data in vm exit info - * structs. This function ensures that the data is copied from the - * vei->vei.vei_data field with the proper size for the operation being - * performed. - * - * Parameters: - * vei: exit information - * data: location to store the result - */ -void -get_input_data(struct vm_exit *vei, uint32_t *data) -{ - switch (vei->vei.vei_size) { - case 1: - *data &= 0xFFFFFF00; - *data |= (uint8_t)vei->vei.vei_data; - break; - case 2: - *data &= 0xFFFF0000; - *data |= (uint16_t)vei->vei.vei_data; - break; - case 4: - *data = vei->vei.vei_data; - break; - default: - log_warnx("%s: invalid i/o size %d", __func__, - vei->vei.vei_size); - } - -} - -/* - * translate_gva - * - * Translates a guest virtual address to a guest physical address by walking - * the currently active page table (if needed). - * - * XXX ensure translate_gva updates the A bit in the PTE - * XXX ensure translate_gva respects segment base and limits in i386 mode - * XXX ensure translate_gva respects segment wraparound in i8086 mode - * XXX ensure translate_gva updates the A bit in the segment selector - * XXX ensure translate_gva respects CR4.LMSLE if available - * - * Parameters: - * exit: The VCPU this translation should be performed for (guest MMU settings - * are gathered from this VCPU) - * va: virtual address to translate - * pa: pointer to paddr_t variable that will receive the translated physical - * address. 'pa' is unchanged on error. - * mode: one of PROT_READ, PROT_WRITE, PROT_EXEC indicating the mode in which - * the address should be translated - * - * Return values: - * 0: the address was successfully translated - 'pa' contains the physical - * address currently mapped by 'va'. - * EFAULT: the PTE for 'VA' is unmapped. A #PF will be injected in this case - * and %cr2 set in the vcpu structure. - * EINVAL: an error occurred reading paging table structures - */ -int -translate_gva(struct vm_exit* exit, uint64_t va, uint64_t* pa, int mode) -{ - int level, shift, pdidx; - uint64_t pte, pt_paddr, pte_paddr, mask, low_mask, high_mask; - uint64_t shift_width, pte_size; - struct vcpu_reg_state *vrs; - - vrs = &exit->vrs; - - if (!pa) - return (EINVAL); - - if (!(vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PG)) { - log_debug("%s: unpaged, va=pa=0x%llx", __func__, va); - *pa = va; - return (0); - } - - pt_paddr = vrs->vrs_crs[VCPU_REGS_CR3]; - - log_debug("%s: guest %%cr0=0x%llx, %%cr3=0x%llx", __func__, - vrs->vrs_crs[VCPU_REGS_CR0], vrs->vrs_crs[VCPU_REGS_CR3]); - - if (vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PE) { - if (vrs->vrs_crs[VCPU_REGS_CR4] & CR4_PAE) { - pte_size = sizeof(uint64_t); - shift_width = 9; - - if (vrs->vrs_msrs[VCPU_REGS_EFER] & EFER_LMA) { - /* 4 level paging */ - level = 4; - mask = L4_MASK; - shift = L4_SHIFT; - } else { - /* 32 bit with PAE paging */ - level = 3; - mask = L3_MASK; - shift = L3_SHIFT; - } - } else { - /* 32 bit paging */ - level = 2; - shift_width = 10; - mask = 0xFFC00000; - shift = 22; - pte_size = sizeof(uint32_t); - } - } else - return (EINVAL); - - /* XXX: Check for R bit in segment selector and set A bit */ - - for (;level > 0; level--) { - pdidx = (va & mask) >> shift; - pte_paddr = (pt_paddr) + (pdidx * pte_size); - - log_debug("%s: read pte level %d @ GPA 0x%llx", __func__, - level, pte_paddr); - if (read_mem(pte_paddr, &pte, pte_size)) { - log_warn("%s: failed to read pte", __func__); - return (EFAULT); - } - - log_debug("%s: PTE @ 0x%llx = 0x%llx", __func__, pte_paddr, - pte); - - /* XXX: Set CR2 */ - if (!(pte & PG_V)) - return (EFAULT); - - /* XXX: Check for SMAP */ - if ((mode == PROT_WRITE) && !(pte & PG_RW)) - return (EPERM); - - if ((exit->cpl > 0) && !(pte & PG_u)) - return (EPERM); - - pte = pte | PG_U; - if (mode == PROT_WRITE) - pte = pte | PG_M; - if (write_mem(pte_paddr, &pte, pte_size)) { - log_warn("%s: failed to write back flags to pte", - __func__); - return (EIO); - } - - /* XXX: EINVAL if in 32bit and PG_PS is 1 but CR4.PSE is 0 */ - if (pte & PG_PS) - break; - - if (level > 1) { - pt_paddr = pte & PG_FRAME; - shift -= shift_width; - mask = mask >> shift_width; - } - } - - low_mask = (1 << shift) - 1; - high_mask = (((uint64_t)1ULL << ((pte_size * 8) - 1)) - 1) ^ low_mask; - *pa = (pte & high_mask) | (va & low_mask); - - log_debug("%s: final GPA for GVA 0x%llx = 0x%llx\n", __func__, va, *pa); - - return (0); -} void vm_pipe_init(struct vm_dev_pipe *p, void (*cb)(int, short, void *)) @@ -2619,3 +1442,31 @@ remap_guest_mem(struct vmd_vm *vm, int vmm_fd) return (0); } + +void +vcpu_halt(uint32_t vcpu_id) +{ + mutex_lock(&vm_mtx); + vcpu_hlt[vcpu_id] = 1; + mutex_unlock(&vm_mtx); +} + +void +vcpu_unhalt(uint32_t vcpu_id) + { + mutex_lock(&vm_mtx); + vcpu_hlt[vcpu_id] = 0; + mutex_unlock(&vm_mtx); +} + +void +vcpu_signal_run(uint32_t vcpu_id) +{ + int ret; + + mutex_lock(&vcpu_run_mtx[vcpu_id]); + ret = pthread_cond_signal(&vcpu_run_cond[vcpu_id]); + if (ret) + fatalx("%s: can't signal (%d)", __func__, ret); + mutex_unlock(&vcpu_run_mtx[vcpu_id]); +} diff --git a/usr.sbin/vmd/vmd.c b/usr.sbin/vmd/vmd.c index 3c053ae08a2..232bc82d8d2 100644 --- a/usr.sbin/vmd/vmd.c +++ b/usr.sbin/vmd/vmd.c @@ -1,4 +1,4 @@ -/* $OpenBSD: vmd.c,v 1.158 2024/07/09 09:31:37 dv Exp $ */ +/* $OpenBSD: vmd.c,v 1.159 2024/07/10 09:27:33 dv Exp $ */ /* * Copyright (c) 2015 Reyk Floeter @@ -41,7 +41,6 @@ #include #include -#include #include "proc.h" #include "atomicio.h" @@ -613,134 +612,6 @@ vmd_dispatch_priv(int fd, struct privsep_proc *p, struct imsg *imsg) return (0); } -int -vmd_check_vmh(struct vm_dump_header *vmh) -{ - int i; - unsigned int code, leaf; - unsigned int a, b, c, d; - - if (strncmp(vmh->vmh_signature, VM_DUMP_SIGNATURE, strlen(VM_DUMP_SIGNATURE)) != 0) { - log_warnx("%s: incompatible dump signature", __func__); - return (-1); - } - - if (vmh->vmh_version != VM_DUMP_VERSION) { - log_warnx("%s: incompatible dump version", __func__); - return (-1); - } - - for (i = 0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) { - code = vmh->vmh_cpuids[i].code; - leaf = vmh->vmh_cpuids[i].leaf; - if (leaf != 0x00) { - log_debug("%s: invalid leaf 0x%x for code 0x%x", - __func__, leaf, code); - return (-1); - } - - switch (code) { - case 0x00: - CPUID_LEAF(code, leaf, a, b, c, d); - if (vmh->vmh_cpuids[i].a > a) { - log_debug("%s: incompatible cpuid level", - __func__); - return (-1); - } - if (!(vmh->vmh_cpuids[i].b == b && - vmh->vmh_cpuids[i].c == c && - vmh->vmh_cpuids[i].d == d)) { - log_debug("%s: incompatible cpu brand", - __func__); - return (-1); - } - break; - - case 0x01: - CPUID_LEAF(code, leaf, a, b, c, d); - if ((vmh->vmh_cpuids[i].c & c & VMM_CPUIDECX_MASK) != - (vmh->vmh_cpuids[i].c & VMM_CPUIDECX_MASK)) { - log_debug("%s: incompatible cpu features " - "code: 0x%x leaf: 0x%x reg: c", __func__, - code, leaf); - return (-1); - } - if ((vmh->vmh_cpuids[i].d & d & VMM_CPUIDEDX_MASK) != - (vmh->vmh_cpuids[i].d & VMM_CPUIDEDX_MASK)) { - log_debug("%s: incompatible cpu features " - "code: 0x%x leaf: 0x%x reg: d", __func__, - code, leaf); - return (-1); - } - break; - - case 0x07: - CPUID_LEAF(code, leaf, a, b, c, d); - if ((vmh->vmh_cpuids[i].b & b & VMM_SEFF0EBX_MASK) != - (vmh->vmh_cpuids[i].b & VMM_SEFF0EBX_MASK)) { - log_debug("%s: incompatible cpu features " - "code: 0x%x leaf: 0x%x reg: c", __func__, - code, leaf); - return (-1); - } - if ((vmh->vmh_cpuids[i].c & c & VMM_SEFF0ECX_MASK) != - (vmh->vmh_cpuids[i].c & VMM_SEFF0ECX_MASK)) { - log_debug("%s: incompatible cpu features " - "code: 0x%x leaf: 0x%x reg: d", __func__, - code, leaf); - return (-1); - } - break; - - case 0x0d: - CPUID_LEAF(code, leaf, a, b, c, d); - if (vmh->vmh_cpuids[i].b > b) { - log_debug("%s: incompatible cpu: insufficient " - "max save area for enabled XCR0 features", - __func__); - return (-1); - } - if (vmh->vmh_cpuids[i].c > c) { - log_debug("%s: incompatible cpu: insufficient " - "max save area for supported XCR0 features", - __func__); - return (-1); - } - break; - - case 0x80000001: - CPUID_LEAF(code, leaf, a, b, c, d); - if ((vmh->vmh_cpuids[i].a & a) != - vmh->vmh_cpuids[i].a) { - log_debug("%s: incompatible cpu features " - "code: 0x%x leaf: 0x%x reg: a", __func__, - code, leaf); - return (-1); - } - if ((vmh->vmh_cpuids[i].c & c) != - vmh->vmh_cpuids[i].c) { - log_debug("%s: incompatible cpu features " - "code: 0x%x leaf: 0x%x reg: c", __func__, - code, leaf); - return (-1); - } - if ((vmh->vmh_cpuids[i].d & d) != - vmh->vmh_cpuids[i].d) { - log_debug("%s: incompatible cpu features " - "code: 0x%x leaf: 0x%x reg: d", __func__, - code, leaf); - return (-1); - } - break; - - default: - log_debug("%s: unknown code 0x%x", __func__, code); - return (-1); - } - } - - return (0); -} void vmd_sighdlr(int sig, short event, void *arg) diff --git a/usr.sbin/vmd/vmd.h b/usr.sbin/vmd/vmd.h index 4f1b05e7058..2f2056541c8 100644 --- a/usr.sbin/vmd/vmd.h +++ b/usr.sbin/vmd/vmd.h @@ -1,4 +1,4 @@ -/* $OpenBSD: vmd.h,v 1.126 2024/07/09 09:31:37 dv Exp $ */ +/* $OpenBSD: vmd.h,v 1.127 2024/07/10 09:27:33 dv Exp $ */ /* * Copyright (c) 2015 Mike Larkin @@ -43,6 +43,9 @@ #define nitems(_a) (sizeof((_a)) / sizeof((_a)[0])) +#define MB(x) (x * 1024UL * 1024UL) +#define GB(x) (x * 1024UL * 1024UL * 1024UL) + #define VMD_USER "_vmd" #define VMD_CONF "/etc/vm.conf" #define SOCKET_NAME "/var/run/vmd.sock" @@ -492,21 +495,51 @@ int opentap(char *); int fd_hasdata(int); int vmm_pipe(struct vmd_vm *, int, void (*)(int, short, void *)); -/* vm.c */ +/* {mach}_vm.c (md interface) */ +void create_memory_map(struct vm_create_params *); +int load_firmware(struct vmd_vm *, struct vcpu_reg_state *); +void init_emulated_hw(struct vmop_create_params *, int, + int[][VM_MAX_BASE_PER_DISK], int *); +void restore_emulated_hw(struct vm_create_params *vcp, int, int *, + int[][VM_MAX_BASE_PER_DISK], int); +int vcpu_reset(uint32_t, uint32_t, struct vcpu_reg_state *); +void pause_vm_md(struct vmd_vm *); +void unpause_vm_md(struct vmd_vm *); +int dump_devs(int); +int dump_send_header(int); +void *hvaddr_mem(paddr_t, size_t); +int write_mem(paddr_t, const void *, size_t); +int read_mem(paddr_t, void *, size_t); +int intr_ack(struct vmd_vm *); +int intr_pending(struct vmd_vm *); +void intr_toggle_el(struct vmd_vm *, int, int); +void vcpu_assert_irq(uint32_t, uint32_t, int); +void vcpu_deassert_irq(uint32_t, uint32_t, int); +int vcpu_exit(struct vm_run_params *); +uint8_t vcpu_exit_pci(struct vm_run_params *); + +#ifdef __amd64__ +/* x86 io functions in x86_vm.c */ +void set_return_data(struct vm_exit *, uint32_t); +void get_input_data(struct vm_exit *, uint32_t *); +#endif /* __amd64 __ */ + +/* vm.c (mi functions) */ +void vcpu_halt(uint32_t); +void vcpu_unhalt(uint32_t); +void vcpu_signal_run(uint32_t); +int vcpu_intr(uint32_t, uint32_t, uint8_t); void vm_main(int, int); void mutex_lock(pthread_mutex_t *); void mutex_unlock(pthread_mutex_t *); -int read_mem(paddr_t, void *buf, size_t); -int start_vm(struct vmd_vm *, int); -__dead void vm_shutdown(unsigned int); +int vmd_check_vmh(struct vm_dump_header *); void vm_pipe_init(struct vm_dev_pipe *, void (*)(int, short, void *)); void vm_pipe_init2(struct vm_dev_pipe *, void (*)(int, short, void *), void *); void vm_pipe_send(struct vm_dev_pipe *, enum pipe_msg_type); enum pipe_msg_type vm_pipe_recv(struct vm_dev_pipe *); -int write_mem(paddr_t, const void *buf, size_t); -void* hvaddr_mem(paddr_t, size_t); int remap_guest_mem(struct vmd_vm *, int); +__dead void vm_shutdown(unsigned int); /* config.c */ int config_init(struct vmd *); diff --git a/usr.sbin/vmd/vmm.c b/usr.sbin/vmd/vmm.c index 70c94c0dff8..6a98e43f751 100644 --- a/usr.sbin/vmd/vmm.c +++ b/usr.sbin/vmd/vmm.c @@ -1,4 +1,4 @@ -/* $OpenBSD: vmm.c,v 1.120 2024/07/09 09:31:37 dv Exp $ */ +/* $OpenBSD: vmm.c,v 1.121 2024/07/10 09:27:33 dv Exp $ */ /* * Copyright (c) 2015 Mike Larkin @@ -30,9 +30,6 @@ #include #include -#include -#include - #include #include @@ -50,7 +47,6 @@ #include #include "vmd.h" -#include "vmm.h" #include "atomicio.h" #include "proc.h" -- 2.20.1