From: dv Date: Wed, 10 Jul 2024 10:41:19 +0000 (+0000) Subject: Missed some files in previous commit to split vmd into mi/md. X-Git-Url: http://artulab.com/gitweb/?a=commitdiff_plain;h=234ee546287049cd0c06003ce677a6226d75654b;p=openbsd Missed some files in previous commit to split vmd into mi/md. Forgot `cvs add` and sys/dev/vmm/vmm.h changes. --- diff --git a/sys/arch/arm64/include/vmmvar.h b/sys/arch/arm64/include/vmmvar.h new file mode 100644 index 00000000000..76afc5cd864 --- /dev/null +++ b/sys/arch/arm64/include/vmmvar.h @@ -0,0 +1,91 @@ +/* $OpenBSD: vmmvar.h,v 1.1 2024/07/10 10:41:19 dv Exp $ */ +/* + * Copyright (c) 2014 Mike Larkin + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/* + * CPU capabilities for VMM operation + */ +#ifndef _MACHINE_VMMVAR_H_ +#define _MACHINE_VMMVAR_H_ + +#define VMM_HV_SIGNATURE "OpenBSDVMM58" + +#define VMM_PCI_MMIO_BAR_BASE 0xF0000000ULL +#define VMM_PCI_MMIO_BAR_END 0xFFDFFFFFULL /* 2 MiB below 4 GiB */ + +/* Exit Reasons */ +#define VM_EXIT_TERMINATED 0xFFFE +#define VM_EXIT_NONE 0xFFFF + +struct vmm_softc_md { + /* Capabilities */ + uint32_t nr_cpus; /* [I] */ +}; + +/* + * struct vcpu_inject_event : describes an exception or interrupt to inject. + */ +struct vcpu_inject_event { + uint8_t vie_vector; /* Exception or interrupt vector. */ + uint32_t vie_errorcode; /* Optional error code. */ + uint8_t vie_type; +#define VCPU_INJECT_NONE 0 +#define VCPU_INJECT_INTR 1 /* External hardware interrupt. */ +#define VCPU_INJECT_EX 2 /* HW or SW Exception */ +#define VCPU_INJECT_NMI 3 /* Non-maskable Interrupt */ +}; + +#define VCPU_REGS_NGPRS 31 + +struct vcpu_reg_state { + uint64_t vrs_gprs[VCPU_REGS_NGPRS]; +}; + +/* + * struct vm_exit + * + * Contains VM exit information communicated to vmd(8). This information is + * gathered by vmm(4) from the CPU on each exit that requires help from vmd. + */ +struct vm_exit { + struct vcpu_reg_state vrs; +}; + +struct vm_intr_params { + /* Input parameters to VMM_IOC_INTR */ + uint32_t vip_vm_id; + uint32_t vip_vcpu_id; + uint16_t vip_intr; +}; + +#define VM_RWREGS_GPRS 0x1 /* read/write GPRs */ +#define VM_RWREGS_ALL (VM_RWREGS_GPRS) + +struct vm_rwregs_params { + /* + * Input/output parameters to VMM_IOC_READREGS / + * VMM_IOC_WRITEREGS + */ + uint32_t vrwp_vm_id; + uint32_t vrwp_vcpu_id; + uint64_t vrwp_mask; + struct vcpu_reg_state vrwp_regs; +}; + +/* IOCTL definitions */ +#define VMM_IOC_INTR _IOW('V', 6, struct vm_intr_params) /* Intr pending */ + +#endif /* ! _MACHINE_VMMVAR_H_ */ diff --git a/sys/dev/vmm/vmm.h b/sys/dev/vmm/vmm.h index 3ec2d20fe93..ac682bc2c45 100644 --- a/sys/dev/vmm/vmm.h +++ b/sys/dev/vmm/vmm.h @@ -1,4 +1,4 @@ -/* $OpenBSD: vmm.h,v 1.5 2024/07/09 09:31:37 dv Exp $ */ +/* $OpenBSD: vmm.h,v 1.6 2024/07/10 10:41:19 dv Exp $ */ /* * Copyright (c) 2014-2023 Mike Larkin * @@ -108,6 +108,20 @@ struct vm_run_params { uint8_t vrp_irqready; /* ready for IRQ on entry */ }; +#define VM_RWVMPARAMS_PVCLOCK_SYSTEM_GPA 0x1 /* read/write pvclock gpa */ +#define VM_RWVMPARAMS_PVCLOCK_VERSION 0x2 /* read/write pvclock version */ +#define VM_RWVMPARAMS_ALL (VM_RWVMPARAMS_PVCLOCK_SYSTEM_GPA | \ + VM_RWVMPARAMS_PVCLOCK_VERSION) + +struct vm_rwvmparams_params { + /* Input parameters to VMM_IOC_READVMPARAMS/VMM_IOC_WRITEVMPARAMS */ + uint32_t vpp_vm_id; + uint32_t vpp_vcpu_id; + uint32_t vpp_mask; + paddr_t vpp_pvclock_system_gpa; + uint32_t vpp_pvclock_version; +}; + /* IOCTL definitions */ #define VMM_IOC_CREATE _IOWR('V', 1, struct vm_create_params) /* Create VM */ #define VMM_IOC_RUN _IOWR('V', 2, struct vm_run_params) /* Run VCPU */ @@ -225,6 +239,7 @@ void vm_teardown(struct vm **); int vm_get_info(struct vm_info_params *); int vm_terminate(struct vm_terminate_params *); int vm_resetcpu(struct vm_resetcpu_params *); +int vm_rwvmparams(struct vm_rwvmparams_params *, int); int vcpu_must_stop(struct vcpu *); int vm_share_mem(struct vm_sharemem_params *, struct proc *); int vm_run(struct vm_run_params *); diff --git a/usr.sbin/vmd/arm64_vm.c b/usr.sbin/vmd/arm64_vm.c new file mode 100644 index 00000000000..282dbcb4985 --- /dev/null +++ b/usr.sbin/vmd/arm64_vm.c @@ -0,0 +1,162 @@ +/* $OpenBSD: arm64_vm.c,v 1.1 2024/07/10 10:41:19 dv Exp $ */ +/* + * Copyright (c) 2024 Dave Voutila + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +#include + +#include "vmd.h" + +void +create_memory_map(struct vm_create_params *vcp) +{ + fatalx("%s: unimplemented", __func__); + /* NOTREACHED */ +} + +int +load_firmware(struct vmd_vm *vm, struct vcpu_reg_state *vrs) +{ + fatalx("%s: unimplemented", __func__); + /* NOTREACHED */ + return (-1); +} + +void +init_emulated_hw(struct vmop_create_params *vcp, int child_cdrom, + int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps) +{ + fatalx("%s: unimplemented", __func__); + /* NOTREACHED */ +} + +void +restore_emulated_hw(struct vm_create_params *vcp, int fd, int *child_taps, + int child_disks[][VM_MAX_BASE_PER_DISK], int child_cdrom) +{ + fatalx("%s: unimplemented", __func__); + /* NOTREACHED */ +} + +void +pause_vm_md(struct vmd_vm *vm) +{ + fatalx("%s: unimplemented", __func__); + /* NOTREACHED */ +} + +void +unpause_vm_md(struct vmd_vm *vm) +{ + fatalx("%s: unimplemented", __func__); + /* NOTREACHED */ +} + +int +dump_devs(int fd) +{ + fatalx("%s: unimplemented", __func__); + /* NOTREACHED */ + return (-1); +} + +int +dump_send_header(int fd) +{ + fatalx("%s: unimplemented", __func__); + /* NOTREACHED */ + return (-1); +} + +void * +hvaddr_mem(paddr_t gpa, size_t len) +{ fatalx("%s: unimplemented", __func__); + /* NOTREACHED */ + return (NULL); +} + +int +write_mem(paddr_t dst, const void *buf, size_t len) +{ + fatalx("%s: unimplemented", __func__); + /* NOTREACHED */ + return (-1); +} + +int +read_mem(paddr_t src, void *buf, size_t len) +{ + fatalx("%s: unimplemented", __func__); + /* NOTREACHED */ + return (-1); +} + +int +intr_pending(struct vmd_vm *vm) +{ + fatalx("%s: unimplemented", __func__); + /* NOTREACHED */ + return (-1); +} + +void +intr_toggle_el(struct vmd_vm *vm, int irq, int val) +{ + fatalx("%s: unimplemented", __func__); + /* NOTREACHED */ +} + +int +intr_ack(struct vmd_vm *vm) +{ + fatalx("%s: unimplemented", __func__); + /* NOTREACHED */ + return (-1); +} + +void +vcpu_assert_irq(uint32_t vm_id, uint32_t vcpu_id, int irq) +{ + fatalx("%s: unimplemented", __func__); +} + +void +vcpu_deassert_irq(uint32_t vm_id, uint32_t vcpu_id, int irq) +{ + fatalx("%s: unimplemented", __func__); +} + +int +vmd_check_vmh(struct vm_dump_header *vmh) +{ + fatalx("%s: unimplemented", __func__); + /* NOTREACHED */ + return (-1); +} + +int +vcpu_exit(struct vm_run_params *vrp) +{ + fatalx("%s: unimplemented", __func__); + /* NOTREACHED */ + return (-1); +} + +uint8_t +vcpu_exit_pci(struct vm_run_params *vrp) +{ + fatalx("%s: unimplemented", __func__); + /* NOTREACHED */ + return (0xff); +} diff --git a/usr.sbin/vmd/x86_mmio.c b/usr.sbin/vmd/x86_mmio.c new file mode 100644 index 00000000000..381df30e471 --- /dev/null +++ b/usr.sbin/vmd/x86_mmio.c @@ -0,0 +1,1045 @@ +/* $OpenBSD: x86_mmio.c,v 1.1 2024/07/10 10:41:19 dv Exp $ */ +/* + * Copyright (c) 2022 Dave Voutila + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +#include +#include + +#include "vmd.h" +#include "mmio.h" + +#define MMIO_DEBUG 0 + +extern char* __progname; + +struct x86_decode_state { + uint8_t s_bytes[15]; + size_t s_len; + size_t s_idx; +}; + +enum decode_result { + DECODE_ERROR = 0, /* Something went wrong. */ + DECODE_DONE, /* Decode success and no more work needed. */ + DECODE_MORE, /* Decode success and more work required. */ +}; + +static const char *str_cpu_mode(int); +static const char *str_decode_res(enum decode_result); +static const char *str_opcode(struct x86_opcode *); +static const char *str_operand_enc(struct x86_opcode *); +static const char *str_reg(int); +static const char *str_sreg(int); +static int detect_cpu_mode(struct vcpu_reg_state *); + +static enum decode_result decode_prefix(struct x86_decode_state *, + struct x86_insn *); +static enum decode_result decode_opcode(struct x86_decode_state *, + struct x86_insn *); +static enum decode_result decode_modrm(struct x86_decode_state *, + struct x86_insn *); +static int get_modrm_reg(struct x86_insn *); +static int get_modrm_addr(struct x86_insn *, struct vcpu_reg_state *vrs); +static enum decode_result decode_disp(struct x86_decode_state *, + struct x86_insn *); +static enum decode_result decode_sib(struct x86_decode_state *, + struct x86_insn *); +static enum decode_result decode_imm(struct x86_decode_state *, + struct x86_insn *); + +static enum decode_result peek_byte(struct x86_decode_state *, uint8_t *); +static enum decode_result next_byte(struct x86_decode_state *, uint8_t *); +static enum decode_result next_value(struct x86_decode_state *, size_t, + uint64_t *); +static int is_valid_state(struct x86_decode_state *, const char *); + +static int emulate_mov(struct x86_insn *, struct vm_exit *); +static int emulate_movzx(struct x86_insn *, struct vm_exit *); + +/* Lookup table for 1-byte opcodes, in opcode alphabetical order. */ +const enum x86_opcode_type x86_1byte_opcode_tbl[255] = { + /* MOV */ + [0x88] = OP_MOV, + [0x89] = OP_MOV, + [0x8A] = OP_MOV, + [0x8B] = OP_MOV, + [0x8C] = OP_MOV, + [0xA0] = OP_MOV, + [0xA1] = OP_MOV, + [0xA2] = OP_MOV, + [0xA3] = OP_MOV, + + /* MOVS */ + [0xA4] = OP_UNSUPPORTED, + [0xA5] = OP_UNSUPPORTED, + + [ESCAPE] = OP_TWO_BYTE, +}; + +/* Lookup table for 1-byte operand encodings, in opcode alphabetical order. */ +const enum x86_operand_enc x86_1byte_operand_enc_tbl[255] = { + /* MOV */ + [0x88] = OP_ENC_MR, + [0x89] = OP_ENC_MR, + [0x8A] = OP_ENC_RM, + [0x8B] = OP_ENC_RM, + [0x8C] = OP_ENC_MR, + [0xA0] = OP_ENC_FD, + [0xA1] = OP_ENC_FD, + [0xA2] = OP_ENC_TD, + [0xA3] = OP_ENC_TD, + + /* MOVS */ + [0xA4] = OP_ENC_ZO, + [0xA5] = OP_ENC_ZO, +}; + +const enum x86_opcode_type x86_2byte_opcode_tbl[255] = { + /* MOVZX */ + [0xB6] = OP_MOVZX, + [0xB7] = OP_MOVZX, +}; + +const enum x86_operand_enc x86_2byte_operand_enc_table[255] = { + /* MOVZX */ + [0xB6] = OP_ENC_RM, + [0xB7] = OP_ENC_RM, +}; + +/* + * peek_byte + * + * Fetch the next byte fron the instruction bytes without advancing the + * position in the stream. + * + * Return values: + * DECODE_DONE: byte was found and is the last in the stream + * DECODE_MORE: byte was found and there are more remaining to be read + * DECODE_ERROR: state is invalid and not byte was found, *byte left unchanged + */ +static enum decode_result +peek_byte(struct x86_decode_state *state, uint8_t *byte) +{ + enum decode_result res; + + if (state == NULL) + return (DECODE_ERROR); + + if (state->s_idx == state->s_len) + return (DECODE_ERROR); + + if (state->s_idx + 1 == state->s_len) + res = DECODE_DONE; + else + res = DECODE_MORE; + + if (byte != NULL) + *byte = state->s_bytes[state->s_idx]; + return (res); +} + +/* + * next_byte + * + * Fetch the next byte fron the instruction bytes, advancing the position in the + * stream and mutating decode state. + * + * Return values: + * DECODE_DONE: byte was found and is the last in the stream + * DECODE_MORE: byte was found and there are more remaining to be read + * DECODE_ERROR: state is invalid and not byte was found, *byte left unchanged + */ +static enum decode_result +next_byte(struct x86_decode_state *state, uint8_t *byte) +{ + uint8_t next; + + /* Cheat and see if we're going to fail. */ + if (peek_byte(state, &next) == DECODE_ERROR) + return (DECODE_ERROR); + + if (byte != NULL) + *byte = next; + state->s_idx++; + + return (state->s_idx < state->s_len ? DECODE_MORE : DECODE_DONE); +} + +/* + * Fetch the next `n' bytes as a single uint64_t value. + */ +static enum decode_result +next_value(struct x86_decode_state *state, size_t n, uint64_t *value) +{ + uint8_t bytes[8]; + size_t i; + enum decode_result res; + + if (value == NULL) + return (DECODE_ERROR); + + if (n == 0 || n > sizeof(bytes)) + return (DECODE_ERROR); + + memset(bytes, 0, sizeof(bytes)); + for (i = 0; i < n; i++) + if ((res = next_byte(state, &bytes[i])) == DECODE_ERROR) + return (DECODE_ERROR); + + *value = *((uint64_t*)bytes); + + return (res); +} + +/* + * is_valid_state + * + * Validate the decode state looks viable. + * + * Returns: + * 1: if state is valid + * 0: if an invariant is detected + */ +static int +is_valid_state(struct x86_decode_state *state, const char *fn_name) +{ + const char *s = (fn_name != NULL) ? fn_name : __func__; + + if (state == NULL) { + log_warnx("%s: null state", s); + return (0); + } + if (state->s_len > sizeof(state->s_bytes)) { + log_warnx("%s: invalid length", s); + return (0); + } + if (state->s_idx + 1 > state->s_len) { + log_warnx("%s: invalid index", s); + return (0); + } + + return (1); +} + +#ifdef MMIO_DEBUG +static void +dump_regs(struct vcpu_reg_state *vrs) +{ + size_t i; + struct vcpu_segment_info *vsi; + + for (i = 0; i < VCPU_REGS_NGPRS; i++) + log_info("%s: %s 0x%llx", __progname, str_reg(i), + vrs->vrs_gprs[i]); + + for (i = 0; i < VCPU_REGS_NSREGS; i++) { + vsi = &vrs->vrs_sregs[i]; + log_info("%s: %s { sel: 0x%04x, lim: 0x%08x, ar: 0x%08x, " + "base: 0x%llx }", __progname, str_sreg(i), + vsi->vsi_sel, vsi->vsi_limit, vsi->vsi_ar, vsi->vsi_base); + } +} + +static void +dump_insn(struct x86_insn *insn) +{ + log_info("instruction { %s, enc=%s, len=%d, mod=0x%02x, (" + "reg=%s, addr=0x%lx) sib=0x%02x }", + str_opcode(&insn->insn_opcode), + str_operand_enc(&insn->insn_opcode), insn->insn_bytes_len, + insn->insn_modrm, str_reg(insn->insn_reg), + insn->insn_gva, insn->insn_sib); +} +#endif /* MMIO_DEBUG */ + +static const char * +str_cpu_mode(int mode) +{ + switch (mode) { + case VMM_CPU_MODE_REAL: return "REAL"; + case VMM_CPU_MODE_PROT: return "PROT"; + case VMM_CPU_MODE_PROT32: return "PROT32"; + case VMM_CPU_MODE_COMPAT: return "COMPAT"; + case VMM_CPU_MODE_LONG: return "LONG"; + default: return "UKNOWN"; + } +} + +__unused static const char * +str_decode_res(enum decode_result res) { + switch (res) { + case DECODE_DONE: return "DONE"; + case DECODE_MORE: return "MORE"; + case DECODE_ERROR: return "ERROR"; + default: return "UNKNOWN"; + } +} + +static const char * +str_opcode(struct x86_opcode *opcode) +{ + switch (opcode->op_type) { + case OP_IN: return "IN"; + case OP_INS: return "INS"; + case OP_MOV: return "MOV"; + case OP_MOVZX: return "MOVZX"; + case OP_OUT: return "OUT"; + case OP_OUTS: return "OUTS"; + case OP_UNSUPPORTED: return "UNSUPPORTED"; + default: return "UNKNOWN"; + } +} + +static const char * +str_operand_enc(struct x86_opcode *opcode) +{ + switch (opcode->op_encoding) { + case OP_ENC_I: return "I"; + case OP_ENC_MI: return "MI"; + case OP_ENC_MR: return "MR"; + case OP_ENC_RM: return "RM"; + case OP_ENC_FD: return "FD"; + case OP_ENC_TD: return "TD"; + case OP_ENC_OI: return "OI"; + case OP_ENC_ZO: return "ZO"; + default: return "UNKNOWN"; + } +} + +static const char * +str_reg(int reg) { + switch (reg) { + case VCPU_REGS_RAX: return "RAX"; + case VCPU_REGS_RCX: return "RCX"; + case VCPU_REGS_RDX: return "RDX"; + case VCPU_REGS_RBX: return "RBX"; + case VCPU_REGS_RSI: return "RSI"; + case VCPU_REGS_RDI: return "RDI"; + case VCPU_REGS_R8: return " R8"; + case VCPU_REGS_R9: return " R9"; + case VCPU_REGS_R10: return "R10"; + case VCPU_REGS_R11: return "R11"; + case VCPU_REGS_R12: return "R12"; + case VCPU_REGS_R13: return "R13"; + case VCPU_REGS_R14: return "R14"; + case VCPU_REGS_R15: return "R15"; + case VCPU_REGS_RSP: return "RSP"; + case VCPU_REGS_RBP: return "RBP"; + case VCPU_REGS_RIP: return "RIP"; + case VCPU_REGS_RFLAGS: return "RFLAGS"; + default: return "UNKNOWN"; + } +} + +static const char * +str_sreg(int sreg) { + switch (sreg) { + case VCPU_REGS_CS: return "CS"; + case VCPU_REGS_DS: return "DS"; + case VCPU_REGS_ES: return "ES"; + case VCPU_REGS_FS: return "FS"; + case VCPU_REGS_GS: return "GS"; + case VCPU_REGS_SS: return "GS"; + case VCPU_REGS_LDTR: return "LDTR"; + case VCPU_REGS_TR: return "TR"; + default: return "UKNOWN"; + } +} + +static int +detect_cpu_mode(struct vcpu_reg_state *vrs) +{ + uint64_t cr0, cr4, cs, efer, rflags; + + /* Is protected mode enabled? */ + cr0 = vrs->vrs_crs[VCPU_REGS_CR0]; + if (!(cr0 & CR0_PE)) + return (VMM_CPU_MODE_REAL); + + cr4 = vrs->vrs_crs[VCPU_REGS_CR4]; + cs = vrs->vrs_sregs[VCPU_REGS_CS].vsi_ar; + efer = vrs->vrs_msrs[VCPU_REGS_EFER]; + rflags = vrs->vrs_gprs[VCPU_REGS_RFLAGS]; + + /* Check for Long modes. */ + if ((efer & EFER_LME) && (cr4 & CR4_PAE) && (cr0 & CR0_PG)) { + if (cs & CS_L) { + /* Long Modes */ + if (!(cs & CS_D)) + return (VMM_CPU_MODE_LONG); + log_warnx("%s: invalid cpu mode", __progname); + return (VMM_CPU_MODE_UNKNOWN); + } else { + /* Compatibility Modes */ + if (cs & CS_D) /* XXX Add Compat32 mode */ + return (VMM_CPU_MODE_UNKNOWN); + return (VMM_CPU_MODE_COMPAT); + } + } + + /* Check for 32-bit Protected Mode. */ + if (cs & CS_D) + return (VMM_CPU_MODE_PROT32); + + /* Check for virtual 8086 mode. */ + if (rflags & EFLAGS_VM) { + /* XXX add Virtual8086 mode */ + log_warnx("%s: Virtual 8086 mode", __progname); + return (VMM_CPU_MODE_UNKNOWN); + } + + /* Can't determine mode. */ + log_warnx("%s: invalid cpu mode", __progname); + return (VMM_CPU_MODE_UNKNOWN); +} + +static enum decode_result +decode_prefix(struct x86_decode_state *state, struct x86_insn *insn) +{ + enum decode_result res = DECODE_ERROR; + struct x86_prefix *prefix; + uint8_t byte; + + if (!is_valid_state(state, __func__) || insn == NULL) + return (-1); + + prefix = &insn->insn_prefix; + memset(prefix, 0, sizeof(*prefix)); + + /* + * Decode prefixes. The last of its kind wins. The behavior is undefined + * in the Intel SDM (see Vol 2, 2.1.1 Instruction Prefixes.) + */ + while ((res = peek_byte(state, &byte)) != DECODE_ERROR) { + switch (byte) { + case LEG_1_LOCK: + case LEG_1_REPNE: + case LEG_1_REP: + prefix->pfx_group1 = byte; + break; + case LEG_2_CS: + case LEG_2_SS: + case LEG_2_DS: + case LEG_2_ES: + case LEG_2_FS: + case LEG_2_GS: + prefix->pfx_group2 = byte; + break; + case LEG_3_OPSZ: + prefix->pfx_group3 = byte; + break; + case LEG_4_ADDRSZ: + prefix->pfx_group4 = byte; + break; + case REX_BASE...REX_BASE + 0x0F: + if (insn->insn_cpu_mode == VMM_CPU_MODE_LONG) + prefix->pfx_rex = byte; + else /* INC encountered */ + return (DECODE_ERROR); + break; + case VEX_2_BYTE: + case VEX_3_BYTE: + log_warnx("%s: VEX not supported", __func__); + return (DECODE_ERROR); + default: + /* Something other than a valid prefix. */ + return (DECODE_MORE); + } + /* Advance our position. */ + next_byte(state, NULL); + } + + return (res); +} + +static enum decode_result +decode_modrm(struct x86_decode_state *state, struct x86_insn *insn) +{ + enum decode_result res; + uint8_t byte = 0; + + if (!is_valid_state(state, __func__) || insn == NULL) + return (DECODE_ERROR); + + insn->insn_modrm_valid = 0; + + /* Check the operand encoding to see if we fetch a byte or abort. */ + switch (insn->insn_opcode.op_encoding) { + case OP_ENC_MR: + case OP_ENC_RM: + case OP_ENC_MI: + res = next_byte(state, &byte); + if (res == DECODE_ERROR) { + log_warnx("%s: failed to get modrm byte", __func__); + break; + } + insn->insn_modrm = byte; + insn->insn_modrm_valid = 1; + break; + + case OP_ENC_I: + case OP_ENC_OI: + log_warnx("%s: instruction does not need memory assist", + __func__); + res = DECODE_ERROR; + break; + + default: + /* Peek to see if we're done decode. */ + res = peek_byte(state, NULL); + } + + return (res); +} + +static int +get_modrm_reg(struct x86_insn *insn) +{ + if (insn == NULL) + return (-1); + + if (insn->insn_modrm_valid) { + switch (MODRM_REGOP(insn->insn_modrm)) { + case 0: + insn->insn_reg = VCPU_REGS_RAX; + break; + case 1: + insn->insn_reg = VCPU_REGS_RCX; + break; + case 2: + insn->insn_reg = VCPU_REGS_RDX; + break; + case 3: + insn->insn_reg = VCPU_REGS_RBX; + break; + case 4: + insn->insn_reg = VCPU_REGS_RSP; + break; + case 5: + insn->insn_reg = VCPU_REGS_RBP; + break; + case 6: + insn->insn_reg = VCPU_REGS_RSI; + break; + case 7: + insn->insn_reg = VCPU_REGS_RDI; + break; + } + } + + /* REX R bit selects extended registers in LONG mode. */ + if (insn->insn_prefix.pfx_rex & REX_R) + insn->insn_reg += 8; + + return (0); +} + +static int +get_modrm_addr(struct x86_insn *insn, struct vcpu_reg_state *vrs) +{ + uint8_t mod, rm; + vaddr_t addr = 0x0UL; + + if (insn == NULL || vrs == NULL) + return (-1); + + if (insn->insn_modrm_valid) { + rm = MODRM_RM(insn->insn_modrm); + mod = MODRM_MOD(insn->insn_modrm); + + switch (rm) { + case 0b000: + addr = vrs->vrs_gprs[VCPU_REGS_RAX]; + break; + case 0b001: + addr = vrs->vrs_gprs[VCPU_REGS_RCX]; + break; + case 0b010: + addr = vrs->vrs_gprs[VCPU_REGS_RDX]; + break; + case 0b011: + addr = vrs->vrs_gprs[VCPU_REGS_RBX]; + break; + case 0b100: + if (mod == 0b11) + addr = vrs->vrs_gprs[VCPU_REGS_RSP]; + break; + case 0b101: + if (mod != 0b00) + addr = vrs->vrs_gprs[VCPU_REGS_RBP]; + break; + case 0b110: + addr = vrs->vrs_gprs[VCPU_REGS_RSI]; + break; + case 0b111: + addr = vrs->vrs_gprs[VCPU_REGS_RDI]; + break; + } + + insn->insn_gva = addr; + } + + return (0); +} + +static enum decode_result +decode_disp(struct x86_decode_state *state, struct x86_insn *insn) +{ + enum decode_result res = DECODE_ERROR; + uint64_t disp = 0; + + if (!is_valid_state(state, __func__) || insn == NULL) + return (DECODE_ERROR); + + if (!insn->insn_modrm_valid) + return (DECODE_ERROR); + + switch (MODRM_MOD(insn->insn_modrm)) { + case 0x00: + insn->insn_disp_type = DISP_0; + res = DECODE_MORE; + break; + case 0x01: + insn->insn_disp_type = DISP_1; + res = next_value(state, 1, &disp); + if (res == DECODE_ERROR) + return (res); + insn->insn_disp = disp; + break; + case 0x02: + if (insn->insn_prefix.pfx_group4 == LEG_4_ADDRSZ) { + insn->insn_disp_type = DISP_2; + res = next_value(state, 2, &disp); + } else { + insn->insn_disp_type = DISP_4; + res = next_value(state, 4, &disp); + } + if (res == DECODE_ERROR) + return (res); + insn->insn_disp = disp; + break; + default: + insn->insn_disp_type = DISP_NONE; + res = DECODE_MORE; + } + + return (res); +} + +static enum decode_result +decode_opcode(struct x86_decode_state *state, struct x86_insn *insn) +{ + enum decode_result res; + enum x86_opcode_type type; + enum x86_operand_enc enc; + struct x86_opcode *opcode = &insn->insn_opcode; + uint8_t byte, byte2; + + if (!is_valid_state(state, __func__) || insn == NULL) + return (-1); + + memset(opcode, 0, sizeof(*opcode)); + + res = next_byte(state, &byte); + if (res == DECODE_ERROR) + return (res); + + type = x86_1byte_opcode_tbl[byte]; + switch(type) { + case OP_UNKNOWN: + case OP_UNSUPPORTED: + log_warnx("%s: unsupported opcode", __func__); + return (DECODE_ERROR); + + case OP_TWO_BYTE: + res = next_byte(state, &byte2); + if (res == DECODE_ERROR) + return (res); + + type = x86_2byte_opcode_tbl[byte2]; + if (type == OP_UNKNOWN || type == OP_UNSUPPORTED) { + log_warnx("%s: unsupported 2-byte opcode", __func__); + return (DECODE_ERROR); + } + + opcode->op_bytes[0] = byte; + opcode->op_bytes[1] = byte2; + opcode->op_bytes_len = 2; + enc = x86_2byte_operand_enc_table[byte2]; + break; + + default: + /* We've potentially got a known 1-byte opcode. */ + opcode->op_bytes[0] = byte; + opcode->op_bytes_len = 1; + enc = x86_1byte_operand_enc_tbl[byte]; + } + + if (enc == OP_ENC_UNKNOWN) + return (DECODE_ERROR); + + opcode->op_type = type; + opcode->op_encoding = enc; + + return (res); +} + +static enum decode_result +decode_sib(struct x86_decode_state *state, struct x86_insn *insn) +{ + enum decode_result res; + uint8_t byte; + + if (!is_valid_state(state, __func__) || insn == NULL) + return (-1); + + /* SIB is optional, so assume we will be continuing. */ + res = DECODE_MORE; + + insn->insn_sib_valid = 0; + if (!insn->insn_modrm_valid) + return (res); + + /* XXX is SIB valid in all cpu modes? */ + if (MODRM_RM(insn->insn_modrm) == 0b100) { + res = next_byte(state, &byte); + if (res != DECODE_ERROR) { + insn->insn_sib_valid = 1; + insn->insn_sib = byte; + } + } + + return (res); +} + +static enum decode_result +decode_imm(struct x86_decode_state *state, struct x86_insn *insn) +{ + enum decode_result res; + size_t num_bytes; + uint64_t value; + + if (!is_valid_state(state, __func__) || insn == NULL) + return (DECODE_ERROR); + + /* Only handle MI encoded instructions. Others shouldn't need assist. */ + if (insn->insn_opcode.op_encoding != OP_ENC_MI) + return (DECODE_DONE); + + /* Exceptions related to MOV instructions. */ + if (insn->insn_opcode.op_type == OP_MOV) { + switch (insn->insn_opcode.op_bytes[0]) { + case 0xC6: + num_bytes = 1; + break; + case 0xC7: + if (insn->insn_cpu_mode == VMM_CPU_MODE_REAL) + num_bytes = 2; + else + num_bytes = 4; + break; + default: + log_warnx("%s: cannot decode immediate bytes for MOV", + __func__); + return (DECODE_ERROR); + } + } else { + /* Fallback to interpreting based on cpu mode and REX. */ + if (insn->insn_cpu_mode == VMM_CPU_MODE_REAL) + num_bytes = 2; + else if (insn->insn_prefix.pfx_rex == REX_NONE) + num_bytes = 4; + else + num_bytes = 8; + } + + res = next_value(state, num_bytes, &value); + if (res != DECODE_ERROR) { + insn->insn_immediate = value; + insn->insn_immediate_len = num_bytes; + } + + return (res); +} + + +/* + * insn_decode + * + * Decode an x86 instruction from the provided instruction bytes. + * + * Return values: + * 0: successful decode + * Non-zero: an exception occurred during decode + */ +int +insn_decode(struct vm_exit *exit, struct x86_insn *insn) +{ + enum decode_result res; + struct vcpu_reg_state *vrs = &exit->vrs; + struct x86_decode_state state; + uint8_t *bytes, len; + int mode; + + if (exit == NULL || insn == NULL) { + log_warnx("%s: invalid input", __func__); + return (DECODE_ERROR); + } + + bytes = exit->vee.vee_insn_bytes; + len = exit->vee.vee_insn_len; + + /* 0. Initialize state and instruction objects. */ + memset(insn, 0, sizeof(*insn)); + memset(&state, 0, sizeof(state)); + state.s_len = len; + memcpy(&state.s_bytes, bytes, len); + + /* 1. Detect CPU mode. */ + mode = detect_cpu_mode(vrs); + if (mode == VMM_CPU_MODE_UNKNOWN) { + log_warnx("%s: failed to identify cpu mode", __func__); +#ifdef MMIO_DEBUG + dump_regs(vrs); +#endif + return (-1); + } + insn->insn_cpu_mode = mode; + +#ifdef MMIO_DEBUG + log_info("%s: cpu mode %s detected", __progname, str_cpu_mode(mode)); + printf("%s: got bytes: [ ", __progname); + for (int i = 0; i < len; i++) { + printf("%02x ", bytes[i]); + } + printf("]\n"); +#endif + /* 2. Decode prefixes. */ + res = decode_prefix(&state, insn); + if (res == DECODE_ERROR) { + log_warnx("%s: error decoding prefixes", __func__); + goto err; + } else if (res == DECODE_DONE) + goto done; + +#ifdef MMIO_DEBUG + log_info("%s: prefixes {g1: 0x%02x, g2: 0x%02x, g3: 0x%02x, g4: 0x%02x," + " rex: 0x%02x }", __progname, insn->insn_prefix.pfx_group1, + insn->insn_prefix.pfx_group2, insn->insn_prefix.pfx_group3, + insn->insn_prefix.pfx_group4, insn->insn_prefix.pfx_rex); +#endif + + /* 3. Pick apart opcode. Here we can start short-circuiting. */ + res = decode_opcode(&state, insn); + if (res == DECODE_ERROR) { + log_warnx("%s: error decoding opcode", __func__); + goto err; + } else if (res == DECODE_DONE) + goto done; + +#ifdef MMIO_DEBUG + log_info("%s: found opcode %s (operand encoding %s) (%s)", __progname, + str_opcode(&insn->insn_opcode), str_operand_enc(&insn->insn_opcode), + str_decode_res(res)); +#endif + + /* Process optional ModR/M byte. */ + res = decode_modrm(&state, insn); + if (res == DECODE_ERROR) { + log_warnx("%s: error decoding modrm", __func__); + goto err; + } + if (get_modrm_addr(insn, vrs) != 0) + goto err; + if (get_modrm_reg(insn) != 0) + goto err; + if (res == DECODE_DONE) + goto done; + +#ifdef MMIO_DEBUG + if (insn->insn_modrm_valid) + log_info("%s: found ModRM 0x%02x (%s)", __progname, + insn->insn_modrm, str_decode_res(res)); +#endif + + /* Process optional SIB byte. */ + res = decode_sib(&state, insn); + if (res == DECODE_ERROR) { + log_warnx("%s: error decoding sib", __func__); + goto err; + } else if (res == DECODE_DONE) + goto done; + +#ifdef MMIO_DEBUG + if (insn->insn_sib_valid) + log_info("%s: found SIB 0x%02x (%s)", __progname, + insn->insn_sib, str_decode_res(res)); +#endif + + /* Process any Displacement bytes. */ + res = decode_disp(&state, insn); + if (res == DECODE_ERROR) { + log_warnx("%s: error decoding displacement", __func__); + goto err; + } else if (res == DECODE_DONE) + goto done; + + /* Process any Immediate data bytes. */ + res = decode_imm(&state, insn); + if (res == DECODE_ERROR) { + log_warnx("%s: error decoding immediate bytes", __func__); + goto err; + } + +done: + insn->insn_bytes_len = state.s_idx; + +#ifdef MMIO_DEBUG + log_info("%s: final instruction length is %u", __func__, + insn->insn_bytes_len); + dump_insn(insn); + log_info("%s: modrm: {mod: %d, regop: %d, rm: %d}", __func__, + MODRM_MOD(insn->insn_modrm), MODRM_REGOP(insn->insn_modrm), + MODRM_RM(insn->insn_modrm)); + dump_regs(vrs); +#endif /* MMIO_DEBUG */ + return (0); + +err: +#ifdef MMIO_DEBUG + dump_insn(insn); + log_info("%s: modrm: {mod: %d, regop: %d, rm: %d}", __func__, + MODRM_MOD(insn->insn_modrm), MODRM_REGOP(insn->insn_modrm), + MODRM_RM(insn->insn_modrm)); + dump_regs(vrs); +#endif /* MMIO_DEBUG */ + return (-1); +} + +static int +emulate_mov(struct x86_insn *insn, struct vm_exit *exit) +{ + /* XXX Only supports read to register for now */ + if (insn->insn_opcode.op_encoding != OP_ENC_RM) + return (-1); + + /* XXX No device emulation yet. Fill with 0xFFs. */ + exit->vrs.vrs_gprs[insn->insn_reg] = 0xFFFFFFFFFFFFFFFF; + + return (0); +} + +static int +emulate_movzx(struct x86_insn *insn, struct vm_exit *exit) +{ + uint8_t byte, len, src = 1, dst = 2; + uint64_t value = 0; + + /* Only RM is valid for MOVZX. */ + if (insn->insn_opcode.op_encoding != OP_ENC_RM) { + log_warnx("invalid op encoding for MOVZX: %d", + insn->insn_opcode.op_encoding); + return (-1); + } + + len = insn->insn_opcode.op_bytes_len; + if (len < 1 || len > sizeof(insn->insn_opcode.op_bytes)) { + log_warnx("invalid opcode byte length: %d", len); + return (-1); + } + + byte = insn->insn_opcode.op_bytes[len - 1]; + switch (byte) { + case 0xB6: + src = 1; + if (insn->insn_cpu_mode == VMM_CPU_MODE_PROT + || insn->insn_cpu_mode == VMM_CPU_MODE_REAL) + dst = 2; + else if (insn->insn_prefix.pfx_rex == REX_NONE) + dst = 4; + else // XXX validate CPU mode + dst = 8; + break; + case 0xB7: + src = 2; + if (insn->insn_prefix.pfx_rex == REX_NONE) + dst = 4; + else // XXX validate CPU mode + dst = 8; + break; + default: + log_warnx("invalid byte in MOVZX opcode: %x", byte); + return (-1); + } + + if (dst == 4) + exit->vrs.vrs_gprs[insn->insn_reg] &= 0xFFFFFFFF00000000; + else + exit->vrs.vrs_gprs[insn->insn_reg] = 0x0UL; + + /* XXX No device emulation yet. Fill with 0xFFs. */ + switch (src) { + case 1: value = 0xFF; break; + case 2: value = 0xFFFF; break; + case 4: value = 0xFFFFFFFF; break; + case 8: value = 0xFFFFFFFFFFFFFFFF; break; + default: + log_warnx("invalid source size: %d", src); + return (-1); + } + + exit->vrs.vrs_gprs[insn->insn_reg] |= value; + + return (0); +} + +/* + * insn_emulate + * + * Returns: + * 0: success + * EINVAL: exception occurred + * EFAULT: page fault occurred, requires retry + * ENOTSUP: an unsupported instruction was provided + */ +int +insn_emulate(struct vm_exit *exit, struct x86_insn *insn) +{ + int res; + + switch (insn->insn_opcode.op_type) { + case OP_MOV: + res = emulate_mov(insn, exit); + break; + + case OP_MOVZX: + res = emulate_movzx(insn, exit); + break; + + default: + log_warnx("%s: emulation not defined for %s", __func__, + str_opcode(&insn->insn_opcode)); + res = ENOTSUP; + } + + if (res == 0) + exit->vrs.vrs_gprs[VCPU_REGS_RIP] += insn->insn_bytes_len; + + return (res); +} diff --git a/usr.sbin/vmd/x86_vm.c b/usr.sbin/vmd/x86_vm.c new file mode 100644 index 00000000000..c6c4b2a9ed7 --- /dev/null +++ b/usr.sbin/vmd/x86_vm.c @@ -0,0 +1,1373 @@ +/* $OpenBSD: x86_vm.c,v 1.1 2024/07/10 10:41:19 dv Exp $ */ +/* + * Copyright (c) 2015 Mike Larkin + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +#include +#include + +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include "atomicio.h" +#include "fw_cfg.h" +#include "i8253.h" +#include "i8259.h" +#include "loadfile.h" +#include "mc146818.h" +#include "ns8250.h" +#include "pci.h" +#include "virtio.h" + +typedef uint8_t (*io_fn_t)(struct vm_run_params *); + +#define MAX_PORTS 65536 + +io_fn_t ioports_map[MAX_PORTS]; +extern char *__progname; + +void create_memory_map(struct vm_create_params *); +int translate_gva(struct vm_exit*, uint64_t, uint64_t *, int); + +static struct vm_mem_range *find_gpa_range(struct vm_create_params *, paddr_t, + size_t); +static int loadfile_bios(gzFile, off_t, struct vcpu_reg_state *); +static int vcpu_exit_eptviolation(struct vm_run_params *); +static void vcpu_exit_inout(struct vm_run_params *); + +extern struct vmd_vm *current_vm; +extern int con_fd; + +/* + * Represents a standard register set for an OS to be booted + * as a flat 64 bit address space. + * + * NOT set here are: + * RIP + * RSP + * GDTR BASE + * + * Specific bootloaders should clone this structure and override + * those fields as needed. + * + * Note - CR3 and various bits in CR0 may be overridden by vmm(4) based on + * features of the CPU in use. + */ +static const struct vcpu_reg_state vcpu_init_flat64 = { + .vrs_gprs[VCPU_REGS_RFLAGS] = 0x2, + .vrs_gprs[VCPU_REGS_RIP] = 0x0, + .vrs_gprs[VCPU_REGS_RSP] = 0x0, + .vrs_crs[VCPU_REGS_CR0] = CR0_ET | CR0_PE | CR0_PG, + .vrs_crs[VCPU_REGS_CR3] = PML4_PAGE, + .vrs_crs[VCPU_REGS_CR4] = CR4_PAE | CR4_PSE, + .vrs_crs[VCPU_REGS_PDPTE0] = 0ULL, + .vrs_crs[VCPU_REGS_PDPTE1] = 0ULL, + .vrs_crs[VCPU_REGS_PDPTE2] = 0ULL, + .vrs_crs[VCPU_REGS_PDPTE3] = 0ULL, + .vrs_sregs[VCPU_REGS_CS] = { 0x8, 0xFFFFFFFF, 0xC09F, 0x0}, + .vrs_sregs[VCPU_REGS_DS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, + .vrs_sregs[VCPU_REGS_ES] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, + .vrs_sregs[VCPU_REGS_FS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, + .vrs_sregs[VCPU_REGS_GS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, + .vrs_sregs[VCPU_REGS_SS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, + .vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0}, + .vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0}, + .vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0}, + .vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0}, + .vrs_msrs[VCPU_REGS_EFER] = EFER_LME | EFER_LMA, + .vrs_drs[VCPU_REGS_DR0] = 0x0, + .vrs_drs[VCPU_REGS_DR1] = 0x0, + .vrs_drs[VCPU_REGS_DR2] = 0x0, + .vrs_drs[VCPU_REGS_DR3] = 0x0, + .vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0, + .vrs_drs[VCPU_REGS_DR7] = 0x400, + .vrs_msrs[VCPU_REGS_STAR] = 0ULL, + .vrs_msrs[VCPU_REGS_LSTAR] = 0ULL, + .vrs_msrs[VCPU_REGS_CSTAR] = 0ULL, + .vrs_msrs[VCPU_REGS_SFMASK] = 0ULL, + .vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL, + .vrs_msrs[VCPU_REGS_MISC_ENABLE] = 0ULL, + .vrs_crs[VCPU_REGS_XCR0] = XFEATURE_X87 +}; + +/* + * Represents a standard register set for an BIOS to be booted + * as a flat 16 bit address space. + */ +static const struct vcpu_reg_state vcpu_init_flat16 = { + .vrs_gprs[VCPU_REGS_RFLAGS] = 0x2, + .vrs_gprs[VCPU_REGS_RIP] = 0xFFF0, + .vrs_gprs[VCPU_REGS_RSP] = 0x0, + .vrs_crs[VCPU_REGS_CR0] = 0x60000010, + .vrs_crs[VCPU_REGS_CR3] = 0, + .vrs_sregs[VCPU_REGS_CS] = { 0xF000, 0xFFFF, 0x809F, 0xF0000}, + .vrs_sregs[VCPU_REGS_DS] = { 0x0, 0xFFFF, 0x8093, 0x0}, + .vrs_sregs[VCPU_REGS_ES] = { 0x0, 0xFFFF, 0x8093, 0x0}, + .vrs_sregs[VCPU_REGS_FS] = { 0x0, 0xFFFF, 0x8093, 0x0}, + .vrs_sregs[VCPU_REGS_GS] = { 0x0, 0xFFFF, 0x8093, 0x0}, + .vrs_sregs[VCPU_REGS_SS] = { 0x0, 0xFFFF, 0x8093, 0x0}, + .vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0}, + .vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0}, + .vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0}, + .vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0}, + .vrs_msrs[VCPU_REGS_EFER] = 0ULL, + .vrs_drs[VCPU_REGS_DR0] = 0x0, + .vrs_drs[VCPU_REGS_DR1] = 0x0, + .vrs_drs[VCPU_REGS_DR2] = 0x0, + .vrs_drs[VCPU_REGS_DR3] = 0x0, + .vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0, + .vrs_drs[VCPU_REGS_DR7] = 0x400, + .vrs_msrs[VCPU_REGS_STAR] = 0ULL, + .vrs_msrs[VCPU_REGS_LSTAR] = 0ULL, + .vrs_msrs[VCPU_REGS_CSTAR] = 0ULL, + .vrs_msrs[VCPU_REGS_SFMASK] = 0ULL, + .vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL, + .vrs_crs[VCPU_REGS_XCR0] = XFEATURE_X87 +}; + +/* + * create_memory_map + * + * Sets up the guest physical memory ranges that the VM can access. + * + * Parameters: + * vcp: VM create parameters describing the VM whose memory map + * is being created + * + * Return values: + * nothing + */ +void +create_memory_map(struct vm_create_params *vcp) +{ + size_t len, mem_bytes; + size_t above_1m = 0, above_4g = 0; + + mem_bytes = vcp->vcp_memranges[0].vmr_size; + vcp->vcp_nmemranges = 0; + if (mem_bytes == 0 || mem_bytes > VMM_MAX_VM_MEM_SIZE) + return; + + /* First memory region: 0 - LOWMEM_KB (DOS low mem) */ + len = LOWMEM_KB * 1024; + vcp->vcp_memranges[0].vmr_gpa = 0x0; + vcp->vcp_memranges[0].vmr_size = len; + vcp->vcp_memranges[0].vmr_type = VM_MEM_RAM; + mem_bytes -= len; + + /* + * Second memory region: LOWMEM_KB - 1MB. + * + * N.B. - Normally ROMs or parts of video RAM are mapped here. + * We have to add this region, because some systems + * unconditionally write to 0xb8000 (VGA RAM), and + * we need to make sure that vmm(4) permits accesses + * to it. So allocate guest memory for it. + */ + len = MB(1) - (LOWMEM_KB * 1024); + vcp->vcp_memranges[1].vmr_gpa = LOWMEM_KB * 1024; + vcp->vcp_memranges[1].vmr_size = len; + vcp->vcp_memranges[1].vmr_type = VM_MEM_RESERVED; + mem_bytes -= len; + + /* If we have less than 2MB remaining, still create a 2nd BIOS area. */ + if (mem_bytes <= MB(2)) { + vcp->vcp_memranges[2].vmr_gpa = VMM_PCI_MMIO_BAR_END; + vcp->vcp_memranges[2].vmr_size = MB(2); + vcp->vcp_memranges[2].vmr_type = VM_MEM_RESERVED; + vcp->vcp_nmemranges = 3; + return; + } + + /* + * Calculate the how to split any remaining memory across the 4GB + * boundary while making sure we do not place physical memory into + * MMIO ranges. + */ + if (mem_bytes > VMM_PCI_MMIO_BAR_BASE - MB(1)) { + above_1m = VMM_PCI_MMIO_BAR_BASE - MB(1); + above_4g = mem_bytes - above_1m; + } else { + above_1m = mem_bytes; + above_4g = 0; + } + + /* Third memory region: area above 1MB to MMIO region */ + vcp->vcp_memranges[2].vmr_gpa = MB(1); + vcp->vcp_memranges[2].vmr_size = above_1m; + vcp->vcp_memranges[2].vmr_type = VM_MEM_RAM; + + /* Fourth region: PCI MMIO range */ + vcp->vcp_memranges[3].vmr_gpa = VMM_PCI_MMIO_BAR_BASE; + vcp->vcp_memranges[3].vmr_size = VMM_PCI_MMIO_BAR_END - + VMM_PCI_MMIO_BAR_BASE + 1; + vcp->vcp_memranges[3].vmr_type = VM_MEM_MMIO; + + /* Fifth region: 2nd copy of BIOS above MMIO ending at 4GB */ + vcp->vcp_memranges[4].vmr_gpa = VMM_PCI_MMIO_BAR_END + 1; + vcp->vcp_memranges[4].vmr_size = MB(2); + vcp->vcp_memranges[4].vmr_type = VM_MEM_RESERVED; + + /* Sixth region: any remainder above 4GB */ + if (above_4g > 0) { + vcp->vcp_memranges[5].vmr_gpa = GB(4); + vcp->vcp_memranges[5].vmr_size = above_4g; + vcp->vcp_memranges[5].vmr_type = VM_MEM_RAM; + vcp->vcp_nmemranges = 6; + } else + vcp->vcp_nmemranges = 5; +} + +int +load_firmware(struct vmd_vm *vm, struct vcpu_reg_state *vrs) +{ + int ret; + gzFile fp; + struct stat sb; + + /* + * Set up default "flat 64 bit" register state - RIP, RSP, and + * GDT info will be set in bootloader + */ + memcpy(vrs, &vcpu_init_flat64, sizeof(*vrs)); + + /* Find and open kernel image */ + if ((fp = gzdopen(vm->vm_kernel, "r")) == NULL) + fatalx("failed to open kernel - exiting"); + + /* Load kernel image */ + ret = loadfile_elf(fp, vm, vrs, vm->vm_params.vmc_bootdevice); + + /* + * Try BIOS as a fallback (only if it was provided as an image + * with vm->vm_kernel and the file is not compressed) + */ + if (ret && errno == ENOEXEC && vm->vm_kernel != -1 && + gzdirect(fp) && (ret = fstat(vm->vm_kernel, &sb)) == 0) + ret = loadfile_bios(fp, sb.st_size, vrs); + + gzclose(fp); + + return (ret); +} + + +/* + * loadfile_bios + * + * Alternatively to loadfile_elf, this function loads a non-ELF BIOS image + * directly into memory. + * + * Parameters: + * fp: file of a kernel file to load + * size: uncompressed size of the image + * (out) vrs: register state to set on init for this kernel + * + * Return values: + * 0 if successful + * various error codes returned from read(2) or loadelf functions + */ +int +loadfile_bios(gzFile fp, off_t size, struct vcpu_reg_state *vrs) +{ + off_t off; + + /* Set up a "flat 16 bit" register state for BIOS */ + memcpy(vrs, &vcpu_init_flat16, sizeof(*vrs)); + + /* Seek to the beginning of the BIOS image */ + if (gzseek(fp, 0, SEEK_SET) == -1) + return (-1); + + /* The BIOS image must end at 1MB */ + if ((off = MB(1) - size) < 0) + return (-1); + + /* Read BIOS image into memory */ + if (mread(fp, off, size) != (size_t)size) { + errno = EIO; + return (-1); + } + + if (gzseek(fp, 0, SEEK_SET) == -1) + return (-1); + + /* Read a second BIOS copy into memory ending at 4GB */ + off = GB(4) - size; + if (mread(fp, off, size) != (size_t)size) { + errno = EIO; + return (-1); + } + + log_debug("%s: loaded BIOS image", __func__); + + return (0); +} + +/* + * init_emulated_hw + * + * Initializes the userspace hardware emulation + */ +void +init_emulated_hw(struct vmop_create_params *vmc, int child_cdrom, + int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps) +{ + struct vm_create_params *vcp = &vmc->vmc_params; + size_t i; + uint64_t memlo, memhi; + + /* Calculate memory size for NVRAM registers */ + memlo = memhi = 0; + for (i = 0; i < vcp->vcp_nmemranges; i++) { + if (vcp->vcp_memranges[i].vmr_gpa == MB(1) && + vcp->vcp_memranges[i].vmr_size > (15 * MB(1))) + memlo = vcp->vcp_memranges[i].vmr_size - (15 * MB(1)); + else if (vcp->vcp_memranges[i].vmr_gpa == GB(4)) + memhi = vcp->vcp_memranges[i].vmr_size; + } + + /* Reset the IO port map */ + memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS); + + /* Init i8253 PIT */ + i8253_init(vcp->vcp_id); + ioports_map[TIMER_CTRL] = vcpu_exit_i8253; + ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253; + ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253; + ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253; + ioports_map[PCKBC_AUX] = vcpu_exit_i8253_misc; + + /* Init mc146818 RTC */ + mc146818_init(vcp->vcp_id, memlo, memhi); + ioports_map[IO_RTC] = vcpu_exit_mc146818; + ioports_map[IO_RTC + 1] = vcpu_exit_mc146818; + + /* Init master and slave PICs */ + i8259_init(); + ioports_map[IO_ICU1] = vcpu_exit_i8259; + ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259; + ioports_map[IO_ICU2] = vcpu_exit_i8259; + ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259; + ioports_map[ELCR0] = vcpu_exit_elcr; + ioports_map[ELCR1] = vcpu_exit_elcr; + + /* Init ns8250 UART */ + ns8250_init(con_fd, vcp->vcp_id); + for (i = COM1_DATA; i <= COM1_SCR; i++) + ioports_map[i] = vcpu_exit_com; + + /* Initialize PCI */ + for (i = VM_PCI_IO_BAR_BASE; i <= VM_PCI_IO_BAR_END; i++) + ioports_map[i] = vcpu_exit_pci; + + ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci; + ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci; + ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci; + ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci; + ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci; + pci_init(); + + /* Initialize virtio devices */ + virtio_init(current_vm, child_cdrom, child_disks, child_taps); + + /* + * Init QEMU fw_cfg interface. Must be done last for pci hardware + * detection. + */ + fw_cfg_init(vmc); + ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg; + ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg; + ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma; + ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma; +} + +/* + * restore_emulated_hw + * + * Restores the userspace hardware emulation from fd + */ +void +restore_emulated_hw(struct vm_create_params *vcp, int fd, + int *child_taps, int child_disks[][VM_MAX_BASE_PER_DISK], int child_cdrom) +{ + /* struct vm_create_params *vcp = &vmc->vmc_params; */ + int i; + memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS); + + /* Init i8253 PIT */ + i8253_restore(fd, vcp->vcp_id); + ioports_map[TIMER_CTRL] = vcpu_exit_i8253; + ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253; + ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253; + ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253; + + /* Init master and slave PICs */ + i8259_restore(fd); + ioports_map[IO_ICU1] = vcpu_exit_i8259; + ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259; + ioports_map[IO_ICU2] = vcpu_exit_i8259; + ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259; + + /* Init ns8250 UART */ + ns8250_restore(fd, con_fd, vcp->vcp_id); + for (i = COM1_DATA; i <= COM1_SCR; i++) + ioports_map[i] = vcpu_exit_com; + + /* Init mc146818 RTC */ + mc146818_restore(fd, vcp->vcp_id); + ioports_map[IO_RTC] = vcpu_exit_mc146818; + ioports_map[IO_RTC + 1] = vcpu_exit_mc146818; + + /* Init QEMU fw_cfg interface */ + fw_cfg_restore(fd); + ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg; + ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg; + ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma; + ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma; + + /* Initialize PCI */ + for (i = VM_PCI_IO_BAR_BASE; i <= VM_PCI_IO_BAR_END; i++) + ioports_map[i] = vcpu_exit_pci; + + ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci; + ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci; + ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci; + ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci; + ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci; + pci_restore(fd); + virtio_restore(fd, current_vm, child_cdrom, child_disks, child_taps); +} + +void +pause_vm_md(struct vmd_vm *vm) +{ + i8253_stop(); + mc146818_stop(); + ns8250_stop(); + virtio_stop(vm); +} + +void +unpause_vm_md(struct vmd_vm *vm) +{ + i8253_start(); + mc146818_start(); + ns8250_start(); + virtio_start(vm); +} + +int +dump_devs(int fd) +{ + int ret = 0; + + if ((ret = i8253_dump(fd))) + return ret; + if ((ret = i8259_dump(fd))) + return ret; + if ((ret = ns8250_dump(fd))) + return ret; + if ((ret = mc146818_dump(fd))) + return ret; + ret = fw_cfg_dump(fd); + + return ret; +} + +int +dump_send_header(int fd) { + struct vm_dump_header vmh; + int i; + + memcpy(&vmh.vmh_signature, VM_DUMP_SIGNATURE, + sizeof(vmh.vmh_signature)); + + vmh.vmh_cpuids[0].code = 0x00; + vmh.vmh_cpuids[0].leaf = 0x00; + + vmh.vmh_cpuids[1].code = 0x01; + vmh.vmh_cpuids[1].leaf = 0x00; + + vmh.vmh_cpuids[2].code = 0x07; + vmh.vmh_cpuids[2].leaf = 0x00; + + vmh.vmh_cpuids[3].code = 0x0d; + vmh.vmh_cpuids[3].leaf = 0x00; + + vmh.vmh_cpuids[4].code = 0x80000001; + vmh.vmh_cpuids[4].leaf = 0x00; + + vmh.vmh_version = VM_DUMP_VERSION; + + for (i=0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) { + CPUID_LEAF(vmh.vmh_cpuids[i].code, + vmh.vmh_cpuids[i].leaf, + vmh.vmh_cpuids[i].a, + vmh.vmh_cpuids[i].b, + vmh.vmh_cpuids[i].c, + vmh.vmh_cpuids[i].d); + } + + if (atomicio(vwrite, fd, &vmh, sizeof(vmh)) != sizeof(vmh)) + return (-1); + + return (0); +} + + +/* + * vcpu_exit_inout + * + * Handle all I/O exits that need to be emulated in vmd. This includes the + * i8253 PIT, the com1 ns8250 UART, and the MC146818 RTC/NVRAM device. + * + * Parameters: + * vrp: vcpu run parameters containing guest state for this exit + */ +void +vcpu_exit_inout(struct vm_run_params *vrp) +{ + struct vm_exit *vei = vrp->vrp_exit; + uint8_t intr = 0xFF; + + if (vei->vei.vei_rep || vei->vei.vei_string) { +#ifdef MMIO_DEBUG + log_info("%s: %s%s%s %d-byte, enc=%d, data=0x%08x, port=0x%04x", + __func__, + vei->vei.vei_rep == 0 ? "" : "REP ", + vei->vei.vei_dir == VEI_DIR_IN ? "IN" : "OUT", + vei->vei.vei_string == 0 ? "" : "S", + vei->vei.vei_size, vei->vei.vei_encoding, + vei->vei.vei_data, vei->vei.vei_port); + log_info("%s: ECX = 0x%llx, RDX = 0x%llx, RSI = 0x%llx", + __func__, + vei->vrs.vrs_gprs[VCPU_REGS_RCX], + vei->vrs.vrs_gprs[VCPU_REGS_RDX], + vei->vrs.vrs_gprs[VCPU_REGS_RSI]); +#endif /* MMIO_DEBUG */ + fatalx("%s: can't emulate REP prefixed IN(S)/OUT(S)", + __func__); + } + + if (ioports_map[vei->vei.vei_port] != NULL) + intr = ioports_map[vei->vei.vei_port](vrp); + else if (vei->vei.vei_dir == VEI_DIR_IN) + set_return_data(vei, 0xFFFFFFFF); + + vei->vrs.vrs_gprs[VCPU_REGS_RIP] += vei->vei.vei_insn_len; + + if (intr != 0xFF) + vcpu_assert_irq(vrp->vrp_vm_id, vrp->vrp_vcpu_id, intr); +} + +/* + * vcpu_exit + * + * Handle a vcpu exit. This function is called when it is determined that + * vmm(4) requires the assistance of vmd to support a particular guest + * exit type (eg, accessing an I/O port or device). Guest state is contained + * in 'vrp', and will be resent to vmm(4) on exit completion. + * + * Upon conclusion of handling the exit, the function determines if any + * interrupts should be injected into the guest, and asserts the proper + * IRQ line whose interrupt should be vectored. + * + * Parameters: + * vrp: vcpu run parameters containing guest state for this exit + * + * Return values: + * 0: the exit was handled successfully + * 1: an error occurred (eg, unknown exit reason passed in 'vrp') + */ +int +vcpu_exit(struct vm_run_params *vrp) +{ + int ret; + + switch (vrp->vrp_exit_reason) { + case VMX_EXIT_INT_WINDOW: + case SVM_VMEXIT_VINTR: + case VMX_EXIT_CPUID: + case VMX_EXIT_EXTINT: + case SVM_VMEXIT_INTR: + case SVM_VMEXIT_MSR: + case SVM_VMEXIT_CPUID: + /* + * We may be exiting to vmd to handle a pending interrupt but + * at the same time the last exit type may have been one of + * these. In this case, there's nothing extra to be done + * here (and falling through to the default case below results + * in more vmd log spam). + */ + break; + case SVM_VMEXIT_NPF: + case VMX_EXIT_EPT_VIOLATION: + ret = vcpu_exit_eptviolation(vrp); + if (ret) + return (ret); + break; + case VMX_EXIT_IO: + case SVM_VMEXIT_IOIO: + vcpu_exit_inout(vrp); + break; + case VMX_EXIT_HLT: + case SVM_VMEXIT_HLT: + vcpu_halt(vrp->vrp_vcpu_id); + break; + case VMX_EXIT_TRIPLE_FAULT: + case SVM_VMEXIT_SHUTDOWN: + /* reset VM */ + return (EAGAIN); + default: + log_debug("%s: unknown exit reason 0x%x", + __progname, vrp->vrp_exit_reason); + } + + return (0); +} + +/* + * vcpu_exit_eptviolation + * + * handle an EPT Violation + * + * Parameters: + * vrp: vcpu run parameters containing guest state for this exit + * + * Return values: + * 0: no action required + * EFAULT: a protection fault occured, kill the vm. + */ +static int +vcpu_exit_eptviolation(struct vm_run_params *vrp) +{ + struct vm_exit *ve = vrp->vrp_exit; + int ret = 0; +#if MMIO_NOTYET + struct x86_insn insn; + uint64_t va, pa; + size_t len = 15; /* Max instruction length in x86. */ +#endif /* MMIO_NOTYET */ + switch (ve->vee.vee_fault_type) { + case VEE_FAULT_HANDLED: + log_debug("%s: fault already handled", __func__); + break; + +#if MMIO_NOTYET + case VEE_FAULT_MMIO_ASSIST: + /* Intel VMX might give us the length of the instruction. */ + if (ve->vee.vee_insn_info & VEE_LEN_VALID) + len = ve->vee.vee_insn_len; + + if (len > 15) + fatalx("%s: invalid instruction length %lu", __func__, + len); + + /* If we weren't given instruction bytes, we need to fetch. */ + if (!(ve->vee.vee_insn_info & VEE_BYTES_VALID)) { + memset(ve->vee.vee_insn_bytes, 0, + sizeof(ve->vee.vee_insn_bytes)); + va = ve->vrs.vrs_gprs[VCPU_REGS_RIP]; + + /* XXX Only support instructions that fit on 1 page. */ + if ((va & PAGE_MASK) + len > PAGE_SIZE) { + log_warnx("%s: instruction might cross page " + "boundary", __func__); + ret = EINVAL; + break; + } + + ret = translate_gva(ve, va, &pa, PROT_EXEC); + if (ret != 0) { + log_warnx("%s: failed gva translation", + __func__); + break; + } + + ret = read_mem(pa, ve->vee.vee_insn_bytes, len); + if (ret != 0) { + log_warnx("%s: failed to fetch instruction " + "bytes from 0x%llx", __func__, pa); + break; + } + } + + ret = insn_decode(ve, &insn); + if (ret == 0) + ret = insn_emulate(ve, &insn); + break; +#endif /* MMIO_NOTYET */ + + case VEE_FAULT_PROTECT: + log_debug("%s: EPT Violation: rip=0x%llx", __progname, + ve->vrs.vrs_gprs[VCPU_REGS_RIP]); + ret = EFAULT; + break; + + default: + fatalx("%s: invalid fault_type %d", __progname, + ve->vee.vee_fault_type); + /* UNREACHED */ + } + + return (ret); +} + +/* + * vcpu_exit_pci + * + * Handle all I/O to the emulated PCI subsystem. + * + * Parameters: + * vrp: vcpu run parameters containing guest state for this exit + * + * Return value: + * Interrupt to inject to the guest VM, or 0xFF if no interrupt should + * be injected. + */ +uint8_t +vcpu_exit_pci(struct vm_run_params *vrp) +{ + struct vm_exit *vei = vrp->vrp_exit; + uint8_t intr; + + intr = 0xFF; + + switch (vei->vei.vei_port) { + case PCI_MODE1_ADDRESS_REG: + pci_handle_address_reg(vrp); + break; + case PCI_MODE1_DATA_REG: + case PCI_MODE1_DATA_REG + 1: + case PCI_MODE1_DATA_REG + 2: + case PCI_MODE1_DATA_REG + 3: + pci_handle_data_reg(vrp); + break; + case VM_PCI_IO_BAR_BASE ... VM_PCI_IO_BAR_END: + intr = pci_handle_io(vrp); + break; + default: + log_warnx("%s: unknown PCI register 0x%llx", + __progname, (uint64_t)vei->vei.vei_port); + break; + } + + return (intr); +} + +/* + * find_gpa_range + * + * Search for a contiguous guest physical mem range. + * + * Parameters: + * vcp: VM create parameters that contain the memory map to search in + * gpa: the starting guest physical address + * len: the length of the memory range + * + * Return values: + * NULL: on failure if there is no memory range as described by the parameters + * Pointer to vm_mem_range that contains the start of the range otherwise. + */ +static struct vm_mem_range * +find_gpa_range(struct vm_create_params *vcp, paddr_t gpa, size_t len) +{ + size_t i, n; + struct vm_mem_range *vmr; + + /* Find the first vm_mem_range that contains gpa */ + for (i = 0; i < vcp->vcp_nmemranges; i++) { + vmr = &vcp->vcp_memranges[i]; + if (gpa < vmr->vmr_gpa + vmr->vmr_size) + break; + } + + /* No range found. */ + if (i == vcp->vcp_nmemranges) + return (NULL); + + /* + * vmr may cover the range [gpa, gpa + len) only partly. Make + * sure that the following vm_mem_ranges are contiguous and + * cover the rest. + */ + n = vmr->vmr_size - (gpa - vmr->vmr_gpa); + if (len < n) + len = 0; + else + len -= n; + gpa = vmr->vmr_gpa + vmr->vmr_size; + for (i = i + 1; len != 0 && i < vcp->vcp_nmemranges; i++) { + vmr = &vcp->vcp_memranges[i]; + if (gpa != vmr->vmr_gpa) + return (NULL); + if (len <= vmr->vmr_size) + len = 0; + else + len -= vmr->vmr_size; + + gpa = vmr->vmr_gpa + vmr->vmr_size; + } + + if (len != 0) + return (NULL); + + return (vmr); +} +/* + * write_mem + * + * Copies data from 'buf' into the guest VM's memory at paddr 'dst'. + * + * Parameters: + * dst: the destination paddr_t in the guest VM + * buf: data to copy (or NULL to zero the data) + * len: number of bytes to copy + * + * Return values: + * 0: success + * EINVAL: if the guest physical memory range [dst, dst + len) does not + * exist in the guest. + */ +int +write_mem(paddr_t dst, const void *buf, size_t len) +{ + const char *from = buf; + char *to; + size_t n, off; + struct vm_mem_range *vmr; + + vmr = find_gpa_range(¤t_vm->vm_params.vmc_params, dst, len); + if (vmr == NULL) { + errno = EINVAL; + log_warn("%s: failed - invalid memory range dst = 0x%lx, " + "len = 0x%zx", __func__, dst, len); + return (EINVAL); + } + + off = dst - vmr->vmr_gpa; + while (len != 0) { + n = vmr->vmr_size - off; + if (len < n) + n = len; + + to = (char *)vmr->vmr_va + off; + if (buf == NULL) + memset(to, 0, n); + else { + memcpy(to, from, n); + from += n; + } + len -= n; + off = 0; + vmr++; + } + + return (0); +} + +/* + * read_mem + * + * Reads memory at guest paddr 'src' into 'buf'. + * + * Parameters: + * src: the source paddr_t in the guest VM to read from. + * buf: destination (local) buffer + * len: number of bytes to read + * + * Return values: + * 0: success + * EINVAL: if the guest physical memory range [dst, dst + len) does not + * exist in the guest. + */ +int +read_mem(paddr_t src, void *buf, size_t len) +{ + char *from, *to = buf; + size_t n, off; + struct vm_mem_range *vmr; + + vmr = find_gpa_range(¤t_vm->vm_params.vmc_params, src, len); + if (vmr == NULL) { + errno = EINVAL; + log_warn("%s: failed - invalid memory range src = 0x%lx, " + "len = 0x%zx", __func__, src, len); + return (EINVAL); + } + + off = src - vmr->vmr_gpa; + while (len != 0) { + n = vmr->vmr_size - off; + if (len < n) + n = len; + + from = (char *)vmr->vmr_va + off; + memcpy(to, from, n); + + to += n; + len -= n; + off = 0; + vmr++; + } + + return (0); +} + +/* + * hvaddr_mem + * + * Translate a guest physical address to a host virtual address, checking the + * provided memory range length to confirm it's contiguous within the same + * guest memory range (vm_mem_range). + * + * Parameters: + * gpa: guest physical address to translate + * len: number of bytes in the intended range + * + * Return values: + * void* to host virtual memory on success + * NULL on error, setting errno to: + * EFAULT: gpa falls outside guest memory ranges + * EINVAL: requested len extends beyond memory range + */ +void * +hvaddr_mem(paddr_t gpa, size_t len) +{ + struct vm_mem_range *vmr; + size_t off; + + vmr = find_gpa_range(¤t_vm->vm_params.vmc_params, gpa, len); + if (vmr == NULL) { + log_warnx("%s: failed - invalid gpa: 0x%lx\n", __func__, gpa); + errno = EFAULT; + return (NULL); + } + + off = gpa - vmr->vmr_gpa; + if (len > (vmr->vmr_size - off)) { + log_warnx("%s: failed - invalid memory range: gpa=0x%lx, " + "len=%zu", __func__, gpa, len); + errno = EINVAL; + return (NULL); + } + + return ((char *)vmr->vmr_va + off); +} + +/* + * vcpu_assert_irq + * + * Injects the specified IRQ on the supplied vcpu/vm + * + * Parameters: + * vm_id: VM ID to inject to + * vcpu_id: VCPU ID to inject to + * irq: IRQ to inject + */ +void +vcpu_assert_irq(uint32_t vm_id, uint32_t vcpu_id, int irq) +{ + i8259_assert_irq(irq); + + if (i8259_is_pending()) { + if (vcpu_intr(vm_id, vcpu_id, 1)) + fatalx("%s: can't assert INTR", __func__); + + vcpu_unhalt(vcpu_id); + vcpu_signal_run(vcpu_id); + } +} + +/* + * vcpu_deassert_pic_irq + * + * Clears the specified IRQ on the supplied vcpu/vm + * + * Parameters: + * vm_id: VM ID to clear in + * vcpu_id: VCPU ID to clear in + * irq: IRQ to clear + */ +void +vcpu_deassert_irq(uint32_t vm_id, uint32_t vcpu_id, int irq) +{ + i8259_deassert_irq(irq); + + if (!i8259_is_pending()) { + if (vcpu_intr(vm_id, vcpu_id, 0)) + fatalx("%s: can't deassert INTR for vm_id %d, " + "vcpu_id %d", __func__, vm_id, vcpu_id); + } +} +/* + * set_return_data + * + * Utility function for manipulating register data in vm exit info structs. This + * function ensures that the data is copied to the vei->vei.vei_data field with + * the proper size for the operation being performed. + * + * Parameters: + * vei: exit information + * data: return data + */ +void +set_return_data(struct vm_exit *vei, uint32_t data) +{ + switch (vei->vei.vei_size) { + case 1: + vei->vei.vei_data &= ~0xFF; + vei->vei.vei_data |= (uint8_t)data; + break; + case 2: + vei->vei.vei_data &= ~0xFFFF; + vei->vei.vei_data |= (uint16_t)data; + break; + case 4: + vei->vei.vei_data = data; + break; + } +} + +/* + * get_input_data + * + * Utility function for manipulating register data in vm exit info + * structs. This function ensures that the data is copied from the + * vei->vei.vei_data field with the proper size for the operation being + * performed. + * + * Parameters: + * vei: exit information + * data: location to store the result + */ +void +get_input_data(struct vm_exit *vei, uint32_t *data) +{ + switch (vei->vei.vei_size) { + case 1: + *data &= 0xFFFFFF00; + *data |= (uint8_t)vei->vei.vei_data; + break; + case 2: + *data &= 0xFFFF0000; + *data |= (uint16_t)vei->vei.vei_data; + break; + case 4: + *data = vei->vei.vei_data; + break; + default: + log_warnx("%s: invalid i/o size %d", __func__, + vei->vei.vei_size); + } + +} + +/* + * translate_gva + * + * Translates a guest virtual address to a guest physical address by walking + * the currently active page table (if needed). + * + * XXX ensure translate_gva updates the A bit in the PTE + * XXX ensure translate_gva respects segment base and limits in i386 mode + * XXX ensure translate_gva respects segment wraparound in i8086 mode + * XXX ensure translate_gva updates the A bit in the segment selector + * XXX ensure translate_gva respects CR4.LMSLE if available + * + * Parameters: + * exit: The VCPU this translation should be performed for (guest MMU settings + * are gathered from this VCPU) + * va: virtual address to translate + * pa: pointer to paddr_t variable that will receive the translated physical + * address. 'pa' is unchanged on error. + * mode: one of PROT_READ, PROT_WRITE, PROT_EXEC indicating the mode in which + * the address should be translated + * + * Return values: + * 0: the address was successfully translated - 'pa' contains the physical + * address currently mapped by 'va'. + * EFAULT: the PTE for 'VA' is unmapped. A #PF will be injected in this case + * and %cr2 set in the vcpu structure. + * EINVAL: an error occurred reading paging table structures + */ +int +translate_gva(struct vm_exit* exit, uint64_t va, uint64_t* pa, int mode) +{ + int level, shift, pdidx; + uint64_t pte, pt_paddr, pte_paddr, mask, low_mask, high_mask; + uint64_t shift_width, pte_size; + struct vcpu_reg_state *vrs; + + vrs = &exit->vrs; + + if (!pa) + return (EINVAL); + + if (!(vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PG)) { + log_debug("%s: unpaged, va=pa=0x%llx", __func__, va); + *pa = va; + return (0); + } + + pt_paddr = vrs->vrs_crs[VCPU_REGS_CR3]; + + log_debug("%s: guest %%cr0=0x%llx, %%cr3=0x%llx", __func__, + vrs->vrs_crs[VCPU_REGS_CR0], vrs->vrs_crs[VCPU_REGS_CR3]); + + if (vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PE) { + if (vrs->vrs_crs[VCPU_REGS_CR4] & CR4_PAE) { + pte_size = sizeof(uint64_t); + shift_width = 9; + + if (vrs->vrs_msrs[VCPU_REGS_EFER] & EFER_LMA) { + /* 4 level paging */ + level = 4; + mask = L4_MASK; + shift = L4_SHIFT; + } else { + /* 32 bit with PAE paging */ + level = 3; + mask = L3_MASK; + shift = L3_SHIFT; + } + } else { + /* 32 bit paging */ + level = 2; + shift_width = 10; + mask = 0xFFC00000; + shift = 22; + pte_size = sizeof(uint32_t); + } + } else + return (EINVAL); + + /* XXX: Check for R bit in segment selector and set A bit */ + + for (;level > 0; level--) { + pdidx = (va & mask) >> shift; + pte_paddr = (pt_paddr) + (pdidx * pte_size); + + log_debug("%s: read pte level %d @ GPA 0x%llx", __func__, + level, pte_paddr); + if (read_mem(pte_paddr, &pte, pte_size)) { + log_warn("%s: failed to read pte", __func__); + return (EFAULT); + } + + log_debug("%s: PTE @ 0x%llx = 0x%llx", __func__, pte_paddr, + pte); + + /* XXX: Set CR2 */ + if (!(pte & PG_V)) + return (EFAULT); + + /* XXX: Check for SMAP */ + if ((mode == PROT_WRITE) && !(pte & PG_RW)) + return (EPERM); + + if ((exit->cpl > 0) && !(pte & PG_u)) + return (EPERM); + + pte = pte | PG_U; + if (mode == PROT_WRITE) + pte = pte | PG_M; + if (write_mem(pte_paddr, &pte, pte_size)) { + log_warn("%s: failed to write back flags to pte", + __func__); + return (EIO); + } + + /* XXX: EINVAL if in 32bit and PG_PS is 1 but CR4.PSE is 0 */ + if (pte & PG_PS) + break; + + if (level > 1) { + pt_paddr = pte & PG_FRAME; + shift -= shift_width; + mask = mask >> shift_width; + } + } + + low_mask = (1 << shift) - 1; + high_mask = (((uint64_t)1ULL << ((pte_size * 8) - 1)) - 1) ^ low_mask; + *pa = (pte & high_mask) | (va & low_mask); + + log_debug("%s: final GPA for GVA 0x%llx = 0x%llx\n", __func__, va, *pa); + + return (0); +} + +int +intr_pending(struct vmd_vm *vm) +{ + /* XXX select active interrupt controller */ + return i8259_is_pending(); +} + +int +intr_ack(struct vmd_vm *vm) +{ + /* XXX select active interrupt controller */ + return i8259_ack(); +} + +void +intr_toggle_el(struct vmd_vm *vm, int irq, int val) +{ + /* XXX select active interrupt controller */ + pic_set_elcr(irq, val); +} + +int +vmd_check_vmh(struct vm_dump_header *vmh) +{ + int i; + unsigned int code, leaf; + unsigned int a, b, c, d; + + if (strncmp(vmh->vmh_signature, VM_DUMP_SIGNATURE, strlen(VM_DUMP_SIGNATURE)) != 0) { + log_warnx("%s: incompatible dump signature", __func__); + return (-1); + } + + if (vmh->vmh_version != VM_DUMP_VERSION) { + log_warnx("%s: incompatible dump version", __func__); + return (-1); + } + + for (i = 0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) { + code = vmh->vmh_cpuids[i].code; + leaf = vmh->vmh_cpuids[i].leaf; + if (leaf != 0x00) { + log_debug("%s: invalid leaf 0x%x for code 0x%x", + __func__, leaf, code); + return (-1); + } + + switch (code) { + case 0x00: + CPUID_LEAF(code, leaf, a, b, c, d); + if (vmh->vmh_cpuids[i].a > a) { + log_debug("%s: incompatible cpuid level", + __func__); + return (-1); + } + if (!(vmh->vmh_cpuids[i].b == b && + vmh->vmh_cpuids[i].c == c && + vmh->vmh_cpuids[i].d == d)) { + log_debug("%s: incompatible cpu brand", + __func__); + return (-1); + } + break; + + case 0x01: + CPUID_LEAF(code, leaf, a, b, c, d); + if ((vmh->vmh_cpuids[i].c & c & VMM_CPUIDECX_MASK) != + (vmh->vmh_cpuids[i].c & VMM_CPUIDECX_MASK)) { + log_debug("%s: incompatible cpu features " + "code: 0x%x leaf: 0x%x reg: c", __func__, + code, leaf); + return (-1); + } + if ((vmh->vmh_cpuids[i].d & d & VMM_CPUIDEDX_MASK) != + (vmh->vmh_cpuids[i].d & VMM_CPUIDEDX_MASK)) { + log_debug("%s: incompatible cpu features " + "code: 0x%x leaf: 0x%x reg: d", __func__, + code, leaf); + return (-1); + } + break; + + case 0x07: + CPUID_LEAF(code, leaf, a, b, c, d); + if ((vmh->vmh_cpuids[i].b & b & VMM_SEFF0EBX_MASK) != + (vmh->vmh_cpuids[i].b & VMM_SEFF0EBX_MASK)) { + log_debug("%s: incompatible cpu features " + "code: 0x%x leaf: 0x%x reg: c", __func__, + code, leaf); + return (-1); + } + if ((vmh->vmh_cpuids[i].c & c & VMM_SEFF0ECX_MASK) != + (vmh->vmh_cpuids[i].c & VMM_SEFF0ECX_MASK)) { + log_debug("%s: incompatible cpu features " + "code: 0x%x leaf: 0x%x reg: d", __func__, + code, leaf); + return (-1); + } + break; + + case 0x0d: + CPUID_LEAF(code, leaf, a, b, c, d); + if (vmh->vmh_cpuids[i].b > b) { + log_debug("%s: incompatible cpu: insufficient " + "max save area for enabled XCR0 features", + __func__); + return (-1); + } + if (vmh->vmh_cpuids[i].c > c) { + log_debug("%s: incompatible cpu: insufficient " + "max save area for supported XCR0 features", + __func__); + return (-1); + } + break; + + case 0x80000001: + CPUID_LEAF(code, leaf, a, b, c, d); + if ((vmh->vmh_cpuids[i].a & a) != + vmh->vmh_cpuids[i].a) { + log_debug("%s: incompatible cpu features " + "code: 0x%x leaf: 0x%x reg: a", __func__, + code, leaf); + return (-1); + } + if ((vmh->vmh_cpuids[i].c & c) != + vmh->vmh_cpuids[i].c) { + log_debug("%s: incompatible cpu features " + "code: 0x%x leaf: 0x%x reg: c", __func__, + code, leaf); + return (-1); + } + if ((vmh->vmh_cpuids[i].d & d) != + vmh->vmh_cpuids[i].d) { + log_debug("%s: incompatible cpu features " + "code: 0x%x leaf: 0x%x reg: d", __func__, + code, leaf); + return (-1); + } + break; + + default: + log_debug("%s: unknown code 0x%x", __func__, code); + return (-1); + } + } + + return (0); +}