Missed some files in previous commit to split vmd into mi/md.
authordv <dv@openbsd.org>
Wed, 10 Jul 2024 10:41:19 +0000 (10:41 +0000)
committerdv <dv@openbsd.org>
Wed, 10 Jul 2024 10:41:19 +0000 (10:41 +0000)
Forgot `cvs add` and sys/dev/vmm/vmm.h changes.

sys/arch/arm64/include/vmmvar.h [new file with mode: 0644]
sys/dev/vmm/vmm.h
usr.sbin/vmd/arm64_vm.c [new file with mode: 0644]
usr.sbin/vmd/x86_mmio.c [new file with mode: 0644]
usr.sbin/vmd/x86_vm.c [new file with mode: 0644]

diff --git a/sys/arch/arm64/include/vmmvar.h b/sys/arch/arm64/include/vmmvar.h
new file mode 100644 (file)
index 0000000..76afc5c
--- /dev/null
@@ -0,0 +1,91 @@
+/*     $OpenBSD: vmmvar.h,v 1.1 2024/07/10 10:41:19 dv Exp $   */
+/*
+ * Copyright (c) 2014 Mike Larkin <mlarkin@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+/*
+ * CPU capabilities for VMM operation
+ */
+#ifndef _MACHINE_VMMVAR_H_
+#define _MACHINE_VMMVAR_H_
+
+#define VMM_HV_SIGNATURE       "OpenBSDVMM58"
+
+#define VMM_PCI_MMIO_BAR_BASE  0xF0000000ULL
+#define VMM_PCI_MMIO_BAR_END   0xFFDFFFFFULL           /* 2 MiB below 4 GiB */
+
+/* Exit Reasons */
+#define VM_EXIT_TERMINATED                     0xFFFE
+#define VM_EXIT_NONE                           0xFFFF
+
+struct vmm_softc_md {
+       /* Capabilities */
+       uint32_t                nr_cpus;        /* [I] */
+};
+
+/*
+ * struct vcpu_inject_event    : describes an exception or interrupt to inject.
+ */
+struct vcpu_inject_event {
+       uint8_t         vie_vector;     /* Exception or interrupt vector. */
+       uint32_t        vie_errorcode;  /* Optional error code. */
+       uint8_t         vie_type;
+#define VCPU_INJECT_NONE       0
+#define VCPU_INJECT_INTR       1       /* External hardware interrupt. */
+#define VCPU_INJECT_EX         2       /* HW or SW Exception */
+#define VCPU_INJECT_NMI                3       /* Non-maskable Interrupt */
+};
+
+#define VCPU_REGS_NGPRS                31
+
+struct vcpu_reg_state {
+       uint64_t                        vrs_gprs[VCPU_REGS_NGPRS];
+};
+
+/*
+ * struct vm_exit
+ *
+ * Contains VM exit information communicated to vmd(8). This information is
+ * gathered by vmm(4) from the CPU on each exit that requires help from vmd.
+ */
+struct vm_exit {
+       struct vcpu_reg_state           vrs;
+};
+
+struct vm_intr_params {
+       /* Input parameters to VMM_IOC_INTR */
+       uint32_t                vip_vm_id;
+       uint32_t                vip_vcpu_id;
+       uint16_t                vip_intr;
+};
+
+#define VM_RWREGS_GPRS 0x1     /* read/write GPRs */
+#define VM_RWREGS_ALL  (VM_RWREGS_GPRS)
+
+struct vm_rwregs_params {
+       /*
+        * Input/output parameters to VMM_IOC_READREGS /
+        * VMM_IOC_WRITEREGS
+        */
+       uint32_t                vrwp_vm_id;
+       uint32_t                vrwp_vcpu_id;
+       uint64_t                vrwp_mask;
+       struct vcpu_reg_state   vrwp_regs;
+};
+
+/* IOCTL definitions */
+#define VMM_IOC_INTR _IOW('V', 6, struct vm_intr_params) /* Intr pending */
+
+#endif /* ! _MACHINE_VMMVAR_H_ */
index 3ec2d20..ac682bc 100644 (file)
@@ -1,4 +1,4 @@
-/* $OpenBSD: vmm.h,v 1.5 2024/07/09 09:31:37 dv Exp $ */
+/* $OpenBSD: vmm.h,v 1.6 2024/07/10 10:41:19 dv Exp $ */
 /*
  * Copyright (c) 2014-2023 Mike Larkin <mlarkin@openbsd.org>
  *
@@ -108,6 +108,20 @@ struct vm_run_params {
        uint8_t         vrp_irqready;           /* ready for IRQ on entry */
 };
 
+#define VM_RWVMPARAMS_PVCLOCK_SYSTEM_GPA 0x1   /* read/write pvclock gpa */
+#define VM_RWVMPARAMS_PVCLOCK_VERSION   0x2    /* read/write pvclock version */
+#define VM_RWVMPARAMS_ALL      (VM_RWVMPARAMS_PVCLOCK_SYSTEM_GPA | \
+    VM_RWVMPARAMS_PVCLOCK_VERSION)
+
+struct vm_rwvmparams_params {
+       /* Input parameters to VMM_IOC_READVMPARAMS/VMM_IOC_WRITEVMPARAMS */
+       uint32_t                vpp_vm_id;
+       uint32_t                vpp_vcpu_id;
+       uint32_t                vpp_mask;
+       paddr_t                 vpp_pvclock_system_gpa;
+       uint32_t                vpp_pvclock_version;
+};
+
 /* IOCTL definitions */
 #define VMM_IOC_CREATE _IOWR('V', 1, struct vm_create_params) /* Create VM */
 #define VMM_IOC_RUN _IOWR('V', 2, struct vm_run_params) /* Run VCPU */
@@ -225,6 +239,7 @@ void vm_teardown(struct vm **);
 int vm_get_info(struct vm_info_params *);
 int vm_terminate(struct vm_terminate_params *);
 int vm_resetcpu(struct vm_resetcpu_params *);
+int vm_rwvmparams(struct vm_rwvmparams_params *, int);
 int vcpu_must_stop(struct vcpu *);
 int vm_share_mem(struct vm_sharemem_params *, struct proc *);
 int vm_run(struct vm_run_params *);
diff --git a/usr.sbin/vmd/arm64_vm.c b/usr.sbin/vmd/arm64_vm.c
new file mode 100644 (file)
index 0000000..282dbcb
--- /dev/null
@@ -0,0 +1,162 @@
+/*     $OpenBSD: arm64_vm.c,v 1.1 2024/07/10 10:41:19 dv Exp $ */
+/*
+ * Copyright (c) 2024 Dave Voutila <dv@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+#include <sys/types.h>
+
+#include "vmd.h"
+
+void
+create_memory_map(struct vm_create_params *vcp)
+{
+       fatalx("%s: unimplemented", __func__);
+       /* NOTREACHED */
+}
+
+int
+load_firmware(struct vmd_vm *vm, struct vcpu_reg_state *vrs)
+{
+       fatalx("%s: unimplemented", __func__);
+       /* NOTREACHED */
+       return (-1);
+}
+
+void
+init_emulated_hw(struct vmop_create_params *vcp, int child_cdrom,
+    int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps)
+{
+       fatalx("%s: unimplemented", __func__);
+       /* NOTREACHED */
+}
+
+void
+restore_emulated_hw(struct vm_create_params *vcp, int fd, int *child_taps,
+    int child_disks[][VM_MAX_BASE_PER_DISK], int child_cdrom)
+{
+       fatalx("%s: unimplemented", __func__);
+       /* NOTREACHED */
+}
+
+void
+pause_vm_md(struct vmd_vm *vm)
+{
+       fatalx("%s: unimplemented", __func__);
+       /* NOTREACHED */
+}
+
+void
+unpause_vm_md(struct vmd_vm *vm)
+{
+       fatalx("%s: unimplemented", __func__);
+       /* NOTREACHED */
+}
+
+int
+dump_devs(int fd)
+{
+       fatalx("%s: unimplemented", __func__);
+       /* NOTREACHED */
+       return (-1);
+}
+
+int
+dump_send_header(int fd)
+{
+       fatalx("%s: unimplemented", __func__);
+       /* NOTREACHED */
+       return (-1);
+}
+
+void *
+hvaddr_mem(paddr_t gpa, size_t len)
+{      fatalx("%s: unimplemented", __func__);
+       /* NOTREACHED */
+       return (NULL);
+}
+
+int
+write_mem(paddr_t dst, const void *buf, size_t len)
+{
+       fatalx("%s: unimplemented", __func__);
+       /* NOTREACHED */
+       return (-1);
+}
+
+int
+read_mem(paddr_t src, void *buf, size_t len)
+{
+       fatalx("%s: unimplemented", __func__);
+       /* NOTREACHED */
+       return (-1);
+}
+
+int
+intr_pending(struct vmd_vm *vm)
+{
+       fatalx("%s: unimplemented", __func__);
+       /* NOTREACHED */
+       return (-1);
+}
+
+void
+intr_toggle_el(struct vmd_vm *vm, int irq, int val)
+{
+       fatalx("%s: unimplemented", __func__);
+       /* NOTREACHED */
+}
+
+int
+intr_ack(struct vmd_vm *vm)
+{
+       fatalx("%s: unimplemented", __func__);
+       /* NOTREACHED */
+       return (-1);
+}
+
+void
+vcpu_assert_irq(uint32_t vm_id, uint32_t vcpu_id, int irq)
+{
+       fatalx("%s: unimplemented", __func__);
+}
+
+void
+vcpu_deassert_irq(uint32_t vm_id, uint32_t vcpu_id, int irq)
+{
+       fatalx("%s: unimplemented", __func__);
+}
+
+int
+vmd_check_vmh(struct vm_dump_header *vmh)
+{
+       fatalx("%s: unimplemented", __func__);
+       /* NOTREACHED */
+       return (-1);
+}
+
+int
+vcpu_exit(struct vm_run_params *vrp)
+{
+       fatalx("%s: unimplemented", __func__);
+       /* NOTREACHED */
+       return (-1);
+}
+
+uint8_t
+vcpu_exit_pci(struct vm_run_params *vrp)
+{
+       fatalx("%s: unimplemented", __func__);
+       /* NOTREACHED */
+       return (0xff);
+}
diff --git a/usr.sbin/vmd/x86_mmio.c b/usr.sbin/vmd/x86_mmio.c
new file mode 100644 (file)
index 0000000..381df30
--- /dev/null
@@ -0,0 +1,1045 @@
+/*     $OpenBSD: x86_mmio.c,v 1.1 2024/07/10 10:41:19 dv Exp $ */
+/*
+ * Copyright (c) 2022 Dave Voutila <dv@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <errno.h>
+#include <string.h>
+
+#include <sys/types.h>
+#include <machine/specialreg.h>
+
+#include "vmd.h"
+#include "mmio.h"
+
+#define MMIO_DEBUG 0
+
+extern char* __progname;
+
+struct x86_decode_state {
+       uint8_t s_bytes[15];
+       size_t  s_len;
+       size_t  s_idx;
+};
+
+enum decode_result {
+       DECODE_ERROR = 0,       /* Something went wrong. */
+       DECODE_DONE,            /* Decode success and no more work needed. */
+       DECODE_MORE,            /* Decode success and more work required. */
+};
+
+static const char *str_cpu_mode(int);
+static const char *str_decode_res(enum decode_result);
+static const char *str_opcode(struct x86_opcode *);
+static const char *str_operand_enc(struct x86_opcode *);
+static const char *str_reg(int);
+static const char *str_sreg(int);
+static int detect_cpu_mode(struct vcpu_reg_state *);
+
+static enum decode_result decode_prefix(struct x86_decode_state *,
+    struct x86_insn *);
+static enum decode_result decode_opcode(struct x86_decode_state *,
+    struct x86_insn *);
+static enum decode_result decode_modrm(struct x86_decode_state *,
+    struct x86_insn *);
+static int get_modrm_reg(struct x86_insn *);
+static int get_modrm_addr(struct x86_insn *, struct vcpu_reg_state *vrs);
+static enum decode_result decode_disp(struct x86_decode_state *,
+    struct x86_insn *);
+static enum decode_result decode_sib(struct x86_decode_state *,
+    struct x86_insn *);
+static enum decode_result decode_imm(struct x86_decode_state *,
+    struct x86_insn *);
+
+static enum decode_result peek_byte(struct x86_decode_state *, uint8_t *);
+static enum decode_result next_byte(struct x86_decode_state *, uint8_t *);
+static enum decode_result next_value(struct x86_decode_state *, size_t,
+    uint64_t *);
+static int is_valid_state(struct x86_decode_state *, const char *);
+
+static int emulate_mov(struct x86_insn *, struct vm_exit *);
+static int emulate_movzx(struct x86_insn *, struct vm_exit *);
+
+/* Lookup table for 1-byte opcodes, in opcode alphabetical order. */
+const enum x86_opcode_type x86_1byte_opcode_tbl[255] = {
+       /* MOV */
+       [0x88] = OP_MOV,
+       [0x89] = OP_MOV,
+       [0x8A] = OP_MOV,
+       [0x8B] = OP_MOV,
+       [0x8C] = OP_MOV,
+       [0xA0] = OP_MOV,
+       [0xA1] = OP_MOV,
+       [0xA2] = OP_MOV,
+       [0xA3] = OP_MOV,
+
+       /* MOVS */
+       [0xA4] = OP_UNSUPPORTED,
+       [0xA5] = OP_UNSUPPORTED,
+
+       [ESCAPE] = OP_TWO_BYTE,
+};
+
+/* Lookup table for 1-byte operand encodings, in opcode alphabetical order. */
+const enum x86_operand_enc x86_1byte_operand_enc_tbl[255] = {
+       /* MOV */
+       [0x88] = OP_ENC_MR,
+       [0x89] = OP_ENC_MR,
+       [0x8A] = OP_ENC_RM,
+       [0x8B] = OP_ENC_RM,
+       [0x8C] = OP_ENC_MR,
+       [0xA0] = OP_ENC_FD,
+       [0xA1] = OP_ENC_FD,
+       [0xA2] = OP_ENC_TD,
+       [0xA3] = OP_ENC_TD,
+
+       /* MOVS */
+       [0xA4] = OP_ENC_ZO,
+       [0xA5] = OP_ENC_ZO,
+};
+
+const enum x86_opcode_type x86_2byte_opcode_tbl[255] = {
+       /* MOVZX */
+       [0xB6] = OP_MOVZX,
+       [0xB7] = OP_MOVZX,
+};
+
+const enum x86_operand_enc x86_2byte_operand_enc_table[255] = {
+       /* MOVZX */
+       [0xB6] = OP_ENC_RM,
+       [0xB7] = OP_ENC_RM,
+};
+
+/*
+ * peek_byte
+ *
+ * Fetch the next byte fron the instruction bytes without advancing the
+ * position in the stream.
+ *
+ * Return values:
+ *  DECODE_DONE: byte was found and is the last in the stream
+ *  DECODE_MORE: byte was found and there are more remaining to be read
+ *  DECODE_ERROR: state is invalid and not byte was found, *byte left unchanged
+ */
+static enum decode_result
+peek_byte(struct x86_decode_state *state, uint8_t *byte)
+{
+       enum decode_result res;
+
+       if (state == NULL)
+               return (DECODE_ERROR);
+
+       if (state->s_idx == state->s_len)
+               return (DECODE_ERROR);
+
+       if (state->s_idx + 1 == state->s_len)
+               res = DECODE_DONE;
+       else
+               res = DECODE_MORE;
+
+       if (byte != NULL)
+               *byte = state->s_bytes[state->s_idx];
+       return (res);
+}
+
+/*
+ * next_byte
+ *
+ * Fetch the next byte fron the instruction bytes, advancing the position in the
+ * stream and mutating decode state.
+ *
+ * Return values:
+ *  DECODE_DONE: byte was found and is the last in the stream
+ *  DECODE_MORE: byte was found and there are more remaining to be read
+ *  DECODE_ERROR: state is invalid and not byte was found, *byte left unchanged
+ */
+static enum decode_result
+next_byte(struct x86_decode_state *state, uint8_t *byte)
+{
+       uint8_t next;
+
+       /* Cheat and see if we're going to fail. */
+       if (peek_byte(state, &next) == DECODE_ERROR)
+               return (DECODE_ERROR);
+
+       if (byte != NULL)
+               *byte = next;
+       state->s_idx++;
+
+       return (state->s_idx < state->s_len ? DECODE_MORE : DECODE_DONE);
+}
+
+/*
+ * Fetch the next `n' bytes as a single uint64_t value.
+ */
+static enum decode_result
+next_value(struct x86_decode_state *state, size_t n, uint64_t *value)
+{
+       uint8_t bytes[8];
+       size_t i;
+       enum decode_result res;
+
+       if (value == NULL)
+               return (DECODE_ERROR);
+
+       if (n == 0 || n > sizeof(bytes))
+               return (DECODE_ERROR);
+
+       memset(bytes, 0, sizeof(bytes));
+       for (i = 0; i < n; i++)
+               if ((res = next_byte(state, &bytes[i])) == DECODE_ERROR)
+                       return (DECODE_ERROR);
+
+       *value = *((uint64_t*)bytes);
+
+       return (res);
+}
+
+/*
+ * is_valid_state
+ *
+ * Validate the decode state looks viable.
+ *
+ * Returns:
+ *  1: if state is valid
+ *  0: if an invariant is detected
+ */
+static int
+is_valid_state(struct x86_decode_state *state, const char *fn_name)
+{
+       const char *s = (fn_name != NULL) ? fn_name : __func__;
+
+       if (state == NULL) {
+               log_warnx("%s: null state", s);
+               return (0);
+       }
+       if (state->s_len > sizeof(state->s_bytes)) {
+               log_warnx("%s: invalid length", s);
+               return (0);
+       }
+       if (state->s_idx + 1 > state->s_len) {
+               log_warnx("%s: invalid index", s);
+               return (0);
+       }
+
+       return (1);
+}
+
+#ifdef MMIO_DEBUG
+static void
+dump_regs(struct vcpu_reg_state *vrs)
+{
+       size_t i;
+       struct vcpu_segment_info *vsi;
+
+       for (i = 0; i < VCPU_REGS_NGPRS; i++)
+               log_info("%s: %s 0x%llx", __progname, str_reg(i),
+                   vrs->vrs_gprs[i]);
+
+       for (i = 0; i < VCPU_REGS_NSREGS; i++) {
+               vsi = &vrs->vrs_sregs[i];
+               log_info("%s: %s { sel: 0x%04x, lim: 0x%08x, ar: 0x%08x, "
+                   "base: 0x%llx }", __progname, str_sreg(i),
+                   vsi->vsi_sel, vsi->vsi_limit, vsi->vsi_ar, vsi->vsi_base);
+       }
+}
+
+static void
+dump_insn(struct x86_insn *insn)
+{
+       log_info("instruction { %s, enc=%s, len=%d, mod=0x%02x, ("
+           "reg=%s, addr=0x%lx) sib=0x%02x }",
+           str_opcode(&insn->insn_opcode),
+           str_operand_enc(&insn->insn_opcode), insn->insn_bytes_len,
+           insn->insn_modrm, str_reg(insn->insn_reg),
+           insn->insn_gva, insn->insn_sib);
+}
+#endif /* MMIO_DEBUG */
+
+static const char *
+str_cpu_mode(int mode)
+{
+       switch (mode) {
+       case VMM_CPU_MODE_REAL: return "REAL";
+       case VMM_CPU_MODE_PROT: return "PROT";
+       case VMM_CPU_MODE_PROT32: return "PROT32";
+       case VMM_CPU_MODE_COMPAT: return "COMPAT";
+       case VMM_CPU_MODE_LONG: return "LONG";
+       default: return "UKNOWN";
+       }
+}
+
+__unused static const char *
+str_decode_res(enum decode_result res) {
+       switch (res) {
+       case DECODE_DONE: return "DONE";
+       case DECODE_MORE: return "MORE";
+       case DECODE_ERROR: return "ERROR";
+       default: return "UNKNOWN";
+       }
+}
+
+static const char *
+str_opcode(struct x86_opcode *opcode)
+{
+       switch (opcode->op_type) {
+       case OP_IN: return "IN";
+       case OP_INS: return "INS";
+       case OP_MOV: return "MOV";
+       case OP_MOVZX: return "MOVZX";
+       case OP_OUT: return "OUT";
+       case OP_OUTS: return "OUTS";
+       case OP_UNSUPPORTED: return "UNSUPPORTED";
+       default: return "UNKNOWN";
+       }
+}
+
+static const char *
+str_operand_enc(struct x86_opcode *opcode)
+{
+       switch (opcode->op_encoding) {
+       case OP_ENC_I: return "I";
+       case OP_ENC_MI: return "MI";
+       case OP_ENC_MR: return "MR";
+       case OP_ENC_RM: return "RM";
+       case OP_ENC_FD: return "FD";
+       case OP_ENC_TD: return "TD";
+       case OP_ENC_OI: return "OI";
+       case OP_ENC_ZO: return "ZO";
+       default: return "UNKNOWN";
+       }
+}
+
+static const char *
+str_reg(int reg) {
+       switch (reg) {
+       case VCPU_REGS_RAX: return "RAX";
+       case VCPU_REGS_RCX: return "RCX";
+       case VCPU_REGS_RDX: return "RDX";
+       case VCPU_REGS_RBX: return "RBX";
+       case VCPU_REGS_RSI: return "RSI";
+       case VCPU_REGS_RDI: return "RDI";
+       case VCPU_REGS_R8:  return " R8";
+       case VCPU_REGS_R9:  return " R9";
+       case VCPU_REGS_R10: return "R10";
+       case VCPU_REGS_R11: return "R11";
+       case VCPU_REGS_R12: return "R12";
+       case VCPU_REGS_R13: return "R13";
+       case VCPU_REGS_R14: return "R14";
+       case VCPU_REGS_R15: return "R15";
+       case VCPU_REGS_RSP: return "RSP";
+       case VCPU_REGS_RBP: return "RBP";
+       case VCPU_REGS_RIP: return "RIP";
+       case VCPU_REGS_RFLAGS: return "RFLAGS";
+       default: return "UNKNOWN";
+       }
+}
+
+static const char *
+str_sreg(int sreg) {
+       switch (sreg) {
+       case VCPU_REGS_CS: return "CS";
+       case VCPU_REGS_DS: return "DS";
+       case VCPU_REGS_ES: return "ES";
+       case VCPU_REGS_FS: return "FS";
+       case VCPU_REGS_GS: return "GS";
+       case VCPU_REGS_SS: return "GS";
+       case VCPU_REGS_LDTR: return "LDTR";
+       case VCPU_REGS_TR: return "TR";
+       default: return "UKNOWN";
+       }
+}
+
+static int
+detect_cpu_mode(struct vcpu_reg_state *vrs)
+{
+       uint64_t cr0, cr4, cs, efer, rflags;
+
+       /* Is protected mode enabled? */
+       cr0 = vrs->vrs_crs[VCPU_REGS_CR0];
+       if (!(cr0 & CR0_PE))
+               return (VMM_CPU_MODE_REAL);
+
+       cr4 = vrs->vrs_crs[VCPU_REGS_CR4];
+       cs = vrs->vrs_sregs[VCPU_REGS_CS].vsi_ar;
+       efer = vrs->vrs_msrs[VCPU_REGS_EFER];
+       rflags = vrs->vrs_gprs[VCPU_REGS_RFLAGS];
+
+       /* Check for Long modes. */
+       if ((efer & EFER_LME) && (cr4 & CR4_PAE) && (cr0 & CR0_PG)) {
+               if (cs & CS_L) {
+                       /* Long Modes */
+                       if (!(cs & CS_D))
+                               return (VMM_CPU_MODE_LONG);
+                       log_warnx("%s: invalid cpu mode", __progname);
+                       return (VMM_CPU_MODE_UNKNOWN);
+               } else {
+                       /* Compatibility Modes */
+                       if (cs & CS_D) /* XXX Add Compat32 mode */
+                               return (VMM_CPU_MODE_UNKNOWN);
+                       return (VMM_CPU_MODE_COMPAT);
+               }
+       }
+
+       /* Check for 32-bit Protected Mode. */
+       if (cs & CS_D)
+               return (VMM_CPU_MODE_PROT32);
+
+       /* Check for virtual 8086 mode. */
+       if (rflags & EFLAGS_VM) {
+               /* XXX add Virtual8086 mode */
+               log_warnx("%s: Virtual 8086 mode", __progname);
+               return (VMM_CPU_MODE_UNKNOWN);
+       }
+
+       /* Can't determine mode. */
+       log_warnx("%s: invalid cpu mode", __progname);
+       return (VMM_CPU_MODE_UNKNOWN);
+}
+
+static enum decode_result
+decode_prefix(struct x86_decode_state *state, struct x86_insn *insn)
+{
+       enum decode_result res = DECODE_ERROR;
+       struct x86_prefix *prefix;
+       uint8_t byte;
+
+       if (!is_valid_state(state, __func__) || insn == NULL)
+               return (-1);
+
+       prefix = &insn->insn_prefix;
+       memset(prefix, 0, sizeof(*prefix));
+
+       /*
+        * Decode prefixes. The last of its kind wins. The behavior is undefined
+        * in the Intel SDM (see Vol 2, 2.1.1 Instruction Prefixes.)
+        */
+       while ((res = peek_byte(state, &byte)) != DECODE_ERROR) {
+               switch (byte) {
+               case LEG_1_LOCK:
+               case LEG_1_REPNE:
+               case LEG_1_REP:
+                       prefix->pfx_group1 = byte;
+                       break;
+               case LEG_2_CS:
+               case LEG_2_SS:
+               case LEG_2_DS:
+               case LEG_2_ES:
+               case LEG_2_FS:
+               case LEG_2_GS:
+                       prefix->pfx_group2 = byte;
+                       break;
+               case LEG_3_OPSZ:
+                       prefix->pfx_group3 = byte;
+                       break;
+               case LEG_4_ADDRSZ:
+                       prefix->pfx_group4 = byte;
+                       break;
+               case REX_BASE...REX_BASE + 0x0F:
+                       if (insn->insn_cpu_mode == VMM_CPU_MODE_LONG)
+                               prefix->pfx_rex = byte;
+                       else /* INC encountered */
+                               return (DECODE_ERROR);
+                       break;
+               case VEX_2_BYTE:
+               case VEX_3_BYTE:
+                       log_warnx("%s: VEX not supported", __func__);
+                       return (DECODE_ERROR);
+               default:
+                       /* Something other than a valid prefix. */
+                       return (DECODE_MORE);
+               }
+               /* Advance our position. */
+               next_byte(state, NULL);
+       }
+
+       return (res);
+}
+
+static enum decode_result
+decode_modrm(struct x86_decode_state *state, struct x86_insn *insn)
+{
+       enum decode_result res;
+       uint8_t byte = 0;
+
+       if (!is_valid_state(state, __func__) || insn == NULL)
+               return (DECODE_ERROR);
+
+       insn->insn_modrm_valid = 0;
+
+       /* Check the operand encoding to see if we fetch a byte or abort. */
+       switch (insn->insn_opcode.op_encoding) {
+       case OP_ENC_MR:
+       case OP_ENC_RM:
+       case OP_ENC_MI:
+               res = next_byte(state, &byte);
+               if (res == DECODE_ERROR) {
+                       log_warnx("%s: failed to get modrm byte", __func__);
+                       break;
+               }
+               insn->insn_modrm = byte;
+               insn->insn_modrm_valid = 1;
+               break;
+
+       case OP_ENC_I:
+       case OP_ENC_OI:
+               log_warnx("%s: instruction does not need memory assist",
+                   __func__);
+               res = DECODE_ERROR;
+               break;
+
+       default:
+               /* Peek to see if we're done decode. */
+               res = peek_byte(state, NULL);
+       }
+
+       return (res);
+}
+
+static int
+get_modrm_reg(struct x86_insn *insn)
+{
+       if (insn == NULL)
+               return (-1);
+
+       if (insn->insn_modrm_valid) {
+               switch (MODRM_REGOP(insn->insn_modrm)) {
+               case 0:
+                       insn->insn_reg = VCPU_REGS_RAX;
+                       break;
+               case 1:
+                       insn->insn_reg = VCPU_REGS_RCX;
+                       break;
+               case 2:
+                       insn->insn_reg = VCPU_REGS_RDX;
+                       break;
+               case 3:
+                       insn->insn_reg = VCPU_REGS_RBX;
+                       break;
+               case 4:
+                       insn->insn_reg = VCPU_REGS_RSP;
+                       break;
+               case 5:
+                       insn->insn_reg = VCPU_REGS_RBP;
+                       break;
+               case 6:
+                       insn->insn_reg = VCPU_REGS_RSI;
+                       break;
+               case 7:
+                       insn->insn_reg = VCPU_REGS_RDI;
+                       break;
+               }
+       }
+
+       /* REX R bit selects extended registers in LONG mode. */
+       if (insn->insn_prefix.pfx_rex & REX_R)
+               insn->insn_reg += 8;
+
+       return (0);
+}
+
+static int
+get_modrm_addr(struct x86_insn *insn, struct vcpu_reg_state *vrs)
+{
+       uint8_t mod, rm;
+       vaddr_t addr = 0x0UL;
+
+       if (insn == NULL || vrs == NULL)
+               return (-1);
+
+       if (insn->insn_modrm_valid) {
+               rm = MODRM_RM(insn->insn_modrm);
+               mod = MODRM_MOD(insn->insn_modrm);
+
+               switch (rm) {
+               case 0b000:
+                       addr = vrs->vrs_gprs[VCPU_REGS_RAX];
+                       break;
+               case 0b001:
+                       addr = vrs->vrs_gprs[VCPU_REGS_RCX];
+                       break;
+               case 0b010:
+                       addr = vrs->vrs_gprs[VCPU_REGS_RDX];
+                       break;
+               case 0b011:
+                       addr = vrs->vrs_gprs[VCPU_REGS_RBX];
+                       break;
+               case 0b100:
+                       if (mod == 0b11)
+                               addr = vrs->vrs_gprs[VCPU_REGS_RSP];
+                       break;
+               case 0b101:
+                       if (mod != 0b00)
+                               addr = vrs->vrs_gprs[VCPU_REGS_RBP];
+                       break;
+               case 0b110:
+                       addr = vrs->vrs_gprs[VCPU_REGS_RSI];
+                       break;
+               case 0b111:
+                       addr = vrs->vrs_gprs[VCPU_REGS_RDI];
+                       break;
+               }
+
+               insn->insn_gva = addr;
+       }
+
+       return (0);
+}
+
+static enum decode_result
+decode_disp(struct x86_decode_state *state, struct x86_insn *insn)
+{
+       enum decode_result res = DECODE_ERROR;
+       uint64_t disp = 0;
+
+       if (!is_valid_state(state, __func__) || insn == NULL)
+               return (DECODE_ERROR);
+
+       if (!insn->insn_modrm_valid)
+               return (DECODE_ERROR);
+
+       switch (MODRM_MOD(insn->insn_modrm)) {
+       case 0x00:
+               insn->insn_disp_type = DISP_0;
+               res = DECODE_MORE;
+               break;
+       case 0x01:
+               insn->insn_disp_type = DISP_1;
+               res = next_value(state, 1, &disp);
+               if (res == DECODE_ERROR)
+                       return (res);
+               insn->insn_disp = disp;
+               break;
+       case 0x02:
+               if (insn->insn_prefix.pfx_group4 == LEG_4_ADDRSZ) {
+                       insn->insn_disp_type = DISP_2;
+                       res = next_value(state, 2, &disp);
+               } else {
+                       insn->insn_disp_type = DISP_4;
+                       res = next_value(state, 4, &disp);
+               }
+               if (res == DECODE_ERROR)
+                       return (res);
+               insn->insn_disp = disp;
+               break;
+       default:
+               insn->insn_disp_type = DISP_NONE;
+               res = DECODE_MORE;
+       }
+
+       return (res);
+}
+
+static enum decode_result
+decode_opcode(struct x86_decode_state *state, struct x86_insn *insn)
+{
+       enum decode_result res;
+       enum x86_opcode_type type;
+       enum x86_operand_enc enc;
+       struct x86_opcode *opcode = &insn->insn_opcode;
+       uint8_t byte, byte2;
+
+       if (!is_valid_state(state, __func__) || insn == NULL)
+               return (-1);
+
+       memset(opcode, 0, sizeof(*opcode));
+
+       res = next_byte(state, &byte);
+       if (res == DECODE_ERROR)
+               return (res);
+
+       type = x86_1byte_opcode_tbl[byte];
+       switch(type) {
+       case OP_UNKNOWN:
+       case OP_UNSUPPORTED:
+               log_warnx("%s: unsupported opcode", __func__);
+               return (DECODE_ERROR);
+
+       case OP_TWO_BYTE:
+               res = next_byte(state, &byte2);
+               if (res == DECODE_ERROR)
+                       return (res);
+
+               type = x86_2byte_opcode_tbl[byte2];
+               if (type == OP_UNKNOWN || type == OP_UNSUPPORTED) {
+                       log_warnx("%s: unsupported 2-byte opcode", __func__);
+                       return (DECODE_ERROR);
+               }
+
+               opcode->op_bytes[0] = byte;
+               opcode->op_bytes[1] = byte2;
+               opcode->op_bytes_len = 2;
+               enc = x86_2byte_operand_enc_table[byte2];
+               break;
+
+       default:
+               /* We've potentially got a known 1-byte opcode. */
+               opcode->op_bytes[0] = byte;
+               opcode->op_bytes_len = 1;
+               enc = x86_1byte_operand_enc_tbl[byte];
+       }
+
+       if (enc == OP_ENC_UNKNOWN)
+               return (DECODE_ERROR);
+
+       opcode->op_type = type;
+       opcode->op_encoding = enc;
+
+       return (res);
+}
+
+static enum decode_result
+decode_sib(struct x86_decode_state *state, struct x86_insn *insn)
+{
+       enum decode_result res;
+       uint8_t byte;
+
+       if (!is_valid_state(state, __func__) || insn == NULL)
+               return (-1);
+
+       /* SIB is optional, so assume we will be continuing. */
+       res = DECODE_MORE;
+
+       insn->insn_sib_valid = 0;
+       if (!insn->insn_modrm_valid)
+               return (res);
+
+       /* XXX is SIB valid in all cpu modes? */
+       if (MODRM_RM(insn->insn_modrm) == 0b100) {
+               res = next_byte(state, &byte);
+               if (res != DECODE_ERROR) {
+                       insn->insn_sib_valid = 1;
+                       insn->insn_sib = byte;
+               }
+       }
+
+       return (res);
+}
+
+static enum decode_result
+decode_imm(struct x86_decode_state *state, struct x86_insn *insn)
+{
+       enum decode_result res;
+       size_t num_bytes;
+       uint64_t value;
+
+       if (!is_valid_state(state, __func__) || insn == NULL)
+               return (DECODE_ERROR);
+
+       /* Only handle MI encoded instructions. Others shouldn't need assist. */
+       if (insn->insn_opcode.op_encoding != OP_ENC_MI)
+               return (DECODE_DONE);
+
+       /* Exceptions related to MOV instructions. */
+       if (insn->insn_opcode.op_type == OP_MOV) {
+               switch (insn->insn_opcode.op_bytes[0]) {
+               case 0xC6:
+                       num_bytes = 1;
+                       break;
+               case 0xC7:
+                       if (insn->insn_cpu_mode == VMM_CPU_MODE_REAL)
+                               num_bytes = 2;
+                       else
+                               num_bytes = 4;
+                       break;
+               default:
+                       log_warnx("%s: cannot decode immediate bytes for MOV",
+                           __func__);
+                       return (DECODE_ERROR);
+               }
+       } else {
+               /* Fallback to interpreting based on cpu mode and REX. */
+               if (insn->insn_cpu_mode == VMM_CPU_MODE_REAL)
+                       num_bytes = 2;
+               else if (insn->insn_prefix.pfx_rex == REX_NONE)
+                       num_bytes = 4;
+               else
+                       num_bytes = 8;
+       }
+
+       res = next_value(state, num_bytes, &value);
+       if (res != DECODE_ERROR) {
+               insn->insn_immediate = value;
+               insn->insn_immediate_len = num_bytes;
+       }
+
+       return (res);
+}
+
+
+/*
+ * insn_decode
+ *
+ * Decode an x86 instruction from the provided instruction bytes.
+ *
+ * Return values:
+ *  0: successful decode
+ *  Non-zero: an exception occurred during decode
+ */
+int
+insn_decode(struct vm_exit *exit, struct x86_insn *insn)
+{
+       enum decode_result res;
+       struct vcpu_reg_state *vrs = &exit->vrs;
+       struct x86_decode_state state;
+       uint8_t *bytes, len;
+       int mode;
+
+       if (exit == NULL || insn == NULL) {
+               log_warnx("%s: invalid input", __func__);
+               return (DECODE_ERROR);
+       }
+
+       bytes = exit->vee.vee_insn_bytes;
+       len = exit->vee.vee_insn_len;
+
+       /* 0. Initialize state and instruction objects. */
+       memset(insn, 0, sizeof(*insn));
+       memset(&state, 0, sizeof(state));
+       state.s_len = len;
+       memcpy(&state.s_bytes, bytes, len);
+
+       /* 1. Detect CPU mode. */
+       mode = detect_cpu_mode(vrs);
+       if (mode == VMM_CPU_MODE_UNKNOWN) {
+               log_warnx("%s: failed to identify cpu mode", __func__);
+#ifdef MMIO_DEBUG
+               dump_regs(vrs);
+#endif
+               return (-1);
+       }
+       insn->insn_cpu_mode = mode;
+
+#ifdef MMIO_DEBUG
+       log_info("%s: cpu mode %s detected", __progname, str_cpu_mode(mode));
+       printf("%s: got bytes: [ ", __progname);
+       for (int i = 0; i < len; i++) {
+               printf("%02x ", bytes[i]);
+       }
+       printf("]\n");
+#endif
+       /* 2. Decode prefixes. */
+       res = decode_prefix(&state, insn);
+       if (res == DECODE_ERROR) {
+               log_warnx("%s: error decoding prefixes", __func__);
+               goto err;
+       } else if (res == DECODE_DONE)
+               goto done;
+
+#ifdef MMIO_DEBUG
+       log_info("%s: prefixes {g1: 0x%02x, g2: 0x%02x, g3: 0x%02x, g4: 0x%02x,"
+           " rex: 0x%02x }", __progname, insn->insn_prefix.pfx_group1,
+           insn->insn_prefix.pfx_group2, insn->insn_prefix.pfx_group3,
+           insn->insn_prefix.pfx_group4, insn->insn_prefix.pfx_rex);
+#endif
+
+       /* 3. Pick apart opcode. Here we can start short-circuiting. */
+       res = decode_opcode(&state, insn);
+       if (res == DECODE_ERROR) {
+               log_warnx("%s: error decoding opcode", __func__);
+               goto err;
+       } else if (res == DECODE_DONE)
+               goto done;
+
+#ifdef MMIO_DEBUG
+       log_info("%s: found opcode %s (operand encoding %s) (%s)", __progname,
+           str_opcode(&insn->insn_opcode), str_operand_enc(&insn->insn_opcode),
+           str_decode_res(res));
+#endif
+
+       /* Process optional ModR/M byte. */
+       res = decode_modrm(&state, insn);
+       if (res == DECODE_ERROR) {
+               log_warnx("%s: error decoding modrm", __func__);
+               goto err;
+       }
+       if (get_modrm_addr(insn, vrs) != 0)
+               goto err;
+       if (get_modrm_reg(insn) != 0)
+               goto err;
+       if (res == DECODE_DONE)
+               goto done;
+
+#ifdef MMIO_DEBUG
+       if (insn->insn_modrm_valid)
+               log_info("%s: found ModRM 0x%02x (%s)", __progname,
+                   insn->insn_modrm, str_decode_res(res));
+#endif
+
+       /* Process optional SIB byte. */
+       res = decode_sib(&state, insn);
+       if (res == DECODE_ERROR) {
+               log_warnx("%s: error decoding sib", __func__);
+               goto err;
+       } else if (res == DECODE_DONE)
+               goto done;
+
+#ifdef MMIO_DEBUG
+       if (insn->insn_sib_valid)
+               log_info("%s: found SIB 0x%02x (%s)", __progname,
+                   insn->insn_sib, str_decode_res(res));
+#endif
+
+       /* Process any Displacement bytes. */
+       res = decode_disp(&state, insn);
+       if (res == DECODE_ERROR) {
+               log_warnx("%s: error decoding displacement", __func__);
+               goto err;
+       } else if (res == DECODE_DONE)
+               goto done;
+
+       /* Process any Immediate data bytes. */
+       res = decode_imm(&state, insn);
+       if (res == DECODE_ERROR) {
+               log_warnx("%s: error decoding immediate bytes", __func__);
+               goto err;
+       }
+
+done:
+       insn->insn_bytes_len = state.s_idx;
+
+#ifdef MMIO_DEBUG
+       log_info("%s: final instruction length is %u", __func__,
+               insn->insn_bytes_len);
+       dump_insn(insn);
+       log_info("%s: modrm: {mod: %d, regop: %d, rm: %d}", __func__,
+           MODRM_MOD(insn->insn_modrm), MODRM_REGOP(insn->insn_modrm),
+           MODRM_RM(insn->insn_modrm));
+       dump_regs(vrs);
+#endif /* MMIO_DEBUG */
+       return (0);
+
+err:
+#ifdef MMIO_DEBUG
+       dump_insn(insn);
+       log_info("%s: modrm: {mod: %d, regop: %d, rm: %d}", __func__,
+           MODRM_MOD(insn->insn_modrm), MODRM_REGOP(insn->insn_modrm),
+           MODRM_RM(insn->insn_modrm));
+       dump_regs(vrs);
+#endif /* MMIO_DEBUG */
+       return (-1);
+}
+
+static int
+emulate_mov(struct x86_insn *insn, struct vm_exit *exit)
+{
+       /* XXX Only supports read to register for now */
+       if (insn->insn_opcode.op_encoding != OP_ENC_RM)
+               return (-1);
+
+       /* XXX No device emulation yet. Fill with 0xFFs. */
+       exit->vrs.vrs_gprs[insn->insn_reg] = 0xFFFFFFFFFFFFFFFF;
+
+       return (0);
+}
+
+static int
+emulate_movzx(struct x86_insn *insn, struct vm_exit *exit)
+{
+       uint8_t byte, len, src = 1, dst = 2;
+       uint64_t value = 0;
+
+       /* Only RM is valid for MOVZX. */
+       if (insn->insn_opcode.op_encoding != OP_ENC_RM) {
+               log_warnx("invalid op encoding for MOVZX: %d",
+                   insn->insn_opcode.op_encoding);
+               return (-1);
+       }
+
+       len = insn->insn_opcode.op_bytes_len;
+       if (len < 1 || len > sizeof(insn->insn_opcode.op_bytes)) {
+               log_warnx("invalid opcode byte length: %d", len);
+               return (-1);
+       }
+
+       byte = insn->insn_opcode.op_bytes[len - 1];
+       switch (byte) {
+       case 0xB6:
+               src = 1;
+               if (insn->insn_cpu_mode == VMM_CPU_MODE_PROT
+                   || insn->insn_cpu_mode == VMM_CPU_MODE_REAL)
+                       dst = 2;
+               else if (insn->insn_prefix.pfx_rex == REX_NONE)
+                       dst = 4;
+               else // XXX validate CPU mode
+                       dst = 8;
+               break;
+       case 0xB7:
+               src = 2;
+               if (insn->insn_prefix.pfx_rex == REX_NONE)
+                       dst = 4;
+               else // XXX validate CPU mode
+                       dst = 8;
+               break;
+       default:
+               log_warnx("invalid byte in MOVZX opcode: %x", byte);
+               return (-1);
+       }
+
+       if (dst == 4)
+               exit->vrs.vrs_gprs[insn->insn_reg] &= 0xFFFFFFFF00000000;
+       else
+               exit->vrs.vrs_gprs[insn->insn_reg] = 0x0UL;
+
+       /* XXX No device emulation yet. Fill with 0xFFs. */
+       switch (src) {
+       case 1: value = 0xFF; break;
+       case 2: value = 0xFFFF; break;
+       case 4: value = 0xFFFFFFFF; break;
+       case 8: value = 0xFFFFFFFFFFFFFFFF; break;
+       default:
+               log_warnx("invalid source size: %d", src);
+               return (-1);
+       }
+
+       exit->vrs.vrs_gprs[insn->insn_reg] |= value;
+
+       return (0);
+}
+
+/*
+ * insn_emulate
+ *
+ * Returns:
+ *  0: success
+ *  EINVAL: exception occurred
+ *  EFAULT: page fault occurred, requires retry
+ *  ENOTSUP: an unsupported instruction was provided
+ */
+int
+insn_emulate(struct vm_exit *exit, struct x86_insn *insn)
+{
+       int res;
+
+       switch (insn->insn_opcode.op_type) {
+       case OP_MOV:
+               res = emulate_mov(insn, exit);
+               break;
+
+       case OP_MOVZX:
+               res = emulate_movzx(insn, exit);
+               break;
+
+       default:
+               log_warnx("%s: emulation not defined for %s", __func__,
+                   str_opcode(&insn->insn_opcode));
+               res = ENOTSUP;
+       }
+
+       if (res == 0)
+               exit->vrs.vrs_gprs[VCPU_REGS_RIP] += insn->insn_bytes_len;
+
+       return (res);
+}
diff --git a/usr.sbin/vmd/x86_vm.c b/usr.sbin/vmd/x86_vm.c
new file mode 100644 (file)
index 0000000..c6c4b2a
--- /dev/null
@@ -0,0 +1,1373 @@
+/*     $OpenBSD: x86_vm.c,v 1.1 2024/07/10 10:41:19 dv Exp $   */
+/*
+ * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include <dev/ic/i8253reg.h>
+#include <dev/isa/isareg.h>
+
+#include <machine/psl.h>
+#include <machine/pte.h>
+#include <machine/specialreg.h>
+#include <machine/vmmvar.h>
+
+#include <errno.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <zlib.h>
+
+#include "atomicio.h"
+#include "fw_cfg.h"
+#include "i8253.h"
+#include "i8259.h"
+#include "loadfile.h"
+#include "mc146818.h"
+#include "ns8250.h"
+#include "pci.h"
+#include "virtio.h"
+
+typedef uint8_t (*io_fn_t)(struct vm_run_params *);
+
+#define MAX_PORTS 65536
+
+io_fn_t        ioports_map[MAX_PORTS];
+extern char *__progname;
+
+void    create_memory_map(struct vm_create_params *);
+int     translate_gva(struct vm_exit*, uint64_t, uint64_t *, int);
+
+static struct vm_mem_range *find_gpa_range(struct vm_create_params *, paddr_t,
+    size_t);
+static int     loadfile_bios(gzFile, off_t, struct vcpu_reg_state *);
+static int     vcpu_exit_eptviolation(struct vm_run_params *);
+static void    vcpu_exit_inout(struct vm_run_params *);
+
+extern struct vmd_vm   *current_vm;
+extern int              con_fd;
+
+/*
+ * Represents a standard register set for an OS to be booted
+ * as a flat 64 bit address space.
+ *
+ * NOT set here are:
+ *  RIP
+ *  RSP
+ *  GDTR BASE
+ *
+ * Specific bootloaders should clone this structure and override
+ * those fields as needed.
+ *
+ * Note - CR3 and various bits in CR0 may be overridden by vmm(4) based on
+ *        features of the CPU in use.
+ */
+static const struct vcpu_reg_state vcpu_init_flat64 = {
+       .vrs_gprs[VCPU_REGS_RFLAGS] = 0x2,
+       .vrs_gprs[VCPU_REGS_RIP] = 0x0,
+       .vrs_gprs[VCPU_REGS_RSP] = 0x0,
+       .vrs_crs[VCPU_REGS_CR0] = CR0_ET | CR0_PE | CR0_PG,
+       .vrs_crs[VCPU_REGS_CR3] = PML4_PAGE,
+       .vrs_crs[VCPU_REGS_CR4] = CR4_PAE | CR4_PSE,
+       .vrs_crs[VCPU_REGS_PDPTE0] = 0ULL,
+       .vrs_crs[VCPU_REGS_PDPTE1] = 0ULL,
+       .vrs_crs[VCPU_REGS_PDPTE2] = 0ULL,
+       .vrs_crs[VCPU_REGS_PDPTE3] = 0ULL,
+       .vrs_sregs[VCPU_REGS_CS] = { 0x8, 0xFFFFFFFF, 0xC09F, 0x0},
+       .vrs_sregs[VCPU_REGS_DS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
+       .vrs_sregs[VCPU_REGS_ES] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
+       .vrs_sregs[VCPU_REGS_FS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
+       .vrs_sregs[VCPU_REGS_GS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
+       .vrs_sregs[VCPU_REGS_SS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
+       .vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0},
+       .vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0},
+       .vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0},
+       .vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0},
+       .vrs_msrs[VCPU_REGS_EFER] = EFER_LME | EFER_LMA,
+       .vrs_drs[VCPU_REGS_DR0] = 0x0,
+       .vrs_drs[VCPU_REGS_DR1] = 0x0,
+       .vrs_drs[VCPU_REGS_DR2] = 0x0,
+       .vrs_drs[VCPU_REGS_DR3] = 0x0,
+       .vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0,
+       .vrs_drs[VCPU_REGS_DR7] = 0x400,
+       .vrs_msrs[VCPU_REGS_STAR] = 0ULL,
+       .vrs_msrs[VCPU_REGS_LSTAR] = 0ULL,
+       .vrs_msrs[VCPU_REGS_CSTAR] = 0ULL,
+       .vrs_msrs[VCPU_REGS_SFMASK] = 0ULL,
+       .vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL,
+       .vrs_msrs[VCPU_REGS_MISC_ENABLE] = 0ULL,
+       .vrs_crs[VCPU_REGS_XCR0] = XFEATURE_X87
+};
+
+/*
+ * Represents a standard register set for an BIOS to be booted
+ * as a flat 16 bit address space.
+ */
+static const struct vcpu_reg_state vcpu_init_flat16 = {
+       .vrs_gprs[VCPU_REGS_RFLAGS] = 0x2,
+       .vrs_gprs[VCPU_REGS_RIP] = 0xFFF0,
+       .vrs_gprs[VCPU_REGS_RSP] = 0x0,
+       .vrs_crs[VCPU_REGS_CR0] = 0x60000010,
+       .vrs_crs[VCPU_REGS_CR3] = 0,
+       .vrs_sregs[VCPU_REGS_CS] = { 0xF000, 0xFFFF, 0x809F, 0xF0000},
+       .vrs_sregs[VCPU_REGS_DS] = { 0x0, 0xFFFF, 0x8093, 0x0},
+       .vrs_sregs[VCPU_REGS_ES] = { 0x0, 0xFFFF, 0x8093, 0x0},
+       .vrs_sregs[VCPU_REGS_FS] = { 0x0, 0xFFFF, 0x8093, 0x0},
+       .vrs_sregs[VCPU_REGS_GS] = { 0x0, 0xFFFF, 0x8093, 0x0},
+       .vrs_sregs[VCPU_REGS_SS] = { 0x0, 0xFFFF, 0x8093, 0x0},
+       .vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0},
+       .vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0},
+       .vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0},
+       .vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0},
+       .vrs_msrs[VCPU_REGS_EFER] = 0ULL,
+       .vrs_drs[VCPU_REGS_DR0] = 0x0,
+       .vrs_drs[VCPU_REGS_DR1] = 0x0,
+       .vrs_drs[VCPU_REGS_DR2] = 0x0,
+       .vrs_drs[VCPU_REGS_DR3] = 0x0,
+       .vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0,
+       .vrs_drs[VCPU_REGS_DR7] = 0x400,
+       .vrs_msrs[VCPU_REGS_STAR] = 0ULL,
+       .vrs_msrs[VCPU_REGS_LSTAR] = 0ULL,
+       .vrs_msrs[VCPU_REGS_CSTAR] = 0ULL,
+       .vrs_msrs[VCPU_REGS_SFMASK] = 0ULL,
+       .vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL,
+       .vrs_crs[VCPU_REGS_XCR0] = XFEATURE_X87
+};
+
+/*
+ * create_memory_map
+ *
+ * Sets up the guest physical memory ranges that the VM can access.
+ *
+ * Parameters:
+ *  vcp: VM create parameters describing the VM whose memory map
+ *       is being created
+ *
+ * Return values:
+ *  nothing
+ */
+void
+create_memory_map(struct vm_create_params *vcp)
+{
+       size_t len, mem_bytes;
+       size_t above_1m = 0, above_4g = 0;
+
+       mem_bytes = vcp->vcp_memranges[0].vmr_size;
+       vcp->vcp_nmemranges = 0;
+       if (mem_bytes == 0 || mem_bytes > VMM_MAX_VM_MEM_SIZE)
+               return;
+
+       /* First memory region: 0 - LOWMEM_KB (DOS low mem) */
+       len = LOWMEM_KB * 1024;
+       vcp->vcp_memranges[0].vmr_gpa = 0x0;
+       vcp->vcp_memranges[0].vmr_size = len;
+       vcp->vcp_memranges[0].vmr_type = VM_MEM_RAM;
+       mem_bytes -= len;
+
+       /*
+        * Second memory region: LOWMEM_KB - 1MB.
+        *
+        * N.B. - Normally ROMs or parts of video RAM are mapped here.
+        * We have to add this region, because some systems
+        * unconditionally write to 0xb8000 (VGA RAM), and
+        * we need to make sure that vmm(4) permits accesses
+        * to it. So allocate guest memory for it.
+        */
+       len = MB(1) - (LOWMEM_KB * 1024);
+       vcp->vcp_memranges[1].vmr_gpa = LOWMEM_KB * 1024;
+       vcp->vcp_memranges[1].vmr_size = len;
+       vcp->vcp_memranges[1].vmr_type = VM_MEM_RESERVED;
+       mem_bytes -= len;
+
+       /* If we have less than 2MB remaining, still create a 2nd BIOS area. */
+       if (mem_bytes <= MB(2)) {
+               vcp->vcp_memranges[2].vmr_gpa = VMM_PCI_MMIO_BAR_END;
+               vcp->vcp_memranges[2].vmr_size = MB(2);
+               vcp->vcp_memranges[2].vmr_type = VM_MEM_RESERVED;
+               vcp->vcp_nmemranges = 3;
+               return;
+       }
+
+       /*
+        * Calculate the how to split any remaining memory across the 4GB
+        * boundary while making sure we do not place physical memory into
+        * MMIO ranges.
+        */
+       if (mem_bytes > VMM_PCI_MMIO_BAR_BASE - MB(1)) {
+               above_1m = VMM_PCI_MMIO_BAR_BASE - MB(1);
+               above_4g = mem_bytes - above_1m;
+       } else {
+               above_1m = mem_bytes;
+               above_4g = 0;
+       }
+
+       /* Third memory region: area above 1MB to MMIO region */
+       vcp->vcp_memranges[2].vmr_gpa = MB(1);
+       vcp->vcp_memranges[2].vmr_size = above_1m;
+       vcp->vcp_memranges[2].vmr_type = VM_MEM_RAM;
+
+       /* Fourth region: PCI MMIO range */
+       vcp->vcp_memranges[3].vmr_gpa = VMM_PCI_MMIO_BAR_BASE;
+       vcp->vcp_memranges[3].vmr_size = VMM_PCI_MMIO_BAR_END -
+           VMM_PCI_MMIO_BAR_BASE + 1;
+       vcp->vcp_memranges[3].vmr_type = VM_MEM_MMIO;
+
+       /* Fifth region: 2nd copy of BIOS above MMIO ending at 4GB */
+       vcp->vcp_memranges[4].vmr_gpa = VMM_PCI_MMIO_BAR_END + 1;
+       vcp->vcp_memranges[4].vmr_size = MB(2);
+       vcp->vcp_memranges[4].vmr_type = VM_MEM_RESERVED;
+
+       /* Sixth region: any remainder above 4GB */
+       if (above_4g > 0) {
+               vcp->vcp_memranges[5].vmr_gpa = GB(4);
+               vcp->vcp_memranges[5].vmr_size = above_4g;
+               vcp->vcp_memranges[5].vmr_type = VM_MEM_RAM;
+               vcp->vcp_nmemranges = 6;
+       } else
+               vcp->vcp_nmemranges = 5;
+}
+
+int
+load_firmware(struct vmd_vm *vm, struct vcpu_reg_state *vrs)
+{
+       int             ret;
+       gzFile          fp;
+       struct stat     sb;
+
+       /*
+        * Set up default "flat 64 bit" register state - RIP, RSP, and
+        * GDT info will be set in bootloader
+        */
+       memcpy(vrs, &vcpu_init_flat64, sizeof(*vrs));
+
+       /* Find and open kernel image */
+       if ((fp = gzdopen(vm->vm_kernel, "r")) == NULL)
+               fatalx("failed to open kernel - exiting");
+
+       /* Load kernel image */
+       ret = loadfile_elf(fp, vm, vrs, vm->vm_params.vmc_bootdevice);
+
+       /*
+        * Try BIOS as a fallback (only if it was provided as an image
+        * with vm->vm_kernel and the file is not compressed)
+        */
+       if (ret && errno == ENOEXEC && vm->vm_kernel != -1 &&
+           gzdirect(fp) && (ret = fstat(vm->vm_kernel, &sb)) == 0)
+               ret = loadfile_bios(fp, sb.st_size, vrs);
+
+       gzclose(fp);
+
+       return (ret);
+}
+
+
+/*
+ * loadfile_bios
+ *
+ * Alternatively to loadfile_elf, this function loads a non-ELF BIOS image
+ * directly into memory.
+ *
+ * Parameters:
+ *  fp: file of a kernel file to load
+ *  size: uncompressed size of the image
+ *  (out) vrs: register state to set on init for this kernel
+ *
+ * Return values:
+ *  0 if successful
+ *  various error codes returned from read(2) or loadelf functions
+ */
+int
+loadfile_bios(gzFile fp, off_t size, struct vcpu_reg_state *vrs)
+{
+       off_t    off;
+
+       /* Set up a "flat 16 bit" register state for BIOS */
+       memcpy(vrs, &vcpu_init_flat16, sizeof(*vrs));
+
+       /* Seek to the beginning of the BIOS image */
+       if (gzseek(fp, 0, SEEK_SET) == -1)
+               return (-1);
+
+       /* The BIOS image must end at 1MB */
+       if ((off = MB(1) - size) < 0)
+               return (-1);
+
+       /* Read BIOS image into memory */
+       if (mread(fp, off, size) != (size_t)size) {
+               errno = EIO;
+               return (-1);
+       }
+
+       if (gzseek(fp, 0, SEEK_SET) == -1)
+               return (-1);
+
+       /* Read a second BIOS copy into memory ending at 4GB */
+       off = GB(4) - size;
+       if (mread(fp, off, size) != (size_t)size) {
+               errno = EIO;
+               return (-1);
+       }
+
+       log_debug("%s: loaded BIOS image", __func__);
+
+       return (0);
+}
+
+/*
+ * init_emulated_hw
+ *
+ * Initializes the userspace hardware emulation
+ */
+void
+init_emulated_hw(struct vmop_create_params *vmc, int child_cdrom,
+    int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps)
+{
+       struct vm_create_params *vcp = &vmc->vmc_params;
+       size_t i;
+       uint64_t memlo, memhi;
+
+       /* Calculate memory size for NVRAM registers */
+       memlo = memhi = 0;
+       for (i = 0; i < vcp->vcp_nmemranges; i++) {
+               if (vcp->vcp_memranges[i].vmr_gpa == MB(1) &&
+                   vcp->vcp_memranges[i].vmr_size > (15 * MB(1)))
+                       memlo = vcp->vcp_memranges[i].vmr_size - (15 * MB(1));
+               else if (vcp->vcp_memranges[i].vmr_gpa == GB(4))
+                       memhi = vcp->vcp_memranges[i].vmr_size;
+       }
+
+       /* Reset the IO port map */
+       memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS);
+
+       /* Init i8253 PIT */
+       i8253_init(vcp->vcp_id);
+       ioports_map[TIMER_CTRL] = vcpu_exit_i8253;
+       ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253;
+       ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253;
+       ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253;
+       ioports_map[PCKBC_AUX] = vcpu_exit_i8253_misc;
+
+       /* Init mc146818 RTC */
+       mc146818_init(vcp->vcp_id, memlo, memhi);
+       ioports_map[IO_RTC] = vcpu_exit_mc146818;
+       ioports_map[IO_RTC + 1] = vcpu_exit_mc146818;
+
+       /* Init master and slave PICs */
+       i8259_init();
+       ioports_map[IO_ICU1] = vcpu_exit_i8259;
+       ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259;
+       ioports_map[IO_ICU2] = vcpu_exit_i8259;
+       ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259;
+       ioports_map[ELCR0] = vcpu_exit_elcr;
+       ioports_map[ELCR1] = vcpu_exit_elcr;
+
+       /* Init ns8250 UART */
+       ns8250_init(con_fd, vcp->vcp_id);
+       for (i = COM1_DATA; i <= COM1_SCR; i++)
+               ioports_map[i] = vcpu_exit_com;
+
+       /* Initialize PCI */
+       for (i = VM_PCI_IO_BAR_BASE; i <= VM_PCI_IO_BAR_END; i++)
+               ioports_map[i] = vcpu_exit_pci;
+
+       ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci;
+       ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci;
+       ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci;
+       ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci;
+       ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci;
+       pci_init();
+
+       /* Initialize virtio devices */
+       virtio_init(current_vm, child_cdrom, child_disks, child_taps);
+
+       /*
+        * Init QEMU fw_cfg interface. Must be done last for pci hardware
+        * detection.
+        */
+       fw_cfg_init(vmc);
+       ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg;
+       ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg;
+       ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma;
+       ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma;
+}
+
+/*
+ * restore_emulated_hw
+ *
+ * Restores the userspace hardware emulation from fd
+ */
+void
+restore_emulated_hw(struct vm_create_params *vcp, int fd,
+    int *child_taps, int child_disks[][VM_MAX_BASE_PER_DISK], int child_cdrom)
+{
+       /* struct vm_create_params *vcp = &vmc->vmc_params; */
+       int i;
+       memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS);
+
+       /* Init i8253 PIT */
+       i8253_restore(fd, vcp->vcp_id);
+       ioports_map[TIMER_CTRL] = vcpu_exit_i8253;
+       ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253;
+       ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253;
+       ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253;
+
+       /* Init master and slave PICs */
+       i8259_restore(fd);
+       ioports_map[IO_ICU1] = vcpu_exit_i8259;
+       ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259;
+       ioports_map[IO_ICU2] = vcpu_exit_i8259;
+       ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259;
+
+       /* Init ns8250 UART */
+       ns8250_restore(fd, con_fd, vcp->vcp_id);
+       for (i = COM1_DATA; i <= COM1_SCR; i++)
+               ioports_map[i] = vcpu_exit_com;
+
+       /* Init mc146818 RTC */
+       mc146818_restore(fd, vcp->vcp_id);
+       ioports_map[IO_RTC] = vcpu_exit_mc146818;
+       ioports_map[IO_RTC + 1] = vcpu_exit_mc146818;
+
+       /* Init QEMU fw_cfg interface */
+       fw_cfg_restore(fd);
+       ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg;
+       ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg;
+       ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma;
+       ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma;
+
+       /* Initialize PCI */
+       for (i = VM_PCI_IO_BAR_BASE; i <= VM_PCI_IO_BAR_END; i++)
+               ioports_map[i] = vcpu_exit_pci;
+
+       ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci;
+       ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci;
+       ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci;
+       ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci;
+       ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci;
+       pci_restore(fd);
+       virtio_restore(fd, current_vm, child_cdrom, child_disks, child_taps);
+}
+
+void
+pause_vm_md(struct vmd_vm *vm)
+{
+       i8253_stop();
+       mc146818_stop();
+       ns8250_stop();
+       virtio_stop(vm);
+}
+
+void
+unpause_vm_md(struct vmd_vm *vm)
+{
+       i8253_start();
+       mc146818_start();
+       ns8250_start();
+       virtio_start(vm);
+}
+
+int
+dump_devs(int fd)
+{
+       int ret = 0;
+
+       if ((ret = i8253_dump(fd)))
+               return ret;
+       if ((ret = i8259_dump(fd)))
+               return ret;
+       if ((ret = ns8250_dump(fd)))
+               return ret;
+       if ((ret = mc146818_dump(fd)))
+               return ret;
+       ret = fw_cfg_dump(fd);
+
+       return ret;
+}
+
+int
+dump_send_header(int fd) {
+       struct vm_dump_header      vmh;
+       int                        i;
+
+       memcpy(&vmh.vmh_signature, VM_DUMP_SIGNATURE,
+           sizeof(vmh.vmh_signature));
+
+       vmh.vmh_cpuids[0].code = 0x00;
+       vmh.vmh_cpuids[0].leaf = 0x00;
+
+       vmh.vmh_cpuids[1].code = 0x01;
+       vmh.vmh_cpuids[1].leaf = 0x00;
+
+       vmh.vmh_cpuids[2].code = 0x07;
+       vmh.vmh_cpuids[2].leaf = 0x00;
+
+       vmh.vmh_cpuids[3].code = 0x0d;
+       vmh.vmh_cpuids[3].leaf = 0x00;
+
+       vmh.vmh_cpuids[4].code = 0x80000001;
+       vmh.vmh_cpuids[4].leaf = 0x00;
+
+       vmh.vmh_version = VM_DUMP_VERSION;
+
+       for (i=0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) {
+               CPUID_LEAF(vmh.vmh_cpuids[i].code,
+                   vmh.vmh_cpuids[i].leaf,
+                   vmh.vmh_cpuids[i].a,
+                   vmh.vmh_cpuids[i].b,
+                   vmh.vmh_cpuids[i].c,
+                   vmh.vmh_cpuids[i].d);
+       }
+
+       if (atomicio(vwrite, fd, &vmh, sizeof(vmh)) != sizeof(vmh))
+               return (-1);
+
+       return (0);
+}
+
+
+/*
+ * vcpu_exit_inout
+ *
+ * Handle all I/O exits that need to be emulated in vmd. This includes the
+ * i8253 PIT, the com1 ns8250 UART, and the MC146818 RTC/NVRAM device.
+ *
+ * Parameters:
+ *  vrp: vcpu run parameters containing guest state for this exit
+ */
+void
+vcpu_exit_inout(struct vm_run_params *vrp)
+{
+       struct vm_exit *vei = vrp->vrp_exit;
+       uint8_t intr = 0xFF;
+
+       if (vei->vei.vei_rep || vei->vei.vei_string) {
+#ifdef MMIO_DEBUG
+               log_info("%s: %s%s%s %d-byte, enc=%d, data=0x%08x, port=0x%04x",
+                   __func__,
+                   vei->vei.vei_rep == 0 ? "" : "REP ",
+                   vei->vei.vei_dir == VEI_DIR_IN ? "IN" : "OUT",
+                   vei->vei.vei_string == 0 ? "" : "S",
+                   vei->vei.vei_size, vei->vei.vei_encoding,
+                   vei->vei.vei_data, vei->vei.vei_port);
+               log_info("%s: ECX = 0x%llx, RDX = 0x%llx, RSI = 0x%llx",
+                   __func__,
+                   vei->vrs.vrs_gprs[VCPU_REGS_RCX],
+                   vei->vrs.vrs_gprs[VCPU_REGS_RDX],
+                   vei->vrs.vrs_gprs[VCPU_REGS_RSI]);
+#endif /* MMIO_DEBUG */
+               fatalx("%s: can't emulate REP prefixed IN(S)/OUT(S)",
+                   __func__);
+       }
+
+       if (ioports_map[vei->vei.vei_port] != NULL)
+               intr = ioports_map[vei->vei.vei_port](vrp);
+       else if (vei->vei.vei_dir == VEI_DIR_IN)
+               set_return_data(vei, 0xFFFFFFFF);
+
+       vei->vrs.vrs_gprs[VCPU_REGS_RIP] += vei->vei.vei_insn_len;
+
+       if (intr != 0xFF)
+               vcpu_assert_irq(vrp->vrp_vm_id, vrp->vrp_vcpu_id, intr);
+}
+
+/*
+ * vcpu_exit
+ *
+ * Handle a vcpu exit. This function is called when it is determined that
+ * vmm(4) requires the assistance of vmd to support a particular guest
+ * exit type (eg, accessing an I/O port or device). Guest state is contained
+ * in 'vrp', and will be resent to vmm(4) on exit completion.
+ *
+ * Upon conclusion of handling the exit, the function determines if any
+ * interrupts should be injected into the guest, and asserts the proper
+ * IRQ line whose interrupt should be vectored.
+ *
+ * Parameters:
+ *  vrp: vcpu run parameters containing guest state for this exit
+ *
+ * Return values:
+ *  0: the exit was handled successfully
+ *  1: an error occurred (eg, unknown exit reason passed in 'vrp')
+ */
+int
+vcpu_exit(struct vm_run_params *vrp)
+{
+       int ret;
+
+       switch (vrp->vrp_exit_reason) {
+       case VMX_EXIT_INT_WINDOW:
+       case SVM_VMEXIT_VINTR:
+       case VMX_EXIT_CPUID:
+       case VMX_EXIT_EXTINT:
+       case SVM_VMEXIT_INTR:
+       case SVM_VMEXIT_MSR:
+       case SVM_VMEXIT_CPUID:
+               /*
+                * We may be exiting to vmd to handle a pending interrupt but
+                * at the same time the last exit type may have been one of
+                * these. In this case, there's nothing extra to be done
+                * here (and falling through to the default case below results
+                * in more vmd log spam).
+                */
+               break;
+       case SVM_VMEXIT_NPF:
+       case VMX_EXIT_EPT_VIOLATION:
+               ret = vcpu_exit_eptviolation(vrp);
+               if (ret)
+                       return (ret);
+               break;
+       case VMX_EXIT_IO:
+       case SVM_VMEXIT_IOIO:
+               vcpu_exit_inout(vrp);
+               break;
+       case VMX_EXIT_HLT:
+       case SVM_VMEXIT_HLT:
+               vcpu_halt(vrp->vrp_vcpu_id);
+               break;
+       case VMX_EXIT_TRIPLE_FAULT:
+       case SVM_VMEXIT_SHUTDOWN:
+               /* reset VM */
+               return (EAGAIN);
+       default:
+               log_debug("%s: unknown exit reason 0x%x",
+                   __progname, vrp->vrp_exit_reason);
+       }
+
+       return (0);
+}
+
+/*
+ * vcpu_exit_eptviolation
+ *
+ * handle an EPT Violation
+ *
+ * Parameters:
+ *  vrp: vcpu run parameters containing guest state for this exit
+ *
+ * Return values:
+ *  0: no action required
+ *  EFAULT: a protection fault occured, kill the vm.
+ */
+static int
+vcpu_exit_eptviolation(struct vm_run_params *vrp)
+{
+       struct vm_exit *ve = vrp->vrp_exit;
+       int ret = 0;
+#if MMIO_NOTYET
+       struct x86_insn insn;
+       uint64_t va, pa;
+       size_t len = 15;                /* Max instruction length in x86. */
+#endif /* MMIO_NOTYET */
+       switch (ve->vee.vee_fault_type) {
+       case VEE_FAULT_HANDLED:
+               log_debug("%s: fault already handled", __func__);
+               break;
+
+#if MMIO_NOTYET
+       case VEE_FAULT_MMIO_ASSIST:
+               /* Intel VMX might give us the length of the instruction. */
+               if (ve->vee.vee_insn_info & VEE_LEN_VALID)
+                       len = ve->vee.vee_insn_len;
+
+               if (len > 15)
+                       fatalx("%s: invalid instruction length %lu", __func__,
+                           len);
+
+               /* If we weren't given instruction bytes, we need to fetch. */
+               if (!(ve->vee.vee_insn_info & VEE_BYTES_VALID)) {
+                       memset(ve->vee.vee_insn_bytes, 0,
+                           sizeof(ve->vee.vee_insn_bytes));
+                       va = ve->vrs.vrs_gprs[VCPU_REGS_RIP];
+
+                       /* XXX Only support instructions that fit on 1 page. */
+                       if ((va & PAGE_MASK) + len > PAGE_SIZE) {
+                               log_warnx("%s: instruction might cross page "
+                                   "boundary", __func__);
+                               ret = EINVAL;
+                               break;
+                       }
+
+                       ret = translate_gva(ve, va, &pa, PROT_EXEC);
+                       if (ret != 0) {
+                               log_warnx("%s: failed gva translation",
+                                   __func__);
+                               break;
+                       }
+
+                       ret = read_mem(pa, ve->vee.vee_insn_bytes, len);
+                       if (ret != 0) {
+                               log_warnx("%s: failed to fetch instruction "
+                                   "bytes from 0x%llx", __func__, pa);
+                               break;
+                       }
+               }
+
+               ret = insn_decode(ve, &insn);
+               if (ret == 0)
+                       ret = insn_emulate(ve, &insn);
+               break;
+#endif /* MMIO_NOTYET */
+
+       case VEE_FAULT_PROTECT:
+               log_debug("%s: EPT Violation: rip=0x%llx", __progname,
+                   ve->vrs.vrs_gprs[VCPU_REGS_RIP]);
+               ret = EFAULT;
+               break;
+
+       default:
+               fatalx("%s: invalid fault_type %d", __progname,
+                   ve->vee.vee_fault_type);
+               /* UNREACHED */
+       }
+
+       return (ret);
+}
+
+/*
+ * vcpu_exit_pci
+ *
+ * Handle all I/O to the emulated PCI subsystem.
+ *
+ * Parameters:
+ *  vrp: vcpu run parameters containing guest state for this exit
+ *
+ * Return value:
+ *  Interrupt to inject to the guest VM, or 0xFF if no interrupt should
+ *      be injected.
+ */
+uint8_t
+vcpu_exit_pci(struct vm_run_params *vrp)
+{
+       struct vm_exit *vei = vrp->vrp_exit;
+       uint8_t intr;
+
+       intr = 0xFF;
+
+       switch (vei->vei.vei_port) {
+       case PCI_MODE1_ADDRESS_REG:
+               pci_handle_address_reg(vrp);
+               break;
+       case PCI_MODE1_DATA_REG:
+       case PCI_MODE1_DATA_REG + 1:
+       case PCI_MODE1_DATA_REG + 2:
+       case PCI_MODE1_DATA_REG + 3:
+               pci_handle_data_reg(vrp);
+               break;
+       case VM_PCI_IO_BAR_BASE ... VM_PCI_IO_BAR_END:
+               intr = pci_handle_io(vrp);
+               break;
+       default:
+               log_warnx("%s: unknown PCI register 0x%llx",
+                   __progname, (uint64_t)vei->vei.vei_port);
+               break;
+       }
+
+       return (intr);
+}
+
+/*
+ * find_gpa_range
+ *
+ * Search for a contiguous guest physical mem range.
+ *
+ * Parameters:
+ *  vcp: VM create parameters that contain the memory map to search in
+ *  gpa: the starting guest physical address
+ *  len: the length of the memory range
+ *
+ * Return values:
+ *  NULL: on failure if there is no memory range as described by the parameters
+ *  Pointer to vm_mem_range that contains the start of the range otherwise.
+ */
+static struct vm_mem_range *
+find_gpa_range(struct vm_create_params *vcp, paddr_t gpa, size_t len)
+{
+       size_t i, n;
+       struct vm_mem_range *vmr;
+
+       /* Find the first vm_mem_range that contains gpa */
+       for (i = 0; i < vcp->vcp_nmemranges; i++) {
+               vmr = &vcp->vcp_memranges[i];
+               if (gpa < vmr->vmr_gpa + vmr->vmr_size)
+                       break;
+       }
+
+       /* No range found. */
+       if (i == vcp->vcp_nmemranges)
+               return (NULL);
+
+       /*
+        * vmr may cover the range [gpa, gpa + len) only partly. Make
+        * sure that the following vm_mem_ranges are contiguous and
+        * cover the rest.
+        */
+       n = vmr->vmr_size - (gpa - vmr->vmr_gpa);
+       if (len < n)
+               len = 0;
+       else
+               len -= n;
+       gpa = vmr->vmr_gpa + vmr->vmr_size;
+       for (i = i + 1; len != 0 && i < vcp->vcp_nmemranges; i++) {
+               vmr = &vcp->vcp_memranges[i];
+               if (gpa != vmr->vmr_gpa)
+                       return (NULL);
+               if (len <= vmr->vmr_size)
+                       len = 0;
+               else
+                       len -= vmr->vmr_size;
+
+               gpa = vmr->vmr_gpa + vmr->vmr_size;
+       }
+
+       if (len != 0)
+               return (NULL);
+
+       return (vmr);
+}
+/*
+ * write_mem
+ *
+ * Copies data from 'buf' into the guest VM's memory at paddr 'dst'.
+ *
+ * Parameters:
+ *  dst: the destination paddr_t in the guest VM
+ *  buf: data to copy (or NULL to zero the data)
+ *  len: number of bytes to copy
+ *
+ * Return values:
+ *  0: success
+ *  EINVAL: if the guest physical memory range [dst, dst + len) does not
+ *      exist in the guest.
+ */
+int
+write_mem(paddr_t dst, const void *buf, size_t len)
+{
+       const char *from = buf;
+       char *to;
+       size_t n, off;
+       struct vm_mem_range *vmr;
+
+       vmr = find_gpa_range(&current_vm->vm_params.vmc_params, dst, len);
+       if (vmr == NULL) {
+               errno = EINVAL;
+               log_warn("%s: failed - invalid memory range dst = 0x%lx, "
+                   "len = 0x%zx", __func__, dst, len);
+               return (EINVAL);
+       }
+
+       off = dst - vmr->vmr_gpa;
+       while (len != 0) {
+               n = vmr->vmr_size - off;
+               if (len < n)
+                       n = len;
+
+               to = (char *)vmr->vmr_va + off;
+               if (buf == NULL)
+                       memset(to, 0, n);
+               else {
+                       memcpy(to, from, n);
+                       from += n;
+               }
+               len -= n;
+               off = 0;
+               vmr++;
+       }
+
+       return (0);
+}
+
+/*
+ * read_mem
+ *
+ * Reads memory at guest paddr 'src' into 'buf'.
+ *
+ * Parameters:
+ *  src: the source paddr_t in the guest VM to read from.
+ *  buf: destination (local) buffer
+ *  len: number of bytes to read
+ *
+ * Return values:
+ *  0: success
+ *  EINVAL: if the guest physical memory range [dst, dst + len) does not
+ *      exist in the guest.
+ */
+int
+read_mem(paddr_t src, void *buf, size_t len)
+{
+       char *from, *to = buf;
+       size_t n, off;
+       struct vm_mem_range *vmr;
+
+       vmr = find_gpa_range(&current_vm->vm_params.vmc_params, src, len);
+       if (vmr == NULL) {
+               errno = EINVAL;
+               log_warn("%s: failed - invalid memory range src = 0x%lx, "
+                   "len = 0x%zx", __func__, src, len);
+               return (EINVAL);
+       }
+
+       off = src - vmr->vmr_gpa;
+       while (len != 0) {
+               n = vmr->vmr_size - off;
+               if (len < n)
+                       n = len;
+
+               from = (char *)vmr->vmr_va + off;
+               memcpy(to, from, n);
+
+               to += n;
+               len -= n;
+               off = 0;
+               vmr++;
+       }
+
+       return (0);
+}
+
+/*
+ * hvaddr_mem
+ *
+ * Translate a guest physical address to a host virtual address, checking the
+ * provided memory range length to confirm it's contiguous within the same
+ * guest memory range (vm_mem_range).
+ *
+ * Parameters:
+ *  gpa: guest physical address to translate
+ *  len: number of bytes in the intended range
+ *
+ * Return values:
+ *  void* to host virtual memory on success
+ *  NULL on error, setting errno to:
+ *    EFAULT: gpa falls outside guest memory ranges
+ *    EINVAL: requested len extends beyond memory range
+ */
+void *
+hvaddr_mem(paddr_t gpa, size_t len)
+{
+       struct vm_mem_range *vmr;
+       size_t off;
+
+       vmr = find_gpa_range(&current_vm->vm_params.vmc_params, gpa, len);
+       if (vmr == NULL) {
+               log_warnx("%s: failed - invalid gpa: 0x%lx\n", __func__, gpa);
+               errno = EFAULT;
+               return (NULL);
+       }
+
+       off = gpa - vmr->vmr_gpa;
+       if (len > (vmr->vmr_size - off)) {
+               log_warnx("%s: failed - invalid memory range: gpa=0x%lx, "
+                   "len=%zu", __func__, gpa, len);
+               errno = EINVAL;
+               return (NULL);
+       }
+
+       return ((char *)vmr->vmr_va + off);
+}
+
+/*
+ * vcpu_assert_irq
+ *
+ * Injects the specified IRQ on the supplied vcpu/vm
+ *
+ * Parameters:
+ *  vm_id: VM ID to inject to
+ *  vcpu_id: VCPU ID to inject to
+ *  irq: IRQ to inject
+ */
+void
+vcpu_assert_irq(uint32_t vm_id, uint32_t vcpu_id, int irq)
+{
+       i8259_assert_irq(irq);
+
+       if (i8259_is_pending()) {
+               if (vcpu_intr(vm_id, vcpu_id, 1))
+                       fatalx("%s: can't assert INTR", __func__);
+
+               vcpu_unhalt(vcpu_id);
+               vcpu_signal_run(vcpu_id);
+       }
+}
+
+/*
+ * vcpu_deassert_pic_irq
+ *
+ * Clears the specified IRQ on the supplied vcpu/vm
+ *
+ * Parameters:
+ *  vm_id: VM ID to clear in
+ *  vcpu_id: VCPU ID to clear in
+ *  irq: IRQ to clear
+ */
+void
+vcpu_deassert_irq(uint32_t vm_id, uint32_t vcpu_id, int irq)
+{
+       i8259_deassert_irq(irq);
+
+       if (!i8259_is_pending()) {
+               if (vcpu_intr(vm_id, vcpu_id, 0))
+                       fatalx("%s: can't deassert INTR for vm_id %d, "
+                           "vcpu_id %d", __func__, vm_id, vcpu_id);
+       }
+}
+/*
+ * set_return_data
+ *
+ * Utility function for manipulating register data in vm exit info structs. This
+ * function ensures that the data is copied to the vei->vei.vei_data field with
+ * the proper size for the operation being performed.
+ *
+ * Parameters:
+ *  vei: exit information
+ *  data: return data
+ */
+void
+set_return_data(struct vm_exit *vei, uint32_t data)
+{
+       switch (vei->vei.vei_size) {
+       case 1:
+               vei->vei.vei_data &= ~0xFF;
+               vei->vei.vei_data |= (uint8_t)data;
+               break;
+       case 2:
+               vei->vei.vei_data &= ~0xFFFF;
+               vei->vei.vei_data |= (uint16_t)data;
+               break;
+       case 4:
+               vei->vei.vei_data = data;
+               break;
+       }
+}
+
+/*
+ * get_input_data
+ *
+ * Utility function for manipulating register data in vm exit info
+ * structs. This function ensures that the data is copied from the
+ * vei->vei.vei_data field with the proper size for the operation being
+ * performed.
+ *
+ * Parameters:
+ *  vei: exit information
+ *  data: location to store the result
+ */
+void
+get_input_data(struct vm_exit *vei, uint32_t *data)
+{
+       switch (vei->vei.vei_size) {
+       case 1:
+               *data &= 0xFFFFFF00;
+               *data |= (uint8_t)vei->vei.vei_data;
+               break;
+       case 2:
+               *data &= 0xFFFF0000;
+               *data |= (uint16_t)vei->vei.vei_data;
+               break;
+       case 4:
+               *data = vei->vei.vei_data;
+               break;
+       default:
+               log_warnx("%s: invalid i/o size %d", __func__,
+                   vei->vei.vei_size);
+       }
+
+}
+
+/*
+ * translate_gva
+ *
+ * Translates a guest virtual address to a guest physical address by walking
+ * the currently active page table (if needed).
+ *
+ * XXX ensure translate_gva updates the A bit in the PTE
+ * XXX ensure translate_gva respects segment base and limits in i386 mode
+ * XXX ensure translate_gva respects segment wraparound in i8086 mode
+ * XXX ensure translate_gva updates the A bit in the segment selector
+ * XXX ensure translate_gva respects CR4.LMSLE if available
+ *
+ * Parameters:
+ *  exit: The VCPU this translation should be performed for (guest MMU settings
+ *   are gathered from this VCPU)
+ *  va: virtual address to translate
+ *  pa: pointer to paddr_t variable that will receive the translated physical
+ *   address. 'pa' is unchanged on error.
+ *  mode: one of PROT_READ, PROT_WRITE, PROT_EXEC indicating the mode in which
+ *   the address should be translated
+ *
+ * Return values:
+ *  0: the address was successfully translated - 'pa' contains the physical
+ *     address currently mapped by 'va'.
+ *  EFAULT: the PTE for 'VA' is unmapped. A #PF will be injected in this case
+ *     and %cr2 set in the vcpu structure.
+ *  EINVAL: an error occurred reading paging table structures
+ */
+int
+translate_gva(struct vm_exit* exit, uint64_t va, uint64_t* pa, int mode)
+{
+       int level, shift, pdidx;
+       uint64_t pte, pt_paddr, pte_paddr, mask, low_mask, high_mask;
+       uint64_t shift_width, pte_size;
+       struct vcpu_reg_state *vrs;
+
+       vrs = &exit->vrs;
+
+       if (!pa)
+               return (EINVAL);
+
+       if (!(vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PG)) {
+               log_debug("%s: unpaged, va=pa=0x%llx", __func__, va);
+               *pa = va;
+               return (0);
+       }
+
+       pt_paddr = vrs->vrs_crs[VCPU_REGS_CR3];
+
+       log_debug("%s: guest %%cr0=0x%llx, %%cr3=0x%llx", __func__,
+           vrs->vrs_crs[VCPU_REGS_CR0], vrs->vrs_crs[VCPU_REGS_CR3]);
+
+       if (vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PE) {
+               if (vrs->vrs_crs[VCPU_REGS_CR4] & CR4_PAE) {
+                       pte_size = sizeof(uint64_t);
+                       shift_width = 9;
+
+                       if (vrs->vrs_msrs[VCPU_REGS_EFER] & EFER_LMA) {
+                               /* 4 level paging */
+                               level = 4;
+                               mask = L4_MASK;
+                               shift = L4_SHIFT;
+                       } else {
+                               /* 32 bit with PAE paging */
+                               level = 3;
+                               mask = L3_MASK;
+                               shift = L3_SHIFT;
+                       }
+               } else {
+                       /* 32 bit paging */
+                       level = 2;
+                       shift_width = 10;
+                       mask = 0xFFC00000;
+                       shift = 22;
+                       pte_size = sizeof(uint32_t);
+               }
+       } else
+               return (EINVAL);
+
+       /* XXX: Check for R bit in segment selector and set A bit */
+
+       for (;level > 0; level--) {
+               pdidx = (va & mask) >> shift;
+               pte_paddr = (pt_paddr) + (pdidx * pte_size);
+
+               log_debug("%s: read pte level %d @ GPA 0x%llx", __func__,
+                   level, pte_paddr);
+               if (read_mem(pte_paddr, &pte, pte_size)) {
+                       log_warn("%s: failed to read pte", __func__);
+                       return (EFAULT);
+               }
+
+               log_debug("%s: PTE @ 0x%llx = 0x%llx", __func__, pte_paddr,
+                   pte);
+
+               /* XXX: Set CR2  */
+               if (!(pte & PG_V))
+                       return (EFAULT);
+
+               /* XXX: Check for SMAP */
+               if ((mode == PROT_WRITE) && !(pte & PG_RW))
+                       return (EPERM);
+
+               if ((exit->cpl > 0) && !(pte & PG_u))
+                       return (EPERM);
+
+               pte = pte | PG_U;
+               if (mode == PROT_WRITE)
+                       pte = pte | PG_M;
+               if (write_mem(pte_paddr, &pte, pte_size)) {
+                       log_warn("%s: failed to write back flags to pte",
+                           __func__);
+                       return (EIO);
+               }
+
+               /* XXX: EINVAL if in 32bit and PG_PS is 1 but CR4.PSE is 0 */
+               if (pte & PG_PS)
+                       break;
+
+               if (level > 1) {
+                       pt_paddr = pte & PG_FRAME;
+                       shift -= shift_width;
+                       mask = mask >> shift_width;
+               }
+       }
+
+       low_mask = (1 << shift) - 1;
+       high_mask = (((uint64_t)1ULL << ((pte_size * 8) - 1)) - 1) ^ low_mask;
+       *pa = (pte & high_mask) | (va & low_mask);
+
+       log_debug("%s: final GPA for GVA 0x%llx = 0x%llx\n", __func__, va, *pa);
+
+       return (0);
+}
+
+int
+intr_pending(struct vmd_vm *vm)
+{
+       /* XXX select active interrupt controller */
+       return i8259_is_pending();
+}
+
+int
+intr_ack(struct vmd_vm *vm)
+{
+       /* XXX select active interrupt controller */
+       return i8259_ack();
+}
+
+void
+intr_toggle_el(struct vmd_vm *vm, int irq, int val)
+{
+       /* XXX select active interrupt controller */
+       pic_set_elcr(irq, val);
+}
+
+int
+vmd_check_vmh(struct vm_dump_header *vmh)
+{
+       int i;
+       unsigned int code, leaf;
+       unsigned int a, b, c, d;
+
+       if (strncmp(vmh->vmh_signature, VM_DUMP_SIGNATURE, strlen(VM_DUMP_SIGNATURE)) != 0) {
+               log_warnx("%s: incompatible dump signature", __func__);
+               return (-1);
+       }
+
+       if (vmh->vmh_version != VM_DUMP_VERSION) {
+               log_warnx("%s: incompatible dump version", __func__);
+               return (-1);
+       }
+
+       for (i = 0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) {
+               code = vmh->vmh_cpuids[i].code;
+               leaf = vmh->vmh_cpuids[i].leaf;
+               if (leaf != 0x00) {
+                       log_debug("%s: invalid leaf 0x%x for code 0x%x",
+                           __func__, leaf, code);
+                       return (-1);
+               }
+
+               switch (code) {
+               case 0x00:
+                       CPUID_LEAF(code, leaf, a, b, c, d);
+                       if (vmh->vmh_cpuids[i].a > a) {
+                               log_debug("%s: incompatible cpuid level",
+                                   __func__);
+                               return (-1);
+                       }
+                       if (!(vmh->vmh_cpuids[i].b == b &&
+                           vmh->vmh_cpuids[i].c == c &&
+                           vmh->vmh_cpuids[i].d == d)) {
+                               log_debug("%s: incompatible cpu brand",
+                                   __func__);
+                               return (-1);
+                       }
+                       break;
+
+               case 0x01:
+                       CPUID_LEAF(code, leaf, a, b, c, d);
+                       if ((vmh->vmh_cpuids[i].c & c & VMM_CPUIDECX_MASK) !=
+                           (vmh->vmh_cpuids[i].c & VMM_CPUIDECX_MASK)) {
+                               log_debug("%s: incompatible cpu features "
+                                   "code: 0x%x leaf: 0x%x  reg: c", __func__,
+                                   code, leaf);
+                               return (-1);
+                       }
+                       if ((vmh->vmh_cpuids[i].d & d & VMM_CPUIDEDX_MASK) !=
+                           (vmh->vmh_cpuids[i].d & VMM_CPUIDEDX_MASK)) {
+                               log_debug("%s: incompatible cpu features "
+                                   "code: 0x%x leaf: 0x%x  reg: d", __func__,
+                                   code, leaf);
+                               return (-1);
+                       }
+                       break;
+
+               case 0x07:
+                       CPUID_LEAF(code, leaf, a, b, c, d);
+                       if ((vmh->vmh_cpuids[i].b & b & VMM_SEFF0EBX_MASK) !=
+                           (vmh->vmh_cpuids[i].b & VMM_SEFF0EBX_MASK)) {
+                               log_debug("%s: incompatible cpu features "
+                                   "code: 0x%x leaf: 0x%x  reg: c", __func__,
+                                   code, leaf);
+                               return (-1);
+                       }
+                       if ((vmh->vmh_cpuids[i].c & c & VMM_SEFF0ECX_MASK) !=
+                           (vmh->vmh_cpuids[i].c & VMM_SEFF0ECX_MASK)) {
+                               log_debug("%s: incompatible cpu features "
+                                   "code: 0x%x leaf: 0x%x  reg: d", __func__,
+                                   code, leaf);
+                               return (-1);
+                       }
+                       break;
+
+               case 0x0d:
+                       CPUID_LEAF(code, leaf, a, b, c, d);
+                       if (vmh->vmh_cpuids[i].b > b) {
+                               log_debug("%s: incompatible cpu: insufficient "
+                                   "max save area for enabled XCR0 features",
+                                   __func__);
+                               return (-1);
+                       }
+                       if (vmh->vmh_cpuids[i].c > c) {
+                               log_debug("%s: incompatible cpu: insufficient "
+                                   "max save area for supported XCR0 features",
+                                   __func__);
+                               return (-1);
+                       }
+                       break;
+
+               case 0x80000001:
+                       CPUID_LEAF(code, leaf, a, b, c, d);
+                       if ((vmh->vmh_cpuids[i].a & a) !=
+                           vmh->vmh_cpuids[i].a) {
+                               log_debug("%s: incompatible cpu features "
+                                   "code: 0x%x leaf: 0x%x  reg: a", __func__,
+                                   code, leaf);
+                               return (-1);
+                       }
+                       if ((vmh->vmh_cpuids[i].c & c) !=
+                           vmh->vmh_cpuids[i].c) {
+                               log_debug("%s: incompatible cpu features "
+                                   "code: 0x%x leaf: 0x%x  reg: c", __func__,
+                                   code, leaf);
+                               return (-1);
+                       }
+                       if ((vmh->vmh_cpuids[i].d & d) !=
+                           vmh->vmh_cpuids[i].d) {
+                               log_debug("%s: incompatible cpu features "
+                                   "code: 0x%x leaf: 0x%x  reg: d", __func__,
+                                   code, leaf);
+                               return (-1);
+                       }
+                       break;
+
+               default:
+                       log_debug("%s: unknown code 0x%x", __func__, code);
+                       return (-1);
+               }
+       }
+
+       return (0);
+}