vmd(8): introduce multi-process model for virtio devices.
authordv <dv@openbsd.org>
Thu, 27 Apr 2023 22:47:27 +0000 (22:47 +0000)
committerdv <dv@openbsd.org>
Thu, 27 Apr 2023 22:47:27 +0000 (22:47 +0000)
Isolate virtio network and block device emulation in dedicated
processes, forked and exec'd from the vm process. This allows for
tightening pledge promises to just "stdio".

Communication between the vcpu's and these devices now occurs via
imsg channels, which adds the benefit of not always blocking the
vcpu thread while emulating the device.

With this commit, it's possible that vmd is the first open source
hypervisor that *defaults* to a multi-process device emulation
model without requiring any additional configuration from the
operator.

Testing help from phessler@ and Mischa Peters.

ok mlarkin@

12 files changed:
usr.sbin/vmd/Makefile
usr.sbin/vmd/dhcp.c
usr.sbin/vmd/vioblk.c [new file with mode: 0644]
usr.sbin/vmd/vionet.c [new file with mode: 0644]
usr.sbin/vmd/vioqcow2.c
usr.sbin/vmd/vioraw.c
usr.sbin/vmd/virtio.c
usr.sbin/vmd/virtio.h
usr.sbin/vmd/vm.c
usr.sbin/vmd/vmd.c
usr.sbin/vmd/vmd.h
usr.sbin/vmd/vmm.c

index d0e7d0c..3fbb9d0 100644 (file)
@@ -1,4 +1,4 @@
-#      $OpenBSD: Makefile,v 1.28 2022/11/10 11:46:39 dv Exp $
+#      $OpenBSD: Makefile,v 1.29 2023/04/27 22:47:27 dv Exp $
 
 .if ${MACHINE} == "amd64"
 
@@ -7,7 +7,7 @@ SRCS=           vmd.c control.c log.c priv.c proc.c config.c vmm.c
 SRCS+=         vm.c loadfile_elf.c pci.c virtio.c i8259.c mc146818.c
 SRCS+=         ns8250.c i8253.c dhcp.c packet.c mmio.c
 SRCS+=         parse.y atomicio.c vioscsi.c vioraw.c vioqcow2.c fw_cfg.c
-SRCS+=         vm_agentx.c
+SRCS+=         vm_agentx.c vioblk.c vionet.c
 
 CFLAGS+=       -Wall -I${.CURDIR}
 CFLAGS+=       -Wstrict-prototypes -Wmissing-prototypes
index 7d7d8d0..3b8744f 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: dhcp.c,v 1.11 2021/06/16 16:55:02 dv Exp $    */
+/*     $OpenBSD: dhcp.c,v 1.12 2023/04/27 22:47:27 dv Exp $    */
 
 /*
  * Copyright (c) 2017 Reyk Floeter <reyk@openbsd.org>
@@ -43,8 +43,9 @@ static const uint8_t broadcast[6] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
 extern struct vmd *env;
 
 ssize_t
-dhcp_request(struct vionet_dev *dev, char *buf, size_t buflen, char **obuf)
+dhcp_request(struct virtio_dev *dev, char *buf, size_t buflen, char **obuf)
 {
+       struct vionet_dev       *vionet = NULL;
        unsigned char           *respbuf = NULL, *op, *oe, dhcptype = 0;
        unsigned char           *opts = NULL;
        ssize_t                  offset, optslen, respbuflen = 0;
@@ -56,6 +57,10 @@ dhcp_request(struct vionet_dev *dev, char *buf, size_t buflen, char **obuf)
        struct vmd_vm           *vm;
        const char              *hostname = NULL;
 
+       if (dev->dev_type != VMD_DEVTYPE_NET)
+               fatalx("%s: not a network device", __func__);
+       vionet = &dev->vionet;
+
        if (buflen < BOOTP_MIN_LEN + ETHER_HDR_LEN ||
            buflen > 1500 + ETHER_HDR_LEN)
                return (-1);
@@ -65,10 +70,10 @@ dhcp_request(struct vionet_dev *dev, char *buf, size_t buflen, char **obuf)
                return (-1);
 
        if (memcmp(pc.pc_dmac, broadcast, ETHER_ADDR_LEN) != 0 &&
-           memcmp(pc.pc_dmac, dev->hostmac, ETHER_ADDR_LEN) != 0)
+           memcmp(pc.pc_dmac, vionet->hostmac, ETHER_ADDR_LEN) != 0)
                return (-1);
 
-       if (memcmp(pc.pc_smac, dev->mac, ETHER_ADDR_LEN) != 0)
+       if (memcmp(pc.pc_smac, vionet->mac, ETHER_ADDR_LEN) != 0)
                return (-1);
 
        if ((offset = decode_udp_ip_header(buf, buflen, offset, &pc)) < 0)
@@ -87,7 +92,7 @@ dhcp_request(struct vionet_dev *dev, char *buf, size_t buflen, char **obuf)
        if (req.op != BOOTREQUEST ||
            req.htype != pc.pc_htype ||
            req.hlen != ETHER_ADDR_LEN ||
-           memcmp(dev->mac, req.chaddr, req.hlen) != 0)
+           memcmp(vionet->mac, req.chaddr, req.hlen) != 0)
                return (-1);
 
        /* Ignore unsupported requests for now */
@@ -134,7 +139,7 @@ dhcp_request(struct vionet_dev *dev, char *buf, size_t buflen, char **obuf)
        resp.hlen = req.hlen;
        resp.xid = req.xid;
 
-       if (dev->pxeboot) {
+       if (vionet->pxeboot) {
                strlcpy(resp.file, "auto_install", sizeof resp.file);
                vm = vm_getbyvmid(dev->vm_vmid);
                if (vm && res_hnok(vm->vm_params.vmc_params.vcp_name))
@@ -143,7 +148,7 @@ dhcp_request(struct vionet_dev *dev, char *buf, size_t buflen, char **obuf)
 
        if ((client_addr.s_addr =
            vm_priv_addr(&env->vmd_cfg,
-           dev->vm_vmid, dev->idx, 1)) == 0)
+           dev->vm_vmid, vionet->idx, 1)) == 0)
                return (-1);
        memcpy(&resp.yiaddr, &client_addr,
            sizeof(client_addr));
@@ -152,7 +157,7 @@ dhcp_request(struct vionet_dev *dev, char *buf, size_t buflen, char **obuf)
        ss2sin(&pc.pc_dst)->sin_port = htons(CLIENT_PORT);
 
        if ((server_addr.s_addr = vm_priv_addr(&env->vmd_cfg, dev->vm_vmid,
-           dev->idx, 0)) == 0)
+           vionet->idx, 0)) == 0)
                return (-1);
        memcpy(&resp.siaddr, &server_addr, sizeof(server_addr));
        memcpy(&ss2sin(&pc.pc_src)->sin_addr, &server_addr,
@@ -167,9 +172,9 @@ dhcp_request(struct vionet_dev *dev, char *buf, size_t buflen, char **obuf)
        if ((respbuf = calloc(1, respbuflen)) == NULL)
                goto fail;
 
-       memcpy(&pc.pc_dmac, dev->mac, sizeof(pc.pc_dmac));
-       memcpy(&resp.chaddr, dev->mac, resp.hlen);
-       memcpy(&pc.pc_smac, dev->mac, sizeof(pc.pc_smac));
+       memcpy(&pc.pc_dmac, vionet->mac, sizeof(pc.pc_dmac));
+       memcpy(&resp.chaddr, vionet->mac, resp.hlen);
+       memcpy(&pc.pc_smac, vionet->mac, sizeof(pc.pc_smac));
        pc.pc_smac[5]++;
        if ((offset = assemble_hw_header(respbuf, respbuflen, 0,
            &pc, HTYPE_ETHER)) < 0) {
diff --git a/usr.sbin/vmd/vioblk.c b/usr.sbin/vmd/vioblk.c
new file mode 100644 (file)
index 0000000..08e5f39
--- /dev/null
@@ -0,0 +1,1002 @@
+/*     $OpenBSD: vioblk.c,v 1.1 2023/04/27 22:47:27 dv Exp $   */
+
+/*
+ * Copyright (c) 2023 Dave Voutila <dv@openbsd.org>
+ * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+#include <sys/mman.h>
+#include <sys/param.h> /* PAGE_SIZE */
+#include <sys/socket.h>
+
+#include <machine/vmmvar.h>
+#include <dev/pci/virtio_pcireg.h>
+#include <dev/pv/vioblkreg.h>
+#include <dev/pv/virtioreg.h>
+
+#include <errno.h>
+#include <event.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "atomicio.h"
+#include "pci.h"
+#include "virtio.h"
+#include "vmd.h"
+
+extern char *__progname;
+extern struct vmd_vm *current_vm;
+
+static const char *disk_type(int);
+static uint32_t handle_io_read(struct viodev_msg *, struct virtio_dev *);
+static int handle_io_write(struct viodev_msg *, struct virtio_dev *);
+void vioblk_notify_rx(struct vioblk_dev *);
+int vioblk_notifyq(struct vioblk_dev *);
+
+static void dev_dispatch_vm(int, short, void *);
+static void handle_sync_io(int, short, void *);
+
+static const char *
+disk_type(int type)
+{
+       switch (type) {
+       case VMDF_RAW: return "raw";
+       case VMDF_QCOW2: return "qcow2";
+       }
+       return "unknown";
+}
+
+__dead void
+vioblk_main(int fd)
+{
+       struct virtio_dev        dev;
+       struct vioblk_dev       *vioblk;
+       struct viodev_msg        msg;
+       struct vmd_vm            vm;
+       struct vm_create_params *vcp;
+       ssize_t                  sz;
+       off_t                    szp = 0;
+       int                      i, ret, type;
+
+       log_procinit("vioblk");
+
+       /* stdio - needed for read/write to disk fds and channels to the vm. */
+       if (pledge("stdio", NULL) == -1)
+               fatal("pledge");
+
+       /* Receive our virtio_dev, mostly preconfigured. */
+       memset(&dev, 0, sizeof(dev));
+       sz = atomicio(read, fd, &dev, sizeof(dev));
+       if (sz != sizeof(dev)) {
+               ret = errno;
+               log_warn("failed to receive vionet");
+               goto fail;
+       }
+       if (dev.dev_type != VMD_DEVTYPE_DISK) {
+               ret = EINVAL;
+               log_warn("received invalid device type");
+               goto fail;
+       }
+       dev.sync_fd = fd;
+       vioblk = &dev.vioblk;
+
+       log_debug("%s: got viblk dev. num disk fds = %d, sync fd = %d, "
+           "async fd = %d, sz = %lld maxfer = %d", __func__, vioblk->ndisk_fd,
+           dev.sync_fd, dev.async_fd, vioblk->sz, vioblk->max_xfer);
+
+       /* Receive our vm information from the vm process. */
+       memset(&vm, 0, sizeof(vm));
+       sz = atomicio(read, dev.sync_fd, &vm, sizeof(vm));
+       if (sz != sizeof(vm)) {
+               ret = EIO;
+               log_warnx("failed to receive vm details");
+               goto fail;
+       }
+       vcp = &vm.vm_params.vmc_params;
+       current_vm = &vm;
+       setproctitle("%s/vioblk[%d]", vcp->vcp_name, vioblk->idx);
+
+       /* Now that we have our vm information, we can remap memory. */
+       ret = remap_guest_mem(&vm);
+       if (ret) {
+               log_warnx("failed to remap guest memory");
+               goto fail;
+       }
+
+       /* Initialize the virtio block abstractions. */
+       type = vm.vm_params.vmc_disktypes[vioblk->idx];
+       switch (type) {
+       case VMDF_RAW:
+               ret = virtio_raw_init(&vioblk->file, &szp, vioblk->disk_fd,
+                   vioblk->ndisk_fd);
+               break;
+       case VMDF_QCOW2:
+               ret = virtio_qcow2_init(&vioblk->file, &szp, vioblk->disk_fd,
+                   vioblk->ndisk_fd);
+               break;
+       default:
+               log_warnx("invalid disk image type");
+               goto fail;
+       }
+       if (ret || szp < 0) {
+               log_warnx("failed to init disk %s image", disk_type(type));
+               goto fail;
+       }
+       vioblk->sz = szp;
+       log_debug("%s: initialized vioblk[%d] with %s image (sz=%lld)",
+           __func__, vioblk->idx, disk_type(type), vioblk->sz);
+
+       /* If we're restoring hardware, reinitialize the virtqueue hva. */
+       if (vm.vm_state & VM_STATE_RECEIVED)
+               vioblk_update_qa(vioblk);
+
+       /* Initialize libevent so we can start wiring event handlers. */
+       event_init();
+
+       /* Wire up an async imsg channel. */
+       log_debug("%s: wiring in async vm event handler (fd=%d)", __func__,
+               dev.async_fd);
+       if (vm_device_pipe(&dev, dev_dispatch_vm)) {
+               ret = EIO;
+               log_warnx("vm_device_pipe");
+               goto fail;
+       }
+
+       /* Configure our sync channel event handler. */
+       log_debug("%s: wiring in sync channel handler (fd=%d)", __func__,
+               dev.sync_fd);
+       if (fcntl(dev.sync_fd, F_SETFL, O_NONBLOCK) == -1) {
+               ret = errno;
+               log_warn("%s: fcntl", __func__);
+               goto fail;
+       }
+       imsg_init(&dev.sync_iev.ibuf, dev.sync_fd);
+       dev.sync_iev.handler = handle_sync_io;
+       dev.sync_iev.data = &dev;
+       dev.sync_iev.events = EV_READ;
+       imsg_event_add(&dev.sync_iev);
+
+       /* Send a ready message over the sync channel. */
+       log_debug("%s: telling vm %s device is ready", __func__, vcp->vcp_name);
+       memset(&msg, 0, sizeof(msg));
+       msg.type = VIODEV_MSG_READY;
+       imsg_compose_event(&dev.sync_iev, IMSG_DEVOP_MSG, 0, 0, -1, &msg,
+           sizeof(msg));
+
+       /* Send a ready message over the async channel. */
+       log_debug("%s: sending heartbeat", __func__);
+       ret = imsg_compose_event(&dev.async_iev, IMSG_DEVOP_MSG, 0, 0, -1,
+           &msg, sizeof(msg));
+       if (ret == -1) {
+               log_warnx("%s: failed to send async ready message!", __func__);
+               goto fail;
+       }
+
+       /* Engage the event loop! */
+       ret = event_dispatch();
+
+       if (ret == 0) {
+               /* Clean shutdown. */
+               close_fd(dev.sync_fd);
+               close_fd(dev.async_fd);
+               for (i = 0; i < (int)sizeof(vioblk->disk_fd); i++)
+                       close_fd(vioblk->disk_fd[i]);
+               _exit(0);
+               /* NOTREACHED */
+       }
+
+fail:
+       /* Try letting the vm know we've failed something. */
+       memset(&msg, 0, sizeof(msg));
+       msg.type = VIODEV_MSG_ERROR;
+       msg.data = ret;
+       imsg_compose(&dev.sync_iev.ibuf, IMSG_DEVOP_MSG, 0, 0, -1, &msg,
+           sizeof(msg));
+       imsg_flush(&dev.sync_iev.ibuf);
+
+       close_fd(dev.sync_fd);
+       close_fd(dev.async_fd);
+       for (i = 0; i < (int)sizeof(vioblk->disk_fd); i++)
+               close_fd(vioblk->disk_fd[i]);
+       _exit(ret);
+       /* NOTREACHED */
+}
+
+const char *
+vioblk_cmd_name(uint32_t type)
+{
+       switch (type) {
+       case VIRTIO_BLK_T_IN: return "read";
+       case VIRTIO_BLK_T_OUT: return "write";
+       case VIRTIO_BLK_T_SCSI_CMD: return "scsi read";
+       case VIRTIO_BLK_T_SCSI_CMD_OUT: return "scsi write";
+       case VIRTIO_BLK_T_FLUSH: return "flush";
+       case VIRTIO_BLK_T_FLUSH_OUT: return "flush out";
+       case VIRTIO_BLK_T_GET_ID: return "get id";
+       default: return "unknown";
+       }
+}
+
+void
+vioblk_update_qa(struct vioblk_dev *dev)
+{
+       struct virtio_vq_info *vq_info;
+       void *hva = NULL;
+
+       /* Invalid queue? */
+       if (dev->cfg.queue_select > 0)
+               return;
+
+       vq_info = &dev->vq[dev->cfg.queue_select];
+       vq_info->q_gpa = (uint64_t)dev->cfg.queue_pfn * VIRTIO_PAGE_SIZE;
+
+       hva = hvaddr_mem(vq_info->q_gpa, vring_size(VIOBLK_QUEUE_SIZE));
+       if (hva == NULL)
+               fatal("vioblk_update_qa");
+       vq_info->q_hva = hva;
+}
+
+void
+vioblk_update_qs(struct vioblk_dev *dev)
+{
+       struct virtio_vq_info *vq_info;
+
+       /* Invalid queue? */
+       if (dev->cfg.queue_select > 0) {
+               dev->cfg.queue_size = 0;
+               return;
+       }
+
+       vq_info = &dev->vq[dev->cfg.queue_select];
+
+       /* Update queue pfn/size based on queue select */
+       dev->cfg.queue_pfn = vq_info->q_gpa >> 12;
+       dev->cfg.queue_size = vq_info->qs;
+}
+
+static void
+vioblk_free_info(struct ioinfo *info)
+{
+       if (!info)
+               return;
+       free(info->buf);
+       free(info);
+}
+
+static struct ioinfo *
+vioblk_start_read(struct vioblk_dev *dev, off_t sector, size_t sz)
+{
+       struct ioinfo *info;
+
+       /* Limit to 64M for now */
+       if (sz > (1 << 26)) {
+               log_warnx("%s: read size exceeded 64M", __func__);
+               return (NULL);
+       }
+
+       info = calloc(1, sizeof(*info));
+       if (!info)
+               goto nomem;
+       info->buf = malloc(sz);
+       if (info->buf == NULL)
+               goto nomem;
+       info->len = sz;
+       info->offset = sector * VIRTIO_BLK_SECTOR_SIZE;
+       info->file = &dev->file;
+       return info;
+
+nomem:
+       free(info);
+       log_warn("malloc error vioblk read");
+       return (NULL);
+}
+
+
+static const uint8_t *
+vioblk_finish_read(struct ioinfo *info)
+{
+       struct virtio_backing *file;
+
+       file = info->file;
+       if (file == NULL || file->pread == NULL) {
+               log_warnx("%s: XXX null?!", __func__);
+               return NULL;
+       }
+       if (file->pread(file->p, info->buf, info->len, info->offset) != info->len) {
+               info->error = errno;
+               log_warn("vioblk read error");
+               return NULL;
+       }
+
+       return info->buf;
+}
+
+static struct ioinfo *
+vioblk_start_write(struct vioblk_dev *dev, off_t sector,
+    paddr_t addr, size_t len)
+{
+       struct ioinfo *info;
+
+       /* Limit to 64M for now */
+       if (len > (1 << 26)) {
+               log_warnx("%s: write size exceeded 64M", __func__);
+               return (NULL);
+       }
+
+       info = calloc(1, sizeof(*info));
+       if (!info)
+               goto nomem;
+
+       info->buf = malloc(len);
+       if (info->buf == NULL)
+               goto nomem;
+       info->len = len;
+       info->offset = sector * VIRTIO_BLK_SECTOR_SIZE;
+       info->file = &dev->file;
+
+       if (read_mem(addr, info->buf, info->len)) {
+               vioblk_free_info(info);
+               return NULL;
+       }
+
+       return info;
+
+nomem:
+       free(info);
+       log_warn("malloc error vioblk write");
+       return (NULL);
+}
+
+static int
+vioblk_finish_write(struct ioinfo *info)
+{
+       struct virtio_backing *file;
+
+       file = info->file;
+       if (file->pwrite(file->p, info->buf, info->len, info->offset) != info->len) {
+               log_warn("vioblk write error");
+               return EIO;
+       }
+       return 0;
+}
+
+/*
+ * XXX in various cases, ds should be set to VIRTIO_BLK_S_IOERR, if we can
+ */
+int
+vioblk_notifyq(struct vioblk_dev *dev)
+{
+       uint16_t idx, cmd_desc_idx, secdata_desc_idx, ds_desc_idx;
+       uint8_t ds;
+       int cnt;
+       off_t secbias;
+       char *vr;
+       struct vring_desc *desc, *cmd_desc, *secdata_desc, *ds_desc;
+       struct vring_avail *avail;
+       struct vring_used *used;
+       struct virtio_blk_req_hdr cmd;
+       struct virtio_vq_info *vq_info;
+
+       /* Invalid queue? */
+       if (dev->cfg.queue_notify > 0)
+               return (0);
+
+       vq_info = &dev->vq[dev->cfg.queue_notify];
+       vr = vq_info->q_hva;
+       if (vr == NULL)
+               fatalx("%s: null vring", __func__);
+
+       /* Compute offsets in ring of descriptors, avail ring, and used ring */
+       desc = (struct vring_desc *)(vr);
+       avail = (struct vring_avail *)(vr + vq_info->vq_availoffset);
+       used = (struct vring_used *)(vr + vq_info->vq_usedoffset);
+
+       idx = vq_info->last_avail & VIOBLK_QUEUE_MASK;
+
+       if ((avail->idx & VIOBLK_QUEUE_MASK) == idx) {
+               log_debug("%s - nothing to do?", __func__);
+               return (0);
+       }
+
+       while (idx != (avail->idx & VIOBLK_QUEUE_MASK)) {
+
+               ds = VIRTIO_BLK_S_IOERR;
+               cmd_desc_idx = avail->ring[idx] & VIOBLK_QUEUE_MASK;
+               cmd_desc = &desc[cmd_desc_idx];
+
+               if ((cmd_desc->flags & VRING_DESC_F_NEXT) == 0) {
+                       log_warnx("unchained vioblk cmd descriptor received "
+                           "(idx %d)", cmd_desc_idx);
+                       goto out;
+               }
+
+               /* Read command from descriptor ring */
+               if (cmd_desc->flags & VRING_DESC_F_WRITE) {
+                       log_warnx("vioblk: unexpected writable cmd descriptor "
+                           "%d", cmd_desc_idx);
+                       goto out;
+               }
+               if (read_mem(cmd_desc->addr, &cmd, sizeof(cmd))) {
+                       log_warnx("vioblk: command read_mem error @ 0x%llx",
+                           cmd_desc->addr);
+                       goto out;
+               }
+
+               switch (cmd.type) {
+               case VIRTIO_BLK_T_IN:
+                       /* first descriptor */
+                       secdata_desc_idx = cmd_desc->next & VIOBLK_QUEUE_MASK;
+                       secdata_desc = &desc[secdata_desc_idx];
+
+                       if ((secdata_desc->flags & VRING_DESC_F_NEXT) == 0) {
+                               log_warnx("unchained vioblk data descriptor "
+                                   "received (idx %d)", cmd_desc_idx);
+                               goto out;
+                       }
+
+                       cnt = 0;
+                       secbias = 0;
+                       do {
+                               struct ioinfo *info;
+                               const uint8_t *secdata;
+
+                               if ((secdata_desc->flags & VRING_DESC_F_WRITE)
+                                   == 0) {
+                                       log_warnx("vioblk: unwritable data "
+                                           "descriptor %d", secdata_desc_idx);
+                                       goto out;
+                               }
+
+                               info = vioblk_start_read(dev,
+                                   cmd.sector + secbias, secdata_desc->len);
+
+                               if (info == NULL) {
+                                       log_warnx("vioblk: can't start read");
+                                       goto out;
+                               }
+
+                               /* read the data, use current data descriptor */
+                               secdata = vioblk_finish_read(info);
+                               if (secdata == NULL) {
+                                       vioblk_free_info(info);
+                                       log_warnx("vioblk: block read error, "
+                                           "sector %lld", cmd.sector);
+                                       goto out;
+                               }
+
+                               if (write_mem(secdata_desc->addr, secdata,
+                                       secdata_desc->len)) {
+                                       log_warnx("can't write sector "
+                                           "data to gpa @ 0x%llx",
+                                           secdata_desc->addr);
+                                       vioblk_free_info(info);
+                                       goto out;
+                               }
+
+                               vioblk_free_info(info);
+
+                               secbias += (secdata_desc->len /
+                                   VIRTIO_BLK_SECTOR_SIZE);
+                               secdata_desc_idx = secdata_desc->next &
+                                   VIOBLK_QUEUE_MASK;
+                               secdata_desc = &desc[secdata_desc_idx];
+
+                               /* Guard against infinite chains */
+                               if (++cnt >= VIOBLK_QUEUE_SIZE) {
+                                       log_warnx("%s: descriptor table "
+                                           "invalid", __func__);
+                                       goto out;
+                               }
+                       } while (secdata_desc->flags & VRING_DESC_F_NEXT);
+
+                       ds_desc_idx = secdata_desc_idx;
+                       ds_desc = secdata_desc;
+
+                       ds = VIRTIO_BLK_S_OK;
+                       break;
+               case VIRTIO_BLK_T_OUT:
+                       secdata_desc_idx = cmd_desc->next & VIOBLK_QUEUE_MASK;
+                       secdata_desc = &desc[secdata_desc_idx];
+
+                       if ((secdata_desc->flags & VRING_DESC_F_NEXT) == 0) {
+                               log_warnx("wr vioblk: unchained vioblk data "
+                                   "descriptor received (idx %d)",
+                                   cmd_desc_idx);
+                               goto out;
+                       }
+
+                       if (secdata_desc->len > dev->max_xfer) {
+                               log_warnx("%s: invalid read size %d requested",
+                                   __func__, secdata_desc->len);
+                               goto out;
+                       }
+
+                       cnt = 0;
+                       secbias = 0;
+                       do {
+                               struct ioinfo *info;
+
+                               if (secdata_desc->flags & VRING_DESC_F_WRITE) {
+                                       log_warnx("wr vioblk: unexpected "
+                                           "writable data descriptor %d",
+                                           secdata_desc_idx);
+                                       goto out;
+                               }
+
+                               info = vioblk_start_write(dev,
+                                   cmd.sector + secbias,
+                                   secdata_desc->addr, secdata_desc->len);
+
+                               if (info == NULL) {
+                                       log_warnx("wr vioblk: can't read "
+                                           "sector data @ 0x%llx",
+                                           secdata_desc->addr);
+                                       goto out;
+                               }
+
+                               if (vioblk_finish_write(info)) {
+                                       log_warnx("wr vioblk: disk write "
+                                           "error");
+                                       vioblk_free_info(info);
+                                       goto out;
+                               }
+
+                               vioblk_free_info(info);
+
+                               secbias += secdata_desc->len /
+                                   VIRTIO_BLK_SECTOR_SIZE;
+
+                               secdata_desc_idx = secdata_desc->next &
+                                   VIOBLK_QUEUE_MASK;
+                               secdata_desc = &desc[secdata_desc_idx];
+
+                               /* Guard against infinite chains */
+                               if (++cnt >= VIOBLK_QUEUE_SIZE) {
+                                       log_warnx("%s: descriptor table "
+                                           "invalid", __func__);
+                                       goto out;
+                               }
+                       } while (secdata_desc->flags & VRING_DESC_F_NEXT);
+
+                       ds_desc_idx = secdata_desc_idx;
+                       ds_desc = secdata_desc;
+
+                       ds = VIRTIO_BLK_S_OK;
+                       break;
+               case VIRTIO_BLK_T_FLUSH:
+               case VIRTIO_BLK_T_FLUSH_OUT:
+                       ds_desc_idx = cmd_desc->next & VIOBLK_QUEUE_MASK;
+                       ds_desc = &desc[ds_desc_idx];
+
+                       ds = VIRTIO_BLK_S_UNSUPP;
+                       break;
+               case VIRTIO_BLK_T_GET_ID:
+                       secdata_desc_idx = cmd_desc->next & VIOBLK_QUEUE_MASK;
+                       secdata_desc = &desc[secdata_desc_idx];
+
+                       /*
+                        * We don't support this command yet. While it's not
+                        * officially part of the virtio spec (will be in v1.2)
+                        * there's no feature to negotiate. Linux drivers will
+                        * often send this command regardless.
+                        *
+                        * When the command is received, it should appear as a
+                        * chain of 3 descriptors, similar to the IN/OUT
+                        * commands. The middle descriptor should have have a
+                        * length of VIRTIO_BLK_ID_BYTES bytes.
+                        */
+                       if ((secdata_desc->flags & VRING_DESC_F_NEXT) == 0) {
+                               log_warnx("id vioblk: unchained vioblk data "
+                                   "descriptor received (idx %d)",
+                                   cmd_desc_idx);
+                               goto out;
+                       }
+
+                       /* Skip the data descriptor. */
+                       ds_desc_idx = secdata_desc->next & VIOBLK_QUEUE_MASK;
+                       ds_desc = &desc[ds_desc_idx];
+
+                       ds = VIRTIO_BLK_S_UNSUPP;
+                       break;
+               default:
+                       log_warnx("%s: unsupported command 0x%x", __func__,
+                           cmd.type);
+                       ds_desc_idx = cmd_desc->next & VIOBLK_QUEUE_MASK;
+                       ds_desc = &desc[ds_desc_idx];
+
+                       ds = VIRTIO_BLK_S_UNSUPP;
+                       break;
+               }
+
+               if ((ds_desc->flags & VRING_DESC_F_WRITE) == 0) {
+                       log_warnx("%s: ds descriptor %d unwritable", __func__,
+                           ds_desc_idx);
+                       goto out;
+               }
+               if (write_mem(ds_desc->addr, &ds, sizeof(ds))) {
+                       log_warnx("%s: can't write device status data @ 0x%llx",
+                           __func__, ds_desc->addr);
+                       goto out;
+               }
+
+               dev->cfg.isr_status = 1;
+               used->ring[used->idx & VIOBLK_QUEUE_MASK].id = cmd_desc_idx;
+               used->ring[used->idx & VIOBLK_QUEUE_MASK].len = cmd_desc->len;
+               __sync_synchronize();
+               used->idx++;
+
+               vq_info->last_avail = avail->idx & VIOBLK_QUEUE_MASK;
+               idx = (idx + 1) & VIOBLK_QUEUE_MASK;
+       }
+out:
+       return (1);
+}
+
+static void
+dev_dispatch_vm(int fd, short event, void *arg)
+{
+       struct virtio_dev       *dev = (struct virtio_dev *)arg;
+       struct imsgev           *iev = &dev->async_iev;
+       struct imsgbuf          *ibuf = &iev->ibuf;
+       struct imsg              imsg;
+       ssize_t                  n = 0;
+
+       if (event & EV_READ) {
+               if ((n = imsg_read(ibuf)) == -1 && errno != EAGAIN)
+                       fatal("%s: imsg_read", __func__);
+               if (n == 0) {
+                       /* this pipe is dead, so remove the event handler */
+                       log_debug("%s: pipe dead (EV_READ)", __func__);
+                       event_del(&iev->ev);
+                       event_loopexit(NULL);
+                       return;
+               }
+       }
+
+       if (event & EV_WRITE) {
+               if ((n = msgbuf_write(&ibuf->w)) == -1 && errno != EAGAIN)
+                       fatal("%s: msgbuf_write", __func__);
+               if (n == 0) {
+                       /* this pipe is dead, so remove the event handler */
+                       log_debug("%s: pipe dead (EV_WRITE)", __func__);
+                       event_del(&iev->ev);
+                       event_loopbreak();
+                       return;
+               }
+       }
+
+       for (;;) {
+               if ((n = imsg_get(ibuf, &imsg)) == -1)
+                       fatal("%s: imsg_get", __func__);
+               if (n == 0)
+                       break;
+
+               switch (imsg.hdr.type) {
+               case IMSG_VMDOP_PAUSE_VM:
+                       log_debug("%s: pausing", __func__);
+                       break;
+               case IMSG_VMDOP_UNPAUSE_VM:
+                       log_debug("%s: unpausing", __func__);
+                       break;
+               default:
+                       log_warnx("%s: unhandled imsg type %d", __func__,
+                           imsg.hdr.type);
+                       break;
+               }
+               imsg_free(&imsg);
+       }
+       imsg_event_add(iev);
+}
+
+/*
+ * Synchronous IO handler.
+ *
+ */
+static void
+handle_sync_io(int fd, short event, void *arg)
+{
+       struct virtio_dev *dev = (struct virtio_dev *)arg;
+       struct imsgev *iev = &dev->sync_iev;
+       struct imsgbuf *ibuf = &iev->ibuf;
+       struct viodev_msg msg;
+       struct imsg imsg;
+       ssize_t n;
+
+       if (event & EV_READ) {
+               if ((n = imsg_read(ibuf)) == -1 && errno != EAGAIN)
+                       fatal("%s: imsg_read", __func__);
+               if (n == 0) {
+                       /* this pipe is dead, so remove the event handler */
+                       log_debug("%s: vioblk pipe dead (EV_READ)", __func__);
+                       event_del(&iev->ev);
+                       event_loopexit(NULL);
+                       return;
+               }
+       }
+
+       if (event & EV_WRITE) {
+               if ((n = msgbuf_write(&ibuf->w)) == -1 && errno != EAGAIN)
+                       fatal("%s: msgbuf_write", __func__);
+               if (n == 0) {
+                       /* this pipe is dead, so remove the event handler */
+                       log_debug("%s: vioblk pipe dead (EV_WRITE)", __func__);
+                       event_del(&iev->ev);
+                       event_loopexit(NULL);
+                       return;
+               }
+       }
+
+       for (;;) {
+               if ((n = imsg_get(ibuf, &imsg)) == -1)
+                       fatalx("%s: imsg_get (n=%ld)", __func__, n);
+               if (n == 0)
+                       break;
+
+               /* Unpack our message. They ALL should be dev messeges! */
+               IMSG_SIZE_CHECK(&imsg, &msg);
+               memcpy(&msg, imsg.data, sizeof(msg));
+               imsg_free(&imsg);
+
+               switch (msg.type) {
+               case VIODEV_MSG_DUMP:
+                       /* Dump device */
+                       n = atomicio(vwrite, dev->sync_fd, dev, sizeof(*dev));
+                       if (n != sizeof(*dev)) {
+                               log_warnx("%s: failed to dump vioblk device",
+                                   __func__);
+                               break;
+                       }
+               case VIODEV_MSG_IO_READ:
+                       /* Read IO: make sure to send a reply */
+                       msg.data = handle_io_read(&msg, dev);
+                       msg.data_valid = 1;
+                       imsg_compose_event(iev, IMSG_DEVOP_MSG, 0, 0, -1, &msg,
+                           sizeof(msg));
+                       break;
+               case VIODEV_MSG_IO_WRITE:
+                       /* Write IO: no reply needed */
+                       if (handle_io_write(&msg, dev) == 1)
+                               virtio_assert_pic_irq(dev, 0);
+                       break;
+               case VIODEV_MSG_SHUTDOWN:
+                       event_del(&dev->sync_iev.ev);
+                       event_loopbreak();
+                       return;
+               default:
+                       fatalx("%s: invalid msg type %d", __func__, msg.type);
+               }
+       }
+       imsg_event_add(iev);
+}
+
+static int
+handle_io_write(struct viodev_msg *msg, struct virtio_dev *dev)
+{
+       struct vioblk_dev *vioblk = &dev->vioblk;
+       uint32_t data = msg->data;
+       int intr = 0;
+
+       switch (msg->reg) {
+       case VIRTIO_CONFIG_DEVICE_FEATURES:
+       case VIRTIO_CONFIG_QUEUE_SIZE:
+       case VIRTIO_CONFIG_ISR_STATUS:
+               log_warnx("%s: illegal write %x to %s", __progname, data,
+                   virtio_reg_name(msg->reg));
+               break;
+       case VIRTIO_CONFIG_GUEST_FEATURES:
+               vioblk->cfg.guest_feature = data;
+               break;
+       case VIRTIO_CONFIG_QUEUE_PFN:
+               vioblk->cfg.queue_pfn = data;
+               vioblk_update_qa(vioblk);
+               break;
+       case VIRTIO_CONFIG_QUEUE_SELECT:
+               vioblk->cfg.queue_select = data;
+               vioblk_update_qs(vioblk);
+               break;
+       case VIRTIO_CONFIG_QUEUE_NOTIFY:
+               vioblk->cfg.queue_notify = data;
+               if (vioblk_notifyq(vioblk))
+                       intr = 1;
+               break;
+       case VIRTIO_CONFIG_DEVICE_STATUS:
+               vioblk->cfg.device_status = data;
+               if (vioblk->cfg.device_status == 0) {
+                       vioblk->cfg.guest_feature = 0;
+                       vioblk->cfg.queue_pfn = 0;
+                       vioblk_update_qa(vioblk);
+                       vioblk->cfg.queue_size = 0;
+                       vioblk_update_qs(vioblk);
+                       vioblk->cfg.queue_select = 0;
+                       vioblk->cfg.queue_notify = 0;
+                       vioblk->cfg.isr_status = 0;
+                       vioblk->vq[0].last_avail = 0;
+                       vioblk->vq[0].notified_avail = 0;
+                       virtio_deassert_pic_irq(dev, msg->vcpu);
+               }
+               break;
+       default:
+               break;
+       }
+       return (intr);
+}
+
+static uint32_t
+handle_io_read(struct viodev_msg *msg, struct virtio_dev *dev)
+{
+       struct vioblk_dev *vioblk = &dev->vioblk;
+       uint8_t sz = msg->io_sz;
+       uint32_t data;
+
+       if (msg->data_valid)
+               data = msg->data;
+       else
+               data = 0;
+
+       switch (msg->reg) {
+       case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI:
+               switch (sz) {
+               case 4:
+                       data = (uint32_t)(vioblk->sz);
+                       break;
+               case 2:
+                       data &= 0xFFFF0000;
+                       data |= (uint32_t)(vioblk->sz) & 0xFFFF;
+                       break;
+               case 1:
+                       data &= 0xFFFFFF00;
+                       data |= (uint32_t)(vioblk->sz) & 0xFF;
+                       break;
+               }
+               /* XXX handle invalid sz */
+               break;
+       case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 1:
+               if (sz == 1) {
+                       data &= 0xFFFFFF00;
+                       data |= (uint32_t)(vioblk->sz >> 8) & 0xFF;
+               }
+               /* XXX handle invalid sz */
+               break;
+       case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 2:
+               if (sz == 1) {
+                       data &= 0xFFFFFF00;
+                       data |= (uint32_t)(vioblk->sz >> 16) & 0xFF;
+               } else if (sz == 2) {
+                       data &= 0xFFFF0000;
+                       data |= (uint32_t)(vioblk->sz >> 16) & 0xFFFF;
+               }
+               /* XXX handle invalid sz */
+               break;
+       case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 3:
+               if (sz == 1) {
+                       data &= 0xFFFFFF00;
+                       data |= (uint32_t)(vioblk->sz >> 24) & 0xFF;
+               }
+               /* XXX handle invalid sz */
+               break;
+       case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 4:
+               switch (sz) {
+               case 4:
+                       data = (uint32_t)(vioblk->sz >> 32);
+                       break;
+               case 2:
+                       data &= 0xFFFF0000;
+                       data |= (uint32_t)(vioblk->sz >> 32) & 0xFFFF;
+                       break;
+               case 1:
+                       data &= 0xFFFFFF00;
+                       data |= (uint32_t)(vioblk->sz >> 32) & 0xFF;
+                       break;
+               }
+               /* XXX handle invalid sz */
+               break;
+       case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 5:
+               if (sz == 1) {
+                       data &= 0xFFFFFF00;
+                       data |= (uint32_t)(vioblk->sz >> 40) & 0xFF;
+               }
+               /* XXX handle invalid sz */
+               break;
+       case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 6:
+               if (sz == 1) {
+                       data &= 0xFFFFFF00;
+                       data |= (uint32_t)(vioblk->sz >> 48) & 0xFF;
+               } else if (sz == 2) {
+                       data &= 0xFFFF0000;
+                       data |= (uint32_t)(vioblk->sz >> 48) & 0xFFFF;
+               }
+               /* XXX handle invalid sz */
+               break;
+       case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 7:
+               if (sz == 1) {
+                       data &= 0xFFFFFF00;
+                       data |= (uint32_t)(vioblk->sz >> 56) & 0xFF;
+               }
+               /* XXX handle invalid sz */
+               break;
+       case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 8:
+               switch (sz) {
+               case 4:
+                       data = (uint32_t)(vioblk->max_xfer);
+                       break;
+               case 2:
+                       data &= 0xFFFF0000;
+                       data |= (uint32_t)(vioblk->max_xfer) & 0xFFFF;
+                       break;
+               case 1:
+                       data &= 0xFFFFFF00;
+                       data |= (uint32_t)(vioblk->max_xfer) & 0xFF;
+                       break;
+               }
+               /* XXX handle invalid sz */
+               break;
+       case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 9:
+               if (sz == 1) {
+                       data &= 0xFFFFFF00;
+                       data |= (uint32_t)(vioblk->max_xfer >> 8) & 0xFF;
+               }
+               /* XXX handle invalid sz */
+               break;
+       case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 10:
+               if (sz == 1) {
+                       data &= 0xFFFFFF00;
+                       data |= (uint32_t)(vioblk->max_xfer >> 16) & 0xFF;
+               } else if (sz == 2) {
+                       data &= 0xFFFF0000;
+                       data |= (uint32_t)(vioblk->max_xfer >> 16)
+                           & 0xFFFF;
+               }
+               /* XXX handle invalid sz */
+               break;
+       case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 11:
+               if (sz == 1) {
+                       data &= 0xFFFFFF00;
+                       data |= (uint32_t)(vioblk->max_xfer >> 24) & 0xFF;
+               }
+               /* XXX handle invalid sz */
+               break;
+       case VIRTIO_CONFIG_DEVICE_FEATURES:
+               data = vioblk->cfg.device_feature;
+               break;
+       case VIRTIO_CONFIG_GUEST_FEATURES:
+               data = vioblk->cfg.guest_feature;
+               break;
+       case VIRTIO_CONFIG_QUEUE_PFN:
+               data = vioblk->cfg.queue_pfn;
+               break;
+       case VIRTIO_CONFIG_QUEUE_SIZE:
+               data = vioblk->cfg.queue_size;
+               break;
+       case VIRTIO_CONFIG_QUEUE_SELECT:
+               data = vioblk->cfg.queue_select;
+               break;
+       case VIRTIO_CONFIG_QUEUE_NOTIFY:
+               data = vioblk->cfg.queue_notify;
+               break;
+       case VIRTIO_CONFIG_DEVICE_STATUS:
+               data = vioblk->cfg.device_status;
+               break;
+       case VIRTIO_CONFIG_ISR_STATUS:
+               data = vioblk->cfg.isr_status;
+               vioblk->cfg.isr_status = 0;
+               virtio_deassert_pic_irq(dev, 0);
+               break;
+       default:
+               return (0xFFFFFFFF);
+       }
+
+       return (data);
+}
diff --git a/usr.sbin/vmd/vionet.c b/usr.sbin/vmd/vionet.c
new file mode 100644 (file)
index 0000000..77ba469
--- /dev/null
@@ -0,0 +1,929 @@
+/*     $OpenBSD: vionet.c,v 1.1 2023/04/27 22:47:27 dv Exp $   */
+
+/*
+ * Copyright (c) 2023 Dave Voutila <dv@openbsd.org>
+ * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+#include <sys/mman.h>
+#include <sys/param.h> /* PAGE_SIZE */
+#include <sys/socket.h>
+
+#include <dev/pci/virtio_pcireg.h>
+#include <dev/pv/virtioreg.h>
+
+#include <machine/vmmvar.h>
+
+#include <net/if.h>
+#include <netinet/in.h>
+#include <netinet/if_ether.h>
+#include <netinet/ip.h>
+
+#include <errno.h>
+#include <event.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "atomicio.h"
+#include "pci.h"
+#include "virtio.h"
+#include "vmd.h"
+
+#define VIRTIO_NET_F_MAC       (1 << 5)
+#define RXQ    0
+#define TXQ    1
+
+extern char *__progname;
+extern struct vmd_vm *current_vm;
+
+/* Device Globals */
+struct event ev_tap;
+
+static int vionet_rx(struct vionet_dev *);
+static void vionet_rx_event(int, short, void *);
+static uint32_t handle_io_read(struct viodev_msg *, struct virtio_dev *);
+static int handle_io_write(struct viodev_msg *, struct virtio_dev *);
+void vionet_notify_rx(struct virtio_dev *);
+int vionet_notifyq(struct virtio_dev *);
+
+static void dev_dispatch_vm(int, short, void *);
+static void handle_sync_io(int, short, void *);
+
+__dead void
+vionet_main(int fd)
+{
+       struct virtio_dev        dev;
+       struct vionet_dev       *vionet = NULL;
+       struct viodev_msg        msg;
+       struct vmd_vm            vm;
+       struct vm_create_params *vcp;
+       ssize_t                  sz;
+       int                      ret;
+
+       log_procinit("vionet");
+
+       /* stdio - needed for read/write to tap fd and channels to the vm. */
+       if (pledge("stdio", NULL) == -1)
+               fatal("pledge");
+
+       /* Receive our vionet_dev, mostly preconfigured. */
+       sz = atomicio(read, fd, &dev, sizeof(dev));
+       if (sz != sizeof(dev)) {
+               ret = errno;
+               log_warn("failed to receive vionet");
+               goto fail;
+       }
+       if (dev.dev_type != VMD_DEVTYPE_NET) {
+               ret = EINVAL;
+               log_warn("received invalid device type");
+               goto fail;
+       }
+       dev.sync_fd = fd;
+       vionet = &dev.vionet;
+
+       log_debug("%s: got vionet dev. tap fd = %d, syncfd = %d, asyncfd = %d",
+           __func__, vionet->data_fd, dev.sync_fd, dev.async_fd);
+
+       /* Receive our vm information from the vm process. */
+       memset(&vm, 0, sizeof(vm));
+       sz = atomicio(read, dev.sync_fd, &vm, sizeof(vm));
+       if (sz != sizeof(vm)) {
+               ret = EIO;
+               log_warnx("failed to receive vm details");
+               goto fail;
+       }
+       vcp = &vm.vm_params.vmc_params;
+       current_vm = &vm;
+       setproctitle("%s/vionet[%d]", vcp->vcp_name, vionet->idx);
+
+       /* Now that we have our vm information, we can remap memory. */
+       ret = remap_guest_mem(&vm);
+       if (ret)
+               goto fail;
+
+       /* If we're restoring hardware, re-initialize virtqueue hva's. */
+       if (vm.vm_state & VM_STATE_RECEIVED) {
+               struct virtio_vq_info *vq_info;
+               void *hva = NULL;
+
+               vq_info = &dev.vionet.vq[TXQ];
+               if (vq_info->q_gpa != 0) {
+                       log_debug("%s: restoring TX virtqueue for gpa 0x%llx",
+                           __func__, vq_info->q_gpa);
+                       hva = hvaddr_mem(vq_info->q_gpa,
+                           vring_size(VIONET_QUEUE_SIZE));
+                       if (hva == NULL)
+                               fatalx("%s: hva == NULL", __func__);
+                       vq_info->q_hva = hva;
+               }
+
+               vq_info = &dev.vionet.vq[RXQ];
+               if (vq_info->q_gpa != 0) {
+                       log_debug("%s: restoring RX virtqueue for gpa 0x%llx",
+                           __func__, vq_info->q_gpa);
+                       hva = hvaddr_mem(vq_info->q_gpa,
+                           vring_size(VIONET_QUEUE_SIZE));
+                       if (hva == NULL)
+                               fatalx("%s: hva == NULL", __func__);
+                       vq_info->q_hva = hva;
+               }
+       }
+
+       /* Initialize libevent so we can start wiring event handlers. */
+       event_init();
+
+       /* Wire up an async imsg channel. */
+       log_debug("%s: wiring in async vm event handler (fd=%d)", __func__,
+               dev.async_fd);
+       if (vm_device_pipe(&dev, dev_dispatch_vm)) {
+               ret = EIO;
+               log_warnx("vm_device_pipe");
+               goto fail;
+       }
+
+       /* Wire up event handling for the tap fd. */
+       log_debug("%s: wiring in tap fd handler (fd=%d)", __func__,
+           vionet->data_fd);
+       event_set(&ev_tap, vionet->data_fd, EV_READ | EV_PERSIST,
+           vionet_rx_event, &dev);
+
+       /* Configure our sync channel event handler. */
+       log_debug("%s: wiring in sync channel handler (fd=%d)", __func__,
+               dev.sync_fd);
+       if (fcntl(dev.sync_fd, F_SETFL, O_NONBLOCK) == -1) {
+               ret = errno;
+               log_warn("%s: fcntl", __func__);
+               goto fail;
+       }
+       imsg_init(&dev.sync_iev.ibuf, dev.sync_fd);
+       dev.sync_iev.handler = handle_sync_io;
+       dev.sync_iev.data = &dev;
+       dev.sync_iev.events = EV_READ;
+       imsg_event_add(&dev.sync_iev);
+
+       /* Send a ready message over the sync channel. */
+       log_debug("%s: telling vm %s device is ready", __func__, vcp->vcp_name);
+       memset(&msg, 0, sizeof(msg));
+       msg.type = VIODEV_MSG_READY;
+       imsg_compose_event(&dev.sync_iev, IMSG_DEVOP_MSG, 0, 0, -1, &msg,
+           sizeof(msg));
+
+       /* Send a ready message over the async channel. */
+       log_debug("%s: sending async ready message", __func__);
+       ret = imsg_compose_event(&dev.async_iev, IMSG_DEVOP_MSG, 0, 0, -1,
+           &msg, sizeof(msg));
+       if (ret == -1) {
+               log_warnx("%s: failed to send async ready message!", __func__);
+               goto fail;
+       }
+
+       /* Engage the event loop! */
+       ret = event_dispatch();
+
+       /* Cleanup */
+       if (ret == 0) {
+               close_fd(dev.sync_fd);
+               close_fd(dev.async_fd);
+               close_fd(vionet->data_fd);
+               _exit(ret);
+               /* NOTREACHED */
+       }
+fail:
+       /* Try firing off a message to the vm saying we're dying. */
+       memset(&msg, 0, sizeof(msg));
+       msg.type = VIODEV_MSG_ERROR;
+       msg.data = ret;
+       imsg_compose(&dev.sync_iev.ibuf, IMSG_DEVOP_MSG, 0, 0, -1, &msg,
+           sizeof(msg));
+       imsg_flush(&dev.sync_iev.ibuf);
+
+       close_fd(dev.sync_fd);
+       close_fd(dev.async_fd);
+       if (vionet != NULL)
+               close_fd(vionet->data_fd);
+
+       _exit(ret);
+}
+
+/*
+ * Update the gpa and hva of the virtqueue.
+ */
+void
+vionet_update_qa(struct vionet_dev *dev)
+{
+       struct virtio_vq_info *vq_info;
+       void *hva = NULL;
+
+       /* Invalid queue? */
+       if (dev->cfg.queue_select > 1)
+               return;
+
+       vq_info = &dev->vq[dev->cfg.queue_select];
+       vq_info->q_gpa = (uint64_t)dev->cfg.queue_pfn * VIRTIO_PAGE_SIZE;
+       dev->cfg.queue_pfn = vq_info->q_gpa >> 12;
+
+       if (vq_info->q_gpa == 0)
+               vq_info->q_hva = NULL;
+
+       hva = hvaddr_mem(vq_info->q_gpa, vring_size(VIONET_QUEUE_SIZE));
+       if (hva == NULL)
+               fatalx("%s: hva == NULL", __func__);
+
+       vq_info->q_hva = hva;
+}
+
+/*
+ * Update the queue size.
+ */
+void
+vionet_update_qs(struct vionet_dev *dev)
+{
+       struct virtio_vq_info *vq_info;
+
+       /* Invalid queue? */
+       if (dev->cfg.queue_select > 1) {
+               log_warnx("%s: !!! invalid queue selector %d", __func__,
+                   dev->cfg.queue_select);
+               dev->cfg.queue_size = 0;
+               return;
+       }
+
+       vq_info = &dev->vq[dev->cfg.queue_select];
+
+       /* Update queue pfn/size based on queue select */
+       dev->cfg.queue_pfn = vq_info->q_gpa >> 12;
+       dev->cfg.queue_size = vq_info->qs;
+}
+
+/*
+ * vionet_enq_rx
+ *
+ * Take a given packet from the host-side tap and copy it into the guest's
+ * buffers utilizing the rx virtio ring. If the packet length is invalid
+ * (too small or too large) or if there are not enough buffers available,
+ * the packet is dropped.
+ */
+int
+vionet_enq_rx(struct vionet_dev *dev, char *pkt, size_t sz, int *spc)
+{
+       uint16_t dxx, idx, hdr_desc_idx, chain_hdr_idx;
+       char *vr = NULL;
+       size_t bufsz = 0, off = 0, pkt_offset = 0, chunk_size = 0;
+       size_t chain_len = 0;
+       struct vring_desc *desc, *pkt_desc, *hdr_desc;
+       struct vring_avail *avail;
+       struct vring_used *used;
+       struct virtio_vq_info *vq_info;
+       struct virtio_net_hdr hdr;
+       size_t hdr_sz;
+
+       if (sz < VIONET_MIN_TXLEN || sz > VIONET_MAX_TXLEN) {
+               log_warnx("%s: invalid packet size", __func__);
+               return (0);
+       }
+
+       hdr_sz = sizeof(hdr);
+
+       if (!(dev->cfg.device_status
+           & VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK)) {
+               log_warnx("%s: driver not ready", __func__);
+               return (0);
+       }
+
+       vq_info = &dev->vq[RXQ];
+       vr = vq_info->q_hva;
+       if (vr == NULL)
+               fatalx("%s: vr == NULL", __func__);
+
+
+       /* Compute offsets in ring of descriptors, avail ring, and used ring */
+       desc = (struct vring_desc *)(vr);
+       avail = (struct vring_avail *)(vr + vq_info->vq_availoffset);
+       used = (struct vring_used *)(vr + vq_info->vq_usedoffset);
+
+       idx = vq_info->last_avail & VIONET_QUEUE_MASK;
+       if ((vq_info->notified_avail & VIONET_QUEUE_MASK) == idx) {
+               log_debug("%s: insufficient available buffer capacity, "
+                   "dropping packet.", __func__);
+               return (0);
+       }
+
+       hdr_desc_idx = avail->ring[idx] & VIONET_QUEUE_MASK;
+       hdr_desc = &desc[hdr_desc_idx];
+
+       dxx = hdr_desc_idx;
+       chain_hdr_idx = dxx;
+       chain_len = 0;
+
+       /* Process the descriptor and walk any potential chain. */
+       do {
+               off = 0;
+               pkt_desc = &desc[dxx];
+               if (!(pkt_desc->flags & VRING_DESC_F_WRITE)) {
+                       log_warnx("%s: invalid descriptor, not writable",
+                           __func__);
+                       return (0);
+               }
+
+               /* How much data do we get to write? */
+               if (sz - bufsz > pkt_desc->len)
+                       chunk_size = pkt_desc->len;
+               else
+                       chunk_size = sz - bufsz;
+
+               if (chain_len == 0) {
+                       off = hdr_sz;
+                       if (chunk_size == pkt_desc->len)
+                               chunk_size -= off;
+               }
+
+               /* Write a chunk of data if we need to */
+               if (chunk_size && write_mem(pkt_desc->addr + off,
+                       pkt + pkt_offset, chunk_size)) {
+                       log_warnx("%s: failed to write to buffer 0x%llx",
+                           __func__, pkt_desc->addr);
+                       return (0);
+               }
+
+               chain_len += chunk_size + off;
+               bufsz += chunk_size;
+               pkt_offset += chunk_size;
+
+               dxx = pkt_desc->next & VIONET_QUEUE_MASK;
+       } while (bufsz < sz && pkt_desc->flags & VRING_DESC_F_NEXT);
+
+       /* Move our marker in the ring...*/
+       vq_info->last_avail = (vq_info->last_avail + 1) &
+           VIONET_QUEUE_MASK;
+
+       /* Prepend the virtio net header in the first buffer. */
+       memset(&hdr, 0, sizeof(hdr));
+       hdr.hdr_len = hdr_sz;
+       if (write_mem(hdr_desc->addr, &hdr, hdr_sz)) {
+           log_warnx("vionet: rx enq header write_mem error @ 0x%llx",
+               hdr_desc->addr);
+           return (0);
+       }
+
+       /* Update the index field in the used ring. This must be done last. */
+       dev->cfg.isr_status = 1;
+       *spc = (vq_info->notified_avail - vq_info->last_avail)
+           & VIONET_QUEUE_MASK;
+
+       /* Update the list of used buffers. */
+       used->ring[used->idx & VIONET_QUEUE_MASK].id = chain_hdr_idx;
+       used->ring[used->idx & VIONET_QUEUE_MASK].len = chain_len;
+       __sync_synchronize();
+       used->idx++;
+
+       return (1);
+}
+
+/*
+ * vionet_rx
+ *
+ * Enqueue data that was received on a tap file descriptor
+ * to the vionet device queue.
+ */
+static int
+vionet_rx(struct vionet_dev *dev)
+{
+       char buf[PAGE_SIZE];
+       int num_enq = 0, spc = 0;
+       struct ether_header *eh;
+       ssize_t sz;
+
+       do {
+               sz = read(dev->data_fd, buf, sizeof(buf));
+               if (sz == -1) {
+                       /*
+                        * If we get EAGAIN, No data is currently available.
+                        * Do not treat this as an error.
+                        */
+                       if (errno != EAGAIN)
+                               log_warn("%s: read error", __func__);
+               } else if (sz > 0) {
+                       eh = (struct ether_header *)buf;
+                       if (!dev->lockedmac ||
+                           ETHER_IS_MULTICAST(eh->ether_dhost) ||
+                           memcmp(eh->ether_dhost, dev->mac,
+                           sizeof(eh->ether_dhost)) == 0)
+                               num_enq += vionet_enq_rx(dev, buf, sz, &spc);
+               } else if (sz == 0) {
+                       log_debug("%s: no data", __func__);
+                       break;
+               }
+       } while (spc > 0 && sz > 0);
+
+       return (num_enq);
+}
+
+/*
+ * vionet_rx_event
+ *
+ * Called when new data can be received on the tap fd of a vionet device.
+ */
+static void
+vionet_rx_event(int fd, short kind, void *arg)
+{
+       struct virtio_dev *dev = (struct virtio_dev *)arg;
+
+       if (vionet_rx(&dev->vionet) > 0)
+               virtio_assert_pic_irq(dev, 0);
+}
+
+void
+vionet_notify_rx(struct virtio_dev *dev)
+{
+       struct vionet_dev *vionet = &dev->vionet;
+       struct vring_avail *avail;
+       struct virtio_vq_info *vq_info;
+       char *vr;
+
+       vq_info = &vionet->vq[RXQ];
+       vr = vq_info->q_hva;
+       if (vr == NULL)
+               fatalx("%s: vr == NULL", __func__);
+
+       /* Compute offset into avail ring */
+       avail = (struct vring_avail *)(vr + vq_info->vq_availoffset);
+       vq_info->notified_avail = avail->idx - 1;
+}
+
+int
+vionet_notifyq(struct virtio_dev *dev)
+{
+       struct vionet_dev *vionet = &dev->vionet;
+       int ret = 0;
+
+       switch (vionet->cfg.queue_notify) {
+       case RXQ:
+               vionet_notify_rx(dev);
+               break;
+       case TXQ:
+               ret = vionet_notify_tx(dev);
+               break;
+       default:
+               /*
+                * Catch the unimplemented queue ID 2 (control queue) as
+                * well as any bogus queue IDs.
+                */
+               log_debug("%s: notify for unimplemented queue ID %d",
+                   __func__, vionet->cfg.queue_notify);
+               break;
+       }
+
+       return (ret);
+}
+
+int
+vionet_notify_tx(struct virtio_dev *dev)
+{
+       uint16_t idx, pkt_desc_idx, hdr_desc_idx, dxx, cnt;
+       size_t pktsz, chunk_size = 0;
+       ssize_t dhcpsz = 0;
+       int num_enq, ofs, spc = 0;
+       char *vr = NULL, *pkt = NULL, *dhcppkt = NULL;
+       struct vionet_dev *vionet = &dev->vionet;
+       struct vring_desc *desc, *pkt_desc, *hdr_desc;
+       struct vring_avail *avail;
+       struct vring_used *used;
+       struct virtio_vq_info *vq_info;
+       struct ether_header *eh;
+
+       if (!(vionet->cfg.device_status
+           & VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK)) {
+               log_warnx("%s: driver not ready", __func__);
+               return (0);
+       }
+
+       vq_info = &vionet->vq[TXQ];
+       vr = vq_info->q_hva;
+       if (vr == NULL)
+               fatalx("%s: vr == NULL", __func__);
+
+       /* Compute offsets in ring of descriptors, avail ring, and used ring */
+       desc = (struct vring_desc *)(vr);
+       avail = (struct vring_avail *)(vr + vq_info->vq_availoffset);
+       used = (struct vring_used *)(vr + vq_info->vq_usedoffset);
+
+       num_enq = 0;
+
+       idx = vq_info->last_avail & VIONET_QUEUE_MASK;
+
+       if ((avail->idx & VIONET_QUEUE_MASK) == idx)
+               return (0);
+
+       while ((avail->idx & VIONET_QUEUE_MASK) != idx) {
+               hdr_desc_idx = avail->ring[idx] & VIONET_QUEUE_MASK;
+               hdr_desc = &desc[hdr_desc_idx];
+               pktsz = 0;
+
+               cnt = 0;
+               dxx = hdr_desc_idx;
+               do {
+                       pktsz += desc[dxx].len;
+                       dxx = desc[dxx].next & VIONET_QUEUE_MASK;
+
+                       /*
+                        * Virtio 1.0, cs04, section 2.4.5:
+                        *  "The number of descriptors in the table is defined
+                        *   by the queue size for this virtqueue: this is the
+                        *   maximum possible descriptor chain length."
+                        */
+                       if (++cnt >= VIONET_QUEUE_SIZE) {
+                               log_warnx("%s: descriptor table invalid",
+                                   __func__);
+                               goto out;
+                       }
+               } while (desc[dxx].flags & VRING_DESC_F_NEXT);
+
+               pktsz += desc[dxx].len;
+
+               /* Remove virtio header descriptor len */
+               pktsz -= hdr_desc->len;
+
+               /* Drop packets violating device MTU-based limits */
+               if (pktsz < VIONET_MIN_TXLEN || pktsz > VIONET_MAX_TXLEN) {
+                       log_warnx("%s: invalid packet size %lu", __func__,
+                           pktsz);
+                       goto drop_packet;
+               }
+               pkt = malloc(pktsz);
+               if (pkt == NULL) {
+                       log_warn("malloc error alloc packet buf");
+                       goto out;
+               }
+
+               ofs = 0;
+               pkt_desc_idx = hdr_desc->next & VIONET_QUEUE_MASK;
+               pkt_desc = &desc[pkt_desc_idx];
+
+               while (pkt_desc->flags & VRING_DESC_F_NEXT) {
+                       /* must be not writable */
+                       if (pkt_desc->flags & VRING_DESC_F_WRITE) {
+                               log_warnx("unexpected writable tx desc "
+                                   "%d", pkt_desc_idx);
+                               goto out;
+                       }
+
+                       /* Check we don't read beyond allocated pktsz */
+                       if (pkt_desc->len > pktsz - ofs) {
+                               log_warnx("%s: descriptor len past pkt len",
+                                   __func__);
+                               chunk_size = pktsz - ofs;
+                       } else
+                               chunk_size = pkt_desc->len;
+
+                       /* Read packet from descriptor ring */
+                       if (read_mem(pkt_desc->addr, pkt + ofs, chunk_size)) {
+                               log_warnx("vionet: packet read_mem error "
+                                   "@ 0x%llx", pkt_desc->addr);
+                               goto out;
+                       }
+
+                       ofs += pkt_desc->len;
+                       pkt_desc_idx = pkt_desc->next & VIONET_QUEUE_MASK;
+                       pkt_desc = &desc[pkt_desc_idx];
+               }
+
+               /* Now handle tail descriptor - must be not writable */
+               if (pkt_desc->flags & VRING_DESC_F_WRITE) {
+                       log_warnx("unexpected writable tx descriptor %d",
+                           pkt_desc_idx);
+                       goto out;
+               }
+
+               /* Check we don't read beyond allocated pktsz */
+               if (pkt_desc->len > pktsz - ofs) {
+                       log_warnx("%s: descriptor len past pkt len", __func__);
+                       chunk_size = pktsz - ofs - pkt_desc->len;
+               } else
+                       chunk_size = pkt_desc->len;
+
+               /* Read packet from descriptor ring */
+               if (read_mem(pkt_desc->addr, pkt + ofs, chunk_size)) {
+                       log_warnx("vionet: packet read_mem error @ "
+                           "0x%llx", pkt_desc->addr);
+                       goto out;
+               }
+
+               /* reject other source addresses */
+               if (vionet->lockedmac && pktsz >= ETHER_HDR_LEN &&
+                   (eh = (struct ether_header *)pkt) &&
+                   memcmp(eh->ether_shost, vionet->mac,
+                   sizeof(eh->ether_shost)) != 0)
+                       log_debug("vionet: wrong source address %s for vm %d",
+                           ether_ntoa((struct ether_addr *)
+                           eh->ether_shost), dev->vm_id);
+               else if (vionet->local &&
+                   (dhcpsz = dhcp_request(dev, pkt, pktsz, &dhcppkt)) != -1) {
+                       log_debug("vionet: dhcp request,"
+                           " local response size %zd", dhcpsz);
+
+               /* XXX signed vs unsigned here, funky cast */
+               } else if (write(vionet->data_fd, pkt, pktsz) != (int)pktsz) {
+                       log_warnx("vionet: tx failed writing to tap: "
+                           "%d", errno);
+                       goto out;
+               }
+
+       drop_packet:
+               vionet->cfg.isr_status = 1;
+               used->ring[used->idx & VIONET_QUEUE_MASK].id = hdr_desc_idx;
+               used->ring[used->idx & VIONET_QUEUE_MASK].len = hdr_desc->len;
+               __sync_synchronize();
+               used->idx++;
+
+               vq_info->last_avail = avail->idx & VIONET_QUEUE_MASK;
+               idx = (idx + 1) & VIONET_QUEUE_MASK;
+
+               num_enq++;
+
+               free(pkt);
+               pkt = NULL;
+       }
+
+       if (dhcpsz > 0)
+               vionet_enq_rx(vionet, dhcppkt, dhcpsz, &spc);
+
+out:
+       free(pkt);
+       free(dhcppkt);
+
+       return (1);
+}
+
+static void
+dev_dispatch_vm(int fd, short event, void *arg)
+{
+       struct virtio_dev       *dev = arg;
+       struct vionet_dev       *vionet = &dev->vionet;
+       struct imsgev           *iev = &dev->async_iev;
+       struct imsgbuf          *ibuf = &iev->ibuf;
+       struct imsg              imsg;
+       ssize_t                  n = 0;
+
+       if (dev == NULL)
+               fatalx("%s: missing vionet pointer", __func__);
+
+       if (event & EV_READ) {
+               if ((n = imsg_read(ibuf)) == -1 && errno != EAGAIN)
+                       fatal("%s: imsg_read", __func__);
+               if (n == 0) {
+                       /* this pipe is dead, so remove the event handler */
+                       log_debug("%s: pipe dead (EV_READ)", __func__);
+                       event_del(&iev->ev);
+                       event_loopexit(NULL);
+                       return;
+               }
+       }
+
+       if (event & EV_WRITE) {
+               if ((n = msgbuf_write(&ibuf->w)) == -1 && errno != EAGAIN)
+                       fatal("%s: msgbuf_write", __func__);
+               if (n == 0) {
+                       /* this pipe is dead, so remove the event handler */
+                       log_debug("%s: pipe dead (EV_WRITE)", __func__);
+                       event_del(&iev->ev);
+                       event_loopexit(NULL);
+                       return;
+               }
+       }
+
+       for (;;) {
+               if ((n = imsg_get(ibuf, &imsg)) == -1)
+                       fatal("%s: imsg_get", __func__);
+               if (n == 0)
+                       break;
+
+               switch (imsg.hdr.type) {
+               case IMSG_DEVOP_HOSTMAC:
+                       IMSG_SIZE_CHECK(&imsg, vionet->hostmac);
+                       memcpy(vionet->hostmac, imsg.data,
+                           sizeof(vionet->hostmac));
+                       log_debug("%s: set hostmac", __func__);
+                       break;
+               case IMSG_VMDOP_PAUSE_VM:
+                       log_debug("%s: pausing", __func__);
+                       event_del(&ev_tap);
+                       break;
+               case IMSG_VMDOP_UNPAUSE_VM:
+                       log_debug("%s: unpausing", __func__);
+                       if (vionet->cfg.device_status
+                           & VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK)
+                               event_add(&ev_tap, NULL);
+                       break;
+               }
+               imsg_free(&imsg);
+       }
+       imsg_event_add(iev);
+}
+
+/*
+ * Synchronous IO handler.
+ *
+ */
+static void
+handle_sync_io(int fd, short event, void *arg)
+{
+       struct virtio_dev *dev = (struct virtio_dev *)arg;
+       struct imsgev *iev = &dev->sync_iev;
+       struct imsgbuf *ibuf = &iev->ibuf;
+       struct viodev_msg msg;
+       struct imsg imsg;
+       ssize_t n;
+
+       if (event & EV_READ) {
+               if ((n = imsg_read(ibuf)) == -1 && errno != EAGAIN)
+                       fatal("%s: imsg_read", __func__);
+               if (n == 0) {
+                       /* this pipe is dead, so remove the event handler */
+                       log_debug("%s: pipe dead (EV_READ)", __func__);
+                       event_del(&iev->ev);
+                       event_loopexit(NULL);
+                       return;
+               }
+       }
+
+       if (event & EV_WRITE) {
+               if ((n = msgbuf_write(&ibuf->w)) == -1 && errno != EAGAIN)
+                       fatal("%s: msgbuf_write", __func__);
+               if (n == 0) {
+                       /* this pipe is dead, so remove the event handler */
+                       log_debug("%s: pipe dead (EV_WRITE)", __func__);
+                       event_del(&iev->ev);
+                       event_loopexit(NULL);
+                       return;
+               }
+       }
+
+       for (;;) {
+               if ((n = imsg_get(ibuf, &imsg)) == -1)
+                       fatalx("%s: imsg_get (n=%ld)", __func__, n);
+               if (n == 0)
+                       break;
+
+               /* Unpack our message. They ALL should be dev messeges! */
+               IMSG_SIZE_CHECK(&imsg, &msg);
+               memcpy(&msg, imsg.data, sizeof(msg));
+               imsg_free(&imsg);
+
+               switch (msg.type) {
+               case VIODEV_MSG_DUMP:
+                       /* Dump device */
+                       n = atomicio(vwrite, dev->sync_fd, dev, sizeof(*dev));
+                       if (n != sizeof(*dev)) {
+                               log_warnx("%s: failed to dump vioblk device",
+                                   __func__);
+                               break;
+                       }
+               case VIODEV_MSG_IO_READ:
+                       /* Read IO: make sure to send a reply */
+                       msg.data = handle_io_read(&msg, dev);
+                       msg.data_valid = 1;
+                       imsg_compose_event(iev, IMSG_DEVOP_MSG, 0, 0, -1, &msg,
+                           sizeof(msg));
+                       break;
+               case VIODEV_MSG_IO_WRITE:
+                       /* Write IO: no reply needed */
+                       if (handle_io_write(&msg, dev) == 1)
+                               virtio_assert_pic_irq(dev, 0);
+                       break;
+               case VIODEV_MSG_SHUTDOWN:
+                       event_del(&dev->sync_iev.ev);
+                       event_del(&ev_tap);
+                       event_loopbreak();
+                       return;
+               default:
+                       fatalx("%s: invalid msg type %d", __func__, msg.type);
+               }
+       }
+       imsg_event_add(iev);
+}
+
+static int
+handle_io_write(struct viodev_msg *msg, struct virtio_dev *dev)
+{
+       struct vionet_dev *vionet = &dev->vionet;
+       uint32_t data = msg->data;
+       int intr = 0;
+
+       switch (msg->reg) {
+       case VIRTIO_CONFIG_DEVICE_FEATURES:
+       case VIRTIO_CONFIG_QUEUE_SIZE:
+       case VIRTIO_CONFIG_ISR_STATUS:
+               log_warnx("%s: illegal write %x to %s", __progname, data,
+                   virtio_reg_name(msg->reg));
+               break;
+       case VIRTIO_CONFIG_GUEST_FEATURES:
+               vionet->cfg.guest_feature = data;
+               break;
+       case VIRTIO_CONFIG_QUEUE_PFN:
+               vionet->cfg.queue_pfn = data;
+               vionet_update_qa(vionet);
+               break;
+       case VIRTIO_CONFIG_QUEUE_SELECT:
+               vionet->cfg.queue_select = data;
+               vionet_update_qs(vionet);
+               break;
+       case VIRTIO_CONFIG_QUEUE_NOTIFY:
+               vionet->cfg.queue_notify = data;
+               if (vionet_notifyq(dev))
+                       intr = 1;
+               break;
+       case VIRTIO_CONFIG_DEVICE_STATUS:
+               vionet->cfg.device_status = data;
+               if (vionet->cfg.device_status == 0) {
+                       vionet->cfg.guest_feature = 0;
+
+                       vionet->cfg.queue_pfn = 0;
+                       vionet_update_qa(vionet);
+
+                       vionet->cfg.queue_size = 0;
+                       vionet_update_qs(vionet);
+
+                       vionet->cfg.queue_select = 0;
+                       vionet->cfg.queue_notify = 0;
+                       vionet->cfg.isr_status = 0;
+                       vionet->vq[RXQ].last_avail = 0;
+                       vionet->vq[RXQ].notified_avail = 0;
+                       vionet->vq[TXQ].last_avail = 0;
+                       vionet->vq[TXQ].notified_avail = 0;
+                       virtio_deassert_pic_irq(dev, msg->vcpu);
+               }
+               event_del(&ev_tap);
+               if (vionet->cfg.device_status
+                   & VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK) {
+                       if (event_add(&ev_tap, NULL))
+                               log_warn("%s: could not initialize virtio tap "
+                                   "event handler", __func__);
+               }
+               break;
+       default:
+               break;
+       }
+       return (intr);
+}
+
+static uint32_t
+handle_io_read(struct viodev_msg *msg, struct virtio_dev *dev)
+{
+       struct vionet_dev *vionet = &dev->vionet;
+       uint32_t data;
+
+       switch (msg->reg) {
+       case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI:
+       case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 1:
+       case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 2:
+       case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 3:
+       case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 4:
+       case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 5:
+               data = vionet->mac[msg->reg -
+                   VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI];
+               break;
+       case VIRTIO_CONFIG_DEVICE_FEATURES:
+               data = vionet->cfg.device_feature;
+               break;
+       case VIRTIO_CONFIG_GUEST_FEATURES:
+               data = vionet->cfg.guest_feature;
+               break;
+       case VIRTIO_CONFIG_QUEUE_PFN:
+               data = vionet->cfg.queue_pfn;
+               break;
+       case VIRTIO_CONFIG_QUEUE_SIZE:
+               data = vionet->cfg.queue_size;
+               break;
+       case VIRTIO_CONFIG_QUEUE_SELECT:
+               data = vionet->cfg.queue_select;
+               break;
+       case VIRTIO_CONFIG_QUEUE_NOTIFY:
+               data = vionet->cfg.queue_notify;
+               break;
+       case VIRTIO_CONFIG_DEVICE_STATUS:
+               data = vionet->cfg.device_status;
+               break;
+       case VIRTIO_CONFIG_ISR_STATUS:
+               data = vionet->cfg.isr_status;
+               vionet->cfg.isr_status = 0;
+               virtio_deassert_pic_irq(dev, 0);
+               break;
+       default:
+               return (0xFFFFFFFF);
+       }
+
+       return (data);
+}
index 3880543..035f199 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: vioqcow2.c,v 1.20 2022/05/20 22:06:47 dv Exp $        */
+/*     $OpenBSD: vioqcow2.c,v 1.21 2023/04/27 22:47:27 dv Exp $        */
 
 /*
  * Copyright (c) 2018 Ori Bernstein <ori@eigenstate.org>
@@ -110,9 +110,8 @@ static ssize_t qc2_pwrite(void *, char *, size_t, off_t);
 static void qc2_close(void *, int);
 
 /*
- * Initializes a raw disk image backing file from an fd.
- * Stores the number of 512 byte sectors in *szp,
- * returning -1 for error, 0 for success.
+ * Initializes a raw disk image backing file from an fd. Stores the
+ * number of bytes in *szp, returning -1 for error, 0 for success.
  *
  * May open snapshot base images.
  */
@@ -132,7 +131,7 @@ virtio_qcow2_init(struct virtio_backing *file, off_t *szp, int *fd, size_t nfd)
        file->pread = qc2_pread;
        file->pwrite = qc2_pwrite;
        file->close = qc2_close;
-       *szp = diskp->disksz;
+       *szp = diskp->disksz / 512;
        return 0;
 }
 
index 174bf40..4050efd 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: vioraw.c,v 1.8 2023/04/16 12:52:54 dv Exp $   */
+/*     $OpenBSD: vioraw.c,v 1.9 2023/04/27 22:47:27 dv Exp $   */
 /*
  * Copyright (c) 2018 Ori Bernstein <ori@eigenstate.org>
  *
@@ -47,7 +47,7 @@ raw_close(void *file, int stayopen)
 
 /*
  * Initializes a raw disk image backing file from an fd.  Stores the
- * number of bytes in *szp, returning -1 for error, 0 for success.
+ * number of 512-byte sectors in *szp, returning -1 for error, 0 for success.
  */
 int
 virtio_raw_init(struct virtio_backing *file, off_t *szp, int *fd, size_t nfd)
@@ -57,6 +57,7 @@ virtio_raw_init(struct virtio_backing *file, off_t *szp, int *fd, size_t nfd)
 
        if (nfd != 1)
                return (-1);
+
        sz = lseek(fd[0], 0, SEEK_END);
        if (sz == -1)
                return (-1);
@@ -69,7 +70,7 @@ virtio_raw_init(struct virtio_backing *file, off_t *szp, int *fd, size_t nfd)
        file->pread = raw_pread;
        file->pwrite = raw_pwrite;
        file->close = raw_close;
-       *szp = sz;
+       *szp = sz / 512;
        return (0);
 }
 
index 6261695..92e77b8 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: virtio.c,v 1.101 2023/04/25 12:46:13 dv Exp $ */
+/*     $OpenBSD: virtio.c,v 1.102 2023/04/27 22:47:27 dv Exp $ */
 
 /*
  * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
@@ -18,6 +18,7 @@
 
 #include <sys/param.h> /* PAGE_SIZE */
 #include <sys/socket.h>
+#include <sys/wait.h>
 
 #include <machine/vmmvar.h>
 #include <dev/pci/pcireg.h>
@@ -34,6 +35,7 @@
 
 #include <errno.h>
 #include <event.h>
+#include <fcntl.h>
 #include <poll.h>
 #include <stddef.h>
 #include <stdlib.h>
 #include "vmd.h"
 #include "vmm.h"
 
+extern struct vmd *env;
 extern char *__progname;
+
 struct viornd_dev viornd;
-struct vioblk_dev *vioblk;
-struct vionet_dev *vionet;
 struct vioscsi_dev *vioscsi;
 struct vmmci_dev vmmci;
 
-int nr_vionet;
-int nr_vioblk;
+/* Devices emulated in subprocesses are inserted into this list. */
+SLIST_HEAD(virtio_dev_head, virtio_dev) virtio_devs;
 
 #define MAXPHYS        (64 * 1024)     /* max raw I/O transfer size */
 
@@ -68,22 +70,11 @@ int nr_vioblk;
 #define RXQ    0
 #define TXQ    1
 
-const char *
-vioblk_cmd_name(uint32_t type)
-{
-       switch (type) {
-       case VIRTIO_BLK_T_IN: return "read";
-       case VIRTIO_BLK_T_OUT: return "write";
-       case VIRTIO_BLK_T_SCSI_CMD: return "scsi read";
-       case VIRTIO_BLK_T_SCSI_CMD_OUT: return "scsi write";
-       case VIRTIO_BLK_T_FLUSH: return "flush";
-       case VIRTIO_BLK_T_FLUSH_OUT: return "flush out";
-       case VIRTIO_BLK_T_GET_ID: return "get id";
-       default: return "unknown";
-       }
-}
+static int virtio_dev_launch(struct vmd_vm *, struct virtio_dev *);
+static void virtio_dispatch_dev(int, short, void *);
+static int handle_dev_msg(struct viodev_msg *, struct virtio_dev *);
 
-static const char *
+const char *
 virtio_reg_name(uint8_t reg)
 {
        switch (reg) {
@@ -95,8 +86,11 @@ virtio_reg_name(uint8_t reg)
        case VIRTIO_CONFIG_QUEUE_NOTIFY: return "queue notify";
        case VIRTIO_CONFIG_DEVICE_STATUS: return "device status";
        case VIRTIO_CONFIG_ISR_STATUS: return "isr status";
-       case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI: return "device config 0";
-       case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 4: return "device config 1";
+       case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI...VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 3:
+               return "device config 0";
+       case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 4:
+       case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 5:
+               return "device config 1";
        case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 8: return "device config 2";
        case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 12: return "device config 3";
        case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 16: return "device config 4";
@@ -154,7 +148,7 @@ viornd_update_qa(void)
 
        hva = hvaddr_mem(vq_info->q_gpa, vring_size(VIORND_QUEUE_SIZE));
        if (hva == NULL)
-               fatal("viornd_update_qa");
+               fatalx("viornd_update_qa");
        vq_info->q_hva = hva;
 }
 
@@ -286,427 +280,123 @@ virtio_rnd_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr,
        return (0);
 }
 
-void
-vioblk_update_qa(struct vioblk_dev *dev)
-{
-       struct virtio_vq_info *vq_info;
-       void *hva = NULL;
-
-       /* Invalid queue? */
-       if (dev->cfg.queue_select > 0)
-               return;
-
-       vq_info = &dev->vq[dev->cfg.queue_select];
-       vq_info->q_gpa = (uint64_t)dev->cfg.queue_pfn * VIRTIO_PAGE_SIZE;
-
-       hva = hvaddr_mem(vq_info->q_gpa, vring_size(VIOBLK_QUEUE_SIZE));
-       if (hva == NULL)
-               fatal("vioblk_update_qa");
-       vq_info->q_hva = hva;
-}
-
-void
-vioblk_update_qs(struct vioblk_dev *dev)
-{
-       struct virtio_vq_info *vq_info;
-
-       /* Invalid queue? */
-       if (dev->cfg.queue_select > 0) {
-               dev->cfg.queue_size = 0;
-               return;
-       }
-
-       vq_info = &dev->vq[dev->cfg.queue_select];
-
-       /* Update queue pfn/size based on queue select */
-       dev->cfg.queue_pfn = vq_info->q_gpa >> 12;
-       dev->cfg.queue_size = vq_info->qs;
-}
-
-static void
-vioblk_free_info(struct ioinfo *info)
-{
-       if (!info)
-               return;
-       free(info->buf);
-       free(info);
-}
-
-static struct ioinfo *
-vioblk_start_read(struct vioblk_dev *dev, off_t sector, size_t sz)
-{
-       struct ioinfo *info;
-
-       /* Limit to 64M for now */
-       if (sz > (1 << 26)) {
-               log_warnx("%s: read size exceeded 64M", __func__);
-               return (NULL);
-       }
-
-       info = calloc(1, sizeof(*info));
-       if (!info)
-               goto nomem;
-       info->buf = malloc(sz);
-       if (info->buf == NULL)
-               goto nomem;
-       info->len = sz;
-       info->offset = sector * VIRTIO_BLK_SECTOR_SIZE;
-       info->file = &dev->file;
-
-       return info;
-
-nomem:
-       free(info);
-       log_warn("malloc error vioblk read");
-       return (NULL);
-}
-
-
-static const uint8_t *
-vioblk_finish_read(struct ioinfo *info)
+int
+vmmci_ctl(unsigned int cmd)
 {
-       struct virtio_backing *file;
+       struct timeval tv = { 0, 0 };
 
-       file = info->file;
-       if (file->pread(file->p, info->buf, info->len, info->offset) != info->len) {
-               info->error = errno;
-               log_warn("vioblk read error");
-               return NULL;
-       }
+       if ((vmmci.cfg.device_status &
+           VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK) == 0)
+               return (-1);
 
-       return info->buf;
-}
+       if (cmd == vmmci.cmd)
+               return (0);
 
-static struct ioinfo *
-vioblk_start_write(struct vioblk_dev *dev, off_t sector,
-    paddr_t addr, size_t len)
-{
-       struct ioinfo *info;
+       switch (cmd) {
+       case VMMCI_NONE:
+               break;
+       case VMMCI_SHUTDOWN:
+       case VMMCI_REBOOT:
+               /* Update command */
+               vmmci.cmd = cmd;
 
-       /* Limit to 64M for now */
-       if (len > (1 << 26)) {
-               log_warnx("%s: write size exceeded 64M", __func__);
-               return (NULL);
-       }
+               /*
+                * vmm VMs do not support powerdown, send a reboot request
+                * instead and turn it off after the triple fault.
+                */
+               if (cmd == VMMCI_SHUTDOWN)
+                       cmd = VMMCI_REBOOT;
 
-       info = calloc(1, sizeof(*info));
-       if (!info)
-               goto nomem;
+               /* Trigger interrupt */
+               vmmci.cfg.isr_status = VIRTIO_CONFIG_ISR_CONFIG_CHANGE;
+               vcpu_assert_pic_irq(vmmci.vm_id, 0, vmmci.irq);
 
-       info->buf = malloc(len);
-       if (info->buf == NULL)
-               goto nomem;
-       info->len = len;
-       info->offset = sector * VIRTIO_BLK_SECTOR_SIZE;
-       info->file = &dev->file;
+               /* Add ACK timeout */
+               tv.tv_sec = VMMCI_TIMEOUT;
+               evtimer_add(&vmmci.timeout, &tv);
+               break;
+       case VMMCI_SYNCRTC:
+               if (vmmci.cfg.guest_feature & VMMCI_F_SYNCRTC) {
+                       /* RTC updated, request guest VM resync of its RTC */
+                       vmmci.cmd = cmd;
 
-       if (read_mem(addr, info->buf, info->len)) {
-               vioblk_free_info(info);
-               return NULL;
+                       vmmci.cfg.isr_status = VIRTIO_CONFIG_ISR_CONFIG_CHANGE;
+                       vcpu_assert_pic_irq(vmmci.vm_id, 0, vmmci.irq);
+               } else {
+                       log_debug("%s: RTC sync skipped (guest does not "
+                           "support RTC sync)\n", __func__);
+               }
+               break;
+       default:
+               fatalx("invalid vmmci command: %d", cmd);
        }
 
-       return info;
-
-nomem:
-       free(info);
-       log_warn("malloc error vioblk write");
-       return (NULL);
+       return (0);
 }
 
-static int
-vioblk_finish_write(struct ioinfo *info)
+void
+vmmci_ack(unsigned int cmd)
 {
-       struct virtio_backing *file;
+       struct timeval   tv = { 0, 0 };
 
-       file = info->file;
-       if (file->pwrite(file->p, info->buf, info->len, info->offset) != info->len) {
-               log_warn("vioblk write error");
-               return EIO;
+       switch (cmd) {
+       case VMMCI_NONE:
+               break;
+       case VMMCI_SHUTDOWN:
+               /*
+                * The shutdown was requested by the VM if we don't have
+                * a pending shutdown request.  In this case add a short
+                * timeout to give the VM a chance to reboot before the
+                * timer is expired.
+                */
+               if (vmmci.cmd == 0) {
+                       log_debug("%s: vm %u requested shutdown", __func__,
+                           vmmci.vm_id);
+                       tv.tv_sec = VMMCI_TIMEOUT;
+                       evtimer_add(&vmmci.timeout, &tv);
+                       return;
+               }
+               /* FALLTHROUGH */
+       case VMMCI_REBOOT:
+               /*
+                * If the VM acknowledged our shutdown request, give it
+                * enough time to shutdown or reboot gracefully.  This
+                * might take a considerable amount of time (running
+                * rc.shutdown on the VM), so increase the timeout before
+                * killing it forcefully.
+                */
+               if (cmd == vmmci.cmd &&
+                   evtimer_pending(&vmmci.timeout, NULL)) {
+                       log_debug("%s: vm %u acknowledged shutdown request",
+                           __func__, vmmci.vm_id);
+                       tv.tv_sec = VMMCI_SHUTDOWN_TIMEOUT;
+                       evtimer_add(&vmmci.timeout, &tv);
+               }
+               break;
+       case VMMCI_SYNCRTC:
+               log_debug("%s: vm %u acknowledged RTC sync request",
+                   __func__, vmmci.vm_id);
+               vmmci.cmd = VMMCI_NONE;
+               break;
+       default:
+               log_warnx("%s: illegal request %u", __func__, cmd);
+               break;
        }
-       return 0;
 }
 
-/*
- * XXX in various cases, ds should be set to VIRTIO_BLK_S_IOERR, if we can
- */
-int
-vioblk_notifyq(struct vioblk_dev *dev)
+void
+vmmci_timeout(int fd, short type, void *arg)
 {
-       uint16_t idx, cmd_desc_idx, secdata_desc_idx, ds_desc_idx;
-       uint8_t ds;
-       int cnt;
-       off_t secbias;
-       char *vr;
-       struct vring_desc *desc, *cmd_desc, *secdata_desc, *ds_desc;
-       struct vring_avail *avail;
-       struct vring_used *used;
-       struct virtio_blk_req_hdr cmd;
-       struct virtio_vq_info *vq_info;
-
-       /* Invalid queue? */
-       if (dev->cfg.queue_notify > 0)
-               return (0);
-
-       vq_info = &dev->vq[dev->cfg.queue_notify];
-       vr = vq_info->q_hva;
-       if (vr == NULL)
-               fatalx("%s: null vring", __func__);
-
-       /* Compute offsets in ring of descriptors, avail ring, and used ring */
-       desc = (struct vring_desc *)(vr);
-       avail = (struct vring_avail *)(vr + vq_info->vq_availoffset);
-       used = (struct vring_used *)(vr + vq_info->vq_usedoffset);
-
-       idx = vq_info->last_avail & VIOBLK_QUEUE_MASK;
-
-       if ((avail->idx & VIOBLK_QUEUE_MASK) == idx) {
-               log_debug("%s - nothing to do?", __func__);
-               return (0);
-       }
-
-       while (idx != (avail->idx & VIOBLK_QUEUE_MASK)) {
-
-               ds = VIRTIO_BLK_S_IOERR;
-               cmd_desc_idx = avail->ring[idx] & VIOBLK_QUEUE_MASK;
-               cmd_desc = &desc[cmd_desc_idx];
-
-               if ((cmd_desc->flags & VRING_DESC_F_NEXT) == 0) {
-                       log_warnx("unchained vioblk cmd descriptor received "
-                           "(idx %d)", cmd_desc_idx);
-                       goto out;
-               }
-
-               /* Read command from descriptor ring */
-               if (cmd_desc->flags & VRING_DESC_F_WRITE) {
-                       log_warnx("vioblk: unexpected writable cmd descriptor "
-                           "%d", cmd_desc_idx);
-                       goto out;
-               }
-               if (read_mem(cmd_desc->addr, &cmd, sizeof(cmd))) {
-                       log_warnx("vioblk: command read_mem error @ 0x%llx",
-                           cmd_desc->addr);
-                       goto out;
-               }
-
-               switch (cmd.type) {
-               case VIRTIO_BLK_T_IN:
-                       /* first descriptor */
-                       secdata_desc_idx = cmd_desc->next & VIOBLK_QUEUE_MASK;
-                       secdata_desc = &desc[secdata_desc_idx];
-
-                       if ((secdata_desc->flags & VRING_DESC_F_NEXT) == 0) {
-                               log_warnx("unchained vioblk data descriptor "
-                                   "received (idx %d)", cmd_desc_idx);
-                               goto out;
-                       }
-
-                       cnt = 0;
-                       secbias = 0;
-                       do {
-                               struct ioinfo *info;
-                               const uint8_t *secdata;
-
-                               if ((secdata_desc->flags & VRING_DESC_F_WRITE)
-                                   == 0) {
-                                       log_warnx("vioblk: unwritable data "
-                                           "descriptor %d", secdata_desc_idx);
-                                       goto out;
-                               }
-
-                               info = vioblk_start_read(dev,
-                                   cmd.sector + secbias, secdata_desc->len);
-
-                               if (info == NULL) {
-                                       log_warnx("vioblk: can't start read");
-                                       goto out;
-                               }
-
-                               /* read the data, use current data descriptor */
-                               secdata = vioblk_finish_read(info);
-                               if (secdata == NULL) {
-                                       vioblk_free_info(info);
-                                       log_warnx("vioblk: block read error, "
-                                           "sector %lld", cmd.sector);
-                                       goto out;
-                               }
-
-                               if (write_mem(secdata_desc->addr, secdata,
-                                       secdata_desc->len)) {
-                                       log_warnx("can't write sector "
-                                           "data to gpa @ 0x%llx",
-                                           secdata_desc->addr);
-                                       vioblk_free_info(info);
-                                       goto out;
-                               }
-
-                               vioblk_free_info(info);
-
-                               secbias += (secdata_desc->len /
-                                   VIRTIO_BLK_SECTOR_SIZE);
-                               secdata_desc_idx = secdata_desc->next &
-                                   VIOBLK_QUEUE_MASK;
-                               secdata_desc = &desc[secdata_desc_idx];
-
-                               /* Guard against infinite chains */
-                               if (++cnt >= VIOBLK_QUEUE_SIZE) {
-                                       log_warnx("%s: descriptor table "
-                                           "invalid", __func__);
-                                       goto out;
-                               }
-                       } while (secdata_desc->flags & VRING_DESC_F_NEXT);
-
-                       ds_desc_idx = secdata_desc_idx;
-                       ds_desc = secdata_desc;
-
-                       ds = VIRTIO_BLK_S_OK;
-                       break;
-               case VIRTIO_BLK_T_OUT:
-                       secdata_desc_idx = cmd_desc->next & VIOBLK_QUEUE_MASK;
-                       secdata_desc = &desc[secdata_desc_idx];
-
-                       if ((secdata_desc->flags & VRING_DESC_F_NEXT) == 0) {
-                               log_warnx("wr vioblk: unchained vioblk data "
-                                   "descriptor received (idx %d)",
-                                   cmd_desc_idx);
-                               goto out;
-                       }
-
-                       if (secdata_desc->len > dev->max_xfer) {
-                               log_warnx("%s: invalid read size %d requested",
-                                   __func__, secdata_desc->len);
-                               goto out;
-                       }
-
-                       cnt = 0;
-                       secbias = 0;
-                       do {
-                               struct ioinfo *info;
-
-                               if (secdata_desc->flags & VRING_DESC_F_WRITE) {
-                                       log_warnx("wr vioblk: unexpected "
-                                           "writable data descriptor %d",
-                                           secdata_desc_idx);
-                                       goto out;
-                               }
-
-                               info = vioblk_start_write(dev,
-                                   cmd.sector + secbias,
-                                   secdata_desc->addr, secdata_desc->len);
-
-                               if (info == NULL) {
-                                       log_warnx("wr vioblk: can't read "
-                                           "sector data @ 0x%llx",
-                                           secdata_desc->addr);
-                                       goto out;
-                               }
-
-                               if (vioblk_finish_write(info)) {
-                                       log_warnx("wr vioblk: disk write "
-                                           "error");
-                                       vioblk_free_info(info);
-                                       goto out;
-                               }
-
-                               vioblk_free_info(info);
-
-                               secbias += secdata_desc->len /
-                                   VIRTIO_BLK_SECTOR_SIZE;
-
-                               secdata_desc_idx = secdata_desc->next &
-                                   VIOBLK_QUEUE_MASK;
-                               secdata_desc = &desc[secdata_desc_idx];
-
-                               /* Guard against infinite chains */
-                               if (++cnt >= VIOBLK_QUEUE_SIZE) {
-                                       log_warnx("%s: descriptor table "
-                                           "invalid", __func__);
-                                       goto out;
-                               }
-                       } while (secdata_desc->flags & VRING_DESC_F_NEXT);
-
-                       ds_desc_idx = secdata_desc_idx;
-                       ds_desc = secdata_desc;
-
-                       ds = VIRTIO_BLK_S_OK;
-                       break;
-               case VIRTIO_BLK_T_FLUSH:
-               case VIRTIO_BLK_T_FLUSH_OUT:
-                       ds_desc_idx = cmd_desc->next & VIOBLK_QUEUE_MASK;
-                       ds_desc = &desc[ds_desc_idx];
-
-                       ds = VIRTIO_BLK_S_UNSUPP;
-                       break;
-               case VIRTIO_BLK_T_GET_ID:
-                       secdata_desc_idx = cmd_desc->next & VIOBLK_QUEUE_MASK;
-                       secdata_desc = &desc[secdata_desc_idx];
-
-                       /*
-                        * We don't support this command yet. While it's not
-                        * officially part of the virtio spec (will be in v1.2)
-                        * there's no feature to negotiate. Linux drivers will
-                        * often send this command regardless.
-                        *
-                        * When the command is received, it should appear as a
-                        * chain of 3 descriptors, similar to the IN/OUT
-                        * commands. The middle descriptor should have have a
-                        * length of VIRTIO_BLK_ID_BYTES bytes.
-                        */
-                       if ((secdata_desc->flags & VRING_DESC_F_NEXT) == 0) {
-                               log_warnx("id vioblk: unchained vioblk data "
-                                   "descriptor received (idx %d)",
-                                   cmd_desc_idx);
-                               goto out;
-                       }
-
-                       /* Skip the data descriptor. */
-                       ds_desc_idx = secdata_desc->next & VIOBLK_QUEUE_MASK;
-                       ds_desc = &desc[ds_desc_idx];
-
-                       ds = VIRTIO_BLK_S_UNSUPP;
-                       break;
-               default:
-                       log_warnx("%s: unsupported command 0x%x", __func__,
-                           cmd.type);
-                       ds_desc_idx = cmd_desc->next & VIOBLK_QUEUE_MASK;
-                       ds_desc = &desc[ds_desc_idx];
-
-                       ds = VIRTIO_BLK_S_UNSUPP;
-                       break;
-               }
-
-               if ((ds_desc->flags & VRING_DESC_F_WRITE) == 0) {
-                       log_warnx("%s: ds descriptor %d unwritable", __func__,
-                           ds_desc_idx);
-                       goto out;
-               }
-               if (write_mem(ds_desc->addr, &ds, sizeof(ds))) {
-                       log_warnx("%s: can't write device status data @ 0x%llx",
-                           __func__, ds_desc->addr);
-                       goto out;
-               }
-
-               dev->cfg.isr_status = 1;
-               used->ring[used->idx & VIOBLK_QUEUE_MASK].id = cmd_desc_idx;
-               used->ring[used->idx & VIOBLK_QUEUE_MASK].len = cmd_desc->len;
-               __sync_synchronize();
-               used->idx++;
-
-               vq_info->last_avail = avail->idx & VIOBLK_QUEUE_MASK;
-               idx = (idx + 1) & VIOBLK_QUEUE_MASK;
-       }
-out:
-       return (1);
+       log_debug("%s: vm %u shutdown", __progname, vmmci.vm_id);
+       vm_shutdown(vmmci.cmd == VMMCI_REBOOT ? VMMCI_REBOOT : VMMCI_SHUTDOWN);
 }
 
 int
-virtio_blk_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr,
-    void *cookie, uint8_t sz)
+vmmci_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr,
+    void *unused, uint8_t sz)
 {
-       struct vioblk_dev *dev = (struct vioblk_dev *)cookie;
-
        *intr = 0xFF;
 
-
        if (dir == 0) {
                switch (reg) {
                case VIRTIO_CONFIG_DEVICE_FEATURES:
@@ -716,938 +406,57 @@ virtio_blk_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr,
                            __progname, *data, virtio_reg_name(reg));
                        break;
                case VIRTIO_CONFIG_GUEST_FEATURES:
-                       dev->cfg.guest_feature = *data;
+                       vmmci.cfg.guest_feature = *data;
                        break;
                case VIRTIO_CONFIG_QUEUE_PFN:
-                       dev->cfg.queue_pfn = *data;
-                       vioblk_update_qa(dev);
+                       vmmci.cfg.queue_pfn = *data;
                        break;
                case VIRTIO_CONFIG_QUEUE_SELECT:
-                       dev->cfg.queue_select = *data;
-                       vioblk_update_qs(dev);
+                       vmmci.cfg.queue_select = *data;
                        break;
                case VIRTIO_CONFIG_QUEUE_NOTIFY:
-                       dev->cfg.queue_notify = *data;
-                       if (vioblk_notifyq(dev))
-                               *intr = 1;
+                       vmmci.cfg.queue_notify = *data;
                        break;
                case VIRTIO_CONFIG_DEVICE_STATUS:
-                       dev->cfg.device_status = *data;
-                       if (dev->cfg.device_status == 0) {
-                               log_debug("%s: device reset", __func__);
-                               dev->cfg.guest_feature = 0;
-                               dev->cfg.queue_pfn = 0;
-                               vioblk_update_qa(dev);
-                               dev->cfg.queue_size = 0;
-                               vioblk_update_qs(dev);
-                               dev->cfg.queue_select = 0;
-                               dev->cfg.queue_notify = 0;
-                               dev->cfg.isr_status = 0;
-                               dev->vq[0].last_avail = 0;
-                               vcpu_deassert_pic_irq(dev->vm_id, 0, dev->irq);
-                       }
+                       vmmci.cfg.device_status = *data;
                        break;
-               default:
+               case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI:
+                       vmmci_ack(*data);
                        break;
                }
        } else {
                switch (reg) {
                case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI:
-                       switch (sz) {
-                       case 4:
-                               *data = (uint32_t)(dev->sz);
-                               break;
-                       case 2:
-                               *data &= 0xFFFF0000;
-                               *data |= (uint32_t)(dev->sz) & 0xFFFF;
-                               break;
-                       case 1:
-                               *data &= 0xFFFFFF00;
-                               *data |= (uint32_t)(dev->sz) & 0xFF;
-                               break;
-                       }
-                       /* XXX handle invalid sz */
-                       break;
-               case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 1:
-                       if (sz == 1) {
-                               *data &= 0xFFFFFF00;
-                               *data |= (uint32_t)(dev->sz >> 8) & 0xFF;
-                       }
-                       /* XXX handle invalid sz */
-                       break;
-               case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 2:
-                       if (sz == 1) {
-                               *data &= 0xFFFFFF00;
-                               *data |= (uint32_t)(dev->sz >> 16) & 0xFF;
-                       } else if (sz == 2) {
-                               *data &= 0xFFFF0000;
-                               *data |= (uint32_t)(dev->sz >> 16) & 0xFFFF;
-                       }
-                       /* XXX handle invalid sz */
-                       break;
-               case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 3:
-                       if (sz == 1) {
-                               *data &= 0xFFFFFF00;
-                               *data |= (uint32_t)(dev->sz >> 24) & 0xFF;
-                       }
-                       /* XXX handle invalid sz */
+                       *data = vmmci.cmd;
                        break;
                case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 4:
-                       switch (sz) {
-                       case 4:
-                               *data = (uint32_t)(dev->sz >> 32);
-                               break;
-                       case 2:
-                               *data &= 0xFFFF0000;
-                               *data |= (uint32_t)(dev->sz >> 32) & 0xFFFF;
-                               break;
-                       case 1:
-                               *data &= 0xFFFFFF00;
-                               *data |= (uint32_t)(dev->sz >> 32) & 0xFF;
-                               break;
-                       }
-                       /* XXX handle invalid sz */
-                       break;
-               case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 5:
-                       if (sz == 1) {
-                               *data &= 0xFFFFFF00;
-                               *data |= (uint32_t)(dev->sz >> 40) & 0xFF;
-                       }
-                       /* XXX handle invalid sz */
-                       break;
-               case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 6:
-                       if (sz == 1) {
-                               *data &= 0xFFFFFF00;
-                               *data |= (uint32_t)(dev->sz >> 48) & 0xFF;
-                       } else if (sz == 2) {
-                               *data &= 0xFFFF0000;
-                               *data |= (uint32_t)(dev->sz >> 48) & 0xFFFF;
-                       }
-                       /* XXX handle invalid sz */
-                       break;
-               case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 7:
-                       if (sz == 1) {
-                               *data &= 0xFFFFFF00;
-                               *data |= (uint32_t)(dev->sz >> 56) & 0xFF;
-                       }
-                       /* XXX handle invalid sz */
+                       /* Update time once when reading the first register */
+                       gettimeofday(&vmmci.time, NULL);
+                       *data = (uint64_t)vmmci.time.tv_sec;
                        break;
                case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 8:
-                       switch (sz) {
-                       case 4:
-                               *data = (uint32_t)(dev->max_xfer);
-                               break;
-                       case 2:
-                               *data &= 0xFFFF0000;
-                               *data |= (uint32_t)(dev->max_xfer) & 0xFFFF;
-                               break;
-                       case 1:
-                               *data &= 0xFFFFFF00;
-                               *data |= (uint32_t)(dev->max_xfer) & 0xFF;
-                               break;
-                       }
-                       /* XXX handle invalid sz */
-                       break;
-               case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 9:
-                       if (sz == 1) {
-                               *data &= 0xFFFFFF00;
-                               *data |= (uint32_t)(dev->max_xfer >> 8) & 0xFF;
-                       }
-                       /* XXX handle invalid sz */
+                       *data = (uint64_t)vmmci.time.tv_sec << 32;
                        break;
-               case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 10:
-                       if (sz == 1) {
-                               *data &= 0xFFFFFF00;
-                               *data |= (uint32_t)(dev->max_xfer >> 16) & 0xFF;
-                       } else if (sz == 2) {
-                               *data &= 0xFFFF0000;
-                               *data |= (uint32_t)(dev->max_xfer >> 16)
-                                   & 0xFFFF;
-                       }
-                       /* XXX handle invalid sz */
+               case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 12:
+                       *data = (uint64_t)vmmci.time.tv_usec;
                        break;
-               case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 11:
-                       if (sz == 1) {
-                               *data &= 0xFFFFFF00;
-                               *data |= (uint32_t)(dev->max_xfer >> 24) & 0xFF;
-                       }
-                       /* XXX handle invalid sz */
+               case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 16:
+                       *data = (uint64_t)vmmci.time.tv_usec << 32;
                        break;
                case VIRTIO_CONFIG_DEVICE_FEATURES:
-                       *data = dev->cfg.device_feature;
+                       *data = vmmci.cfg.device_feature;
                        break;
                case VIRTIO_CONFIG_GUEST_FEATURES:
-                       *data = dev->cfg.guest_feature;
+                       *data = vmmci.cfg.guest_feature;
                        break;
                case VIRTIO_CONFIG_QUEUE_PFN:
-                       *data = dev->cfg.queue_pfn;
+                       *data = vmmci.cfg.queue_pfn;
                        break;
                case VIRTIO_CONFIG_QUEUE_SIZE:
-                       if (sz == 4)
-                               *data = dev->cfg.queue_size;
-                       else if (sz == 2) {
-                               *data &= 0xFFFF0000;
-                               *data |= (uint16_t)dev->cfg.queue_size;
-                       } else if (sz == 1) {
-                               *data &= 0xFFFFFF00;
-                               *data |= (uint8_t)dev->cfg.queue_size;
-                       }
+                       *data = vmmci.cfg.queue_size;
                        break;
                case VIRTIO_CONFIG_QUEUE_SELECT:
-                       *data = dev->cfg.queue_select;
-                       break;
-               case VIRTIO_CONFIG_QUEUE_NOTIFY:
-                       *data = dev->cfg.queue_notify;
-                       break;
-               case VIRTIO_CONFIG_DEVICE_STATUS:
-                       if (sz == 4)
-                               *data = dev->cfg.device_status;
-                       else if (sz == 2) {
-                               *data &= 0xFFFF0000;
-                               *data |= (uint16_t)dev->cfg.device_status;
-                       } else if (sz == 1) {
-                               *data &= 0xFFFFFF00;
-                               *data |= (uint8_t)dev->cfg.device_status;
-                       }
-                       break;
-               case VIRTIO_CONFIG_ISR_STATUS:
-                       *data = dev->cfg.isr_status;
-                       dev->cfg.isr_status = 0;
-                       vcpu_deassert_pic_irq(dev->vm_id, 0, dev->irq);
-                       break;
-               }
-       }
-       return (0);
-}
-
-int
-virtio_net_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr,
-    void *cookie, uint8_t sz)
-{
-       struct vionet_dev *dev = (struct vionet_dev *)cookie;
-
-       *intr = 0xFF;
-       mutex_lock(&dev->mutex);
-
-       if (dir == 0) {
-               switch (reg) {
-               case VIRTIO_CONFIG_DEVICE_FEATURES:
-               case VIRTIO_CONFIG_QUEUE_SIZE:
-               case VIRTIO_CONFIG_ISR_STATUS:
-                       log_warnx("%s: illegal write %x to %s",
-                           __progname, *data, virtio_reg_name(reg));
-                       break;
-               case VIRTIO_CONFIG_GUEST_FEATURES:
-                       dev->cfg.guest_feature = *data;
-                       break;
-               case VIRTIO_CONFIG_QUEUE_PFN:
-                       dev->cfg.queue_pfn = *data;
-                       vionet_update_qa(dev);
-                       break;
-               case VIRTIO_CONFIG_QUEUE_SELECT:
-                       dev->cfg.queue_select = *data;
-                       vionet_update_qs(dev);
-                       break;
-               case VIRTIO_CONFIG_QUEUE_NOTIFY:
-                       dev->cfg.queue_notify = *data;
-                       if (vionet_notifyq(dev))
-                               *intr = 1;
-                       break;
-               case VIRTIO_CONFIG_DEVICE_STATUS:
-                       dev->cfg.device_status = *data;
-                       if (dev->cfg.device_status == 0) {
-                               log_debug("%s: device reset", __func__);
-                               dev->cfg.guest_feature = 0;
-                               dev->cfg.queue_pfn = 0;
-                               vionet_update_qa(dev);
-                               dev->cfg.queue_size = 0;
-                               vionet_update_qs(dev);
-                               dev->cfg.queue_select = 0;
-                               dev->cfg.queue_notify = 0;
-                               dev->cfg.isr_status = 0;
-                               dev->vq[RXQ].last_avail = 0;
-                               dev->vq[RXQ].notified_avail = 0;
-                               dev->vq[TXQ].last_avail = 0;
-                               dev->vq[TXQ].notified_avail = 0;
-                               vcpu_deassert_pic_irq(dev->vm_id, 0, dev->irq);
-                       }
-                       break;
-               default:
-                       break;
-               }
-       } else {
-               switch (reg) {
-               case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI:
-               case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 1:
-               case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 2:
-               case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 3:
-               case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 4:
-               case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 5:
-                       *data = dev->mac[reg -
-                           VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI];
-                       break;
-               case VIRTIO_CONFIG_DEVICE_FEATURES:
-                       *data = dev->cfg.device_feature;
-                       break;
-               case VIRTIO_CONFIG_GUEST_FEATURES:
-                       *data = dev->cfg.guest_feature;
-                       break;
-               case VIRTIO_CONFIG_QUEUE_PFN:
-                       *data = dev->cfg.queue_pfn;
-                       break;
-               case VIRTIO_CONFIG_QUEUE_SIZE:
-                       *data = dev->cfg.queue_size;
-                       break;
-               case VIRTIO_CONFIG_QUEUE_SELECT:
-                       *data = dev->cfg.queue_select;
-                       break;
-               case VIRTIO_CONFIG_QUEUE_NOTIFY:
-                       *data = dev->cfg.queue_notify;
-                       break;
-               case VIRTIO_CONFIG_DEVICE_STATUS:
-                       *data = dev->cfg.device_status;
-                       break;
-               case VIRTIO_CONFIG_ISR_STATUS:
-                       *data = dev->cfg.isr_status;
-                       dev->cfg.isr_status = 0;
-                       vcpu_deassert_pic_irq(dev->vm_id, 0, dev->irq);
-                       break;
-               }
-       }
-
-       mutex_unlock(&dev->mutex);
-       return (0);
-}
-
-/*
- * Must be called with dev->mutex acquired.
- */
-void
-vionet_update_qa(struct vionet_dev *dev)
-{
-       struct virtio_vq_info *vq_info;
-       void *hva = NULL;
-
-       /* Invalid queue? */
-       if (dev->cfg.queue_select > 1)
-               return;
-
-       vq_info = &dev->vq[dev->cfg.queue_select];
-       vq_info->q_gpa = (uint64_t)dev->cfg.queue_pfn * VIRTIO_PAGE_SIZE;
-
-       hva = hvaddr_mem(vq_info->q_gpa, vring_size(VIONET_QUEUE_SIZE));
-       if (hva == NULL)
-               fatal("vionet_update_qa");
-       vq_info->q_hva = hva;
-}
-
-/*
- * Must be called with dev->mutex acquired.
- */
-void
-vionet_update_qs(struct vionet_dev *dev)
-{
-       struct virtio_vq_info *vq_info;
-
-       /* Invalid queue? */
-       if (dev->cfg.queue_select > 1) {
-               dev->cfg.queue_size = 0;
-               return;
-       }
-
-       vq_info = &dev->vq[dev->cfg.queue_select];
-
-       /* Update queue pfn/size based on queue select */
-       dev->cfg.queue_pfn = vq_info->q_gpa >> 12;
-       dev->cfg.queue_size = vq_info->qs;
-}
-
-/*
- * vionet_enq_rx
- *
- * Take a given packet from the host-side tap and copy it into the guest's
- * buffers utilizing the rx virtio ring. If the packet length is invalid
- * (too small or too large) or if there are not enough buffers available,
- * the packet is dropped.
- *
- * Must be called with dev->mutex acquired.
- */
-int
-vionet_enq_rx(struct vionet_dev *dev, char *pkt, size_t sz, int *spc)
-{
-       uint16_t dxx, idx, hdr_desc_idx, chain_hdr_idx;
-       char *vr = NULL;
-       size_t bufsz = 0, off = 0, pkt_offset = 0, chunk_size = 0;
-       size_t chain_len = 0;
-       struct vring_desc *desc, *pkt_desc, *hdr_desc;
-       struct vring_avail *avail;
-       struct vring_used *used;
-       struct virtio_vq_info *vq_info;
-       struct virtio_net_hdr hdr;
-       size_t hdr_sz;
-
-       if (sz < VIONET_MIN_TXLEN || sz > VIONET_MAX_TXLEN) {
-               log_warn("%s: invalid packet size", __func__);
-               return (0);
-       }
-
-       hdr_sz = sizeof(hdr);
-
-       if (!(dev->cfg.device_status & VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK))
-               return (0);
-
-       vq_info = &dev->vq[RXQ];
-       vr = vq_info->q_hva;
-       if (vr == NULL)
-               fatalx("%s: null vring", __func__);
-
-       /* Compute offsets in ring of descriptors, avail ring, and used ring */
-       desc = (struct vring_desc *)(vr);
-       avail = (struct vring_avail *)(vr + vq_info->vq_availoffset);
-       used = (struct vring_used *)(vr + vq_info->vq_usedoffset);
-
-       idx = vq_info->last_avail & VIONET_QUEUE_MASK;
-       if ((vq_info->notified_avail & VIONET_QUEUE_MASK) == idx) {
-               log_debug("%s: insufficient available buffer capacity, "
-                   "dropping packet.", __func__);
-               return (0);
-       }
-
-       hdr_desc_idx = avail->ring[idx] & VIONET_QUEUE_MASK;
-       hdr_desc = &desc[hdr_desc_idx];
-
-       dxx = hdr_desc_idx;
-       chain_hdr_idx = dxx;
-       chain_len = 0;
-
-       /* Process the descriptor and walk any potential chain. */
-       do {
-               off = 0;
-               pkt_desc = &desc[dxx];
-               if (!(pkt_desc->flags & VRING_DESC_F_WRITE)) {
-                       log_warnx("%s: invalid descriptor, not writable",
-                           __func__);
-                       return (0);
-               }
-
-               /* How much data do we get to write? */
-               if (sz - bufsz > pkt_desc->len)
-                       chunk_size = pkt_desc->len;
-               else
-                       chunk_size = sz - bufsz;
-
-               if (chain_len == 0) {
-                       off = hdr_sz;
-                       if (chunk_size == pkt_desc->len)
-                               chunk_size -= off;
-               }
-
-               /* Write a chunk of data if we need to */
-               if (chunk_size && write_mem(pkt_desc->addr + off,
-                       pkt + pkt_offset, chunk_size)) {
-                       log_warnx("%s: failed to write to buffer 0x%llx",
-                           __func__, pkt_desc->addr);
-                       return (0);
-               }
-
-               chain_len += chunk_size + off;
-               bufsz += chunk_size;
-               pkt_offset += chunk_size;
-
-               dxx = pkt_desc->next & VIONET_QUEUE_MASK;
-       } while (bufsz < sz && pkt_desc->flags & VRING_DESC_F_NEXT);
-
-       /* Move our marker in the ring...*/
-       vq_info->last_avail = (vq_info->last_avail + 1) &
-           VIONET_QUEUE_MASK;
-
-       /* Prepend the virtio net header in the first buffer. */
-       memset(&hdr, 0, sizeof(hdr));
-       hdr.hdr_len = hdr_sz;
-       if (write_mem(hdr_desc->addr, &hdr, hdr_sz)) {
-           log_warnx("vionet: rx enq header write_mem error @ 0x%llx",
-               hdr_desc->addr);
-           return (0);
-       }
-
-       /* Update the index field in the used ring. This must be done last. */
-       dev->cfg.isr_status = 1;
-       *spc = (vq_info->notified_avail - vq_info->last_avail)
-           & VIONET_QUEUE_MASK;
-
-       /* Update the list of used buffers. */
-       used->ring[used->idx & VIONET_QUEUE_MASK].id = chain_hdr_idx;
-       used->ring[used->idx & VIONET_QUEUE_MASK].len = chain_len;
-       __sync_synchronize();
-       used->idx++;
-
-       return (1);
-}
-
-/*
- * vionet_rx
- *
- * Enqueue data that was received on a tap file descriptor
- * to the vionet device queue.
- *
- * Must be called with dev->mutex acquired.
- */
-static int
-vionet_rx(struct vionet_dev *dev)
-{
-       char buf[PAGE_SIZE];
-       int num_enq = 0, spc = 0;
-       struct ether_header *eh;
-       ssize_t sz;
-
-       do {
-               sz = read(dev->fd, buf, sizeof(buf));
-               if (sz == -1) {
-                       /*
-                        * If we get EAGAIN, No data is currently available.
-                        * Do not treat this as an error.
-                        */
-                       if (errno != EAGAIN)
-                               log_warn("unexpected read error on vionet "
-                                   "device");
-               } else if (sz > 0) {
-                       eh = (struct ether_header *)buf;
-                       if (!dev->lockedmac ||
-                           ETHER_IS_MULTICAST(eh->ether_dhost) ||
-                           memcmp(eh->ether_dhost, dev->mac,
-                           sizeof(eh->ether_dhost)) == 0)
-                               num_enq += vionet_enq_rx(dev, buf, sz, &spc);
-               } else if (sz == 0) {
-                       log_debug("process_rx: no data");
-                       break;
-               }
-       } while (spc > 0 && sz > 0);
-
-       return (num_enq);
-}
-
-/*
- * vionet_rx_event
- *
- * Called from the event handling thread when new data can be
- * received on the tap fd of a vionet device.
- */
-static void
-vionet_rx_event(int fd, short kind, void *arg)
-{
-       struct vionet_dev *dev = arg;
-
-       mutex_lock(&dev->mutex);
-
-       if (vionet_rx(dev) > 0) {
-               /* XXX: vcpu_id */
-               vcpu_assert_pic_irq(dev->vm_id, 0, dev->irq);
-       }
-
-       mutex_unlock(&dev->mutex);
-}
-
-/*
- * Must be called with dev->mutex acquired.
- */
-void
-vionet_notify_rx(struct vionet_dev *dev)
-{
-       char *vr;
-       struct vring_avail *avail;
-       struct virtio_vq_info *vq_info;
-
-       vq_info = &dev->vq[RXQ];
-       vr = vq_info->q_hva;
-       if (vr == NULL)
-               fatalx("%s: null vring", __func__);
-
-       /* Compute offset into avail ring */
-       avail = (struct vring_avail *)(vr + vq_info->vq_availoffset);
-       vq_info->notified_avail = avail->idx - 1;
-}
-
-/*
- * Must be called with dev->mutex acquired.
- */
-int
-vionet_notifyq(struct vionet_dev *dev)
-{
-       int ret = 0;
-
-       switch (dev->cfg.queue_notify) {
-       case RXQ:
-               vionet_notify_rx(dev);
-               break;
-       case TXQ:
-               ret = vionet_notify_tx(dev);
-               break;
-       default:
-               /*
-                * Catch the unimplemented queue ID 2 (control queue) as
-                * well as any bogus queue IDs.
-                */
-               log_debug("%s: notify for unimplemented queue ID %d",
-                   __func__, dev->cfg.queue_notify);
-               break;
-       }
-
-       return (ret);
-}
-
-/*
- * Must be called with dev->mutex acquired.
- */
-int
-vionet_notify_tx(struct vionet_dev *dev)
-{
-       uint16_t idx, pkt_desc_idx, hdr_desc_idx, dxx, cnt;
-       size_t pktsz, chunk_size = 0;
-       ssize_t dhcpsz = 0;
-       int num_enq, ofs, spc = 0;
-       char *vr = NULL, *pkt = NULL, *dhcppkt = NULL;
-       struct vring_desc *desc, *pkt_desc, *hdr_desc;
-       struct vring_avail *avail;
-       struct vring_used *used;
-       struct virtio_vq_info *vq_info;
-       struct ether_header *eh;
-
-       vq_info = &dev->vq[TXQ];
-       vr = vq_info->q_hva;
-       if (vr == NULL)
-               fatalx("%s: null vring", __func__);
-
-       /* Compute offsets in ring of descriptors, avail ring, and used ring */
-       desc = (struct vring_desc *)(vr);
-       avail = (struct vring_avail *)(vr + vq_info->vq_availoffset);
-       used = (struct vring_used *)(vr + vq_info->vq_usedoffset);
-
-       num_enq = 0;
-
-       idx = vq_info->last_avail & VIONET_QUEUE_MASK;
-
-       if ((avail->idx & VIONET_QUEUE_MASK) == idx) {
-               log_debug("%s - nothing to do?", __func__);
-               return (0);
-       }
-
-       while ((avail->idx & VIONET_QUEUE_MASK) != idx) {
-               hdr_desc_idx = avail->ring[idx] & VIONET_QUEUE_MASK;
-               hdr_desc = &desc[hdr_desc_idx];
-               pktsz = 0;
-
-               cnt = 0;
-               dxx = hdr_desc_idx;
-               do {
-                       pktsz += desc[dxx].len;
-                       dxx = desc[dxx].next & VIONET_QUEUE_MASK;
-
-                       /*
-                        * Virtio 1.0, cs04, section 2.4.5:
-                        *  "The number of descriptors in the table is defined
-                        *   by the queue size for this virtqueue: this is the
-                        *   maximum possible descriptor chain length."
-                        */
-                       if (++cnt >= VIONET_QUEUE_SIZE) {
-                               log_warnx("%s: descriptor table invalid",
-                                   __func__);
-                               goto out;
-                       }
-               } while (desc[dxx].flags & VRING_DESC_F_NEXT);
-
-               pktsz += desc[dxx].len;
-
-               /* Remove virtio header descriptor len */
-               pktsz -= hdr_desc->len;
-
-               /* Drop packets violating device MTU-based limits */
-               if (pktsz < VIONET_MIN_TXLEN || pktsz > VIONET_MAX_TXLEN) {
-                       log_warnx("%s: invalid packet size %lu", __func__,
-                           pktsz);
-                       goto drop_packet;
-               }
-               pkt = malloc(pktsz);
-               if (pkt == NULL) {
-                       log_warn("malloc error alloc packet buf");
-                       goto out;
-               }
-
-               ofs = 0;
-               pkt_desc_idx = hdr_desc->next & VIONET_QUEUE_MASK;
-               pkt_desc = &desc[pkt_desc_idx];
-
-               while (pkt_desc->flags & VRING_DESC_F_NEXT) {
-                       /* must be not writable */
-                       if (pkt_desc->flags & VRING_DESC_F_WRITE) {
-                               log_warnx("unexpected writable tx desc "
-                                   "%d", pkt_desc_idx);
-                               goto out;
-                       }
-
-                       /* Check we don't read beyond allocated pktsz */
-                       if (pkt_desc->len > pktsz - ofs) {
-                               log_warnx("%s: descriptor len past pkt len",
-                                   __func__);
-                               chunk_size = pktsz - ofs;
-                       } else
-                               chunk_size = pkt_desc->len;
-
-                       /* Read packet from descriptor ring */
-                       if (read_mem(pkt_desc->addr, pkt + ofs, chunk_size)) {
-                               log_warnx("vionet: packet read_mem error "
-                                   "@ 0x%llx", pkt_desc->addr);
-                               goto out;
-                       }
-
-                       ofs += pkt_desc->len;
-                       pkt_desc_idx = pkt_desc->next & VIONET_QUEUE_MASK;
-                       pkt_desc = &desc[pkt_desc_idx];
-               }
-
-               /* Now handle tail descriptor - must be not writable */
-               if (pkt_desc->flags & VRING_DESC_F_WRITE) {
-                       log_warnx("unexpected writable tx descriptor %d",
-                           pkt_desc_idx);
-                       goto out;
-               }
-
-               /* Check we don't read beyond allocated pktsz */
-               if (pkt_desc->len > pktsz - ofs) {
-                       log_warnx("%s: descriptor len past pkt len", __func__);
-                       chunk_size = pktsz - ofs - pkt_desc->len;
-               } else
-                       chunk_size = pkt_desc->len;
-
-               /* Read packet from descriptor ring */
-               if (read_mem(pkt_desc->addr, pkt + ofs, chunk_size)) {
-                       log_warnx("vionet: packet read_mem error @ "
-                           "0x%llx", pkt_desc->addr);
-                       goto out;
-               }
-
-               /* reject other source addresses */
-               if (dev->lockedmac && pktsz >= ETHER_HDR_LEN &&
-                   (eh = (struct ether_header *)pkt) &&
-                   memcmp(eh->ether_shost, dev->mac,
-                   sizeof(eh->ether_shost)) != 0)
-                       log_debug("vionet: wrong source address %s for vm %d",
-                           ether_ntoa((struct ether_addr *)
-                           eh->ether_shost), dev->vm_id);
-               else if (dev->local &&
-                   (dhcpsz = dhcp_request(dev, pkt, pktsz, &dhcppkt)) != -1) {
-                       log_debug("vionet: dhcp request,"
-                           " local response size %zd", dhcpsz);
-
-               /* XXX signed vs unsigned here, funky cast */
-               } else if (write(dev->fd, pkt, pktsz) != (int)pktsz) {
-                       log_warnx("vionet: tx failed writing to tap: "
-                           "%d", errno);
-                       goto out;
-               }
-
-       drop_packet:
-               dev->cfg.isr_status = 1;
-               used->ring[used->idx & VIONET_QUEUE_MASK].id = hdr_desc_idx;
-               used->ring[used->idx & VIONET_QUEUE_MASK].len = hdr_desc->len;
-               __sync_synchronize();
-               used->idx++;
-
-               vq_info->last_avail = avail->idx & VIONET_QUEUE_MASK;
-               idx = (idx + 1) & VIONET_QUEUE_MASK;
-
-               num_enq++;
-
-               free(pkt);
-               pkt = NULL;
-       }
-
-       if (dhcpsz > 0)
-               vionet_enq_rx(dev, dhcppkt, dhcpsz, &spc);
-
-out:
-       free(pkt);
-       free(dhcppkt);
-
-       return (1);
-}
-
-int
-vmmci_ctl(unsigned int cmd)
-{
-       struct timeval tv = { 0, 0 };
-
-       if ((vmmci.cfg.device_status &
-           VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK) == 0)
-               return (-1);
-
-       if (cmd == vmmci.cmd)
-               return (0);
-
-       switch (cmd) {
-       case VMMCI_NONE:
-               break;
-       case VMMCI_SHUTDOWN:
-       case VMMCI_REBOOT:
-               /* Update command */
-               vmmci.cmd = cmd;
-
-               /*
-                * vmm VMs do not support powerdown, send a reboot request
-                * instead and turn it off after the triple fault.
-                */
-               if (cmd == VMMCI_SHUTDOWN)
-                       cmd = VMMCI_REBOOT;
-
-               /* Trigger interrupt */
-               vmmci.cfg.isr_status = VIRTIO_CONFIG_ISR_CONFIG_CHANGE;
-               vcpu_assert_pic_irq(vmmci.vm_id, 0, vmmci.irq);
-
-               /* Add ACK timeout */
-               tv.tv_sec = VMMCI_TIMEOUT;
-               evtimer_add(&vmmci.timeout, &tv);
-               break;
-       case VMMCI_SYNCRTC:
-               if (vmmci.cfg.guest_feature & VMMCI_F_SYNCRTC) {
-                       /* RTC updated, request guest VM resync of its RTC */
-                       vmmci.cmd = cmd;
-
-                       vmmci.cfg.isr_status = VIRTIO_CONFIG_ISR_CONFIG_CHANGE;
-                       vcpu_assert_pic_irq(vmmci.vm_id, 0, vmmci.irq);
-               } else {
-                       log_debug("%s: RTC sync skipped (guest does not "
-                           "support RTC sync)\n", __func__);
-               }
-               break;
-       default:
-               fatalx("invalid vmmci command: %d", cmd);
-       }
-
-       return (0);
-}
-
-void
-vmmci_ack(unsigned int cmd)
-{
-       struct timeval   tv = { 0, 0 };
-
-       switch (cmd) {
-       case VMMCI_NONE:
-               break;
-       case VMMCI_SHUTDOWN:
-               /*
-                * The shutdown was requested by the VM if we don't have
-                * a pending shutdown request.  In this case add a short
-                * timeout to give the VM a chance to reboot before the
-                * timer is expired.
-                */
-               if (vmmci.cmd == 0) {
-                       log_debug("%s: vm %u requested shutdown", __func__,
-                           vmmci.vm_id);
-                       tv.tv_sec = VMMCI_TIMEOUT;
-                       evtimer_add(&vmmci.timeout, &tv);
-                       return;
-               }
-               /* FALLTHROUGH */
-       case VMMCI_REBOOT:
-               /*
-                * If the VM acknowledged our shutdown request, give it
-                * enough time to shutdown or reboot gracefully.  This
-                * might take a considerable amount of time (running
-                * rc.shutdown on the VM), so increase the timeout before
-                * killing it forcefully.
-                */
-               if (cmd == vmmci.cmd &&
-                   evtimer_pending(&vmmci.timeout, NULL)) {
-                       log_debug("%s: vm %u acknowledged shutdown request",
-                           __func__, vmmci.vm_id);
-                       tv.tv_sec = VMMCI_SHUTDOWN_TIMEOUT;
-                       evtimer_add(&vmmci.timeout, &tv);
-               }
-               break;
-       case VMMCI_SYNCRTC:
-               log_debug("%s: vm %u acknowledged RTC sync request",
-                   __func__, vmmci.vm_id);
-               vmmci.cmd = VMMCI_NONE;
-               break;
-       default:
-               log_warnx("%s: illegal request %u", __func__, cmd);
-               break;
-       }
-}
-
-void
-vmmci_timeout(int fd, short type, void *arg)
-{
-       log_debug("%s: vm %u shutdown", __progname, vmmci.vm_id);
-       vm_shutdown(vmmci.cmd == VMMCI_REBOOT ? VMMCI_REBOOT : VMMCI_SHUTDOWN);
-}
-
-int
-vmmci_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr,
-    void *unused, uint8_t sz)
-{
-       *intr = 0xFF;
-
-       if (dir == 0) {
-               switch (reg) {
-               case VIRTIO_CONFIG_DEVICE_FEATURES:
-               case VIRTIO_CONFIG_QUEUE_SIZE:
-               case VIRTIO_CONFIG_ISR_STATUS:
-                       log_warnx("%s: illegal write %x to %s",
-                           __progname, *data, virtio_reg_name(reg));
-                       break;
-               case VIRTIO_CONFIG_GUEST_FEATURES:
-                       vmmci.cfg.guest_feature = *data;
-                       break;
-               case VIRTIO_CONFIG_QUEUE_PFN:
-                       vmmci.cfg.queue_pfn = *data;
-                       break;
-               case VIRTIO_CONFIG_QUEUE_SELECT:
-                       vmmci.cfg.queue_select = *data;
-                       break;
-               case VIRTIO_CONFIG_QUEUE_NOTIFY:
-                       vmmci.cfg.queue_notify = *data;
-                       break;
-               case VIRTIO_CONFIG_DEVICE_STATUS:
-                       vmmci.cfg.device_status = *data;
-                       break;
-               case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI:
-                       vmmci_ack(*data);
-                       break;
-               }
-       } else {
-               switch (reg) {
-               case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI:
-                       *data = vmmci.cmd;
-                       break;
-               case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 4:
-                       /* Update time once when reading the first register */
-                       gettimeofday(&vmmci.time, NULL);
-                       *data = (uint64_t)vmmci.time.tv_sec;
-                       break;
-               case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 8:
-                       *data = (uint64_t)vmmci.time.tv_sec << 32;
-                       break;
-               case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 12:
-                       *data = (uint64_t)vmmci.time.tv_usec;
-                       break;
-               case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 16:
-                       *data = (uint64_t)vmmci.time.tv_usec << 32;
-                       break;
-               case VIRTIO_CONFIG_DEVICE_FEATURES:
-                       *data = vmmci.cfg.device_feature;
-                       break;
-               case VIRTIO_CONFIG_GUEST_FEATURES:
-                       *data = vmmci.cfg.guest_feature;
-                       break;
-               case VIRTIO_CONFIG_QUEUE_PFN:
-                       *data = vmmci.cfg.queue_pfn;
-                       break;
-               case VIRTIO_CONFIG_QUEUE_SIZE:
-                       *data = vmmci.cfg.queue_size;
-                       break;
-               case VIRTIO_CONFIG_QUEUE_SELECT:
-                       *data = vmmci.cfg.queue_select;
+                       *data = vmmci.cfg.queue_select;
                        break;
                case VIRTIO_CONFIG_QUEUE_NOTIFY:
                        *data = vmmci.cfg.queue_notify;
@@ -1678,36 +487,15 @@ virtio_get_base(int fd, char *path, size_t npath, int type, const char *dpath)
        return -1;
 }
 
-/*
- * Initializes a struct virtio_backing using the list of fds.
- */
-static int
-virtio_init_disk(struct virtio_backing *file, off_t *sz,
-    int *fd, size_t nfd, int type)
-{
-       /*
-        * probe disk types in order of preference, first one to work wins.
-        * TODO: provide a way of specifying the type and options.
-        */
-       switch (type) {
-       case VMDF_RAW:
-               return virtio_raw_init(file, sz, fd, nfd);
-       case VMDF_QCOW2:
-               return virtio_qcow2_init(file, sz, fd, nfd);
-       }
-       log_warnx("%s: invalid disk format", __func__);
-       return -1;
-}
-
 void
 virtio_init(struct vmd_vm *vm, int child_cdrom,
     int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps)
 {
        struct vmop_create_params *vmc = &vm->vm_params;
        struct vm_create_params *vcp = &vmc->vmc_params;
+       struct virtio_dev *dev;
        uint8_t id;
-       uint8_t i;
-       int ret;
+       uint8_t i, j;
 
        /* Virtio entropy device */
        if (pci_add_device(&id, PCI_VENDOR_QUMRANET,
@@ -1737,105 +525,98 @@ virtio_init(struct vmd_vm *vm, int child_cdrom,
        viornd.irq = pci_get_dev_irq(id);
        viornd.vm_id = vcp->vcp_id;
 
-       if (vmc->vmc_nnics > 0) {
-               vionet = calloc(vmc->vmc_nnics, sizeof(struct vionet_dev));
-               if (vionet == NULL) {
-                       log_warn("%s: calloc failure allocating vionets",
-                           __progname);
-                       return;
-               }
+       SLIST_INIT(&virtio_devs);
 
-               nr_vionet = vmc->vmc_nnics;
-               /* Virtio network */
+       if (vmc->vmc_nnics > 0) {
                for (i = 0; i < vmc->vmc_nnics; i++) {
+                       dev = calloc(1, sizeof(struct virtio_dev));
+                       if (dev == NULL) {
+                               log_warn("%s: calloc failure allocating vionet",
+                                   __progname);
+                               return;
+                       }
+                       /* Virtio network */
+                       dev->dev_type = VMD_DEVTYPE_NET;
+
                        if (pci_add_device(&id, PCI_VENDOR_QUMRANET,
-                           PCI_PRODUCT_QUMRANET_VIO_NET, PCI_CLASS_SYSTEM,
-                           PCI_SUBCLASS_SYSTEM_MISC,
-                           PCI_VENDOR_OPENBSD,
-                           PCI_PRODUCT_VIRTIO_NETWORK, 1, NULL)) {
+                               PCI_PRODUCT_QUMRANET_VIO_NET, PCI_CLASS_SYSTEM,
+                               PCI_SUBCLASS_SYSTEM_MISC, PCI_VENDOR_OPENBSD,
+                               PCI_PRODUCT_VIRTIO_NETWORK, 1, NULL)) {
                                log_warnx("%s: can't add PCI virtio net device",
                                    __progname);
                                return;
                        }
-
-                       if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, virtio_net_io,
-                           &vionet[i])) {
+                       dev->pci_id = id;
+                       dev->sync_fd = -1;
+                       dev->async_fd = -1;
+                       dev->vm_id = vcp->vcp_id;
+                       dev->vm_vmid = vm->vm_vmid;
+                       dev->irq = pci_get_dev_irq(id);
+
+                       /* The vionet pci bar function is called by the vcpu. */
+                       if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, virtio_pci_io,
+                           dev)) {
                                log_warnx("%s: can't add bar for virtio net "
                                    "device", __progname);
                                return;
                        }
 
-                       ret = pthread_mutex_init(&vionet[i].mutex, NULL);
-                       if (ret) {
-                               errno = ret;
-                               log_warn("%s: could not initialize mutex "
-                                   "for vionet device", __progname);
-                               return;
-                       }
-
-                       vionet[i].vq[RXQ].qs = VIONET_QUEUE_SIZE;
-                       vionet[i].vq[RXQ].vq_availoffset =
+                       dev->vionet.vq[RXQ].qs = VIONET_QUEUE_SIZE;
+                       dev->vionet.vq[RXQ].vq_availoffset =
                            sizeof(struct vring_desc) * VIONET_QUEUE_SIZE;
-                       vionet[i].vq[RXQ].vq_usedoffset = VIRTQUEUE_ALIGN(
-                           sizeof(struct vring_desc) * VIONET_QUEUE_SIZE
-                           + sizeof(uint16_t) * (2 + VIONET_QUEUE_SIZE));
-                       vionet[i].vq[RXQ].last_avail = 0;
-                       vionet[i].vq[RXQ].notified_avail = 0;
-
-                       vionet[i].vq[TXQ].qs = VIONET_QUEUE_SIZE;
-                       vionet[i].vq[TXQ].vq_availoffset =
+                       dev->vionet.vq[RXQ].vq_usedoffset = VIRTQUEUE_ALIGN(
+                               sizeof(struct vring_desc) * VIONET_QUEUE_SIZE
+                               + sizeof(uint16_t) * (2 + VIONET_QUEUE_SIZE));
+                       dev->vionet.vq[RXQ].last_avail = 0;
+                       dev->vionet.vq[RXQ].notified_avail = 0;
+
+                       dev->vionet.vq[TXQ].qs = VIONET_QUEUE_SIZE;
+                       dev->vionet.vq[TXQ].vq_availoffset =
                            sizeof(struct vring_desc) * VIONET_QUEUE_SIZE;
-                       vionet[i].vq[TXQ].vq_usedoffset = VIRTQUEUE_ALIGN(
-                           sizeof(struct vring_desc) * VIONET_QUEUE_SIZE
-                           + sizeof(uint16_t) * (2 + VIONET_QUEUE_SIZE));
-                       vionet[i].vq[TXQ].last_avail = 0;
-                       vionet[i].vq[TXQ].notified_avail = 0;
-                       vionet[i].fd = child_taps[i];
-                       vionet[i].vm_id = vcp->vcp_id;
-                       vionet[i].vm_vmid = vm->vm_vmid;
-                       vionet[i].irq = pci_get_dev_irq(id);
-
-                       event_set(&vionet[i].event, vionet[i].fd,
-                           EV_READ | EV_PERSIST, vionet_rx_event, &vionet[i]);
-                       if (event_add(&vionet[i].event, NULL)) {
-                               log_warn("could not initialize vionet event "
-                                   "handler");
-                               return;
-                       }
+                       dev->vionet.vq[TXQ].vq_usedoffset = VIRTQUEUE_ALIGN(
+                               sizeof(struct vring_desc) * VIONET_QUEUE_SIZE
+                               + sizeof(uint16_t) * (2 + VIONET_QUEUE_SIZE));
+                       dev->vionet.vq[TXQ].last_avail = 0;
+                       dev->vionet.vq[TXQ].notified_avail = 0;
+
+                       dev->vionet.data_fd = child_taps[i];
 
                        /* MAC address has been assigned by the parent */
-                       memcpy(&vionet[i].mac, &vmc->vmc_macs[i], 6);
-                       vionet[i].cfg.device_feature = VIRTIO_NET_F_MAC;
+                       memcpy(&dev->vionet.mac, &vmc->vmc_macs[i], 6);
+                       dev->vionet.cfg.device_feature = VIRTIO_NET_F_MAC;
 
-                       vionet[i].lockedmac =
+                       dev->vionet.lockedmac =
                            vmc->vmc_ifflags[i] & VMIFF_LOCKED ? 1 : 0;
-                       vionet[i].local =
+                       dev->vionet.local =
                            vmc->vmc_ifflags[i] & VMIFF_LOCAL ? 1 : 0;
                        if (i == 0 && vmc->vmc_bootdevice & VMBOOTDEV_NET)
-                               vionet[i].pxeboot = 1;
-                       vionet[i].idx = i;
-                       vionet[i].pci_id = id;
+                               dev->vionet.pxeboot = 1;
 
                        log_debug("%s: vm \"%s\" vio%u lladdr %s%s%s%s",
                            __func__, vcp->vcp_name, i,
-                           ether_ntoa((void *)vionet[i].mac),
-                           vionet[i].lockedmac ? ", locked" : "",
-                           vionet[i].local ? ", local" : "",
-                           vionet[i].pxeboot ? ", pxeboot" : "");
+                           ether_ntoa((void *)dev->vionet.mac),
+                           dev->vionet.lockedmac ? ", locked" : "",
+                           dev->vionet.local ? ", local" : "",
+                           dev->vionet.pxeboot ? ", pxeboot" : "");
+
+                       /* Add the vionet to our device list. */
+                       dev->vionet.idx = i;
+                       SLIST_INSERT_HEAD(&virtio_devs, dev, dev_next);
                }
        }
 
        if (vmc->vmc_ndisks > 0) {
-               nr_vioblk = vmc->vmc_ndisks;
-               vioblk = calloc(vmc->vmc_ndisks, sizeof(struct vioblk_dev));
-               if (vioblk == NULL) {
-                       log_warn("%s: calloc failure allocating vioblks",
-                           __progname);
-                       return;
-               }
-
-               /* One virtio block device for each disk defined in vcp */
                for (i = 0; i < vmc->vmc_ndisks; i++) {
+                       dev = calloc(1, sizeof(struct virtio_dev));
+                       if (dev == NULL) {
+                               log_warn("%s: calloc failure allocating vioblk",
+                                   __progname);
+                               return;
+                       }
+
+                       /* One vioblk device for each disk defined in vcp */
+                       dev->dev_type = VMD_DEVTYPE_DISK;
+
                        if (pci_add_device(&id, PCI_VENDOR_QUMRANET,
                            PCI_PRODUCT_QUMRANET_VIO_BLOCK,
                            PCI_CLASS_MASS_STORAGE,
@@ -1846,35 +627,53 @@ virtio_init(struct vmd_vm *vm, int child_cdrom,
                                    "device", __progname);
                                return;
                        }
-                       if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, virtio_blk_io,
-                           &vioblk[i])) {
+                       dev->pci_id = id;
+                       dev->sync_fd = -1;
+                       dev->async_fd = -1;
+                       dev->vm_id = vcp->vcp_id;
+                       dev->vm_vmid = vm->vm_vmid;
+                       dev->irq = pci_get_dev_irq(id);
+
+                       if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, virtio_pci_io,
+                           &dev->vioblk)) {
                                log_warnx("%s: can't add bar for virtio block "
                                    "device", __progname);
                                return;
                        }
-                       vioblk[i].vq[0].qs = VIOBLK_QUEUE_SIZE;
-                       vioblk[i].vq[0].vq_availoffset =
+                       dev->vioblk.vq[0].qs = VIOBLK_QUEUE_SIZE;
+                       dev->vioblk.vq[0].vq_availoffset =
                            sizeof(struct vring_desc) * VIOBLK_QUEUE_SIZE;
-                       vioblk[i].vq[0].vq_usedoffset = VIRTQUEUE_ALIGN(
+                       dev->vioblk.vq[0].vq_usedoffset = VIRTQUEUE_ALIGN(
                            sizeof(struct vring_desc) * VIOBLK_QUEUE_SIZE
                            + sizeof(uint16_t) * (2 + VIOBLK_QUEUE_SIZE));
-                       vioblk[i].vq[0].last_avail = 0;
-                       vioblk[i].cfg.device_feature = VIRTIO_BLK_F_SIZE_MAX;
-                       vioblk[i].max_xfer = 1048576;
-                       vioblk[i].pci_id = id;
-                       vioblk[i].vm_id = vcp->vcp_id;
-                       vioblk[i].irq = pci_get_dev_irq(id);
-                       if (virtio_init_disk(&vioblk[i].file, &vioblk[i].sz,
-                           child_disks[i], vmc->vmc_diskbases[i],
-                           vmc->vmc_disktypes[i]) == -1) {
-                               log_warnx("%s: unable to determine disk format",
-                                   __func__);
-                               return;
-                       }
-                       vioblk[i].sz /= 512;
+                       dev->vioblk.vq[0].last_avail = 0;
+                       dev->vioblk.cfg.device_feature =
+                           VIRTIO_BLK_F_SIZE_MAX;
+                       dev->vioblk.max_xfer = 1048576;
+
+                       /*
+                        * Initialize disk fds to an invalid fd (-1), then
+                        * set any child disk fds.
+                        */
+                       memset(&dev->vioblk.disk_fd, -1,
+                           sizeof(dev->vioblk.disk_fd));
+                       dev->vioblk.ndisk_fd = vmc->vmc_diskbases[i];
+                       for (j = 0; j < dev->vioblk.ndisk_fd; j++)
+                               dev->vioblk.disk_fd[j] = child_disks[i][j];
+
+                       dev->vioblk.idx = i;
+                       SLIST_INSERT_HEAD(&virtio_devs, dev, dev_next);
                }
        }
 
+       /*
+        * Launch virtio devices that support subprocess execution.
+        */
+       SLIST_FOREACH(dev, &virtio_devs, dev_next) {
+               if (virtio_dev_launch(vm, dev) != 0)
+                       fatalx("failed to launch virtio device");
+       }
+
        /* vioscsi cdrom */
        if (strlen(vmc->vmc_cdrom)) {
                vioscsi = calloc(1, sizeof(struct vioscsi_dev));
@@ -1901,7 +700,7 @@ virtio_init(struct vmd_vm *vm, int child_cdrom,
                        return;
                }
 
-               for ( i = 0; i < VIRTIO_MAX_QUEUES; i++) {
+               for (i = 0; i < VIRTIO_MAX_QUEUES; i++) {
                        vioscsi->vq[i].qs = VIOSCSI_QUEUE_SIZE;
                        vioscsi->vq[i].vq_availoffset =
                            sizeof(struct vring_desc) * VIOSCSI_QUEUE_SIZE;
@@ -1910,15 +709,15 @@ virtio_init(struct vmd_vm *vm, int child_cdrom,
                            + sizeof(uint16_t) * (2 + VIOSCSI_QUEUE_SIZE));
                        vioscsi->vq[i].last_avail = 0;
                }
-               if (virtio_init_disk(&vioscsi->file, &vioscsi->sz,
-                   &child_cdrom, 1, VMDF_RAW) == -1) {
+               if (virtio_raw_init(&vioscsi->file, &vioscsi->sz, &child_cdrom,
+                   1) == -1) {
                        log_warnx("%s: unable to determine iso format",
                            __func__);
                        return;
                }
                vioscsi->locked = 0;
                vioscsi->lba = 0;
-               vioscsi->n_blocks = vioscsi->sz >> 11; /* num of 2048 blocks in file */
+               vioscsi->n_blocks = vioscsi->sz >> 2; /* num of 2048 blocks in file */
                vioscsi->max_xfer = VIOSCSI_BLOCK_SIZE_CDROM;
                vioscsi->pci_id = id;
                vioscsi->vm_id = vcp->vcp_id;
@@ -1967,27 +766,84 @@ virtio_init(struct vmd_vm *vm, int child_cdrom,
 void
 vionet_set_hostmac(struct vmd_vm *vm, unsigned int idx, uint8_t *addr)
 {
-       struct vmop_create_params *vmc = &vm->vm_params;
-       struct vionet_dev         *dev;
+       struct vmop_create_params       *vmc = &vm->vm_params;
+       struct virtio_dev               *dev;
+       struct vionet_dev               *vionet = NULL;
+       int ret;
 
        if (idx > vmc->vmc_nnics)
-               fatalx("vionet_set_hostmac");
+               fatalx("%s: invalid vionet index: %u", __func__, idx);
 
-       dev = &vionet[idx];
-       memcpy(dev->hostmac, addr, sizeof(dev->hostmac));
+       SLIST_FOREACH(dev, &virtio_devs, dev_next) {
+               if (dev->dev_type == VMD_DEVTYPE_NET
+                   && dev->vionet.idx == idx) {
+                       vionet = &dev->vionet;
+                       break;
+               }
+       }
+       if (vionet == NULL)
+               fatalx("%s: dev == NULL, idx = %u", __func__, idx);
+
+       /* Set the local vm process copy. */
+       memcpy(vionet->hostmac, addr, sizeof(vionet->hostmac));
+
+       /* Send the information to the device process. */
+       ret = imsg_compose_event(&dev->async_iev, IMSG_DEVOP_HOSTMAC, 0, 0, -1,
+           vionet->hostmac, sizeof(vionet->hostmac));
+       if (ret == -1) {
+               log_warnx("%s: failed to queue hostmac to vionet dev %u",
+                   __func__, idx);
+               return;
+       }
 }
 
 void
 virtio_shutdown(struct vmd_vm *vm)
 {
-       int i;
+       int ret, status;
+       pid_t pid = 0;
+       struct virtio_dev *dev, *tmp;
+       struct viodev_msg msg;
+       struct imsgbuf *ibuf;
 
-       /* ensure that our disks are synced */
+       /* Ensure that our disks are synced. */
        if (vioscsi != NULL)
                vioscsi->file.close(vioscsi->file.p, 0);
 
-       for (i = 0; i < nr_vioblk; i++)
-               vioblk[i].file.close(vioblk[i].file.p, 0);
+       /*
+        * Broadcast shutdown to child devices. We need to do this
+        * synchronously as we have already stopped the async event thread.
+        */
+       SLIST_FOREACH(dev, &virtio_devs, dev_next) {
+               memset(&msg, 0, sizeof(msg));
+               msg.type = VIODEV_MSG_SHUTDOWN;
+               ibuf = &dev->sync_iev.ibuf;
+               ret = imsg_compose(ibuf, VIODEV_MSG_SHUTDOWN, 0, 0, -1,
+                   &msg, sizeof(msg));
+               if (ret == -1)
+                       fatalx("%s: failed to send shutdown to device",
+                           __func__);
+               if (imsg_flush(ibuf) == -1)
+                       fatalx("%s: imsg_flush", __func__);
+       }
+
+       /*
+        * Wait for all children to shutdown using a simple approach of
+        * iterating over known child devices and waiting for them to die.
+        */
+       SLIST_FOREACH_SAFE(dev, &virtio_devs, dev_next, tmp) {
+               log_debug("%s: waiting on device pid %d", __func__,
+                   dev->dev_pid);
+               do {
+                       pid = waitpid(dev->dev_pid, &status, WNOHANG);
+               } while (pid == 0 || (pid == -1 && errno == EINTR));
+               if (pid == dev->dev_pid)
+                       log_debug("%s: device for pid %d is stopped",
+                           __func__, pid);
+               else
+                       log_warnx("%s: unexpected pid %d", __func__, pid);
+               free(dev);
+       }
 }
 
 int
@@ -2042,67 +898,52 @@ vionet_restore(int fd, struct vmd_vm *vm, int *child_taps)
 {
        struct vmop_create_params *vmc = &vm->vm_params;
        struct vm_create_params *vcp = &vmc->vmc_params;
+       struct virtio_dev *dev;
        uint8_t i;
-       int ret;
-       void *hva = NULL;
 
-       nr_vionet = vmc->vmc_nnics;
-       if (vmc->vmc_nnics > 0) {
-               vionet = calloc(vmc->vmc_nnics, sizeof(struct vionet_dev));
-               if (vionet == NULL) {
-                       log_warn("%s: calloc failure allocating vionets",
+       if (vmc->vmc_nnics == 0)
+               return (0);
+
+       for (i = 0; i < vmc->vmc_nnics; i++) {
+               dev = calloc(1, sizeof(struct virtio_dev));
+               if (dev == NULL) {
+                       log_warn("%s: calloc failure allocating vionet",
                            __progname);
                        return (-1);
                }
-               log_debug("%s: receiving vionet", __func__);
-               if (atomicio(read, fd, vionet,
-                   vmc->vmc_nnics * sizeof(struct vionet_dev)) !=
-                   vmc->vmc_nnics * sizeof(struct vionet_dev)) {
+
+               log_debug("%s: receiving virtio network device", __func__);
+               if (atomicio(read, fd, dev, sizeof(struct virtio_dev))
+                   != sizeof(struct virtio_dev)) {
                        log_warnx("%s: error reading vionet from fd",
                            __func__);
                        return (-1);
                }
 
                /* Virtio network */
-               for (i = 0; i < vmc->vmc_nnics; i++) {
-                       if (pci_set_bar_fn(vionet[i].pci_id, 0, virtio_net_io,
-                           &vionet[i])) {
-                               log_warnx("%s: can't set bar fn for virtio net "
-                                   "device", __progname);
-                               return (-1);
-                       }
+               if (dev->dev_type != VMD_DEVTYPE_NET) {
+                       log_warnx("%s: invalid device type", __func__);
+                       return (-1);
+               }
 
-                       memset(&vionet[i].mutex, 0, sizeof(pthread_mutex_t));
-                       ret = pthread_mutex_init(&vionet[i].mutex, NULL);
+               dev->sync_fd = -1;
+               dev->async_fd = -1;
+               dev->vm_id = vcp->vcp_id;
+               dev->vm_vmid = vm->vm_vmid;
+               dev->irq = pci_get_dev_irq(dev->pci_id);
 
-                       if (ret) {
-                               errno = ret;
-                               log_warn("%s: could not initialize mutex "
-                                   "for vionet device", __progname);
-                               return (-1);
-                       }
-                       vionet[i].fd = child_taps[i];
-                       vionet[i].vm_id = vcp->vcp_id;
-                       vionet[i].vm_vmid = vm->vm_vmid;
-                       vionet[i].irq = pci_get_dev_irq(vionet[i].pci_id);
-
-                       hva = hvaddr_mem(vionet[i].vq[RXQ].q_gpa,
-                           vring_size(VIONET_QUEUE_SIZE));
-                       if (hva == NULL)
-                               fatal("failed to restore vionet RX virtqueue");
-                       vionet[i].vq[RXQ].q_hva = hva;
-
-                       hva = hvaddr_mem(vionet[i].vq[TXQ].q_gpa,
-                           vring_size(VIONET_QUEUE_SIZE));
-                       if (hva == NULL)
-                               fatal("failed to restore vionet TX virtqueue");
-                       vionet[i].vq[TXQ].q_hva = hva;
-
-                       memset(&vionet[i].event, 0, sizeof(struct event));
-                       event_set(&vionet[i].event, vionet[i].fd,
-                           EV_READ | EV_PERSIST, vionet_rx_event, &vionet[i]);
+               if (pci_set_bar_fn(dev->pci_id, 0, virtio_pci_io, dev)) {
+                       log_warnx("%s: can't set bar fn for virtio net "
+                           "device", __progname);
+                       return (-1);
                }
+
+               dev->vionet.data_fd = child_taps[i];
+               dev->vionet.idx = i;
+
+               SLIST_INSERT_HEAD(&virtio_devs, dev, dev_next);
        }
+
        return (0);
 }
 
@@ -2110,44 +951,50 @@ int
 vioblk_restore(int fd, struct vmd_vm *vm,
     int child_disks[][VM_MAX_BASE_PER_DISK])
 {
-       uint8_t i;
-       void *hva = NULL;
+       struct vmop_create_params *vmc = &vm->vm_params;
+       struct virtio_dev *dev;
+       uint8_t i, j;
 
-       nr_vioblk = vm->vm_params.vmc_ndisks;
-       vioblk = calloc(vm->vm_params.vmc_ndisks, sizeof(struct vioblk_dev));
-       if (vioblk == NULL) {
-               log_warn("%s: calloc failure allocating vioblks", __progname);
-               return (-1);
-       }
-       log_debug("%s: receiving vioblk", __func__);
-       if (atomicio(read, fd, vioblk,
-           nr_vioblk * sizeof(struct vioblk_dev)) !=
-           nr_vioblk * sizeof(struct vioblk_dev)) {
-               log_warnx("%s: error reading vioblk from fd", __func__);
-               return (-1);
-       }
-       for (i = 0; i < vm->vm_params.vmc_ndisks; i++) {
-               if (pci_set_bar_fn(vioblk[i].pci_id, 0, virtio_blk_io,
-                   &vioblk[i])) {
-                       log_warnx("%s: can't set bar fn for virtio block "
-                           "device", __progname);
+       if (vmc->vmc_ndisks == 0)
+               return (0);
+
+       for (i = 0; i < vmc->vmc_ndisks; i++) {
+               dev = calloc(1, sizeof(struct virtio_dev));
+               if (dev == NULL) {
+                       log_warn("%s: calloc failure allocating vioblks",
+                           __progname);
                        return (-1);
                }
-               if (virtio_init_disk(&vioblk[i].file, &vioblk[i].sz,
-                   child_disks[i], vm->vm_params.vmc_diskbases[i],
-                   vm->vm_params.vmc_disktypes[i]) == -1)  {
-                       log_warnx("%s: unable to determine disk format",
-                           __func__);
+
+               log_debug("%s: receiving vioblk", __func__);
+               if (atomicio(read, fd, dev, sizeof(struct virtio_dev))
+                   != sizeof(struct virtio_dev)) {
+                       log_warnx("%s: error reading vioblk from fd", __func__);
+                       return (-1);
+               }
+               if (dev->dev_type != VMD_DEVTYPE_DISK) {
+                       log_warnx("%s: invalid device type", __func__);
                        return (-1);
                }
-               vioblk[i].vm_id = vm->vm_params.vmc_params.vcp_id;
-               vioblk[i].irq = pci_get_dev_irq(vioblk[i].pci_id);
 
-               hva = hvaddr_mem(vioblk[i].vq[0].q_gpa,
-                   vring_size(VIOBLK_QUEUE_SIZE));
-               if (hva == NULL)
-                       fatal("failed to restore vioblk virtqueue");
-               vioblk[i].vq[0].q_hva = hva;
+               dev->sync_fd = -1;
+               dev->async_fd = -1;
+
+               if (pci_set_bar_fn(dev->pci_id, 0, virtio_pci_io, dev)) {
+                       log_warnx("%s: can't set bar fn for virtio block "
+                           "device", __progname);
+                       return (-1);
+               }
+               dev->vm_id = vmc->vmc_params.vcp_id;
+               dev->irq = pci_get_dev_irq(dev->pci_id);
+
+               memset(&dev->vioblk.disk_fd, -1, sizeof(dev->vioblk.disk_fd));
+               dev->vioblk.ndisk_fd = vmc->vmc_diskbases[i];
+               for (j = 0; j < dev->vioblk.ndisk_fd; j++)
+                       dev->vioblk.disk_fd[j] = child_disks[i][j];
+
+               dev->vioblk.idx = i;
+               SLIST_INSERT_HEAD(&virtio_devs, dev, dev_next);
        }
        return (0);
 }
@@ -2181,11 +1028,6 @@ vioscsi_restore(int fd, struct vmd_vm *vm, int child_cdrom)
                return (-1);
        }
 
-       if (virtio_init_disk(&vioscsi->file, &vioscsi->sz, &child_cdrom, 1,
-           VMDF_RAW) == -1) {
-               log_warnx("%s: unable to determine iso format", __func__);
-               return (-1);
-       }
        vioscsi->vm_id = vm->vm_params.vmc_params.vcp_id;
        vioscsi->irq = pci_get_dev_irq(vioscsi->pci_id);
 
@@ -2205,22 +1047,30 @@ int
 virtio_restore(int fd, struct vmd_vm *vm, int child_cdrom,
     int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps)
 {
+       struct virtio_dev *dev;
        int ret;
 
+       SLIST_INIT(&virtio_devs);
+
        if ((ret = viornd_restore(fd, vm)) == -1)
-               return ret;
+               return (ret);
 
        if ((ret = vioblk_restore(fd, vm, child_disks)) == -1)
-               return ret;
+               return (ret);
 
        if ((ret = vioscsi_restore(fd, vm, child_cdrom)) == -1)
-               return ret;
+               return (ret);
 
        if ((ret = vionet_restore(fd, vm, child_taps)) == -1)
-               return ret;
+               return (ret);
 
        if ((ret = vmmci_restore(fd, vm->vm_params.vmc_params.vcp_id)) == -1)
-               return ret;
+               return (ret);
+
+       SLIST_FOREACH(dev, &virtio_devs, dev_next) {
+               if (virtio_dev_launch(vm, dev) != 0)
+                       fatalx("%s: failed to restore virtio dev", __func__);
+       }
 
        return (0);
 }
@@ -2254,40 +1104,114 @@ vmmci_dump(int fd)
 int
 vionet_dump(int fd)
 {
-       int i;
+       struct virtio_dev       *dev, temp;
+       struct viodev_msg        msg;
+       struct imsg              imsg;
+       struct imsgbuf          *ibuf = NULL;
+       size_t                   sz;
+       int                      ret;
+
+       log_debug("%s: dumping vionet", __func__);
+
+       SLIST_FOREACH(dev, &virtio_devs, dev_next) {
+               if (dev->dev_type != VMD_DEVTYPE_NET)
+                       continue;
+
+               memset(&msg, 0, sizeof(msg));
+               memset(&imsg, 0, sizeof(imsg));
+
+               ibuf = &dev->sync_iev.ibuf;
+               msg.type = VIODEV_MSG_DUMP;
+
+               ret = imsg_compose(ibuf, IMSG_DEVOP_MSG, 0, 0, -1, &msg,
+                   sizeof(msg));
+               if (ret == -1) {
+                       log_warnx("%s: failed requesting dump of vionet[%d]",
+                           __func__, dev->vionet.idx);
+                       return (-1);
+               }
+               if (imsg_flush(ibuf) == -1) {
+                       log_warnx("%s: imsg_flush", __func__);
+                       return (-1);
+               }
+
+               sz = atomicio(read, dev->sync_fd, &temp, sizeof(temp));
+               if (sz != sizeof(temp)) {
+                       log_warnx("%s: failed to dump vionet[%d]", __func__,
+                           dev->vionet.idx);
+                       return (-1);
+               }
 
-       log_debug("%s: sending vionet", __func__);
+               temp.vionet.vq[RXQ].q_hva = NULL;
+               temp.vionet.vq[TXQ].q_hva = NULL;
+               temp.async_fd = -1;
+               temp.sync_fd = -1;
+               memset(&temp.async_iev, 0, sizeof(temp.async_iev));
+               memset(&temp.sync_iev, 0, sizeof(temp.sync_iev));
 
-       for (i = 0; i < nr_vionet; i++) {
-               vionet[i].vq[RXQ].q_hva = NULL;
-               vionet[i].vq[TXQ].q_hva = NULL;
+               if (atomicio(vwrite, fd, &temp, sizeof(temp)) != sizeof(temp)) {
+                       log_warnx("%s: error writing vionet to fd", __func__);
+                       return (-1);
+               }
        }
 
-       if (atomicio(vwrite, fd, vionet,
-           nr_vionet * sizeof(struct vionet_dev)) !=
-           nr_vionet * sizeof(struct vionet_dev)) {
-               log_warnx("%s: error writing vionet to fd", __func__);
-               return (-1);
-       }
        return (0);
 }
 
 int
 vioblk_dump(int fd)
 {
-       int i;
+       struct virtio_dev       *dev, temp;
+       struct viodev_msg        msg;
+       struct imsg              imsg;
+       struct imsgbuf          *ibuf = NULL;
+       size_t                   sz;
+       int                      ret;
+
+       log_debug("%s: dumping vioblk", __func__);
+
+       SLIST_FOREACH(dev, &virtio_devs, dev_next) {
+               if (dev->dev_type != VMD_DEVTYPE_DISK)
+                       continue;
+
+               memset(&msg, 0, sizeof(msg));
+               memset(&imsg, 0, sizeof(imsg));
+
+               ibuf = &dev->sync_iev.ibuf;
+               msg.type = VIODEV_MSG_DUMP;
+
+               ret = imsg_compose(ibuf, IMSG_DEVOP_MSG, 0, 0, -1, &msg,
+                   sizeof(msg));
+               if (ret == -1) {
+                       log_warnx("%s: failed requesting dump of vioblk[%d]",
+                           __func__, dev->vioblk.idx);
+                       return (-1);
+               }
+               if (imsg_flush(ibuf) == -1) {
+                       log_warnx("%s: imsg_flush", __func__);
+                       return (-1);
+               }
 
-       log_debug("%s: sending vioblk", __func__);
 
-       for (i = 0; i < nr_vioblk; i++)
-               vioblk[i].vq[0].q_hva = NULL;
+               sz = atomicio(read, dev->sync_fd, &temp, sizeof(temp));
+               if (sz != sizeof(temp)) {
+                       log_warnx("%s: failed to dump vioblk[%d]", __func__,
+                           dev->vioblk.idx);
+                       return (-1);
+               }
 
-       if (atomicio(vwrite, fd, vioblk,
-           nr_vioblk * sizeof(struct vioblk_dev)) !=
-           nr_vioblk * sizeof(struct vioblk_dev)) {
-               log_warnx("%s: error writing vioblk to fd", __func__);
-               return (-1);
+               temp.vioblk.vq[0].q_hva = NULL;
+               temp.async_fd = -1;
+               temp.sync_fd = -1;
+               memset(&temp.async_iev, 0, sizeof(temp.async_iev));
+               memset(&temp.sync_iev, 0, sizeof(temp.sync_iev));
+
+               if (atomicio(vwrite, fd, &temp, sizeof(temp)) != sizeof(temp)) {
+                       log_warnx("%s: error writing vioblk to fd", __func__);
+                       return (-1);
+               }
        }
+
        return (0);
 }
 
@@ -2338,12 +1262,15 @@ virtio_dump(int fd)
 void
 virtio_stop(struct vmd_vm *vm)
 {
-       uint8_t i;
-       for (i = 0; i < vm->vm_params.vmc_nnics; i++) {
-               if (event_del(&vionet[i].event)) {
-                       log_warn("could not initialize vionet event "
-                           "handler");
-                       return;
+       struct virtio_dev *dev;
+       int ret;
+
+       SLIST_FOREACH(dev, &virtio_devs, dev_next) {
+               ret = imsg_compose_event(&dev->async_iev, IMSG_VMDOP_PAUSE_VM,
+                   0, 0, -1, NULL, 0);
+               if (ret == -1) {
+                       log_warnx("%s: failed to compose pause msg to device",
+                               __func__);
                }
        }
 }
@@ -2351,12 +1278,498 @@ virtio_stop(struct vmd_vm *vm)
 void
 virtio_start(struct vmd_vm *vm)
 {
-       uint8_t i;
-       for (i = 0; i < vm->vm_params.vmc_nnics; i++) {
-               if (event_add(&vionet[i].event, NULL)) {
-                       log_warn("could not initialize vionet event "
-                           "handler");
+       struct virtio_dev *dev;
+       int ret;
+
+       SLIST_FOREACH(dev, &virtio_devs, dev_next) {
+               ret = imsg_compose_event(&dev->async_iev, IMSG_VMDOP_UNPAUSE_VM,
+                   0, 0, -1, NULL, 0);
+               if (ret == -1) {
+                       log_warnx("%s: failed to compose start msg to device",
+                           __func__);
+               }
+       }
+}
+
+/*
+ * Fork+exec a child virtio device. Returns 0 on success.
+ */
+static int
+virtio_dev_launch(struct vmd_vm *vm, struct virtio_dev *dev)
+{
+       char *nargv[8], num[32], t[2];
+       pid_t dev_pid;
+       int data_fds[VM_MAX_BASE_PER_DISK], sync_fds[2], async_fds[2], ret = 0;
+       size_t i, j, data_fds_sz, sz = 0;
+       struct virtio_dev *d = NULL;
+       struct viodev_msg msg;
+       struct imsg imsg;
+       struct imsgev *iev = &dev->sync_iev;
+
+       switch (dev->dev_type) {
+       case VMD_DEVTYPE_NET:
+               data_fds[0] = dev->vionet.data_fd;
+               data_fds_sz = 1;
+               log_debug("%s: launching vionet[%d]",
+                   vm->vm_params.vmc_params.vcp_name, dev->vionet.idx);
+               break;
+       case VMD_DEVTYPE_DISK:
+               memcpy(&data_fds, dev->vioblk.disk_fd, sizeof(data_fds));
+               data_fds_sz = dev->vioblk.ndisk_fd;
+               log_debug("%s: launching vioblk[%d]",
+                   vm->vm_params.vmc_params.vcp_name, dev->vioblk.idx);
+               break;
+               /* NOTREACHED */
+       default:
+               log_warn("%s: invalid device type", __func__);
+               return (EINVAL);
+       }
+
+       /* We need two channels: one synchronous (IO reads) and one async. */
+       if (socketpair(AF_UNIX, SOCK_STREAM, PF_UNSPEC, sync_fds) == -1) {
+               log_warn("failed to create socketpair");
+               return (errno);
+       }
+       if (socketpair(AF_UNIX, SOCK_STREAM, PF_UNSPEC, async_fds) == -1) {
+               log_warn("failed to create async socketpair");
+               return (errno);
+       }
+
+       /* Keep communication channels open after exec. */
+       if (fcntl(sync_fds[1], F_SETFD, 0)) {
+               ret = errno;
+               log_warn("%s: fcntl", __func__);
+               goto err;
+       }
+       if (fcntl(async_fds[1], F_SETFD, 0)) {
+               ret = errno;
+               log_warn("%s: fcnt", __func__);
+               goto err;
+       }
+
+       /* Keep data file descriptors open after exec. */
+       for (i = 0; i < data_fds_sz; i++) {
+               log_debug("%s: marking fd %d !close-on-exec", __func__,
+                   data_fds[i]);
+               if (fcntl(data_fds[i], F_SETFD, 0)) {
+                       ret = errno;
+                       log_warn("%s: fcntl", __func__);
+                       goto err;
+               }
+       }
+
+       /* Fork... */
+       dev_pid = fork();
+       if (dev_pid == -1) {
+               ret = errno;
+               log_warn("%s: fork failed", __func__);
+               goto err;
+       }
+
+       if (dev_pid > 0) {
+               /* Parent */
+               close_fd(sync_fds[1]);
+               close_fd(async_fds[1]);
+
+               /* Save the child's pid to help with cleanup. */
+               dev->dev_pid = dev_pid;
+
+               /* Set the channel fds to the child's before sending. */
+               dev->sync_fd = sync_fds[1];
+               dev->async_fd = async_fds[1];
+
+               /* Close data fds. Only the child device needs them now. */
+               for (i = 0; i < data_fds_sz; i++)
+                       close_fd(data_fds[i]);
+
+               /* Set our synchronous channel to non-blocking. */
+               if (fcntl(sync_fds[0], F_SETFL, O_NONBLOCK) == -1) {
+                       ret = errno;
+                       log_warn("%s: fcntl", __func__);
+                       goto err;
+               }
+
+               /* 1. Send over our configured device. */
+               log_debug("%s: sending '%c' type device struct", __func__,
+                       dev->dev_type);
+               sz = atomicio(vwrite, sync_fds[0], dev, sizeof(*dev));
+               if (sz != sizeof(*dev)) {
+                       log_warnx("%s: failed to send device", __func__);
+                       ret = EIO;
+                       goto err;
+               }
+
+               /* 2. Send over details on the VM (including memory fds). */
+               log_debug("%s: sending vm message for '%s'", __func__,
+                       vm->vm_params.vmc_params.vcp_name);
+               sz = atomicio(vwrite, sync_fds[0], vm, sizeof(*vm));
+               if (sz != sizeof(*vm)) {
+                       log_warnx("%s: failed to send vm details", __func__);
+                       ret = EIO;
+                       goto err;
+               }
+
+               /*
+                * Initialize our imsg channel to the child device. The initial
+                * communication will be synchronous. We expect the child to
+                * report itself "ready" to confirm the launch was a success.
+                */
+               imsg_init(&iev->ibuf, sync_fds[0]);
+               do
+                       ret = imsg_read(&iev->ibuf);
+               while (ret == -1 && errno == EAGAIN);
+               if (ret == 0 || ret == -1) {
+                       log_warnx("%s: failed to receive ready message from "
+                           "'%c' type device", __func__, dev->dev_type);
+                       ret = EIO;
+                       goto err;
+               }
+               ret = 0;
+
+               log_debug("%s: receiving reply", __func__);
+               if (imsg_get(&iev->ibuf, &imsg) < 1) {
+                       log_warnx("%s: imsg_get", __func__);
+                       ret = EIO;
+                       goto err;
+               }
+               IMSG_SIZE_CHECK(&imsg, &msg);
+               memcpy(&msg, imsg.data, sizeof(msg));
+               imsg_free(&imsg);
+
+               if (msg.type != VIODEV_MSG_READY) {
+                       log_warnx("%s: expected ready message, got type %d",
+                           __func__, msg.type);
+                       ret = EINVAL;
+                       goto err;
+               }
+               log_debug("%s: device reports ready via sync channel",
+                   __func__);
+
+               /*
+                * Wire in the async event handling, but after reverting back
+                * to the parent's fd's.
+                */
+               dev->sync_fd = sync_fds[0];
+               dev->async_fd = async_fds[0];
+               vm_device_pipe(dev, virtio_dispatch_dev);
+       } else {
+               /* Child */
+               close_fd(async_fds[0]);
+               close_fd(sync_fds[0]);
+
+               /*
+                * Close any other device fd's we know aren't
+                * ours. This releases any exclusive locks held on
+                * things like disk images.
+                */
+               SLIST_FOREACH(d, &virtio_devs, dev_next) {
+                       if (d == dev)
+                               continue;
+
+                       switch (d->dev_type) {
+                       case VMD_DEVTYPE_DISK:
+                               for (j = 0; j < d->vioblk.ndisk_fd; j++)
+                                       close_fd(d->vioblk.disk_fd[j]);
+                               break;
+                       case VMD_DEVTYPE_NET:
+                               close_fd(d->vionet.data_fd);
+                               break;
+                       default:
+                               fatalx("%s: invalid device type '%c'",
+                                   __func__, d->dev_type);
+                       }
+               }
+
+               memset(&nargv, 0, sizeof(nargv));
+               memset(num, 0, sizeof(num));
+               snprintf(num, sizeof(num), "%d", sync_fds[1]);
+
+               t[0] = dev->dev_type;
+               t[1] = '\0';
+
+               nargv[0] = env->argv0;
+               nargv[1] = "-X";
+               nargv[2] = num;
+               nargv[3] = "-t";
+               nargv[4] = t;
+               nargv[5] = "-n";
+
+               if (env->vmd_verbose) {
+                       nargv[6] = "-v";
+                       nargv[7] = NULL;
+               } else
+                       nargv[6] = NULL;
+
+               /* Control resumes in vmd.c:main(). */
+               execvp(nargv[0], nargv);
+
+               ret = errno;
+               log_warn("%s: failed to exec device", __func__);
+               _exit(ret);
+               /* NOTREACHED */
+       }
+
+       return (ret);
+
+err:
+       close_fd(sync_fds[0]);
+       close_fd(sync_fds[1]);
+       close_fd(async_fds[0]);
+       close_fd(async_fds[1]);
+       return (ret);
+}
+
+/*
+ * Initialize an async imsg channel for a virtio device.
+ */
+int
+vm_device_pipe(struct virtio_dev *dev, void (*cb)(int, short, void *))
+{
+       struct imsgev *iev = &dev->async_iev;
+       int fd = dev->async_fd;
+
+       log_debug("%s: initializing '%c' device pipe (fd=%d)", __func__,
+           dev->dev_type, fd);
+
+       if (fcntl(fd, F_SETFL, O_NONBLOCK) == -1) {
+               log_warn("failed to set nonblocking mode on vm device pipe");
+               return (-1);
+       }
+
+       imsg_init(&iev->ibuf, fd);
+       iev->handler = cb;
+       iev->data = dev;
+       iev->events = EV_READ;
+       imsg_event_add(iev);
+
+       return (0);
+}
+
+void
+virtio_dispatch_dev(int fd, short event, void *arg)
+{
+       struct virtio_dev       *dev = (struct virtio_dev*)arg;
+       struct imsgev           *iev = &dev->async_iev;
+       struct imsgbuf          *ibuf = &iev->ibuf;
+       struct imsg              imsg;
+       struct viodev_msg        msg;
+       ssize_t                  n = 0;
+
+       if (event & EV_READ) {
+               if ((n = imsg_read(ibuf)) == -1 && errno != EAGAIN)
+                       fatal("%s: imsg_read", __func__);
+               if (n == 0) {
+                       /* this pipe is dead, so remove the event handler */
+                       log_debug("%s: pipe dead (EV_READ)", __func__);
+                       event_del(&iev->ev);
+                       event_loopexit(NULL);
                        return;
                }
        }
+
+       if (event & EV_WRITE) {
+               if ((n = msgbuf_write(&ibuf->w)) == -1 && errno != EAGAIN)
+                       fatal("%s: msgbuf_write", __func__);
+               if (n == 0) {
+                       /* this pipe is dead, so remove the event handler */
+                       log_debug("%s: pipe dead (EV_WRITE)", __func__);
+                       event_del(&iev->ev);
+                       event_loopexit(NULL);
+                       return;
+               }
+       }
+
+       for (;;) {
+               if ((n = imsg_get(ibuf, &imsg)) == -1)
+                       fatal("%s: imsg_get", __func__);
+               if (n == 0)
+                       break;
+
+               switch (imsg.hdr.type) {
+               case IMSG_DEVOP_MSG:
+                       IMSG_SIZE_CHECK(&imsg, &msg);
+                       memcpy(&msg, imsg.data, sizeof(msg));
+                       handle_dev_msg(&msg, dev);
+                       break;
+               default:
+                       log_warnx("%s: got non devop imsg %d", __func__,
+                           imsg.hdr.type);
+                       break;
+               }
+               imsg_free(&imsg);
+       }
+       imsg_event_add(iev);
+}
+
+
+static int
+handle_dev_msg(struct viodev_msg *msg, struct virtio_dev *gdev)
+{
+       uint32_t vm_id = gdev->vm_id;
+       int irq = gdev->irq;
+
+       switch (msg->type) {
+       case VIODEV_MSG_KICK:
+               if (msg->state == INTR_STATE_ASSERT)
+                       vcpu_assert_pic_irq(vm_id, msg->vcpu, irq);
+               else if (msg->state == INTR_STATE_DEASSERT)
+                       vcpu_deassert_pic_irq(vm_id, msg->vcpu, irq);
+               break;
+       case VIODEV_MSG_READY:
+               log_debug("%s: device reports ready", __func__);
+               break;
+       case VIODEV_MSG_ERROR:
+               log_warnx("%s: device reported error", __func__);
+               break;
+       case VIODEV_MSG_INVALID:
+       case VIODEV_MSG_IO_READ:
+       case VIODEV_MSG_IO_WRITE:
+               /* FALLTHROUGH */
+       default:
+               log_warnx("%s: unsupported device message type %d", __func__,
+                   msg->type);
+               return (1);
+       }
+
+       return (0);
+};
+
+/*
+ * Called by the VM process while processing IO from the VCPU thread.
+ *
+ * N.b. Since the VCPU thread calls this function, we cannot mutate the event
+ * system. All ipc messages must be sent manually and cannot be queued for
+ * the event loop to push them. (We need to perform a synchronous read, so
+ * this isn't really a big deal.)
+ */
+int
+virtio_pci_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr,
+    void *cookie, uint8_t sz)
+{
+       struct virtio_dev *dev = (struct virtio_dev *)cookie;
+       struct imsgbuf *ibuf = &dev->sync_iev.ibuf;
+       struct imsg imsg;
+       struct viodev_msg msg;
+       ssize_t n;
+       int ret = 0;
+
+       memset(&msg, 0, sizeof(msg));
+       msg.reg = reg;
+       msg.io_sz = sz;
+
+       if (dir == 0) {
+               msg.type = VIODEV_MSG_IO_WRITE;
+               msg.data = *data;
+               msg.data_valid = 1;
+       } else
+               msg.type = VIODEV_MSG_IO_READ;
+
+       if (msg.type == VIODEV_MSG_IO_WRITE) {
+               /*
+                * Write request. No reply expected.
+                */
+               ret = imsg_compose(ibuf, IMSG_DEVOP_MSG, 0, 0, -1, &msg,
+                   sizeof(msg));
+               if (ret == -1) {
+                       log_warn("%s: failed to send async io event to vionet"
+                           " device", __func__);
+                       return (ret);
+               }
+               if (imsg_flush(ibuf) == -1) {
+                       log_warnx("%s: imsg_flush (write)", __func__);
+                       return (-1);
+               }
+       } else {
+               /*
+                * Read request. Requires waiting for a reply.
+                */
+               ret = imsg_compose(ibuf, IMSG_DEVOP_MSG, 0, 0, -1, &msg,
+                   sizeof(msg));
+               if (ret == -1) {
+                       log_warnx("%s: failed to send sync io event to vionet"
+                           " device", __func__);
+                       return (ret);
+               }
+               if (imsg_flush(ibuf) == -1) {
+                       log_warnx("%s: imsg_flush (read)", __func__);
+                       return (-1);
+               }
+
+               /* Read our reply. */
+               do
+                       n = imsg_read(ibuf);
+               while (n == -1 && errno == EAGAIN);
+               if (n == 0 || n == -1) {
+                       log_warn("%s: imsg_read (n=%ld)", __func__, n);
+                       return (-1);
+               }
+               if ((n = imsg_get(ibuf, &imsg)) == -1) {
+                       log_warn("%s: imsg_get (n=%ld)", __func__, n);
+                       return (-1);
+               }
+               if (n == 0) {
+                       log_warnx("%s: invalid imsg", __func__);
+                       return (-1);
+               }
+
+               IMSG_SIZE_CHECK(&imsg, &msg);
+               memcpy(&msg, imsg.data, sizeof(msg));
+               imsg_free(&imsg);
+
+               if (msg.type == VIODEV_MSG_IO_READ && msg.data_valid) {
+                       log_debug("%s: got sync read response (reg=%s)",
+                           __func__, virtio_reg_name(msg.reg));
+                       *data = msg.data;
+                       /*
+                        * It's possible we're asked to {de,}assert after the
+                        * device performs a register read.
+                        */
+                       if (msg.state == INTR_STATE_ASSERT)
+                               vcpu_assert_pic_irq(dev->vm_id, msg.vcpu, msg.irq);
+                       else if (msg.state == INTR_STATE_DEASSERT)
+                               vcpu_deassert_pic_irq(dev->vm_id, msg.vcpu, msg.irq);
+               } else {
+                       log_warnx("%s: expected IO_READ, got %d", __func__,
+                           msg.type);
+                       return (-1);
+               }
+       }
+
+       return (0);
+}
+
+void
+virtio_assert_pic_irq(struct virtio_dev *dev, int vcpu)
+{
+       struct viodev_msg msg;
+       int ret;
+
+       memset(&msg, 0, sizeof(msg));
+       msg.irq = dev->irq;
+       msg.vcpu = vcpu;
+       msg.type = VIODEV_MSG_KICK;
+       msg.state = INTR_STATE_ASSERT;
+
+       ret = imsg_compose_event(&dev->async_iev, IMSG_DEVOP_MSG, 0, 0, -1,
+           &msg, sizeof(msg));
+       if (ret == -1)
+               log_warnx("%s: failed to assert irq %d", __func__, dev->irq);
+}
+
+void
+virtio_deassert_pic_irq(struct virtio_dev *dev, int vcpu)
+{
+       struct viodev_msg msg;
+       int ret;
+
+       memset(&msg, 0, sizeof(msg));
+       msg.irq = dev->irq;
+       msg.vcpu = vcpu;
+       msg.type = VIODEV_MSG_KICK;
+       msg.state = INTR_STATE_DEASSERT;
+
+       ret = imsg_compose_event(&dev->async_iev, IMSG_DEVOP_MSG, 0, 0, -1,
+           &msg, sizeof(msg));
+       if (ret == -1)
+               log_warnx("%s: failed to deassert irq %d", __func__, dev->irq);
 }
index f090ed5..285c116 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: virtio.h,v 1.44 2023/04/25 12:46:13 dv Exp $  */
+/*     $OpenBSD: virtio.h,v 1.45 2023/04/27 22:47:27 dv Exp $  */
 
 /*
  * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
  */
 #define VIRTIO_MAX_QUEUES      3
 
+#define MAXPHYS        (64 * 1024)     /* max raw I/O transfer size */
+
 /*
  * Rename the address config register to be more descriptive.
  */
 #define VIRTIO_CONFIG_QUEUE_PFN        VIRTIO_CONFIG_QUEUE_ADDRESS
 
+/*
+ * VM <-> Device messaging.
+ */
+struct viodev_msg {
+       uint8_t type;
+#define VIODEV_MSG_INVALID     0
+#define VIODEV_MSG_READY       1
+#define VIODEV_MSG_ERROR       2
+#define VIODEV_MSG_KICK                3
+#define VIODEV_MSG_IO_READ     4
+#define VIODEV_MSG_IO_WRITE    5
+#define VIODEV_MSG_DUMP                6
+#define VIODEV_MSG_SHUTDOWN    7
+
+       uint16_t reg;           /* VirtIO register */
+       uint8_t io_sz;          /* IO instruction size */
+       uint8_t vcpu;           /* VCPU id */
+       uint8_t irq;            /* IRQ number */
+
+       int8_t state;           /* Interrupt state toggle (if any) */
+#define INTR_STATE_ASSERT       1
+#define INTR_STATE_NOOP                 0
+#define INTR_STATE_DEASSERT    -1
+
+       uint32_t data;          /* Data (if any) */
+       uint8_t data_valid;     /* 1 if data field is populated. */
+} __packed;
+
 /*
  * This struct stores notifications from a virtio driver. There is
  * one such struct per virtio device.
@@ -177,16 +207,15 @@ struct viornd_dev {
 
 struct vioblk_dev {
        struct virtio_io_cfg cfg;
-
        struct virtio_vq_info vq[VIRTIO_MAX_QUEUES];
        struct virtio_backing file;
 
-       uint64_t sz;
+       int disk_fd[VM_MAX_BASE_PER_DISK];      /* fds for disk image(s) */
+       uint8_t ndisk_fd;       /* number of valid disk fds */
+       uint64_t sz;            /* size in 512 byte sectors */
        uint32_t max_xfer;
 
-       uint8_t pci_id;
-       int irq;
-       uint32_t vm_id;
+       unsigned int idx;
 };
 
 /* vioscsi will use at least 3 queues - 5.6.2 Virtqueues
@@ -218,26 +247,40 @@ struct vioscsi_dev {
 };
 
 struct vionet_dev {
-       pthread_mutex_t mutex;
-       struct event event;
-
        struct virtio_io_cfg cfg;
-
        struct virtio_vq_info vq[VIRTIO_MAX_QUEUES];
 
-       int fd;
-       uint32_t vm_id;
-       uint32_t vm_vmid;
-       int irq;
+       int data_fd;            /* fd for our tap device */
+
        uint8_t mac[6];
        uint8_t hostmac[6];
-
-       int idx;
        int lockedmac;
        int local;
        int pxeboot;
 
+       unsigned int idx;
+};
+
+struct virtio_dev {
+       union {
+               struct vioblk_dev vioblk;
+               struct vionet_dev vionet;
+       };
+
+       struct imsgev async_iev;
+       struct imsgev sync_iev;
+
+       int sync_fd;            /* fd for synchronous channel */
+       int async_fd;           /* fd for async channel */
+
        uint8_t pci_id;
+       uint32_t vm_id;
+       uint32_t vm_vmid;
+       int irq;
+
+       pid_t dev_pid;
+       char dev_type;
+       SLIST_ENTRY(virtio_dev) dev_next;
 };
 
 struct virtio_net_hdr {
@@ -290,7 +333,12 @@ void virtio_shutdown(struct vmd_vm *);
 int virtio_dump(int);
 int virtio_restore(int, struct vmd_vm *, int, int[][VM_MAX_BASE_PER_DISK],
     int *);
+const char *virtio_reg_name(uint8_t);
 uint32_t vring_size(uint32_t);
+int vm_device_pipe(struct virtio_dev *, void (*)(int, short, void *));
+int virtio_pci_io(int, uint16_t, uint32_t *, uint8_t *, void *, uint8_t);
+void virtio_assert_pic_irq(struct virtio_dev *, int);
+void virtio_deassert_pic_irq(struct virtio_dev *, int);
 
 int virtio_rnd_io(int, uint16_t, uint32_t *, uint8_t *, void *, uint8_t);
 int viornd_dump(int);
@@ -305,21 +353,19 @@ int virtio_qcow2_init(struct virtio_backing *, off_t *, int*, size_t);
 int virtio_raw_create(const char *, uint64_t);
 int virtio_raw_init(struct virtio_backing *, off_t *, int*, size_t);
 
-int virtio_blk_io(int, uint16_t, uint32_t *, uint8_t *, void *, uint8_t);
 int vioblk_dump(int);
 int vioblk_restore(int, struct vmd_vm *, int[][VM_MAX_BASE_PER_DISK]);
 void vioblk_update_qs(struct vioblk_dev *);
 void vioblk_update_qa(struct vioblk_dev *);
 int vioblk_notifyq(struct vioblk_dev *);
 
-int virtio_net_io(int, uint16_t, uint32_t *, uint8_t *, void *, uint8_t);
 int vionet_dump(int);
 int vionet_restore(int, struct vmd_vm *, int *);
 void vionet_update_qs(struct vionet_dev *);
 void vionet_update_qa(struct vionet_dev *);
-int vionet_notifyq(struct vionet_dev *);
-void vionet_notify_rx(struct vionet_dev *);
-int vionet_notify_tx(struct vionet_dev *);
+int vionet_notifyq(struct virtio_dev *);
+void vionet_notify_rx(struct virtio_dev *);
+int vionet_notify_tx(struct virtio_dev *);
 void vionet_process_rx(uint32_t);
 int vionet_enq_rx(struct vionet_dev *, char *, size_t, int *);
 void vionet_set_hostmac(struct vmd_vm *, unsigned int, uint8_t *);
@@ -336,7 +382,7 @@ int vioscsi_dump(int);
 int vioscsi_restore(int, struct vmd_vm *, int);
 
 /* dhcp.c */
-ssize_t dhcp_request(struct vionet_dev *, char *, size_t, char **);
+ssize_t dhcp_request(struct virtio_dev *, char *, size_t, char **);
 
 /* vioscsi.c */
 int vioscsi_io(int, uint16_t, uint32_t *, uint8_t *, void *, uint8_t);
index 319cee8..39f5544 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: vm.c,v 1.86 2023/04/25 12:46:13 dv Exp $      */
+/*     $OpenBSD: vm.c,v 1.87 2023/04/27 22:47:27 dv Exp $      */
 
 /*
  * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
@@ -81,8 +81,8 @@ void *vcpu_run_loop(void *);
 int vcpu_exit(struct vm_run_params *);
 int vcpu_reset(uint32_t, uint32_t, struct vcpu_reg_state *);
 void create_memory_map(struct vm_create_params *);
-int alloc_guest_mem(struct vm_create_params *);
 static int vmm_create_vm(struct vmd_vm *);
+int alloc_guest_mem(struct vmd_vm *);
 void init_emulated_hw(struct vmop_create_params *, int,
     int[][VM_MAX_BASE_PER_DISK], int *);
 void restore_emulated_hw(struct vm_create_params *, int, int *,
@@ -230,8 +230,8 @@ vm_main(int fd)
        /*
         * We aren't root, so we can't chroot(2). Use unveil(2) instead.
         */
-       if (unveil("/var/empty", "") == -1)
-               fatal("unveil /var/empty");
+       if (unveil(env->argv0, "x") == -1)
+               fatal("unveil %s", env->argv0);
        if (unveil(NULL, NULL) == -1)
                fatal("unveil lock");
 
@@ -239,10 +239,11 @@ vm_main(int fd)
         * pledge in the vm processes:
         * stdio - for malloc and basic I/O including events.
         * vmm - for the vmm ioctls and operations.
+        * proc exec - fork/exec for launching devices.
         * recvfd - for vm send/recv and sending fd to devices.
-        * proc - required for vmm(4) VMM_IOC_CREATE ioctl
+        * tmppath/rpath - for shm_mkstemp, ftruncate, unlink
         */
-       if (pledge("stdio vmm recvfd proc", NULL) == -1)
+       if (pledge("stdio vmm proc exec recvfd tmppath rpath", NULL) == -1)
                fatal("pledge");
 
        /* Receive our vm configuration. */
@@ -372,7 +373,7 @@ start_vm(struct vmd_vm *vm, int fd)
        if (!(vm->vm_state & VM_STATE_RECEIVED))
                create_memory_map(vcp);
 
-       ret = alloc_guest_mem(&vm->vm_params.vmc_params);
+       ret = alloc_guest_mem(vm);
        if (ret) {
                struct rlimit lim;
                char buf[FMT_SCALED_STRSIZE];
@@ -395,10 +396,6 @@ start_vm(struct vmd_vm *vm, int fd)
                return (ret);
        }
 
-       /* Tighten pledge now that we've called VMM_IOC_CREATE ioctl. */
-       if (pledge("stdio vmm recvfd", NULL) == -1)
-               fatal("pledge");
-
        /*
         * Some of vmd currently relies on global state (current_vm, con_fd).
         */
@@ -487,15 +484,19 @@ start_vm(struct vmd_vm *vm, int fd)
                nicfds[i] = vm->vm_ifs[i].vif_fd;
 
        if (vm->vm_state & VM_STATE_RECEIVED) {
+               restore_mem(vm->vm_receive_fd, vcp);
                restore_emulated_hw(vcp, vm->vm_receive_fd, nicfds,
                    vm->vm_disks, vm->vm_cdrom);
-               restore_mem(vm->vm_receive_fd, vcp);
                if (restore_vm_params(vm->vm_receive_fd, vcp))
                        fatal("restore vm params failed");
                unpause_vm(vm);
        } else
                init_emulated_hw(vmc, vm->vm_cdrom, vm->vm_disks, nicfds);
 
+       /* Drop privleges further before starting the vcpu run loop(s). */
+       if (pledge("stdio vmm recvfd", NULL) == -1)
+               fatal("pledge");
+
        /*
         * Execute the vcpu run loop(s) for this VM.
         */
@@ -653,7 +654,7 @@ send_vm(int fd, struct vmd_vm *vm)
        size_t                     sz;
 
        if (dump_send_header(fd)) {
-               log_info("%s: failed to send vm dump header", __func__);
+               log_warnx("%s: failed to send vm dump header", __func__);
                goto err;
        }
 
@@ -697,6 +698,9 @@ send_vm(int fd, struct vmd_vm *vm)
                }
        }
 
+       /* Dump memory before devices to aid in restoration. */
+       if ((ret = dump_mem(fd, vm)))
+               goto err;
        if ((ret = i8253_dump(fd)))
                goto err;
        if ((ret = i8259_dump(fd)))
@@ -711,8 +715,6 @@ send_vm(int fd, struct vmd_vm *vm)
                goto err;
        if ((ret = virtio_dump(fd)))
                goto err;
-       if ((ret = dump_mem(fd, vm)))
-               goto err;
 
        for (i = 0; i < vm->vm_params.vmc_params.vcp_ncpus; i++) {
                vpp.vpp_vcpu_id = i;
@@ -1086,31 +1088,67 @@ create_memory_map(struct vm_create_params *vcp)
  *  !0: failure - errno indicating the source of the failure
  */
 int
-alloc_guest_mem(struct vm_create_params *vcp)
+alloc_guest_mem(struct vmd_vm *vm)
 {
        void *p;
-       int ret;
+       char *tmp;
+       int fd, ret = 0;
        size_t i, j;
+       struct vm_create_params *vcp = &vm->vm_params.vmc_params;
        struct vm_mem_range *vmr;
 
+       tmp = calloc(32, sizeof(char));
+       if (tmp == NULL) {
+               ret = errno;
+               log_warn("%s: calloc", __func__);
+               return (ret);
+       }
+       strlcpy(tmp, "/tmp/vmd.XXXXXXXXXX", 32);
+
+       vm->vm_nmemfds = vcp->vcp_nmemranges;
+
        for (i = 0; i < vcp->vcp_nmemranges; i++) {
                vmr = &vcp->vcp_memranges[i];
+
+               fd = shm_mkstemp(tmp);
+               if (fd < 0) {
+                       ret = errno;
+                       log_warn("%s: shm_mkstemp", __func__);
+                       return (ret);
+               }
+               if (ftruncate(fd, vmr->vmr_size) == -1) {
+                       ret = errno;
+                       log_warn("%s: ftruncate", __func__);
+                       goto out;
+               }
+               if (fcntl(fd, F_SETFD, 0) == -1) {
+                       ret = errno;
+                       log_warn("%s: fcntl", __func__);
+                       goto out;
+               }
+               if (shm_unlink(tmp) == -1) {
+                       ret = errno;
+                       log_warn("%s: shm_unlink", __func__);
+                       goto out;
+               }
+               strlcpy(tmp, "/tmp/vmd.XXXXXXXXXX", 32);
+
                p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE,
-                   MAP_PRIVATE | MAP_ANON, -1, 0);
+                   MAP_SHARED | MAP_CONCEAL, fd, 0);
                if (p == MAP_FAILED) {
                        ret = errno;
                        for (j = 0; j < i; j++) {
                                vmr = &vcp->vcp_memranges[j];
                                munmap((void *)vmr->vmr_va, vmr->vmr_size);
                        }
-
-                       return (ret);
+                       goto out;
                }
-
+               vm->vm_memfds[i] = fd;
                vmr->vmr_va = (vaddr_t)p;
        }
-
-       return (0);
+out:
+       free(tmp);
+       return (ret);
 }
 
 /*
@@ -2499,3 +2537,60 @@ vm_pipe_recv(struct vm_dev_pipe *p)
 
        return msg;
 }
+
+/*
+ * Re-map the guest address space using the shared memory file descriptor.
+ *
+ * Returns 0 on success, non-zero in event of failure.
+ */
+int
+remap_guest_mem(struct vmd_vm *vm)
+{
+       struct vm_create_params *vcp;
+       struct vm_mem_range     *vmr;
+       size_t                   i, j;
+       void                    *p = NULL;
+       int                      ret;
+
+       if (vm == NULL)
+               return (1);
+
+       vcp = &vm->vm_params.vmc_params;
+
+       /*
+        * We've execve'd, so we need to re-map the guest VM memory. Iterate
+        * over all possible vm_mem_range entries so we can initialize all
+        * file descriptors to a value.
+        */
+       for (i = 0; i < VMM_MAX_MEM_RANGES; i++) {
+               if (i < vcp->vcp_nmemranges) {
+                       vmr = &vcp->vcp_memranges[i];
+                       /* Skip ranges we know we don't need right now. */
+                       if (vmr->vmr_type == VM_MEM_MMIO) {
+                               log_debug("%s: skipping range i=%ld, type=%d",
+                                   __func__, i, vmr->vmr_type);
+                               vm->vm_memfds[i] = -1;
+                               continue;
+                       }
+                       /* Re-mmap the memrange. */
+                       p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE,
+                           MAP_SHARED | MAP_CONCEAL, vm->vm_memfds[i], 0);
+                       if (p == MAP_FAILED) {
+                               ret = errno;
+                               log_warn("%s: mmap", __func__);
+                               for (j = 0; j < i; j++) {
+                                       vmr = &vcp->vcp_memranges[j];
+                                       munmap((void *)vmr->vmr_va,
+                                           vmr->vmr_size);
+                               }
+                               return (ret);
+                       }
+                       vmr->vmr_va = (vaddr_t)p;
+               } else {
+                       /* Initialize with an invalid fd. */
+                       vm->vm_memfds[i] = -1;
+               }
+       }
+
+       return (0);
+}
index 1246c91..a9124f9 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: vmd.c,v 1.144 2023/04/25 12:46:13 dv Exp $    */
+/*     $OpenBSD: vmd.c,v 1.145 2023/04/27 22:47:27 dv Exp $    */
 
 /*
  * Copyright (c) 2015 Reyk Floeter <reyk@openbsd.org>
@@ -774,13 +774,14 @@ main(int argc, char **argv)
        int                      proc_instance = 0, vm_launch = 0, vm_fd = -1;
        const char              *errp, *title = NULL;
        int                      argc0 = argc;
+       char                     dev_type = '\0';
 
        log_init(0, LOG_DAEMON);
 
        if ((env = calloc(1, sizeof(*env))) == NULL)
                fatal("calloc: env");
 
-       while ((ch = getopt(argc, argv, "D:P:I:V:df:vn")) != -1) {
+       while ((ch = getopt(argc, argv, "D:P:I:V:X:df:nt:v")) != -1) {
                switch (ch) {
                case 'D':
                        if (cmdline_symset(optarg) < 0)
@@ -812,13 +813,28 @@ main(int argc, char **argv)
                        if (errp)
                                fatalx("invalid process instance");
                        break;
-               /* child vm fork/exec */
+               /* child vm and device fork/exec */
                case 'V':
                        vm_launch = VMD_LAUNCH_VM;
                        vm_fd = strtonum(optarg, 0, 128, &errp);
                        if (errp)
                                fatalx("invalid vm fd");
                        break;
+               case 'X':
+                       vm_launch = VMD_LAUNCH_DEV;
+                       vm_fd = strtonum(optarg, 0, 128, &errp);
+                       if (errp)
+                               fatalx("invalid device fd");
+                       break;
+               case 't':
+                       dev_type = *optarg;
+                       switch (dev_type) {
+                       case VMD_DEVTYPE_NET:
+                       case VMD_DEVTYPE_DISK:
+                               break;
+                       default: fatalx("invalid device type");
+                       }
+                       break;
                default:
                        usage();
                }
@@ -865,6 +881,15 @@ main(int argc, char **argv)
        if (vm_launch == VMD_LAUNCH_VM) {
                vm_main(vm_fd);
                /* NOTREACHED */
+       } else if (vm_launch == VMD_LAUNCH_DEV) {
+               if (dev_type == VMD_DEVTYPE_NET) {
+                       vionet_main(vm_fd);
+                       /* NOTREACHED */
+               } else if (dev_type == VMD_DEVTYPE_DISK) {
+                       vioblk_main(vm_fd);
+                       /* NOTREACHED */
+               }
+               fatalx("unsupported device type '%c'", dev_type);
        }
 
        /* Open /dev/vmm early. */
index ffa7c01..cb77fda 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: vmd.h,v 1.119 2023/04/26 16:12:21 mlarkin Exp $       */
+/*     $OpenBSD: vmd.h,v 1.120 2023/04/27 22:47:27 dv Exp $    */
 
 /*
  * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
 
 /* Launch mode identifiers for when a vm fork+exec's. */
 #define VMD_LAUNCH_VM          1
+#define VMD_LAUNCH_DEV         2
+
+#define VMD_DEVTYPE_NET                'n'
+#define VMD_DEVTYPE_DISK       'd'
 
 /* Rate-limit fast reboots */
 #define VM_START_RATE_SEC      6       /* min. seconds since last reboot */
@@ -137,7 +141,10 @@ enum imsg_type {
        IMSG_VMDOP_VM_SHUTDOWN,
        IMSG_VMDOP_VM_REBOOT,
        IMSG_VMDOP_CONFIG,
-       IMSG_VMDOP_DONE
+       IMSG_VMDOP_DONE,
+       /* Device Operation Messages */
+       IMSG_DEVOP_HOSTMAC,
+       IMSG_DEVOP_MSG,
 };
 
 struct vmop_result {
@@ -319,6 +326,9 @@ struct vmd_vm {
        struct timeval           vm_start_tv;
        int                      vm_start_limit;
 
+       int                      vm_memfds[VMM_MAX_MEM_RANGES];
+       size_t                   vm_nmemfds;
+
        TAILQ_ENTRY(vmd_vm)      vm_entry;
 };
 TAILQ_HEAD(vmlist, vmd_vm);
@@ -486,6 +496,7 @@ void         vm_pipe_send(struct vm_dev_pipe *, enum pipe_msg_type);
 enum pipe_msg_type vm_pipe_recv(struct vm_dev_pipe *);
 int     write_mem(paddr_t, const void *buf, size_t);
 void*   hvaddr_mem(paddr_t, size_t);
+int     remap_guest_mem(struct vmd_vm *);
 
 /* config.c */
 int     config_init(struct vmd *);
@@ -512,4 +523,10 @@ int         host(const char *, struct address *);
 /* virtio.c */
 int     virtio_get_base(int, char *, size_t, int, const char *);
 
+/* vionet.c */
+__dead void vionet_main(int);
+
+/* vioblk.c */
+__dead void vioblk_main(int);
+
 #endif /* VMD_H */
index df4ae5a..3511967 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: vmm.c,v 1.110 2023/04/25 12:46:13 dv Exp $    */
+/*     $OpenBSD: vmm.c,v 1.111 2023/04/27 22:47:27 dv Exp $    */
 
 /*
  * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
@@ -627,7 +627,7 @@ vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *pid)
 {
        struct vm_create_params *vcp;
        struct vmd_vm           *vm;
-       char                    *nargv[5], num[32];
+       char                    *nargv[6], num[32];
        int                      fd, ret = EINVAL;
        int                      fds[2];
        pid_t                    vm_pid;
@@ -770,6 +770,7 @@ vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *pid)
                 * Prepare our new argv for execvp(2) with the fd of our open
                 * pipe to the parent/vmm process as an argument.
                 */
+               memset(&nargv, 0, sizeof(nargv));
                memset(num, 0, sizeof(num));
                snprintf(num, sizeof(num), "%d", fds[1]);
 
@@ -777,7 +778,12 @@ vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *pid)
                nargv[1] = "-V";
                nargv[2] = num;
                nargv[3] = "-n";
-               nargv[4] = NULL;
+
+               if (env->vmd_verbose) {
+                       nargv[4] = "-v";
+                       nargv[5] = NULL;
+               } else
+                       nargv[4] = NULL;
 
                /* Control resumes in vmd main(). */
                execvp(nargv[0], nargv);