From 3c817da7c9b8ff92fe5ef180a7138775dcfc6671 Mon Sep 17 00:00:00 2001 From: dv Date: Sat, 13 May 2023 23:15:28 +0000 Subject: [PATCH] vmm(4)/vmd(8): switch to anonymous shared mappings. While splitting out emulated virtio network and block devices into separate processes, I originally used named mappings via shm_mkstemp(3). While this functionally achieved the desired result, it had two unintended consequences: 1) tearing down a vm process and its child processes required excessive locking as the guest memory was tied into the VFS layer. 2) it was observed by mlarkin@ that actions in other parts of the VFS layer could cause some of the guest memory to flush to storage, possibly filling /tmp. This commit adds a new vmm(4) ioctl dedicated to allowing a process request the kernel share a mapping of guest memory into its own vm space. This requires an open fd to /dev/vmm (requiring root) and both the "vmm" and "proc" pledge(2) promises. In addition, the caller must know enough about the original memory ranges to reconstruct them to make the vm's ranges. Tested with help from Mischa Peters. ok mlarkin@ --- regress/sys/arch/amd64/vmm/vcpu.c | 81 ++++++++++++++++--- sys/dev/vmm/vmm.c | 85 +++++++++++++++++++- sys/dev/vmm/vmm.h | 12 ++- usr.sbin/vmd/vioblk.c | 25 ++++-- usr.sbin/vmd/vionet.c | 29 +++++-- usr.sbin/vmd/virtio.c | 16 ++-- usr.sbin/vmd/vm.c | 126 ++++++++++++++---------------- usr.sbin/vmd/vmd.c | 20 +++-- usr.sbin/vmd/vmd.h | 13 ++- usr.sbin/vmd/vmm.c | 24 +++--- 10 files changed, 299 insertions(+), 132 deletions(-) diff --git a/regress/sys/arch/amd64/vmm/vcpu.c b/regress/sys/arch/amd64/vmm/vcpu.c index f221b58f75c..84bd9492a01 100644 --- a/regress/sys/arch/amd64/vmm/vcpu.c +++ b/regress/sys/arch/amd64/vmm/vcpu.c @@ -1,4 +1,4 @@ -/* $OpenBSD: vcpu.c,v 1.5 2023/04/27 05:42:44 anton Exp $ */ +/* $OpenBSD: vcpu.c,v 1.6 2023/05/13 23:15:28 dv Exp $ */ /* * Copyright (c) 2022 Dave Voutila @@ -83,6 +83,7 @@ main(int argc, char **argv) struct vm_resetcpu_params vresetp; struct vm_run_params vrunp; struct vm_terminate_params vtp; + struct vm_sharemem_params vsp; struct vm_mem_range *vmr; int fd, ret = 1; @@ -127,8 +128,9 @@ main(int argc, char **argv) ((uint8_t*)p)[j + 1] = PCKBC_AUX; } vmr->vmr_va = (vaddr_t)p; - printf("mapped region %zu: { gpa: 0x%08lx, size: %lu }\n", - i, vmr->vmr_gpa, vmr->vmr_size); + printf("created mapped region %zu: { gpa: 0x%08lx, size: %lu," + " hva: 0x%lx }\n", i, vmr->vmr_gpa, vmr->vmr_size, + vmr->vmr_va); } if (ioctl(fd, VMM_IOC_CREATE, &vcp) == -1) @@ -136,7 +138,54 @@ main(int argc, char **argv) printf("created vm %d named \"%s\"\n", vcp.vcp_id, vcp.vcp_name); /* - * 2. Check that our VM exists. + * 2. Check we can create shared memory mappings. + */ + memset(&vsp, 0, sizeof(vsp)); + vsp.vsp_nmemranges = vcp.vcp_nmemranges; + memcpy(&vsp.vsp_memranges, &vcp.vcp_memranges, + sizeof(vsp.vsp_memranges)); + vsp.vsp_vm_id = vcp.vcp_id; + + /* Find some new va ranges... */ + for (i = 0; i < vsp.vsp_nmemranges; i++) { + vmr = &vsp.vsp_memranges[i]; + p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON, -1, 0); + if (p == MAP_FAILED) + err(1, "mmap"); + vmr->vmr_va = (vaddr_t)p; + } + + /* Release our mappings so vmm can replace them. */ + for (i = 0; i < vsp.vsp_nmemranges; i++) { + vmr = &vsp.vsp_memranges[i]; + munmap((void*)vmr->vmr_va, vmr->vmr_size); + } + + /* Perform the shared mapping. */ + if (ioctl(fd, VMM_IOC_SHAREMEM, &vsp) == -1) + err(1, "VMM_IOC_SHAREMEM"); + printf("created shared memory mappings\n"); + + /* We should see our reset vector instructions in the new mappings. */ + for (i = 0; i < vsp.vsp_nmemranges; i++) { + vmr = &vsp.vsp_memranges[i]; + p = (void*)vmr->vmr_va; + + for (j = 0; j < vmr->vmr_size; j += 2) { + if (((uint8_t*)p)[j + 0] != 0xE4) + errx(1, "bad byte"); + if (((uint8_t*)p)[j + 1] != PCKBC_AUX) + errx(1, "bad byte"); + } + printf("checked shared region %zu: { gpa: 0x%08lx, size: %lu," + " hva: 0x%lx }\n", i, vmr->vmr_gpa, vmr->vmr_size, + vmr->vmr_va); + } + printf("validated shared memory mappings\n"); + + /* + * 3. Check that our VM exists. */ memset(&vip, 0, sizeof(vip)); vip.vip_size = 0; @@ -189,7 +238,7 @@ main(int argc, char **argv) ours = NULL; /* - * 3. Reset our VCPU and initialize register state. + * 4. Reset our VCPU and initialize register state. */ memset(&vresetp, 0, sizeof(vresetp)); vresetp.vrp_vm_id = vcp.vcp_id; @@ -205,7 +254,7 @@ main(int argc, char **argv) vresetp.vrp_vm_id); /* - * 4. Run the vcpu, expecting an immediate exit for IO assist. + * 5. Run the vcpu, expecting an immediate exit for IO assist. */ exit = malloc(sizeof(*exit)); if (exit == NULL) { @@ -258,7 +307,7 @@ main(int argc, char **argv) out: /* - * 5. Terminate our VM and clean up. + * 6. Terminate our VM and clean up. */ memset(&vtp, 0, sizeof(vtp)); vtp.vtp_vm_id = vcp.vcp_id; @@ -277,12 +326,22 @@ out: vmr = &vcp.vcp_memranges[i]; if (vmr->vmr_va) { if (munmap((void *)vmr->vmr_va, vmr->vmr_size)) { - warn("failed to unmap region %zu at 0x%08lx", - i, vmr->vmr_va); + warn("failed to unmap orginal region %zu @ hva " + "0x%lx", i, vmr->vmr_va); + ret = 1; + } else + printf("unmapped origin region %zu @ hva " + "0x%lx\n", i, vmr->vmr_va); + } + vmr = &vsp.vsp_memranges[i]; + if (vmr->vmr_va) { + if (munmap((void *)vmr->vmr_va, vmr->vmr_size)) { + warn("failed to unmap shared region %zu @ hva " + "0x%lx", i, vmr->vmr_va); ret = 1; } else - printf("unmapped region %zu @ gpa 0x%08lx\n", - i, vmr->vmr_gpa); + printf("unmapped shared region %zu @ hva " + "0x%lx\n", i, vmr->vmr_va); } } diff --git a/sys/dev/vmm/vmm.c b/sys/dev/vmm/vmm.c index d46b3431081..4d4866f70dc 100644 --- a/sys/dev/vmm/vmm.c +++ b/sys/dev/vmm/vmm.c @@ -1,4 +1,4 @@ -/* $OpenBSD: vmm.c,v 1.1 2023/04/26 15:11:21 mlarkin Exp $ */ +/* $OpenBSD: vmm.c,v 1.2 2023/05/13 23:15:28 dv Exp $ */ /* * Copyright (c) 2014-2023 Mike Larkin * @@ -262,6 +262,9 @@ vmmioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p) case VMM_IOC_WRITEVMPARAMS: ret = vm_rwvmparams((struct vm_rwvmparams_params *)data, 1); break; + case VMM_IOC_SHAREMEM: + ret = vm_share_mem((struct vm_sharemem_params *)data, p); + break; default: ret = vmmioctl_machdep(dev, cmd, data, flag, p); break; @@ -286,6 +289,7 @@ pledge_ioctl_vmm(struct proc *p, long com) switch (com) { case VMM_IOC_CREATE: case VMM_IOC_INFO: + case VMM_IOC_SHAREMEM: /* The "parent" process in vmd forks and manages VMs */ if (p->p_p->ps_pledge & PLEDGE_PROC) return (0); @@ -780,3 +784,82 @@ vcpu_must_stop(struct vcpu *vcpu) return (1); return (0); } + +/* + * vm_share_mem + * + * Share a uvm mapping for the vm guest memory ranges into the calling process. + * + * Return values: + * 0: if successful + * ENOENT: if the vm cannot be found by vm_find + * EPERM: if the vm cannot be accessed by the current process + * EINVAL: if the provide memory ranges fail checks + * ENOMEM: if uvm_share fails to find available memory in the destination map + */ +int +vm_share_mem(struct vm_sharemem_params *vsp, struct proc *p) +{ + int ret = EINVAL; + size_t i, n; + struct vm *vm; + struct vm_mem_range *src, *dst; + + ret = vm_find(vsp->vsp_vm_id, &vm); + if (ret) + return (ret); + + /* Check we have the expected number of ranges. */ + if (vm->vm_nmemranges != vsp->vsp_nmemranges) + goto out; + n = vm->vm_nmemranges; + + /* Check their types, sizes, and gpa's (implying page alignment). */ + for (i = 0; i < n; i++) { + src = &vm->vm_memranges[i]; + dst = &vsp->vsp_memranges[i]; + + /* + * The vm memranges were already checked during creation, so + * compare to them to confirm validity of mapping request. + */ + if (src->vmr_type != dst->vmr_type) + goto out; + if (src->vmr_gpa != dst->vmr_gpa) + goto out; + if (src->vmr_size != dst->vmr_size) + goto out; + + /* Check our intended destination is page-aligned. */ + if (dst->vmr_va & PAGE_MASK) + goto out; + } + + /* + * Share each range individually with the calling process. We do + * not need PROC_EXEC as the emulated devices do not need to execute + * instructions from guest memory. + */ + for (i = 0; i < n; i++) { + src = &vm->vm_memranges[i]; + dst = &vsp->vsp_memranges[i]; + + /* Skip MMIO range. */ + if (src->vmr_type == VM_MEM_MMIO) + continue; + + DPRINTF("sharing gpa=0x%lx for pid %d @ va=0x%lx\n", + src->vmr_gpa, p->p_p->ps_pid, dst->vmr_va); + ret = uvm_share(&p->p_vmspace->vm_map, dst->vmr_va, + PROT_READ | PROT_WRITE, vm->vm_map, src->vmr_gpa, + src->vmr_size); + if (ret) { + printf("%s: uvm_share failed (%d)\n", __func__, ret); + break; + } + } + ret = 0; +out: + refcnt_rele_wake(&vm->vm_refcnt); + return (ret); +} diff --git a/sys/dev/vmm/vmm.h b/sys/dev/vmm/vmm.h index d2355d42b44..38b4a3f85f1 100644 --- a/sys/dev/vmm/vmm.h +++ b/sys/dev/vmm/vmm.h @@ -1,4 +1,4 @@ -/* $OpenBSD: vmm.h,v 1.2 2023/04/26 16:13:19 mlarkin Exp $ */ +/* $OpenBSD: vmm.h,v 1.3 2023/05/13 23:15:28 dv Exp $ */ /* * Copyright (c) 2014-2023 Mike Larkin * @@ -76,6 +76,13 @@ struct vm_resetcpu_params { struct vcpu_reg_state vrp_init_state; }; +struct vm_sharemem_params { + /* Input parameters to VMM_IOC_SHAREMEM */ + uint32_t vsp_vm_id; + size_t vsp_nmemranges; + struct vm_mem_range vsp_memranges[VMM_MAX_MEM_RANGES]; +}; + /* IOCTL definitions */ #define VMM_IOC_CREATE _IOWR('V', 1, struct vm_create_params) /* Create VM */ #define VMM_IOC_RUN _IOWR('V', 2, struct vm_run_params) /* Run VCPU */ @@ -88,7 +95,7 @@ struct vm_resetcpu_params { #define VMM_IOC_READVMPARAMS _IOWR('V', 9, struct vm_rwvmparams_params) /* Set VM params */ #define VMM_IOC_WRITEVMPARAMS _IOW('V', 10, struct vm_rwvmparams_params) - +#define VMM_IOC_SHAREMEM _IOW('V', 11, struct vm_sharemem_params) #ifdef _KERNEL @@ -194,6 +201,7 @@ int vm_get_info(struct vm_info_params *); int vm_terminate(struct vm_terminate_params *); int vm_resetcpu(struct vm_resetcpu_params *); int vcpu_must_stop(struct vcpu *); +int vm_share_mem(struct vm_sharemem_params *, struct proc *); #endif /* _KERNEL */ #endif /* DEV_VMM_H */ diff --git a/usr.sbin/vmd/vioblk.c b/usr.sbin/vmd/vioblk.c index 9373a135aa8..33d447a9438 100644 --- a/usr.sbin/vmd/vioblk.c +++ b/usr.sbin/vmd/vioblk.c @@ -1,4 +1,4 @@ -/* $OpenBSD: vioblk.c,v 1.2 2023/04/28 18:52:22 dv Exp $ */ +/* $OpenBSD: vioblk.c,v 1.3 2023/05/13 23:15:28 dv Exp $ */ /* * Copyright (c) 2023 Dave Voutila @@ -58,7 +58,7 @@ disk_type(int type) } __dead void -vioblk_main(int fd) +vioblk_main(int fd, int fd_vmm) { struct virtio_dev dev; struct vioblk_dev *vioblk; @@ -71,8 +71,11 @@ vioblk_main(int fd) log_procinit("vioblk"); - /* stdio - needed for read/write to disk fds and channels to the vm. */ - if (pledge("stdio", NULL) == -1) + /* + * stdio - needed for read/write to disk fds and channels to the vm. + * vmm + proc - needed to create shared vm mappings. + */ + if (pledge("stdio vmm proc", NULL) == -1) fatal("pledge"); /* Receive our virtio_dev, mostly preconfigured. */ @@ -92,8 +95,9 @@ vioblk_main(int fd) vioblk = &dev.vioblk; log_debug("%s: got viblk dev. num disk fds = %d, sync fd = %d, " - "async fd = %d, sz = %lld maxfer = %d", __func__, vioblk->ndisk_fd, - dev.sync_fd, dev.async_fd, vioblk->sz, vioblk->max_xfer); + "async fd = %d, sz = %lld maxfer = %d, vmm fd = %d", __func__, + vioblk->ndisk_fd, dev.sync_fd, dev.async_fd, vioblk->sz, + vioblk->max_xfer, fd_vmm); /* Receive our vm information from the vm process. */ memset(&vm, 0, sizeof(vm)); @@ -108,12 +112,19 @@ vioblk_main(int fd) setproctitle("%s/vioblk[%d]", vcp->vcp_name, vioblk->idx); /* Now that we have our vm information, we can remap memory. */ - ret = remap_guest_mem(&vm); + ret = remap_guest_mem(&vm, fd_vmm); if (ret) { log_warnx("failed to remap guest memory"); goto fail; } + /* + * We no longer need /dev/vmm access. + */ + close_fd(fd_vmm); + if (pledge("stdio", NULL) == -1) + fatal("pledge2"); + /* Initialize the virtio block abstractions. */ type = vm.vm_params.vmc_disktypes[vioblk->idx]; switch (type) { diff --git a/usr.sbin/vmd/vionet.c b/usr.sbin/vmd/vionet.c index 6ce905fdccf..c16ad2635ea 100644 --- a/usr.sbin/vmd/vionet.c +++ b/usr.sbin/vmd/vionet.c @@ -1,4 +1,4 @@ -/* $OpenBSD: vionet.c,v 1.2 2023/04/28 18:52:22 dv Exp $ */ +/* $OpenBSD: vionet.c,v 1.3 2023/05/13 23:15:28 dv Exp $ */ /* * Copyright (c) 2023 Dave Voutila @@ -61,7 +61,7 @@ static void dev_dispatch_vm(int, short, void *); static void handle_sync_io(int, short, void *); __dead void -vionet_main(int fd) +vionet_main(int fd, int fd_vmm) { struct virtio_dev dev; struct vionet_dev *vionet = NULL; @@ -73,8 +73,11 @@ vionet_main(int fd) log_procinit("vionet"); - /* stdio - needed for read/write to tap fd and channels to the vm. */ - if (pledge("stdio", NULL) == -1) + /* + * stdio - needed for read/write to disk fds and channels to the vm. + * vmm + proc - needed to create shared vm mappings. + */ + if (pledge("stdio vmm proc", NULL) == -1) fatal("pledge"); /* Receive our vionet_dev, mostly preconfigured. */ @@ -92,8 +95,9 @@ vionet_main(int fd) dev.sync_fd = fd; vionet = &dev.vionet; - log_debug("%s: got vionet dev. tap fd = %d, syncfd = %d, asyncfd = %d", - __func__, vionet->data_fd, dev.sync_fd, dev.async_fd); + log_debug("%s: got vionet dev. tap fd = %d, syncfd = %d, asyncfd = %d" + ", vmm fd = %d", __func__, vionet->data_fd, dev.sync_fd, + dev.async_fd, fd_vmm); /* Receive our vm information from the vm process. */ memset(&vm, 0, sizeof(vm)); @@ -108,9 +112,18 @@ vionet_main(int fd) setproctitle("%s/vionet[%d]", vcp->vcp_name, vionet->idx); /* Now that we have our vm information, we can remap memory. */ - ret = remap_guest_mem(&vm); - if (ret) + ret = remap_guest_mem(&vm, fd_vmm); + if (ret) { + fatal("%s: failed to remap", __func__); goto fail; + } + + /* + * We no longer need /dev/vmm access. + */ + close_fd(fd_vmm); + if (pledge("stdio", NULL) == -1) + fatal("pledge2"); /* If we're restoring hardware, re-initialize virtqueue hva's. */ if (vm.vm_state & VM_STATE_RECEIVED) { diff --git a/usr.sbin/vmd/virtio.c b/usr.sbin/vmd/virtio.c index 92e77b8f834..d29b9e7b883 100644 --- a/usr.sbin/vmd/virtio.c +++ b/usr.sbin/vmd/virtio.c @@ -1,4 +1,4 @@ -/* $OpenBSD: virtio.c,v 1.102 2023/04/27 22:47:27 dv Exp $ */ +/* $OpenBSD: virtio.c,v 1.103 2023/05/13 23:15:28 dv Exp $ */ /* * Copyright (c) 2015 Mike Larkin @@ -1297,7 +1297,7 @@ virtio_start(struct vmd_vm *vm) static int virtio_dev_launch(struct vmd_vm *vm, struct virtio_dev *dev) { - char *nargv[8], num[32], t[2]; + char *nargv[10], num[32], vmm_fd[32], t[2]; pid_t dev_pid; int data_fds[VM_MAX_BASE_PER_DISK], sync_fds[2], async_fds[2], ret = 0; size_t i, j, data_fds_sz, sz = 0; @@ -1483,6 +1483,8 @@ virtio_dev_launch(struct vmd_vm *vm, struct virtio_dev *dev) memset(&nargv, 0, sizeof(nargv)); memset(num, 0, sizeof(num)); snprintf(num, sizeof(num), "%d", sync_fds[1]); + memset(vmm_fd, 0, sizeof(vmm_fd)); + snprintf(vmm_fd, sizeof(vmm_fd), "%d", env->vmd_fd); t[0] = dev->dev_type; t[1] = '\0'; @@ -1492,13 +1494,15 @@ virtio_dev_launch(struct vmd_vm *vm, struct virtio_dev *dev) nargv[2] = num; nargv[3] = "-t"; nargv[4] = t; - nargv[5] = "-n"; + nargv[5] = "-i"; + nargv[6] = vmm_fd; + nargv[7] = "-n"; if (env->vmd_verbose) { - nargv[6] = "-v"; - nargv[7] = NULL; + nargv[8] = "-v"; + nargv[9] = NULL; } else - nargv[6] = NULL; + nargv[8] = NULL; /* Control resumes in vmd.c:main(). */ execvp(nargv[0], nargv); diff --git a/usr.sbin/vmd/vm.c b/usr.sbin/vmd/vm.c index d42abb5a834..8ec69d6056e 100644 --- a/usr.sbin/vmd/vm.c +++ b/usr.sbin/vmd/vm.c @@ -1,4 +1,4 @@ -/* $OpenBSD: vm.c,v 1.88 2023/04/28 19:46:42 dv Exp $ */ +/* $OpenBSD: vm.c,v 1.89 2023/05/13 23:15:28 dv Exp $ */ /* * Copyright (c) 2015 Mike Larkin @@ -218,9 +218,10 @@ static const struct vcpu_reg_state vcpu_init_flat16 = { * Primary entrypoint for launching a vm. Does not return. * * fd: file descriptor for communicating with vmm process. + * fd_vmm: file descriptor for communicating with vmm(4) device */ void -vm_main(int fd) +vm_main(int fd, int vmm_fd) { struct vm_create_params *vcp = NULL; struct vmd_vm vm; @@ -241,9 +242,8 @@ vm_main(int fd) * vmm - for the vmm ioctls and operations. * proc exec - fork/exec for launching devices. * recvfd - for vm send/recv and sending fd to devices. - * tmppath/rpath - for shm_mkstemp, ftruncate, unlink */ - if (pledge("stdio vmm proc exec recvfd tmppath rpath", NULL) == -1) + if (pledge("stdio vmm proc exec recvfd", NULL) == -1) fatal("pledge"); /* Receive our vm configuration. */ @@ -254,13 +254,6 @@ vm_main(int fd) _exit(EIO); } - /* Receive the /dev/vmm fd number. */ - sz = atomicio(read, fd, &env->vmd_fd, sizeof(env->vmd_fd)); - if (sz != sizeof(env->vmd_fd)) { - log_warnx("failed to receive /dev/vmm fd"); - _exit(EIO); - } - /* Update process with the vm name. */ vcp = &vm.vm_params.vmc_params; setproctitle("%s", vcp->vcp_name); @@ -1099,63 +1092,34 @@ int alloc_guest_mem(struct vmd_vm *vm) { void *p; - char *tmp; - int fd, ret = 0; + int ret = 0; size_t i, j; struct vm_create_params *vcp = &vm->vm_params.vmc_params; struct vm_mem_range *vmr; - tmp = calloc(32, sizeof(char)); - if (tmp == NULL) { - ret = errno; - log_warn("%s: calloc", __func__); - return (ret); - } - strlcpy(tmp, "/tmp/vmd.XXXXXXXXXX", 32); - - vm->vm_nmemfds = vcp->vcp_nmemranges; - for (i = 0; i < vcp->vcp_nmemranges; i++) { vmr = &vcp->vcp_memranges[i]; - fd = shm_mkstemp(tmp); - if (fd < 0) { - ret = errno; - log_warn("%s: shm_mkstemp", __func__); - return (ret); - } - if (ftruncate(fd, vmr->vmr_size) == -1) { - ret = errno; - log_warn("%s: ftruncate", __func__); - goto out; - } - if (fcntl(fd, F_SETFD, 0) == -1) { - ret = errno; - log_warn("%s: fcntl", __func__); - goto out; - } - if (shm_unlink(tmp) == -1) { - ret = errno; - log_warn("%s: shm_unlink", __func__); - goto out; - } - strlcpy(tmp, "/tmp/vmd.XXXXXXXXXX", 32); - + /* + * We only need R/W as userland. vmm(4) will use R/W/X in its + * mapping. + * + * We must use MAP_SHARED so emulated devices will be able + * to generate shared mappings. + */ p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_CONCEAL, fd, 0); + MAP_ANON | MAP_CONCEAL | MAP_SHARED, -1, 0); if (p == MAP_FAILED) { ret = errno; for (j = 0; j < i; j++) { vmr = &vcp->vcp_memranges[j]; munmap((void *)vmr->vmr_va, vmr->vmr_size); } - goto out; + return (ret); } - vm->vm_memfds[i] = fd; vmr->vmr_va = (vaddr_t)p; } -out: - free(tmp); + return (ret); } @@ -2552,10 +2516,11 @@ vm_pipe_recv(struct vm_dev_pipe *p) * Returns 0 on success, non-zero in event of failure. */ int -remap_guest_mem(struct vmd_vm *vm) +remap_guest_mem(struct vmd_vm *vm, int vmm_fd) { struct vm_create_params *vcp; struct vm_mem_range *vmr; + struct vm_sharemem_params vsp; size_t i, j; void *p = NULL; int ret; @@ -2566,23 +2531,32 @@ remap_guest_mem(struct vmd_vm *vm) vcp = &vm->vm_params.vmc_params; /* - * We've execve'd, so we need to re-map the guest VM memory. Iterate - * over all possible vm_mem_range entries so we can initialize all - * file descriptors to a value. + * Initialize our VM shared memory request using our original + * creation parameters. We'll overwrite the va's after mmap(2). + */ + memset(&vsp, 0, sizeof(vsp)); + vsp.vsp_nmemranges = vcp->vcp_nmemranges; + vsp.vsp_vm_id = vcp->vcp_id; + memcpy(&vsp.vsp_memranges, &vcp->vcp_memranges, + sizeof(vsp.vsp_memranges)); + + /* + * Use mmap(2) to identify virtual address space for our mappings. */ for (i = 0; i < VMM_MAX_MEM_RANGES; i++) { - if (i < vcp->vcp_nmemranges) { - vmr = &vcp->vcp_memranges[i]; - /* Skip ranges we know we don't need right now. */ + if (i < vsp.vsp_nmemranges) { + vmr = &vsp.vsp_memranges[i]; + + /* Ignore any MMIO ranges. */ if (vmr->vmr_type == VM_MEM_MMIO) { - log_debug("%s: skipping range i=%ld, type=%d", - __func__, i, vmr->vmr_type); - vm->vm_memfds[i] = -1; + vmr->vmr_va = 0; + vcp->vcp_memranges[i].vmr_va = 0; continue; } - /* Re-mmap the memrange. */ - p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_CONCEAL, vm->vm_memfds[i], 0); + + /* Make initial mappings for the memrange. */ + p = mmap(NULL, vmr->vmr_size, PROT_READ, MAP_ANON, -1, + 0); if (p == MAP_FAILED) { ret = errno; log_warn("%s: mmap", __func__); @@ -2594,11 +2568,29 @@ remap_guest_mem(struct vmd_vm *vm) return (ret); } vmr->vmr_va = (vaddr_t)p; - } else { - /* Initialize with an invalid fd. */ - vm->vm_memfds[i] = -1; + vcp->vcp_memranges[i].vmr_va = vmr->vmr_va; } } + /* + * munmap(2) now that we have va's and ranges that don't overlap. vmm + * will use the va's and sizes to recreate the mappings for us. + */ + for (i = 0; i < vsp.vsp_nmemranges; i++) { + vmr = &vsp.vsp_memranges[i]; + if (vmr->vmr_type == VM_MEM_MMIO) + continue; + if (munmap((void*)vmr->vmr_va, vmr->vmr_size) == -1) + fatal("%s: munmap", __func__); + } + + /* + * Ask vmm to enter the shared mappings for us. They'll point + * to the same host physical memory, but will have a randomized + * virtual address for the calling process. + */ + if (ioctl(vmm_fd, VMM_IOC_SHAREMEM, &vsp) == -1) + return (errno); + return (0); } diff --git a/usr.sbin/vmd/vmd.c b/usr.sbin/vmd/vmd.c index b8cc0a09fe3..86a5132fe22 100644 --- a/usr.sbin/vmd/vmd.c +++ b/usr.sbin/vmd/vmd.c @@ -1,4 +1,4 @@ -/* $OpenBSD: vmd.c,v 1.148 2023/05/12 16:18:17 dv Exp $ */ +/* $OpenBSD: vmd.c,v 1.149 2023/05/13 23:15:28 dv Exp $ */ /* * Copyright (c) 2015 Reyk Floeter @@ -788,7 +788,8 @@ main(int argc, char **argv) struct privsep *ps; int ch; enum privsep_procid proc_id = PROC_PARENT; - int proc_instance = 0, vm_launch = 0, vm_fd = -1; + int proc_instance = 0, vm_launch = 0; + int vmm_fd = -1, vm_fd = -1; const char *errp, *title = NULL; int argc0 = argc; char dev_type = '\0'; @@ -798,7 +799,7 @@ main(int argc, char **argv) if ((env = calloc(1, sizeof(*env))) == NULL) fatal("calloc: env"); - while ((ch = getopt(argc, argv, "D:P:I:V:X:df:nt:v")) != -1) { + while ((ch = getopt(argc, argv, "D:P:I:V:X:df:i:nt:v")) != -1) { switch (ch) { case 'D': if (cmdline_symset(optarg) < 0) @@ -852,6 +853,11 @@ main(int argc, char **argv) default: fatalx("invalid device type"); } break; + case 'i': + vmm_fd = strtonum(optarg, 0, 128, &errp); + if (errp) + fatalx("invalid vmm fd"); + break; default: usage(); } @@ -880,7 +886,7 @@ main(int argc, char **argv) ps = &env->vmd_ps; ps->ps_env = env; - env->vmd_fd = -1; + env->vmd_fd = vmm_fd; if (config_init(env) == -1) fatal("failed to initialize configuration"); @@ -896,14 +902,14 @@ main(int argc, char **argv) * If we're launching a new vm or its device, we short out here. */ if (vm_launch == VMD_LAUNCH_VM) { - vm_main(vm_fd); + vm_main(vm_fd, vmm_fd); /* NOTREACHED */ } else if (vm_launch == VMD_LAUNCH_DEV) { if (dev_type == VMD_DEVTYPE_NET) { - vionet_main(vm_fd); + vionet_main(vm_fd, vmm_fd); /* NOTREACHED */ } else if (dev_type == VMD_DEVTYPE_DISK) { - vioblk_main(vm_fd); + vioblk_main(vm_fd, vmm_fd); /* NOTREACHED */ } fatalx("unsupported device type '%c'", dev_type); diff --git a/usr.sbin/vmd/vmd.h b/usr.sbin/vmd/vmd.h index 68de0544706..9c25b0c92ad 100644 --- a/usr.sbin/vmd/vmd.h +++ b/usr.sbin/vmd/vmd.h @@ -1,4 +1,4 @@ -/* $OpenBSD: vmd.h,v 1.121 2023/04/28 19:46:42 dv Exp $ */ +/* $OpenBSD: vmd.h,v 1.122 2023/05/13 23:15:28 dv Exp $ */ /* * Copyright (c) 2015 Mike Larkin @@ -329,9 +329,6 @@ struct vmd_vm { struct timeval vm_start_tv; int vm_start_limit; - int vm_memfds[VMM_MAX_MEM_RANGES]; - size_t vm_nmemfds; - TAILQ_ENTRY(vmd_vm) vm_entry; }; TAILQ_HEAD(vmlist, vmd_vm); @@ -488,7 +485,7 @@ int fd_hasdata(int); int vmm_pipe(struct vmd_vm *, int, void (*)(int, short, void *)); /* vm.c */ -void vm_main(int); +void vm_main(int, int); void mutex_lock(pthread_mutex_t *); void mutex_unlock(pthread_mutex_t *); int read_mem(paddr_t, void *buf, size_t); @@ -499,7 +496,7 @@ void vm_pipe_send(struct vm_dev_pipe *, enum pipe_msg_type); enum pipe_msg_type vm_pipe_recv(struct vm_dev_pipe *); int write_mem(paddr_t, const void *buf, size_t); void* hvaddr_mem(paddr_t, size_t); -int remap_guest_mem(struct vmd_vm *); +int remap_guest_mem(struct vmd_vm *, int); /* config.c */ int config_init(struct vmd *); @@ -527,9 +524,9 @@ int host(const char *, struct address *); int virtio_get_base(int, char *, size_t, int, const char *); /* vionet.c */ -__dead void vionet_main(int); +__dead void vionet_main(int, int); /* vioblk.c */ -__dead void vioblk_main(int); +__dead void vioblk_main(int, int); #endif /* VMD_H */ diff --git a/usr.sbin/vmd/vmm.c b/usr.sbin/vmd/vmm.c index 35119673dc3..7f307f99c9c 100644 --- a/usr.sbin/vmd/vmm.c +++ b/usr.sbin/vmd/vmm.c @@ -1,4 +1,4 @@ -/* $OpenBSD: vmm.c,v 1.111 2023/04/27 22:47:27 dv Exp $ */ +/* $OpenBSD: vmm.c,v 1.112 2023/05/13 23:15:28 dv Exp $ */ /* * Copyright (c) 2015 Mike Larkin @@ -627,7 +627,7 @@ vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *pid) { struct vm_create_params *vcp; struct vmd_vm *vm; - char *nargv[6], num[32]; + char *nargv[8], num[32], vmm_fd[32]; int fd, ret = EINVAL; int fds[2]; pid_t vm_pid; @@ -701,16 +701,6 @@ vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *pid) if (ret == EIO) goto err; - /* Send the fd number for /dev/vmm. */ - sz = atomicio(vwrite, fds[0], &env->vmd_fd, - sizeof(env->vmd_fd)); - if (sz != sizeof(env->vmd_fd)) { - log_warnx("%s: failed to send /dev/vmm fd for vm '%s'", - __func__, vcp->vcp_name); - ret = EIO; - goto err; - } - /* Read back the kernel-generated vm id from the child */ sz = atomicio(read, fds[0], &vcp->vcp_id, sizeof(vcp->vcp_id)); if (sz != sizeof(vcp->vcp_id)) { @@ -773,17 +763,21 @@ vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *pid) memset(&nargv, 0, sizeof(nargv)); memset(num, 0, sizeof(num)); snprintf(num, sizeof(num), "%d", fds[1]); + memset(vmm_fd, 0, sizeof(vmm_fd)); + snprintf(vmm_fd, sizeof(vmm_fd), "%d", env->vmd_fd); nargv[0] = env->argv0; nargv[1] = "-V"; nargv[2] = num; nargv[3] = "-n"; + nargv[4] = "-i"; + nargv[5] = vmm_fd; if (env->vmd_verbose) { - nargv[4] = "-v"; - nargv[5] = NULL; + nargv[6] = "-v"; + nargv[7] = NULL; } else - nargv[4] = NULL; + nargv[6] = NULL; /* Control resumes in vmd main(). */ execvp(nargv[0], nargv); -- 2.20.1