From 24386e31c854f991d6fdd6ec96c78647f310c98d Mon Sep 17 00:00:00 2001 From: dv Date: Sun, 23 Apr 2023 12:11:37 +0000 Subject: [PATCH] vmd(8): teach vmm process how to exec. Use execvp(2) to launch vm children with new address spaces. Consequently, introduces use of unveil(2) into the vmm and vm processes. This imposes the requirement of launching vmd with absolute paths, similar to sshd(8). ok mlarkin@ --- usr.sbin/vmd/vm.c | 219 ++++++++++++++++++++++++++++----------------- usr.sbin/vmd/vmd.c | 35 ++++++-- usr.sbin/vmd/vmd.h | 7 +- usr.sbin/vmd/vmm.c | 114 ++++++++++++++++++++--- 4 files changed, 274 insertions(+), 101 deletions(-) diff --git a/usr.sbin/vmd/vm.c b/usr.sbin/vmd/vm.c index 84db4c4e37a..62782a2cc1f 100644 --- a/usr.sbin/vmd/vm.c +++ b/usr.sbin/vmd/vm.c @@ -1,4 +1,4 @@ -/* $OpenBSD: vm.c,v 1.84 2023/04/23 05:37:55 anton Exp $ */ +/* $OpenBSD: vm.c,v 1.85 2023/04/23 12:11:37 dv Exp $ */ /* * Copyright (c) 2015 Mike Larkin @@ -74,8 +74,7 @@ io_fn_t ioports_map[MAX_PORTS]; -int run_vm(int, int[][VM_MAX_BASE_PER_DISK], int *, - struct vmop_create_params *, struct vcpu_reg_state *); +static int run_vm(struct vmop_create_params *, struct vcpu_reg_state *); void vm_dispatch_vmm(int, short, void *); void *event_thread(void *); void *vcpu_run_loop(void *); @@ -213,6 +212,72 @@ static const struct vcpu_reg_state vcpu_init_flat16 = { .vrs_crs[VCPU_REGS_XCR0] = XFEATURE_X87 }; +/* + * vm_main + * + * Primary entrypoint for launching a vm. Does not return. + * + * fd: file descriptor for communicating with vmm process. + */ +void +vm_main(int fd) +{ + struct vm_create_params *vcp = NULL; + struct vmd_vm vm; + size_t sz = 0; + int ret = 0; + + /* + * We aren't root, so we can't chroot(2). Use unveil(2) instead. + */ + if (unveil("/var/empty", "") == -1) + fatal("unveil /var/empty"); + if (unveil(NULL, NULL) == -1) + fatal("unveil lock"); + + /* + * pledge in the vm processes: + * stdio - for malloc and basic I/O including events. + * vmm - for the vmm ioctls and operations. + * recvfd - for vm send/recv and sending fd to devices. + * proc - required for vmm(4) VMM_IOC_CREATE ioctl + */ + if (pledge("stdio vmm recvfd proc", NULL) == -1) + fatal("pledge"); + + /* Receive our vm configuration. */ + memset(&vm, 0, sizeof(vm)); + sz = atomicio(read, fd, &vm, sizeof(vm)); + if (sz != sizeof(vm)) { + log_warnx("failed to receive start message"); + _exit(EIO); + } + + /* Receive the /dev/vmm fd number. */ + sz = atomicio(read, fd, &env->vmd_fd, sizeof(env->vmd_fd)); + if (sz != sizeof(env->vmd_fd)) { + log_warnx("failed to receive /dev/vmm fd"); + _exit(EIO); + } + + /* Update process with the vm name. */ + vcp = &vm.vm_params.vmc_params; + setproctitle("%s", vcp->vcp_name); + log_procinit(vcp->vcp_name); + + /* + * We need, at minimum, a vm_kernel fd to boot a vm. This is either a + * kernel or a BIOS image. + */ + if (vm.vm_kernel < 0 && !(vm.vm_state & VM_STATE_RECEIVED)) { + log_warnx("%s: failed to receive boot fd", vcp->vcp_name); + _exit(EINVAL); + } + + ret = start_vm(&vm, fd); + _exit(ret); +} + /* * loadfile_bios * @@ -300,15 +365,14 @@ start_vm(struct vmd_vm *vm, int fd) struct vm_rwregs_params vrp; struct stat sb; - /* Child */ - setproctitle("%s", vcp->vcp_name); - log_procinit(vcp->vcp_name); - + /* + * We first try to initialize and allocate memory before bothering + * vmm(4) with a request to create a new vm. + */ if (!(vm->vm_state & VM_STATE_RECEIVED)) create_memory_map(vcp); - ret = alloc_guest_mem(vcp); - + ret = alloc_guest_mem(&vm->vm_params.vmc_params); if (ret) { struct rlimit lim; char buf[FMT_SCALED_STRSIZE]; @@ -318,31 +382,44 @@ start_vm(struct vmd_vm *vm, int fd) "limit is %s)", buf); } errno = ret; - fatal("could not allocate guest memory"); + log_warn("could not allocate guest memory"); + return (ret); } + /* We've allocated guest memory, so now create the vm in vmm(4). */ ret = vmm_create_vm(vcp); - current_vm = vm; + if (ret) { + /* Let the vmm process know we failed by sending a 0 vm id. */ + vcp->vcp_id = 0; + atomicio(vwrite, fd, &vcp->vcp_id, sizeof(vcp->vcp_id)); + return (ret); + } - /* send back the kernel-generated vm id (0 on error) */ - if (atomicio(vwrite, fd, &vcp->vcp_id, sizeof(vcp->vcp_id)) != - sizeof(vcp->vcp_id)) - fatal("failed to send created vm id to vmm process"); + /* Tighten pledge now that we've called VMM_IOC_CREATE ioctl. */ + if (pledge("stdio vmm recvfd", NULL) == -1) + fatal("pledge"); - if (ret) { - errno = ret; - fatal("create vmm ioctl failed - exiting"); + /* + * Some of vmd currently relies on global state (current_vm, con_fd). + */ + current_vm = vm; + con_fd = vm->vm_tty; + if (fcntl(con_fd, F_SETFL, O_NONBLOCK) == -1) { + log_warn("failed to set nonblocking mode on console"); + return (1); } /* - * pledge in the vm processes: - * stdio - for malloc and basic I/O including events. - * recvfd - for send/recv. - * vmm - for the vmm ioctls and operations. + * We now let the vmm process know we were successful by sending it our + * vmm(4) assigned vm id. */ - if (pledge("stdio vmm recvfd", NULL) == -1) - fatal("pledge"); + if (atomicio(vwrite, fd, &vcp->vcp_id, sizeof(vcp->vcp_id)) != + sizeof(vcp->vcp_id)) { + log_warn("failed to send created vm id to vmm process"); + return (1); + } + /* Prepare either our boot image or receive an existing vm to launch. */ if (vm->vm_state & VM_STATE_RECEIVED) { ret = atomicio(read, vm->vm_receive_fd, &vrp, sizeof(vrp)); if (ret != sizeof(vrp)) @@ -377,16 +454,37 @@ start_vm(struct vmd_vm *vm, int fd) } if (vm->vm_kernel != -1) - close(vm->vm_kernel); + close_fd(vm->vm_kernel); - con_fd = vm->vm_tty; - if (fcntl(con_fd, F_SETFL, O_NONBLOCK) == -1) - fatal("failed to set nonblocking mode on console"); + /* Initialize our mutexes. */ + ret = pthread_mutex_init(&threadmutex, NULL); + if (ret) { + log_warn("%s: could not initialize thread state mutex", + __func__); + return (ret); + } + ret = pthread_cond_init(&threadcond, NULL); + if (ret) { + log_warn("%s: could not initialize thread state " + "condition variable", __func__); + return (ret); + } + mutex_lock(&threadmutex); - for (i = 0; i < VM_MAX_NICS_PER_VM; i++) - nicfds[i] = vm->vm_ifs[i].vif_fd; + /* + * Finalize our communication socket with the vmm process. From here + * onwards, communication with the vmm process is event-based. + */ event_init(); + if (vmm_pipe(vm, fd, vm_dispatch_vmm) == -1) + fatal("setup vm pipe"); + + /* + * Initialize or restore our emulated hardware. + */ + for (i = 0; i < VMM_MAX_NICS_PER_VM; i++) + nicfds[i] = vm->vm_ifs[i].vif_fd; if (vm->vm_state & VM_STATE_RECEIVED) { restore_emulated_hw(vcp, vm->vm_receive_fd, nicfds, @@ -395,13 +493,13 @@ start_vm(struct vmd_vm *vm, int fd) if (restore_vm_params(vm->vm_receive_fd, vcp)) fatal("restore vm params failed"); unpause_vm(vcp); - } - - if (vmm_pipe(vm, fd, vm_dispatch_vmm) == -1) - fatal("setup vm pipe"); + } else + init_emulated_hw(vmc, vm->vm_cdrom, vm->vm_disks, nicfds); - /* Execute the vcpu run loop(s) for this VM */ - ret = run_vm(vm->vm_cdrom, vm->vm_disks, nicfds, &vm->vm_params, &vrs); + /* + * Execute the vcpu run loop(s) for this VM. + */ + ret = run_vm(&vm->vm_params, &vrs); /* Ensure that any in-flight data is written back */ virtio_shutdown(vm); @@ -1205,10 +1303,8 @@ restore_emulated_hw(struct vm_create_params *vcp, int fd, * 0: the VM exited normally * !0 : the VM exited abnormally or failed to start */ -int -run_vm(int child_cdrom, int child_disks[][VM_MAX_BASE_PER_DISK], - int *child_taps, struct vmop_create_params *vmc, - struct vcpu_reg_state *vrs) +static int +run_vm(struct vmop_create_params *vmc, struct vcpu_reg_state *vrs) { struct vm_create_params *vcp = &vmc->vmc_params; struct vm_rwregs_params vregsp; @@ -1223,24 +1319,6 @@ run_vm(int child_cdrom, int child_disks[][VM_MAX_BASE_PER_DISK], if (vcp == NULL) return (EINVAL); - if (child_cdrom == -1 && strlen(vcp->vcp_cdrom)) - return (EINVAL); - - if (child_disks == NULL && vcp->vcp_ndisks != 0) - return (EINVAL); - - if (child_taps == NULL && vcp->vcp_nnics != 0) - return (EINVAL); - - if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM) - return (EINVAL); - - if (vcp->vcp_ndisks > VM_MAX_DISKS_PER_VM) - return (EINVAL); - - if (vcp->vcp_nnics > VM_MAX_NICS_PER_VM) - return (EINVAL); - if (vcp->vcp_nmemranges == 0 || vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES) return (EINVAL); @@ -1253,29 +1331,8 @@ run_vm(int child_cdrom, int child_disks[][VM_MAX_BASE_PER_DISK], return (ENOMEM); } - log_debug("%s: initializing hardware for vm %s", __func__, - vcp->vcp_name); - - if (!(current_vm->vm_state & VM_STATE_RECEIVED)) - init_emulated_hw(vmc, child_cdrom, child_disks, child_taps); - - ret = pthread_mutex_init(&threadmutex, NULL); - if (ret) { - log_warn("%s: could not initialize thread state mutex", - __func__); - return (ret); - } - ret = pthread_cond_init(&threadcond, NULL); - if (ret) { - log_warn("%s: could not initialize thread state " - "condition variable", __func__); - return (ret); - } - - mutex_lock(&threadmutex); - - log_debug("%s: starting vcpu threads for vm %s", __func__, - vcp->vcp_name); + log_debug("%s: starting %zu vcpu thread(s) for vm %s", __func__, + vcp->vcp_ncpus, vcp->vcp_name); /* * Create and launch one thread for each VCPU. These threads may diff --git a/usr.sbin/vmd/vmd.c b/usr.sbin/vmd/vmd.c index 97209bf1c8a..f188e55066e 100644 --- a/usr.sbin/vmd/vmd.c +++ b/usr.sbin/vmd/vmd.c @@ -1,4 +1,4 @@ -/* $OpenBSD: vmd.c,v 1.141 2023/04/19 12:58:16 jsg Exp $ */ +/* $OpenBSD: vmd.c,v 1.142 2023/04/23 12:11:37 dv Exp $ */ /* * Copyright (c) 2015 Reyk Floeter @@ -74,8 +74,10 @@ static struct privsep_proc procs[] = { /* Keep "priv" on top as procs[0] */ { "priv", PROC_PRIV, vmd_dispatch_priv, priv }, { "control", PROC_CONTROL, vmd_dispatch_control, control }, - { "vmm", PROC_VMM, vmd_dispatch_vmm, vmm, vmm_shutdown }, - { "agentx", PROC_AGENTX, vmd_dispatch_agentx, vm_agentx, vm_agentx_shutdown, "/" } + { "vmm", PROC_VMM, vmd_dispatch_vmm, vmm, + vmm_shutdown, "/" }, + { "agentx", PROC_AGENTX, vmd_dispatch_agentx, vm_agentx, + vm_agentx_shutdown, "/" } }; enum privsep_procid privsep_process; @@ -767,7 +769,7 @@ main(int argc, char **argv) int ch; const char *conffile = VMD_CONF; enum privsep_procid proc_id = PROC_PARENT; - int proc_instance = 0; + int proc_instance = 0, vm_launch = 0, vm_fd = -1; const char *errp, *title = NULL; int argc0 = argc; @@ -776,7 +778,7 @@ main(int argc, char **argv) if ((env = calloc(1, sizeof(*env))) == NULL) fatal("calloc: env"); - while ((ch = getopt(argc, argv, "D:P:I:df:vn")) != -1) { + while ((ch = getopt(argc, argv, "D:P:I:V:df:vn")) != -1) { switch (ch) { case 'D': if (cmdline_symset(optarg) < 0) @@ -792,6 +794,7 @@ main(int argc, char **argv) case 'v': env->vmd_verbose++; break; + /* vmd fork/exec */ case 'n': env->vmd_noaction = 1; break; @@ -807,6 +810,13 @@ main(int argc, char **argv) if (errp) fatalx("invalid process instance"); break; + /* child vm fork/exec */ + case 'V': + vm_launch = VMD_LAUNCH_VM; + vm_fd = strtonum(optarg, 0, 128, &errp); + if (errp) + fatalx("invalid vm fd"); + break; default: usage(); } @@ -822,8 +832,13 @@ main(int argc, char **argv) log_init(env->vmd_debug, LOG_DAEMON); log_setverbose(env->vmd_verbose); + /* Re-exec from the vmm child process requires an absolute path. */ + if (proc_id == PROC_PARENT && *argv[0] != '/') + fatalx("re-exec requires execution with an absolute path"); + env->argv0 = argv[0]; + /* check for root privileges */ - if (env->vmd_noaction == 0) { + if (env->vmd_noaction == 0 && !vm_launch) { if (geteuid()) fatalx("need root privileges"); } @@ -842,6 +857,14 @@ main(int argc, char **argv) proc_priv->p_pw = &proc_privpw; /* initialized to all 0 */ proc_priv->p_chroot = ps->ps_pw->pw_dir; /* from VMD_USER */ + /* + * If we're launching a new vm or its device, we short out here. + */ + if (vm_launch == VMD_LAUNCH_VM) { + vm_main(vm_fd); + /* NOTREACHED */ + } + /* Open /dev/vmm early. */ if (env->vmd_noaction == 0 && proc_id == PROC_PARENT) { env->vmd_fd = open(VMM_NODE, O_RDWR); diff --git a/usr.sbin/vmd/vmd.h b/usr.sbin/vmd/vmd.h index 153d4206257..5e0eb8dfe8e 100644 --- a/usr.sbin/vmd/vmd.h +++ b/usr.sbin/vmd/vmd.h @@ -1,4 +1,4 @@ -/* $OpenBSD: vmd.h,v 1.116 2023/04/16 12:47:26 dv Exp $ */ +/* $OpenBSD: vmd.h,v 1.117 2023/04/23 12:11:37 dv Exp $ */ /* * Copyright (c) 2015 Mike Larkin @@ -72,6 +72,9 @@ #define VMD_DEFAULT_STAGGERED_START_DELAY 30 +/* Launch mode identifiers for when a vm fork+exec's. */ +#define VMD_LAUNCH_VM 1 + /* Rate-limit fast reboots */ #define VM_START_RATE_SEC 6 /* min. seconds since last reboot */ #define VM_START_RATE_LIMIT 3 /* max. number of fast reboots */ @@ -355,6 +358,7 @@ struct vmd_config { struct vmd { struct privsep vmd_ps; const char *vmd_conffile; + char *argv0; /* abs. path to vmd for exec, unveil */ /* global configuration that is sent to the children */ struct vmd_config vmd_cfg; @@ -464,6 +468,7 @@ int fd_hasdata(int); int vmm_pipe(struct vmd_vm *, int, void (*)(int, short, void *)); /* vm.c */ +void vm_main(int); void mutex_lock(pthread_mutex_t *); void mutex_unlock(pthread_mutex_t *); int read_mem(paddr_t, void *buf, size_t); diff --git a/usr.sbin/vmd/vmm.c b/usr.sbin/vmd/vmm.c index 36c909e94be..e5d4e3ab73d 100644 --- a/usr.sbin/vmd/vmm.c +++ b/usr.sbin/vmd/vmm.c @@ -1,4 +1,4 @@ -/* $OpenBSD: vmm.c,v 1.108 2023/04/16 12:47:26 dv Exp $ */ +/* $OpenBSD: vmm.c,v 1.109 2023/04/23 12:11:37 dv Exp $ */ /* * Copyright (c) 2015 Mike Larkin @@ -80,20 +80,28 @@ vmm_run(struct privsep *ps, struct privsep_proc *p, void *arg) if (config_init(ps->ps_env) == -1) fatal("failed to initialize configuration"); - signal_del(&ps->ps_evsigchld); - signal_set(&ps->ps_evsigchld, SIGCHLD, vmm_sighdlr, ps); - signal_add(&ps->ps_evsigchld, NULL); + /* + * We aren't root, so we can't chroot(2). Use unveil(2) instead. + */ + if (unveil(env->argv0, "x") == -1) + fatal("unveil %s", env->argv0); + if (unveil(NULL, NULL) == -1) + fatal("unveil lock"); /* * pledge in the vmm process: * stdio - for malloc and basic I/O including events. * vmm - for the vmm ioctls and operations. - * proc - for forking and maitaining vms. + * proc, exec - for forking and execing new vm's. * sendfd - for sending send/recv fds to vm proc. * recvfd - for disks, interfaces and other fds. */ - if (pledge("stdio vmm sendfd recvfd proc", NULL) == -1) + if (pledge("stdio vmm sendfd recvfd proc exec", NULL) == -1) fatal("pledge"); + + signal_del(&ps->ps_evsigchld); + signal_set(&ps->ps_evsigchld, SIGCHLD, vmm_sighdlr, ps); + signal_add(&ps->ps_evsigchld, NULL); } int @@ -603,7 +611,7 @@ opentap(char *ifname) /* * vmm_start_vm * - * Prepares and forks a new VM process. + * Prepares and fork+execs a new VM process. * * Parameters: * imsg: The VM data structure that is including the VM create parameters. @@ -619,7 +627,8 @@ vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *pid) { struct vm_create_params *vcp; struct vmd_vm *vm; - int ret = EINVAL; + char *nargv[5], num[32]; + int fd, ret = EINVAL; int fds[2]; pid_t vm_pid; size_t i, j, sz; @@ -641,10 +650,17 @@ vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *pid) if (socketpair(AF_UNIX, SOCK_STREAM, PF_UNSPEC, fds) == -1) fatal("socketpair"); - /* Fork the vmm process to create the vm, inheriting open device fds. */ + /* Keep our channel open after exec. */ + if (fcntl(fds[1], F_SETFD, 0)) { + ret = errno; + log_warn("%s: fcntl", __func__); + goto err; + } + + /* Start child vmd for this VM (fork, chroot, drop privs) */ vm_pid = fork(); if (vm_pid == -1) { - log_warn("%s: fork child failed", __func__); + log_warn("%s: start child failed", __func__); ret = EIO; goto err; } @@ -654,6 +670,16 @@ vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *pid) vm->vm_pid = vm_pid; close_fd(fds[1]); + /* Send the details over the pipe to the child. */ + sz = atomicio(vwrite, fds[0], vm, sizeof(*vm)); + if (sz != sizeof(*vm)) { + log_warnx("%s: failed to send config for vm '%s'", + __func__, vcp->vcp_name); + ret = EIO; + /* Defer error handling until after fd closing. */ + } + + /* As the parent/vmm process, we no longer need these fds. */ for (i = 0 ; i < vcp->vcp_ndisks; i++) { for (j = 0; j < VM_MAX_BASE_PER_DISK; j++) { if (close_fd(vm->vm_disks[i][j]) == 0) @@ -671,6 +697,20 @@ vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *pid) if (close_fd(vm->vm_tty) == 0) vm->vm_tty = -1; + /* Deferred error handling from sending the vm struct. */ + if (ret == EIO) + goto err; + + /* Send the fd number for /dev/vmm. */ + sz = atomicio(vwrite, fds[0], &env->vmd_fd, + sizeof(env->vmd_fd)); + if (sz != sizeof(env->vmd_fd)) { + log_warnx("%s: failed to send /dev/vmm fd for vm '%s'", + __func__, vcp->vcp_name); + ret = EIO; + goto err; + } + /* Read back the kernel-generated vm id from the child */ sz = atomicio(read, fds[0], &vcp->vcp_id, sizeof(vcp->vcp_id)); if (sz != sizeof(vcp->vcp_id)) { @@ -681,30 +721,78 @@ vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *pid) goto err; } + /* Check for an invalid id. This indicates child failure. */ if (vcp->vcp_id == 0) goto err; *id = vcp->vcp_id; *pid = vm->vm_pid; + /* Wire up our pipe into the event handling. */ if (vmm_pipe(vm, fds[0], vmm_dispatch_vm) == -1) fatal("setup vm pipe"); return (0); } else { - /* Child */ + /* Child. Create a new session. */ + if (setsid() == -1) + fatal("setsid"); + close_fd(fds[0]); close_fd(PROC_PARENT_SOCK_FILENO); - ret = start_vm(vm, fds[1]); + /* Detach from terminal. */ + if (!env->vmd_debug && (fd = + open("/dev/null", O_RDWR, 0)) != -1) { + dup2(fd, STDIN_FILENO); + dup2(fd, STDOUT_FILENO); + dup2(fd, STDERR_FILENO); + if (fd > 2) + close(fd); + } + /* Toggle all fds to not close on exec. */ + for (i = 0 ; i < vcp->vcp_ndisks; i++) + for (j = 0; j < VM_MAX_BASE_PER_DISK; j++) + if (vm->vm_disks[i][j] != -1) + fcntl(vm->vm_disks[i][j], F_SETFD, 0); + for (i = 0 ; i < vcp->vcp_nnics; i++) + fcntl(vm->vm_ifs[i].vif_fd, F_SETFD, 0); + if (vm->vm_kernel != -1) + fcntl(vm->vm_kernel, F_SETFD, 0); + if (vm->vm_cdrom != -1) + fcntl(vm->vm_cdrom, F_SETFD, 0); + if (vm->vm_tty != -1) + fcntl(vm->vm_tty, F_SETFD, 0); + fcntl(env->vmd_fd, F_SETFD, 0); /* vmm device fd */ + + /* + * Prepare our new argv for execvp(2) with the fd of our open + * pipe to the parent/vmm process as an argument. + */ + memset(num, 0, sizeof(num)); + snprintf(num, sizeof(num), "%d", fds[1]); + + nargv[0] = env->argv0; + nargv[1] = "-V"; + nargv[2] = num; + nargv[3] = "-n"; + nargv[4] = NULL; + + /* Control resumes in vmd main(). */ + execvp(nargv[0], nargv); + + ret = errno; + log_warn("execvp %s", nargv[0]); _exit(ret); + /* NOTREACHED */ } return (0); err: - vm_remove(vm, __func__); + if (!vm->vm_from_config) + vm_remove(vm, __func__); return (ret); } -- 2.20.1