Use execvp(2) to launch vm children with new address spaces.
Consequently, introduces use of unveil(2) into the vmm and vm
processes.
This imposes the requirement of launching vmd with absolute paths,
similar to sshd(8).
ok mlarkin@
-/* $OpenBSD: vm.c,v 1.84 2023/04/23 05:37:55 anton Exp $ */
+/* $OpenBSD: vm.c,v 1.85 2023/04/23 12:11:37 dv Exp $ */
/*
* Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
io_fn_t ioports_map[MAX_PORTS];
-int run_vm(int, int[][VM_MAX_BASE_PER_DISK], int *,
- struct vmop_create_params *, struct vcpu_reg_state *);
+static int run_vm(struct vmop_create_params *, struct vcpu_reg_state *);
void vm_dispatch_vmm(int, short, void *);
void *event_thread(void *);
void *vcpu_run_loop(void *);
.vrs_crs[VCPU_REGS_XCR0] = XFEATURE_X87
};
+/*
+ * vm_main
+ *
+ * Primary entrypoint for launching a vm. Does not return.
+ *
+ * fd: file descriptor for communicating with vmm process.
+ */
+void
+vm_main(int fd)
+{
+ struct vm_create_params *vcp = NULL;
+ struct vmd_vm vm;
+ size_t sz = 0;
+ int ret = 0;
+
+ /*
+ * We aren't root, so we can't chroot(2). Use unveil(2) instead.
+ */
+ if (unveil("/var/empty", "") == -1)
+ fatal("unveil /var/empty");
+ if (unveil(NULL, NULL) == -1)
+ fatal("unveil lock");
+
+ /*
+ * pledge in the vm processes:
+ * stdio - for malloc and basic I/O including events.
+ * vmm - for the vmm ioctls and operations.
+ * recvfd - for vm send/recv and sending fd to devices.
+ * proc - required for vmm(4) VMM_IOC_CREATE ioctl
+ */
+ if (pledge("stdio vmm recvfd proc", NULL) == -1)
+ fatal("pledge");
+
+ /* Receive our vm configuration. */
+ memset(&vm, 0, sizeof(vm));
+ sz = atomicio(read, fd, &vm, sizeof(vm));
+ if (sz != sizeof(vm)) {
+ log_warnx("failed to receive start message");
+ _exit(EIO);
+ }
+
+ /* Receive the /dev/vmm fd number. */
+ sz = atomicio(read, fd, &env->vmd_fd, sizeof(env->vmd_fd));
+ if (sz != sizeof(env->vmd_fd)) {
+ log_warnx("failed to receive /dev/vmm fd");
+ _exit(EIO);
+ }
+
+ /* Update process with the vm name. */
+ vcp = &vm.vm_params.vmc_params;
+ setproctitle("%s", vcp->vcp_name);
+ log_procinit(vcp->vcp_name);
+
+ /*
+ * We need, at minimum, a vm_kernel fd to boot a vm. This is either a
+ * kernel or a BIOS image.
+ */
+ if (vm.vm_kernel < 0 && !(vm.vm_state & VM_STATE_RECEIVED)) {
+ log_warnx("%s: failed to receive boot fd", vcp->vcp_name);
+ _exit(EINVAL);
+ }
+
+ ret = start_vm(&vm, fd);
+ _exit(ret);
+}
+
/*
* loadfile_bios
*
struct vm_rwregs_params vrp;
struct stat sb;
- /* Child */
- setproctitle("%s", vcp->vcp_name);
- log_procinit(vcp->vcp_name);
-
+ /*
+ * We first try to initialize and allocate memory before bothering
+ * vmm(4) with a request to create a new vm.
+ */
if (!(vm->vm_state & VM_STATE_RECEIVED))
create_memory_map(vcp);
- ret = alloc_guest_mem(vcp);
-
+ ret = alloc_guest_mem(&vm->vm_params.vmc_params);
if (ret) {
struct rlimit lim;
char buf[FMT_SCALED_STRSIZE];
"limit is %s)", buf);
}
errno = ret;
- fatal("could not allocate guest memory");
+ log_warn("could not allocate guest memory");
+ return (ret);
}
+ /* We've allocated guest memory, so now create the vm in vmm(4). */
ret = vmm_create_vm(vcp);
- current_vm = vm;
+ if (ret) {
+ /* Let the vmm process know we failed by sending a 0 vm id. */
+ vcp->vcp_id = 0;
+ atomicio(vwrite, fd, &vcp->vcp_id, sizeof(vcp->vcp_id));
+ return (ret);
+ }
- /* send back the kernel-generated vm id (0 on error) */
- if (atomicio(vwrite, fd, &vcp->vcp_id, sizeof(vcp->vcp_id)) !=
- sizeof(vcp->vcp_id))
- fatal("failed to send created vm id to vmm process");
+ /* Tighten pledge now that we've called VMM_IOC_CREATE ioctl. */
+ if (pledge("stdio vmm recvfd", NULL) == -1)
+ fatal("pledge");
- if (ret) {
- errno = ret;
- fatal("create vmm ioctl failed - exiting");
+ /*
+ * Some of vmd currently relies on global state (current_vm, con_fd).
+ */
+ current_vm = vm;
+ con_fd = vm->vm_tty;
+ if (fcntl(con_fd, F_SETFL, O_NONBLOCK) == -1) {
+ log_warn("failed to set nonblocking mode on console");
+ return (1);
}
/*
- * pledge in the vm processes:
- * stdio - for malloc and basic I/O including events.
- * recvfd - for send/recv.
- * vmm - for the vmm ioctls and operations.
+ * We now let the vmm process know we were successful by sending it our
+ * vmm(4) assigned vm id.
*/
- if (pledge("stdio vmm recvfd", NULL) == -1)
- fatal("pledge");
+ if (atomicio(vwrite, fd, &vcp->vcp_id, sizeof(vcp->vcp_id)) !=
+ sizeof(vcp->vcp_id)) {
+ log_warn("failed to send created vm id to vmm process");
+ return (1);
+ }
+ /* Prepare either our boot image or receive an existing vm to launch. */
if (vm->vm_state & VM_STATE_RECEIVED) {
ret = atomicio(read, vm->vm_receive_fd, &vrp, sizeof(vrp));
if (ret != sizeof(vrp))
}
if (vm->vm_kernel != -1)
- close(vm->vm_kernel);
+ close_fd(vm->vm_kernel);
- con_fd = vm->vm_tty;
- if (fcntl(con_fd, F_SETFL, O_NONBLOCK) == -1)
- fatal("failed to set nonblocking mode on console");
+ /* Initialize our mutexes. */
+ ret = pthread_mutex_init(&threadmutex, NULL);
+ if (ret) {
+ log_warn("%s: could not initialize thread state mutex",
+ __func__);
+ return (ret);
+ }
+ ret = pthread_cond_init(&threadcond, NULL);
+ if (ret) {
+ log_warn("%s: could not initialize thread state "
+ "condition variable", __func__);
+ return (ret);
+ }
+ mutex_lock(&threadmutex);
- for (i = 0; i < VM_MAX_NICS_PER_VM; i++)
- nicfds[i] = vm->vm_ifs[i].vif_fd;
+ /*
+ * Finalize our communication socket with the vmm process. From here
+ * onwards, communication with the vmm process is event-based.
+ */
event_init();
+ if (vmm_pipe(vm, fd, vm_dispatch_vmm) == -1)
+ fatal("setup vm pipe");
+
+ /*
+ * Initialize or restore our emulated hardware.
+ */
+ for (i = 0; i < VMM_MAX_NICS_PER_VM; i++)
+ nicfds[i] = vm->vm_ifs[i].vif_fd;
if (vm->vm_state & VM_STATE_RECEIVED) {
restore_emulated_hw(vcp, vm->vm_receive_fd, nicfds,
if (restore_vm_params(vm->vm_receive_fd, vcp))
fatal("restore vm params failed");
unpause_vm(vcp);
- }
-
- if (vmm_pipe(vm, fd, vm_dispatch_vmm) == -1)
- fatal("setup vm pipe");
+ } else
+ init_emulated_hw(vmc, vm->vm_cdrom, vm->vm_disks, nicfds);
- /* Execute the vcpu run loop(s) for this VM */
- ret = run_vm(vm->vm_cdrom, vm->vm_disks, nicfds, &vm->vm_params, &vrs);
+ /*
+ * Execute the vcpu run loop(s) for this VM.
+ */
+ ret = run_vm(&vm->vm_params, &vrs);
/* Ensure that any in-flight data is written back */
virtio_shutdown(vm);
* 0: the VM exited normally
* !0 : the VM exited abnormally or failed to start
*/
-int
-run_vm(int child_cdrom, int child_disks[][VM_MAX_BASE_PER_DISK],
- int *child_taps, struct vmop_create_params *vmc,
- struct vcpu_reg_state *vrs)
+static int
+run_vm(struct vmop_create_params *vmc, struct vcpu_reg_state *vrs)
{
struct vm_create_params *vcp = &vmc->vmc_params;
struct vm_rwregs_params vregsp;
if (vcp == NULL)
return (EINVAL);
- if (child_cdrom == -1 && strlen(vcp->vcp_cdrom))
- return (EINVAL);
-
- if (child_disks == NULL && vcp->vcp_ndisks != 0)
- return (EINVAL);
-
- if (child_taps == NULL && vcp->vcp_nnics != 0)
- return (EINVAL);
-
- if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM)
- return (EINVAL);
-
- if (vcp->vcp_ndisks > VM_MAX_DISKS_PER_VM)
- return (EINVAL);
-
- if (vcp->vcp_nnics > VM_MAX_NICS_PER_VM)
- return (EINVAL);
-
if (vcp->vcp_nmemranges == 0 ||
vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES)
return (EINVAL);
return (ENOMEM);
}
- log_debug("%s: initializing hardware for vm %s", __func__,
- vcp->vcp_name);
-
- if (!(current_vm->vm_state & VM_STATE_RECEIVED))
- init_emulated_hw(vmc, child_cdrom, child_disks, child_taps);
-
- ret = pthread_mutex_init(&threadmutex, NULL);
- if (ret) {
- log_warn("%s: could not initialize thread state mutex",
- __func__);
- return (ret);
- }
- ret = pthread_cond_init(&threadcond, NULL);
- if (ret) {
- log_warn("%s: could not initialize thread state "
- "condition variable", __func__);
- return (ret);
- }
-
- mutex_lock(&threadmutex);
-
- log_debug("%s: starting vcpu threads for vm %s", __func__,
- vcp->vcp_name);
+ log_debug("%s: starting %zu vcpu thread(s) for vm %s", __func__,
+ vcp->vcp_ncpus, vcp->vcp_name);
/*
* Create and launch one thread for each VCPU. These threads may
-/* $OpenBSD: vmd.c,v 1.141 2023/04/19 12:58:16 jsg Exp $ */
+/* $OpenBSD: vmd.c,v 1.142 2023/04/23 12:11:37 dv Exp $ */
/*
* Copyright (c) 2015 Reyk Floeter <reyk@openbsd.org>
/* Keep "priv" on top as procs[0] */
{ "priv", PROC_PRIV, vmd_dispatch_priv, priv },
{ "control", PROC_CONTROL, vmd_dispatch_control, control },
- { "vmm", PROC_VMM, vmd_dispatch_vmm, vmm, vmm_shutdown },
- { "agentx", PROC_AGENTX, vmd_dispatch_agentx, vm_agentx, vm_agentx_shutdown, "/" }
+ { "vmm", PROC_VMM, vmd_dispatch_vmm, vmm,
+ vmm_shutdown, "/" },
+ { "agentx", PROC_AGENTX, vmd_dispatch_agentx, vm_agentx,
+ vm_agentx_shutdown, "/" }
};
enum privsep_procid privsep_process;
int ch;
const char *conffile = VMD_CONF;
enum privsep_procid proc_id = PROC_PARENT;
- int proc_instance = 0;
+ int proc_instance = 0, vm_launch = 0, vm_fd = -1;
const char *errp, *title = NULL;
int argc0 = argc;
if ((env = calloc(1, sizeof(*env))) == NULL)
fatal("calloc: env");
- while ((ch = getopt(argc, argv, "D:P:I:df:vn")) != -1) {
+ while ((ch = getopt(argc, argv, "D:P:I:V:df:vn")) != -1) {
switch (ch) {
case 'D':
if (cmdline_symset(optarg) < 0)
case 'v':
env->vmd_verbose++;
break;
+ /* vmd fork/exec */
case 'n':
env->vmd_noaction = 1;
break;
if (errp)
fatalx("invalid process instance");
break;
+ /* child vm fork/exec */
+ case 'V':
+ vm_launch = VMD_LAUNCH_VM;
+ vm_fd = strtonum(optarg, 0, 128, &errp);
+ if (errp)
+ fatalx("invalid vm fd");
+ break;
default:
usage();
}
log_init(env->vmd_debug, LOG_DAEMON);
log_setverbose(env->vmd_verbose);
+ /* Re-exec from the vmm child process requires an absolute path. */
+ if (proc_id == PROC_PARENT && *argv[0] != '/')
+ fatalx("re-exec requires execution with an absolute path");
+ env->argv0 = argv[0];
+
/* check for root privileges */
- if (env->vmd_noaction == 0) {
+ if (env->vmd_noaction == 0 && !vm_launch) {
if (geteuid())
fatalx("need root privileges");
}
proc_priv->p_pw = &proc_privpw; /* initialized to all 0 */
proc_priv->p_chroot = ps->ps_pw->pw_dir; /* from VMD_USER */
+ /*
+ * If we're launching a new vm or its device, we short out here.
+ */
+ if (vm_launch == VMD_LAUNCH_VM) {
+ vm_main(vm_fd);
+ /* NOTREACHED */
+ }
+
/* Open /dev/vmm early. */
if (env->vmd_noaction == 0 && proc_id == PROC_PARENT) {
env->vmd_fd = open(VMM_NODE, O_RDWR);
-/* $OpenBSD: vmd.h,v 1.116 2023/04/16 12:47:26 dv Exp $ */
+/* $OpenBSD: vmd.h,v 1.117 2023/04/23 12:11:37 dv Exp $ */
/*
* Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
#define VMD_DEFAULT_STAGGERED_START_DELAY 30
+/* Launch mode identifiers for when a vm fork+exec's. */
+#define VMD_LAUNCH_VM 1
+
/* Rate-limit fast reboots */
#define VM_START_RATE_SEC 6 /* min. seconds since last reboot */
#define VM_START_RATE_LIMIT 3 /* max. number of fast reboots */
struct vmd {
struct privsep vmd_ps;
const char *vmd_conffile;
+ char *argv0; /* abs. path to vmd for exec, unveil */
/* global configuration that is sent to the children */
struct vmd_config vmd_cfg;
int vmm_pipe(struct vmd_vm *, int, void (*)(int, short, void *));
/* vm.c */
+void vm_main(int);
void mutex_lock(pthread_mutex_t *);
void mutex_unlock(pthread_mutex_t *);
int read_mem(paddr_t, void *buf, size_t);
-/* $OpenBSD: vmm.c,v 1.108 2023/04/16 12:47:26 dv Exp $ */
+/* $OpenBSD: vmm.c,v 1.109 2023/04/23 12:11:37 dv Exp $ */
/*
* Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
if (config_init(ps->ps_env) == -1)
fatal("failed to initialize configuration");
- signal_del(&ps->ps_evsigchld);
- signal_set(&ps->ps_evsigchld, SIGCHLD, vmm_sighdlr, ps);
- signal_add(&ps->ps_evsigchld, NULL);
+ /*
+ * We aren't root, so we can't chroot(2). Use unveil(2) instead.
+ */
+ if (unveil(env->argv0, "x") == -1)
+ fatal("unveil %s", env->argv0);
+ if (unveil(NULL, NULL) == -1)
+ fatal("unveil lock");
/*
* pledge in the vmm process:
* stdio - for malloc and basic I/O including events.
* vmm - for the vmm ioctls and operations.
- * proc - for forking and maitaining vms.
+ * proc, exec - for forking and execing new vm's.
* sendfd - for sending send/recv fds to vm proc.
* recvfd - for disks, interfaces and other fds.
*/
- if (pledge("stdio vmm sendfd recvfd proc", NULL) == -1)
+ if (pledge("stdio vmm sendfd recvfd proc exec", NULL) == -1)
fatal("pledge");
+
+ signal_del(&ps->ps_evsigchld);
+ signal_set(&ps->ps_evsigchld, SIGCHLD, vmm_sighdlr, ps);
+ signal_add(&ps->ps_evsigchld, NULL);
}
int
/*
* vmm_start_vm
*
- * Prepares and forks a new VM process.
+ * Prepares and fork+execs a new VM process.
*
* Parameters:
* imsg: The VM data structure that is including the VM create parameters.
{
struct vm_create_params *vcp;
struct vmd_vm *vm;
- int ret = EINVAL;
+ char *nargv[5], num[32];
+ int fd, ret = EINVAL;
int fds[2];
pid_t vm_pid;
size_t i, j, sz;
if (socketpair(AF_UNIX, SOCK_STREAM, PF_UNSPEC, fds) == -1)
fatal("socketpair");
- /* Fork the vmm process to create the vm, inheriting open device fds. */
+ /* Keep our channel open after exec. */
+ if (fcntl(fds[1], F_SETFD, 0)) {
+ ret = errno;
+ log_warn("%s: fcntl", __func__);
+ goto err;
+ }
+
+ /* Start child vmd for this VM (fork, chroot, drop privs) */
vm_pid = fork();
if (vm_pid == -1) {
- log_warn("%s: fork child failed", __func__);
+ log_warn("%s: start child failed", __func__);
ret = EIO;
goto err;
}
vm->vm_pid = vm_pid;
close_fd(fds[1]);
+ /* Send the details over the pipe to the child. */
+ sz = atomicio(vwrite, fds[0], vm, sizeof(*vm));
+ if (sz != sizeof(*vm)) {
+ log_warnx("%s: failed to send config for vm '%s'",
+ __func__, vcp->vcp_name);
+ ret = EIO;
+ /* Defer error handling until after fd closing. */
+ }
+
+ /* As the parent/vmm process, we no longer need these fds. */
for (i = 0 ; i < vcp->vcp_ndisks; i++) {
for (j = 0; j < VM_MAX_BASE_PER_DISK; j++) {
if (close_fd(vm->vm_disks[i][j]) == 0)
if (close_fd(vm->vm_tty) == 0)
vm->vm_tty = -1;
+ /* Deferred error handling from sending the vm struct. */
+ if (ret == EIO)
+ goto err;
+
+ /* Send the fd number for /dev/vmm. */
+ sz = atomicio(vwrite, fds[0], &env->vmd_fd,
+ sizeof(env->vmd_fd));
+ if (sz != sizeof(env->vmd_fd)) {
+ log_warnx("%s: failed to send /dev/vmm fd for vm '%s'",
+ __func__, vcp->vcp_name);
+ ret = EIO;
+ goto err;
+ }
+
/* Read back the kernel-generated vm id from the child */
sz = atomicio(read, fds[0], &vcp->vcp_id, sizeof(vcp->vcp_id));
if (sz != sizeof(vcp->vcp_id)) {
goto err;
}
+ /* Check for an invalid id. This indicates child failure. */
if (vcp->vcp_id == 0)
goto err;
*id = vcp->vcp_id;
*pid = vm->vm_pid;
+ /* Wire up our pipe into the event handling. */
if (vmm_pipe(vm, fds[0], vmm_dispatch_vm) == -1)
fatal("setup vm pipe");
return (0);
} else {
- /* Child */
+ /* Child. Create a new session. */
+ if (setsid() == -1)
+ fatal("setsid");
+
close_fd(fds[0]);
close_fd(PROC_PARENT_SOCK_FILENO);
- ret = start_vm(vm, fds[1]);
+ /* Detach from terminal. */
+ if (!env->vmd_debug && (fd =
+ open("/dev/null", O_RDWR, 0)) != -1) {
+ dup2(fd, STDIN_FILENO);
+ dup2(fd, STDOUT_FILENO);
+ dup2(fd, STDERR_FILENO);
+ if (fd > 2)
+ close(fd);
+ }
+ /* Toggle all fds to not close on exec. */
+ for (i = 0 ; i < vcp->vcp_ndisks; i++)
+ for (j = 0; j < VM_MAX_BASE_PER_DISK; j++)
+ if (vm->vm_disks[i][j] != -1)
+ fcntl(vm->vm_disks[i][j], F_SETFD, 0);
+ for (i = 0 ; i < vcp->vcp_nnics; i++)
+ fcntl(vm->vm_ifs[i].vif_fd, F_SETFD, 0);
+ if (vm->vm_kernel != -1)
+ fcntl(vm->vm_kernel, F_SETFD, 0);
+ if (vm->vm_cdrom != -1)
+ fcntl(vm->vm_cdrom, F_SETFD, 0);
+ if (vm->vm_tty != -1)
+ fcntl(vm->vm_tty, F_SETFD, 0);
+ fcntl(env->vmd_fd, F_SETFD, 0); /* vmm device fd */
+
+ /*
+ * Prepare our new argv for execvp(2) with the fd of our open
+ * pipe to the parent/vmm process as an argument.
+ */
+ memset(num, 0, sizeof(num));
+ snprintf(num, sizeof(num), "%d", fds[1]);
+
+ nargv[0] = env->argv0;
+ nargv[1] = "-V";
+ nargv[2] = num;
+ nargv[3] = "-n";
+ nargv[4] = NULL;
+
+ /* Control resumes in vmd main(). */
+ execvp(nargv[0], nargv);
+
+ ret = errno;
+ log_warn("execvp %s", nargv[0]);
_exit(ret);
+ /* NOTREACHED */
}
return (0);
err:
- vm_remove(vm, __func__);
+ if (!vm->vm_from_config)
+ vm_remove(vm, __func__);
return (ret);
}