From: ccardenas Date: Sun, 9 Sep 2018 04:09:32 +0000 (+0000) Subject: Add initial qcow2 image support. X-Git-Url: http://artulab.com/gitweb/?a=commitdiff_plain;h=f224f92ab31c90476a30c119e655f312fe004b8a;p=openbsd Add initial qcow2 image support. Users are able to declare disk images as 'raw' or 'qcow2' using either vmctl and vm.conf. The default disk image format is 'raw' if not specified. Examples of using disk format: vmctl start bsd -Lc -r cd64.iso -d qcow2:current.qc2 or vmctl start bsd -Lc -r cd64.iso -d raw:current.raw is equivalent to vmctl start bsd -Lc -r cd64.iso -d current.raw in vm.conf vm "current" { disable memory 2G disk "/home/user/vmm/current.qc2" format "qcow2" interface { switch "external" } } or vm "current" { disable memory 2G disk "/home/user/vmm/current.raw" format "raw" interface { switch "external" } } is equivlanet to vm "current" { disable memory 2G disk "/home/user/vmm/current.raw" interface { switch "external" } } Tested by many. Big Thanks to Ori Bernstein. --- diff --git a/usr.sbin/vmctl/main.c b/usr.sbin/vmctl/main.c index b7674d0c980..0652490cdb3 100644 --- a/usr.sbin/vmctl/main.c +++ b/usr.sbin/vmctl/main.c @@ -1,4 +1,4 @@ -/* $OpenBSD: main.c,v 1.39 2018/07/12 14:53:37 reyk Exp $ */ +/* $OpenBSD: main.c,v 1.40 2018/09/09 04:09:32 ccardenas Exp $ */ /* * Copyright (c) 2015 Reyk Floeter @@ -205,8 +205,8 @@ vmmaction(struct parse_result *res) switch (res->action) { case CMD_START: ret = vm_start(res->id, res->name, res->size, res->nifs, - res->nets, res->ndisks, res->disks, res->path, - res->isopath, res->instance); + res->nets, res->ndisks, res->disks, res->disktypes, + res->path, res->isopath, res->instance); if (ret) { errno = ret; err(1, "start VM operation failed"); @@ -334,6 +334,7 @@ parse_free(struct parse_result *res) for (i = 0; i < res->ndisks; i++) free(res->disks[i]); free(res->disks); + free(res->disktypes); memset(res, 0, sizeof(*res)); } @@ -398,10 +399,29 @@ parse_size(struct parse_result *res, char *word, long long val) return (0); } +#define RAW_FMT_PREFIX "raw:" +#define QCOW2_FMT_PREFIX "qcow2:" + +int +parse_disktype(char *s, char **ret) +{ + *ret = s; + if (strstr(s, RAW_FMT_PREFIX) == s) { + *ret = s + strlen(RAW_FMT_PREFIX); + return VMDF_RAW; + } + if (strstr(s, QCOW2_FMT_PREFIX) == s) { + *ret = s + strlen(QCOW2_FMT_PREFIX); + return VMDF_QCOW2; + } + return VMDF_RAW; +} + int -parse_disk(struct parse_result *res, char *word) +parse_disk(struct parse_result *res, char *word, int type) { char **disks; + int *disktypes; char *s; if ((disks = reallocarray(res->disks, res->ndisks + 1, @@ -409,12 +429,19 @@ parse_disk(struct parse_result *res, char *word) warn("reallocarray"); return (-1); } + if ((disktypes = reallocarray(res->disktypes, res->ndisks + 1, + sizeof(int))) == NULL) { + warn("reallocarray"); + return -1; + } if ((s = strdup(word)) == NULL) { warn("strdup"); return (-1); } disks[res->ndisks] = s; + disktypes[res->ndisks] = type; res->disks = disks; + res->disktypes = disktypes; res->ndisks++; return (0); @@ -580,8 +607,8 @@ ctl_reset(struct parse_result *res, int argc, char *argv[]) int ctl_start(struct parse_result *res, int argc, char *argv[]) { - int ch, i; - char path[PATH_MAX]; + int ch, i, type; + char path[PATH_MAX], *s; if (argc < 2) ctl_usage(res->ctl); @@ -628,9 +655,10 @@ ctl_start(struct parse_result *res, int argc, char *argv[]) errx(1, "invalid network: %s", optarg); break; case 'd': - if (realpath(optarg, path) == NULL) + type = parse_disktype(optarg, &s); + if (realpath(s, path) == NULL) err(1, "invalid disk path"); - if (parse_disk(res, path) != 0) + if (parse_disk(res, path, type) != 0) errx(1, "invalid disk: %s", optarg); break; case 'i': diff --git a/usr.sbin/vmctl/vmctl.8 b/usr.sbin/vmctl/vmctl.8 index 81ecbeb6c1d..0c0ed779e2b 100644 --- a/usr.sbin/vmctl/vmctl.8 +++ b/usr.sbin/vmctl/vmctl.8 @@ -1,4 +1,4 @@ -.\" $OpenBSD: vmctl.8,v 1.44 2018/07/29 14:11:05 anton Exp $ +.\" $OpenBSD: vmctl.8,v 1.45 2018/09/09 04:09:32 ccardenas Exp $ .\" .\" Copyright (c) 2015 Mike Larkin .\" @@ -14,7 +14,7 @@ .\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF .\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. .\" -.Dd $Mdocdate: July 29 2018 $ +.Dd $Mdocdate: September 9 2018 $ .Dt VMCTL 8 .Os .Sh NAME @@ -55,7 +55,15 @@ Creates a VM disk image file with the specified .Ar path and .Ar size , -rounded to megabytes. +rounded to megabytes. The disk +.Ar format +may be specified as either +.Ar raw +or +.Ar qcow2 , +defaulting to +.Ar raw +if left unspecified. .It Cm load Ar filename Load additional configuration from the specified file. .It Cm log brief @@ -107,6 +115,14 @@ If not specified, the default is to boot using the BIOS image in Automatically connect to the VM console. .It Fl d Ar path Disk image file (may be specified multiple times to add multiple disk images). +The disk +.Ar path +may be prefixed with a format prefix ( +.Pa raw: +or +.Pa qcow2: +) in order to specify the disk format. If left unspecified, the default format is +.Pa raw . .It Fl i Ar count Number of network interfaces to add to the VM. .It Fl L diff --git a/usr.sbin/vmctl/vmctl.c b/usr.sbin/vmctl/vmctl.c index 867a0e703e0..3810bd778d1 100644 --- a/usr.sbin/vmctl/vmctl.c +++ b/usr.sbin/vmctl/vmctl.c @@ -1,4 +1,4 @@ -/* $OpenBSD: vmctl.c,v 1.55 2018/08/23 06:04:53 reyk Exp $ */ +/* $OpenBSD: vmctl.c,v 1.56 2018/09/09 04:09:32 ccardenas Exp $ */ /* * Copyright (c) 2014 Mike Larkin @@ -70,8 +70,8 @@ int info_console; */ int vm_start(uint32_t start_id, const char *name, int memsize, int nnics, - char **nics, int ndisks, char **disks, char *kernel, char *iso, - char *instance) + char **nics, int ndisks, char **disks, int *disktypes, char *kernel, + char *iso, char *instance) { struct vmop_create_params *vmc; struct vm_create_params *vcp; @@ -128,11 +128,13 @@ vm_start(uint32_t start_id, const char *name, int memsize, int nnics, vcp->vcp_nnics = nnics; vcp->vcp_id = start_id; - for (i = 0 ; i < ndisks; i++) + for (i = 0 ; i < ndisks; i++) { if (strlcpy(vcp->vcp_disks[i], disks[i], sizeof(vcp->vcp_disks[i])) >= sizeof(vcp->vcp_disks[i])) errx(1, "disk path too long"); + vmc->vmc_disktypes[i] = disktypes[i]; + } for (i = 0 ; i < nnics; i++) { vmc->vmc_ifflags[i] = VMIFF_UP; diff --git a/usr.sbin/vmctl/vmctl.h b/usr.sbin/vmctl/vmctl.h index 91ade10b7d8..92be581c0c5 100644 --- a/usr.sbin/vmctl/vmctl.h +++ b/usr.sbin/vmctl/vmctl.h @@ -1,4 +1,4 @@ -/* $OpenBSD: vmctl.h,v 1.21 2018/07/12 12:04:49 reyk Exp $ */ +/* $OpenBSD: vmctl.h,v 1.22 2018/09/09 04:09:32 ccardenas Exp $ */ /* * Copyright (c) 2015 Reyk Floeter @@ -52,6 +52,7 @@ struct parse_result { int nnets; size_t ndisks; char **disks; + int *disktypes; int verbose; char *instance; unsigned int flags; @@ -74,7 +75,8 @@ int vmmaction(struct parse_result *); int parse_ifs(struct parse_result *, char *, int); int parse_network(struct parse_result *, char *); int parse_size(struct parse_result *, char *, long long); -int parse_disk(struct parse_result *, char *); +int parse_disktype(char *, char **); +int parse_disk(struct parse_result *, char *, int); int parse_vmid(struct parse_result *, char *, int); int parse_instance(struct parse_result *, char *); void parse_free(struct parse_result *); @@ -85,7 +87,7 @@ __dead void /* vmctl.c */ int create_imagefile(const char *, long); int vm_start(uint32_t, const char *, int, int, char **, int, - char **, char *, char *, char *); + char **, int *, char *, char *, char *); int vm_start_complete(struct imsg *, int *, int); void terminate_vm(uint32_t, const char *, unsigned int); int terminate_vm_complete(struct imsg *, int *, unsigned int); diff --git a/usr.sbin/vmd/Makefile b/usr.sbin/vmd/Makefile index 60616d39167..33e10f55ea5 100644 --- a/usr.sbin/vmd/Makefile +++ b/usr.sbin/vmd/Makefile @@ -1,4 +1,4 @@ -# $OpenBSD: Makefile,v 1.19 2018/08/25 04:16:09 ccardenas Exp $ +# $OpenBSD: Makefile,v 1.20 2018/09/09 04:09:32 ccardenas Exp $ .if ${MACHINE} == "amd64" || ${MACHINE} == "i386" @@ -6,7 +6,7 @@ PROG= vmd SRCS= vmd.c control.c log.c priv.c proc.c config.c vmm.c SRCS+= vm.c loadfile_elf.c pci.c virtio.c i8259.c mc146818.c SRCS+= ns8250.c i8253.c vmboot.c ufs.c disklabel.c dhcp.c packet.c -SRCS+= parse.y atomicio.c vioscsi.c vioraw.c +SRCS+= parse.y atomicio.c vioscsi.c vioraw.c vioqcow2.c CFLAGS+= -Wall -I${.CURDIR} CFLAGS+= -Wstrict-prototypes -Wmissing-prototypes diff --git a/usr.sbin/vmd/parse.y b/usr.sbin/vmd/parse.y index 13cb3e3ce3f..0adf4b7dcaf 100644 --- a/usr.sbin/vmd/parse.y +++ b/usr.sbin/vmd/parse.y @@ -1,4 +1,4 @@ -/* $OpenBSD: parse.y,v 1.43 2018/09/07 07:35:31 miko Exp $ */ +/* $OpenBSD: parse.y,v 1.44 2018/09/09 04:09:32 ccardenas Exp $ */ /* * Copyright (c) 2007-2016 Reyk Floeter @@ -88,7 +88,7 @@ int symset(const char *, const char *, int); char *symget(const char *); ssize_t parse_size(char *, int64_t); -int parse_disk(char *); +int parse_disk(char *, int); static struct vmop_create_params vmc; static struct vm_create_params *vcp; @@ -117,13 +117,14 @@ typedef struct { %token INCLUDE ERROR -%token ADD ALLOW BOOT CDROM DISABLE DISK DOWN ENABLE GROUP INSTANCE INTERFACE -%token LLADDR LOCAL LOCKED MEMORY NIFS OWNER PATH PREFIX RDOMAIN SIZE SOCKET -%token SWITCH UP VM VMID +%token ADD ALLOW BOOT CDROM DISABLE DISK DOWN ENABLE FORMAT GROUP INSTANCE +%token INTERFACE LLADDR LOCAL LOCKED MEMORY NIFS OWNER PATH PREFIX RDOMAIN +%token SIZE SOCKET SWITCH UP VM VMID %token NUMBER %token STRING %type lladdr %type disable +%type image_format %type local %type locked %type updown @@ -368,8 +369,8 @@ vm_opts_l : vm_opts_l vm_opts nl vm_opts : disable { vcp_disable = $1; } - | DISK string { - if (parse_disk($2) != 0) { + | DISK string image_format { + if (parse_disk($2, $3) != 0) { yyerror("failed to parse disks: %s", $2); free($2); YYERROR; @@ -559,6 +560,22 @@ owner_id : /* none */ { } ; +image_format : /* none */ { + $$ = VMDF_RAW; + } + | FORMAT string { + if (strcmp($2, "raw") == 0) + $$ = VMDF_RAW; + else if (strcmp($2, "qcow2") == 0) + $$ = VMDF_QCOW2; + else { + yyerror("unrecognized disk format %s", $2); + free($2); + YYERROR; + } + } + ; + iface_opts_o : '{' optnl iface_opts_l '}' | iface_opts_c | /* empty */ @@ -720,6 +737,7 @@ lookup(char *s) { "disk", DISK }, { "down", DOWN }, { "enable", ENABLE }, + { "format", FORMAT }, { "group", GROUP }, { "id", VMID }, { "include", INCLUDE }, @@ -1207,7 +1225,7 @@ parse_size(char *word, int64_t val) } int -parse_disk(char *word) +parse_disk(char *word, int type) { char path[PATH_MAX]; @@ -1226,6 +1244,7 @@ parse_disk(char *word) log_warnx("disk path too long"); return (-1); } + vmc.vmc_disktypes[vcp->vcp_ndisks] = type; vcp->vcp_ndisks++; diff --git a/usr.sbin/vmd/vioqcow2.c b/usr.sbin/vmd/vioqcow2.c new file mode 100644 index 00000000000..a893366bc3e --- /dev/null +++ b/usr.sbin/vmd/vioqcow2.c @@ -0,0 +1,580 @@ +/* $OpenBSD: vioqcow2.c,v 1.1 2018/09/09 04:09:32 ccardenas Exp $ */ + +/* + * Copyright (c) 2018 Ori Bernstein + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "vmd.h" +#include "vmm.h" +#include "virtio.h" + +#define QCOW2_COMPRESSED 0x4000000000000000ull +#define QCOW2_INPLACE 0x8000000000000000ull + +#define QCOW2_DIRTY (1 << 0) +#define QCOW2_CORRUPT (1 << 1) + +enum { + ICFEATURE_DIRTY = 1 << 0, + ICFEATURE_CORRUPT = 1 << 1, +}; + +enum { + ACFEATURE_BITEXT = 1 << 0, +}; + +struct qcheader { + char magic[4]; + uint32_t version; + uint64_t backingoff; + uint32_t backingsz; + uint32_t clustershift; + uint64_t disksz; + uint32_t cryptmethod; + uint32_t l1sz; + uint64_t l1off; + uint64_t refoff; + uint32_t refsz; + uint32_t snapcount; + uint64_t snapsz; + /* v3 additions */ + uint64_t incompatfeatures; + uint64_t compatfeatures; + uint64_t autoclearfeatures; + uint32_t reforder; /* Bits = 1 << reforder */ + uint32_t headersz; +} __packed; + +struct qcdisk { + pthread_rwlock_t lock; + struct qcdisk *base; + struct qcheader header; + + int fd; + uint64_t *l1; + char *scratch; + off_t end; + uint32_t clustersz; + off_t disksz; /* In bytes */ + uint32_t cryptmethod; + + uint32_t l1sz; + off_t l1off; + + off_t refoff; + uint32_t refsz; + + uint32_t nsnap; + off_t snapoff; + + /* v3 features */ + uint64_t incompatfeatures; + uint64_t autoclearfeatures; + uint32_t refssz; + uint32_t headersz; +}; + +extern char *__progname; + +static off_t xlate(struct qcdisk *, off_t, int *); +static int copy_cluster(struct qcdisk *, struct qcdisk *, off_t, off_t); +static off_t mkcluster(struct qcdisk *, struct qcdisk *, off_t, off_t); +static int inc_refs(struct qcdisk *, off_t, int); +static int qc2_openpath(struct qcdisk *, char *, int); +static int qc2_open(struct qcdisk *, int); +static ssize_t qc2_pread(void *, char *, size_t, off_t); +static ssize_t qc2_pwrite(void *, char *, size_t, off_t); +static void qc2_close(void *); + +/* + * Initializes a raw disk image backing file from an fd. + * Stores the number of 512 byte sectors in *szp, + * returning -1 for error, 0 for success. + * + * May open snapshot base images. + */ +int +virtio_init_qcow2(struct virtio_backing *file, off_t *szp, int fd) +{ + struct qcdisk *diskp; + + diskp = malloc(sizeof(struct qcdisk)); + if (diskp == NULL) + return -1; + if (qc2_open(diskp, fd) == -1) { + log_warnx("could not open qcow2 disk"); + free(diskp); + return -1; + } + file->p = diskp; + file->pread = qc2_pread; + file->pwrite = qc2_pwrite; + file->close = qc2_close; + *szp = diskp->disksz; + return 0; +} + +static int +qc2_openpath(struct qcdisk *disk, char *path, int flags) +{ + int fd; + + fd = open(path, flags); + if (fd < 0) + return -1; + return qc2_open(disk, fd); +} + +static int +qc2_open(struct qcdisk *disk, int fd) +{ + char basepath[PATH_MAX]; + struct stat st; + struct qcheader header; + uint64_t backingoff; + uint32_t backingsz; + size_t i; + int version; + + if (pread(fd, &header, sizeof header, 0) != sizeof header) { + log_warn("short read on header"); + return -1; + } + if (strncmp(header.magic, "QFI\xfb", 4) != 0) { + log_warn("invalid magic numbers"); + return -1; + } + pthread_rwlock_init(&disk->lock, NULL); + disk->fd = fd; + disk->base = NULL; + + disk->clustersz = (1ull << be32toh(header.clustershift)); + disk->disksz = be64toh(header.disksz); + disk->cryptmethod = be32toh(header.cryptmethod); + disk->l1sz = be32toh(header.l1sz); + disk->l1off = be64toh(header.l1off); + disk->refsz = be32toh(header.refsz); + disk->refoff = be64toh(header.refoff); + disk->nsnap = be32toh(header.snapcount); + disk->snapoff = be64toh(header.snapsz); + /* + * The additional features here are defined as 0 in the v2 format, + * so as long as we clear the buffer before parsing, we don't need + * to check versions here. + */ + disk->incompatfeatures = be64toh(header.incompatfeatures); + disk->autoclearfeatures = be64toh(header.autoclearfeatures); + disk->refssz = be32toh(header.refsz); + disk->headersz = be32toh(header.headersz); + + /* + * We only know about the dirty or corrupt bits here. + */ + if (disk->incompatfeatures & ~(QCOW2_DIRTY|QCOW2_CORRUPT)) { + log_warn("%s: unsupported features %llx", __progname, + disk->incompatfeatures & ~(QCOW2_DIRTY|QCOW2_CORRUPT)); + return -1; + } + + disk->l1 = calloc(disk->l1sz, sizeof *disk->l1); + if (pread(disk->fd, (char*)disk->l1, 8*disk->l1sz, disk->l1off) + != 8*disk->l1sz) { + free(disk->l1); + return -1; + } + for (i = 0; i < disk->l1sz; i++) + disk->l1[i] = be64toh(disk->l1[i]); + version = be32toh(header.version); + if (version != 2 && version != 3) { + log_warn("%s: unknown qcow2 version %d", __progname, version); + return -1; + } + + backingoff = be64toh(header.backingoff); + backingsz = be32toh(header.backingsz); + if (backingsz != 0) { + /* + * FIXME: we need to figure out a way of opening these things, + * otherwise we just crash with a pledge violation. + */ + log_warn("unsupported external snapshot images"); + return -1; + + if (backingsz >= sizeof basepath - 1) { + log_warn("%s: snapshot path too long", __progname); + return -1; + } + if (pread(fd, basepath, backingsz, backingoff) != backingsz) { + log_warn("%s: could not read snapshot base name", + __progname); + return -1; + } + basepath[backingsz] = 0; + + disk->base = calloc(1, sizeof(struct qcdisk)); + if (qc2_openpath(disk->base, basepath, O_RDONLY) == -1) { + free(disk->base); + return -1; + } + if (disk->base->clustersz != disk->clustersz) { + log_warn("%s: all disks must share clustersize", + __progname); + free(disk->base); + return -1; + } + } + fstat(fd, &st); + disk->end = st.st_size; + return 0; +} + +static ssize_t +qc2_pread(void *p, char *buf, size_t len, off_t off) +{ + struct qcdisk *disk, *d; + off_t phys_off, end, cluster_off; + ssize_t sz, rem; + + disk = p; + end = off + len; + if (off < 0 || end > disk->disksz) + return -1; + + /* handle head chunk separately */ + rem = len; + while (off != end) { + for (d = disk; d; d = d->base) + if ((phys_off = xlate(d, off, NULL)) > 0) + break; + /* Break out into chunks. This handles + * three cases: + * + * |----+====|========|====+ | + * + * Either we are at the start of the read, + * and the cluster has some leading bytes. + * This means that we are reading the tail + * of the cluster, and our size is: + * + * clustersz - (off % clustersz). + * + * Otherwise, we're reading the middle section. + * We're already aligned here, so we can just + * read the whole cluster size. Or we're at the + * tail, at which point we just want to read the + * remaining bytes. + */ + cluster_off = off % disk->clustersz; + sz = disk->clustersz - cluster_off; + if (sz > rem) + sz = rem; + /* + * If we're within the disk, but don't have backing bytes, + * just read back zeros. + */ + if (!d) + bzero(buf, sz); + else if (pread(d->fd, buf, sz, phys_off) != sz) + return -1; + off += sz; + buf += sz; + rem -= sz; + } + return len; +} + +ssize_t +qc2_pwrite(void *p, char *buf, size_t len, off_t off) +{ + struct qcdisk *disk, *d; + off_t phys_off, cluster_off, end; + ssize_t sz, rem; + int inplace; + + d = p; + disk = p; + inplace = 1; + end = off + len; + if (off < 0 || end > disk->disksz) + return -1; + rem = len; + while (off != end) { + /* See the read code for a summary of the computation */ + cluster_off = off % disk->clustersz; + sz = disk->clustersz - cluster_off; + if (sz > rem) + sz = rem; + + phys_off = xlate(disk, off, &inplace); + if (phys_off == -1) + return -1; + /* + * If we couldn't find the cluster in the writable disk, + * see if it exists in the base image. If it does, we + * need to copy it before the write. The copy happens + * in the '!inplace' if clause below te search. + */ + if (phys_off == 0) + for (d = disk->base; d; d = d->base) + if ((phys_off = xlate(d, off, NULL)) > 0) + break; + if (!inplace || phys_off == 0) + phys_off = mkcluster(disk, d, off, phys_off); + if (phys_off == -1) + return -1; + if (pwrite(disk->fd, buf, sz, phys_off) != sz) + return -1; + off += sz; + buf += sz; + rem -= sz; + } + return len; +} + +static void +qc2_close(void *p) +{ + struct qcdisk *disk; + + disk = p; + pwrite(disk->fd, disk->l1, disk->l1sz, disk->l1off); + close(disk->fd); + free(disk); +} + +/* + * Translates a virtual offset into an on-disk offset. + * Returns: + * -1 on error + * 0 on 'not found' + * >0 on found + */ +static off_t +xlate(struct qcdisk *disk, off_t off, int *inplace) +{ + off_t l2sz, l1off, l2tab, l2off, cluster, clusteroff; + uint64_t buf; + + + /* + * Clear out inplace flag -- xlate misses should not + * be flagged as updatable in place. We will still + * return 0 from them, but this leaves less surprises + * in the API. + */ + if (inplace) + *inplace = 0; + pthread_rwlock_rdlock(&disk->lock); + if (off < 0) + goto err; + + l2sz = disk->clustersz / 8; + l1off = (off / disk->clustersz) / l2sz; + if (l1off >= disk->l1sz) + goto err; + + l2tab = disk->l1[l1off]; + l2tab &= ~QCOW2_INPLACE; + if (l2tab == 0) { + pthread_rwlock_unlock(&disk->lock); + return 0; + } + l2off = (off / disk->clustersz) % l2sz; + pread(disk->fd, &buf, sizeof(buf), l2tab + l2off*8); + cluster = be64toh(buf); + /* + * cluster may be 0, but all future operations don't affect + * the return value. + */ + if (inplace) + *inplace = !!(cluster & QCOW2_INPLACE); + if (cluster & QCOW2_COMPRESSED) { + log_warn("%s: compressed clusters unsupported", __progname); + goto err; + } + pthread_rwlock_unlock(&disk->lock); + clusteroff = 0; + cluster &= ~QCOW2_INPLACE; + if (cluster) + clusteroff = off % disk->clustersz; + return cluster + clusteroff; +err: + pthread_rwlock_unlock(&disk->lock); + return -1; +} + +/* + * Allocates a new cluster on disk, creating a new L2 table + * if needed. The cluster starts off with a refs of one, + * and the writable bit set. + * + * Returns -1 on error, and the physical address within the + * cluster of the write offset if it exists. + */ +static off_t +mkcluster(struct qcdisk *disk, struct qcdisk *base, off_t off, off_t src_phys) +{ + off_t l2sz, l1off, l2tab, l2off, cluster, clusteroff, orig; + uint64_t buf; + int fd; + + pthread_rwlock_wrlock(&disk->lock); + + cluster = -1; + fd = disk->fd; + /* L1 entries always exist */ + l2sz = disk->clustersz / 8; + l1off = off / (disk->clustersz * l2sz); + if (l1off >= disk->l1sz) + goto fail; + + /* + * Align disk to cluster size, for ftruncate: Not strictly + * required, but it easier to eyeball buggy write offsets, + * and helps performance a bit. + */ + disk->end = (disk->end + disk->clustersz - 1) & ~(disk->clustersz - 1); + + l2tab = disk->l1[l1off]; + l2off = (off / disk->clustersz) % l2sz; + /* We may need to create or clone an L2 entry to map the block */ + if (l2tab == 0 || (l2tab & QCOW2_INPLACE) == 0) { + orig = l2tab & ~QCOW2_INPLACE; + l2tab = disk->end; + disk->end += disk->clustersz; + if (ftruncate(disk->fd, disk->end) == -1) { + perror("ftruncate"); + goto fail; + } + + /* + * If we translated, found a L2 entry, but it needed to + * be copied, copy it. + */ + if (orig != 0 && copy_cluster(disk, disk, l2tab, orig) == -1) { + perror("move cluster"); + goto fail; + } + /* Update l1 -- we flush it later */ + disk->l1[l1off] = l2tab | QCOW2_INPLACE; + if (inc_refs(disk, l2tab, 1) == -1) { + perror("refs"); + goto fail; + } + } + l2tab &= ~QCOW2_INPLACE; + + /* Grow the disk */ + if (ftruncate(disk->fd, disk->end + disk->clustersz) < 0) + goto fail; + if (src_phys > 0) + if (copy_cluster(disk, base, disk->end, src_phys) == -1) + goto fail; + cluster = disk->end; + disk->end += disk->clustersz; + buf = htobe64(cluster | QCOW2_INPLACE); + if (pwrite(disk->fd, &buf, sizeof buf, l2tab + l2off*8) != sizeof(buf)) + goto fail; + + /* TODO: lazily sync: currently VMD doesn't close things */ + buf = htobe64(disk->l1[l1off]); + if (pwrite(disk->fd, &buf, sizeof buf, disk->l1off + 8*l1off) != 8) + goto fail; + if (inc_refs(disk, cluster, 1) == -1) + goto fail; + + pthread_rwlock_unlock(&disk->lock); + clusteroff = off % disk->clustersz; + return cluster + clusteroff; + +fail: + pthread_rwlock_unlock(&disk->lock); + return -1; +} + +/* Copies a cluster containing src to dst. Src and dst need not be aligned. */ +static int +copy_cluster(struct qcdisk *disk, struct qcdisk *base, off_t dst, off_t src) +{ + char *scratch; + + scratch = alloca(disk->clustersz); + if (!scratch) + err(1, "out of memory"); + src &= ~(disk->clustersz - 1); + dst &= ~(disk->clustersz - 1); + if (pread(base->fd, scratch, disk->clustersz, src) == -1) + return -1; + if (pwrite(disk->fd, scratch, disk->clustersz, dst) == -1) + return -1; + return 0; +} + +static int +inc_refs(struct qcdisk *disk, off_t off, int newcluster) +{ + off_t l1off, l1idx, l2idx, l2cluster; + size_t nper; + uint16_t refs; + uint64_t buf; + + off &= ~QCOW2_INPLACE; + nper = disk->clustersz / 2; + l1idx = (off / disk->clustersz) / nper; + l2idx = (off / disk->clustersz) % nper; + l1off = disk->refoff + 8*l1idx; + if (pread(disk->fd, &buf, sizeof buf, l1off) != 8) + return -1; + + l2cluster = be64toh(buf); + if (l2cluster == 0) { + l2cluster = disk->end; + disk->end += disk->clustersz; + if (ftruncate(disk->fd, disk->end) < 0) { + log_warn("refs block grow fail "); + return -1; + } + buf = htobe64(l2cluster); + if (pwrite(disk->fd, &buf, sizeof buf, l1off) != 8) { + return -1; + } + } + + refs = 1; + if (!newcluster) { + if (pread(disk->fd, &refs, sizeof refs, l2cluster+2*l2idx) != 2) + return -1; + refs = be16toh(refs) + 1; + } + refs = htobe16(refs); + if (pwrite(disk->fd, &refs, sizeof refs, l2cluster + 2*l2idx) != 2) { + log_warn("could not write ref block"); + } + return 0; +} + diff --git a/usr.sbin/vmd/virtio.c b/usr.sbin/vmd/virtio.c index df500a385ce..37faa8a0c9a 100644 --- a/usr.sbin/vmd/virtio.c +++ b/usr.sbin/vmd/virtio.c @@ -1,4 +1,4 @@ -/* $OpenBSD: virtio.c,v 1.64 2018/08/25 04:16:09 ccardenas Exp $ */ +/* $OpenBSD: virtio.c,v 1.65 2018/09/09 04:09:32 ccardenas Exp $ */ /* * Copyright (c) 2015 Mike Larkin @@ -1746,13 +1746,18 @@ vmmci_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr, } static int -virtio_init_disk(struct virtio_backing *file, off_t *sz, int fd) +virtio_init_disk(struct virtio_backing *file, off_t *sz, int fd, int type) { /* - * This is where we slot in disk type selection. - * Right now, there's only raw. + * probe disk types in order of preference, first one to work wins. + * TODO: provide a way of specifying the type and options. */ - return virtio_init_raw(file, sz, fd); + switch (type) { + case VMDF_RAW: return virtio_init_raw(file, sz, fd); + case VMDF_QCOW2: return virtio_init_qcow2(file, sz, fd); + } + log_warnx("%s: invalid disk format", __progname); + return -1; } void @@ -1833,7 +1838,7 @@ virtio_init(struct vmd_vm *vm, int child_cdrom, int *child_disks, vioblk[i].vm_id = vcp->vcp_id; vioblk[i].irq = pci_get_dev_irq(id); if (virtio_init_disk(&vioblk[i].file, &vioblk[i].sz, - child_disks[i]) == -1) + child_disks[i], vmc->vmc_disktypes[i]) == -1) continue; vioblk[i].sz /= 512; } @@ -1959,7 +1964,7 @@ virtio_init(struct vmd_vm *vm, int child_cdrom, int *child_disks, vioscsi->vq[i].last_avail = 0; } if (virtio_init_disk(&vioscsi->file, &vioscsi->sz, - child_cdrom) == -1) + child_cdrom, VMDF_RAW) == -1) return; vioscsi->locked = 0; vioscsi->lba = 0; @@ -2098,8 +2103,9 @@ vionet_restore(int fd, struct vmd_vm *vm, int *child_taps) } int -vioblk_restore(int fd, struct vm_create_params *vcp, int *child_disks) +vioblk_restore(int fd, struct vmop_create_params *vmc, int *child_disks) { + struct vm_create_params *vcp = &vmc->vmc_params; uint8_t i; nr_vioblk = vcp->vcp_ndisks; @@ -2123,7 +2129,7 @@ vioblk_restore(int fd, struct vm_create_params *vcp, int *child_disks) return (-1); } if (virtio_init_disk(&vioblk[i].file, &vioblk[i].sz, - child_disks[i]) == -1) + child_disks[i], vmc->vmc_disktypes[i]) == -1) continue; } return (0); @@ -2155,7 +2161,7 @@ vioscsi_restore(int fd, struct vm_create_params *vcp, int child_cdrom) return (-1); } - virtio_init_disk(&vioscsi->file, &vioscsi->sz, child_cdrom); + virtio_init_disk(&vioscsi->file, &vioscsi->sz, child_cdrom, VMDF_RAW); return (0); } @@ -2171,7 +2177,7 @@ virtio_restore(int fd, struct vmd_vm *vm, int child_cdrom, int *child_disks, if ((ret = viornd_restore(fd, vcp)) == -1) return ret; - if ((ret = vioblk_restore(fd, vcp, child_disks)) == -1) + if ((ret = vioblk_restore(fd, vmc, child_disks)) == -1) return ret; if ((ret = vioscsi_restore(fd, vcp, child_cdrom)) == -1) diff --git a/usr.sbin/vmd/virtio.h b/usr.sbin/vmd/virtio.h index 84a7e2af6a5..86ee6d21f9f 100644 --- a/usr.sbin/vmd/virtio.h +++ b/usr.sbin/vmd/virtio.h @@ -1,4 +1,4 @@ -/* $OpenBSD: virtio.h,v 1.27 2018/08/25 04:16:09 ccardenas Exp $ */ +/* $OpenBSD: virtio.h,v 1.28 2018/09/09 04:09:32 ccardenas Exp $ */ /* * Copyright (c) 2015 Mike Larkin @@ -270,10 +270,11 @@ void viornd_update_qa(void); int viornd_notifyq(void); int virtio_init_raw(struct virtio_backing *dev, off_t *sz, int fd); +int virtio_init_qcow2(struct virtio_backing *dev, off_t *sz, int fd); int virtio_blk_io(int, uint16_t, uint32_t *, uint8_t *, void *, uint8_t); int vioblk_dump(int); -int vioblk_restore(int, struct vm_create_params *, int *); +int vioblk_restore(int, struct vmop_create_params *, int *); void vioblk_update_qs(struct vioblk_dev *); void vioblk_update_qa(struct vioblk_dev *); int vioblk_notifyq(struct vioblk_dev *); diff --git a/usr.sbin/vmd/vm.conf.5 b/usr.sbin/vmd/vm.conf.5 index 413395e4f3a..c4768d29114 100644 --- a/usr.sbin/vmd/vm.conf.5 +++ b/usr.sbin/vmd/vm.conf.5 @@ -1,4 +1,4 @@ -.\" $OpenBSD: vm.conf.5,v 1.35 2018/07/13 17:48:30 jmc Exp $ +.\" $OpenBSD: vm.conf.5,v 1.36 2018/09/09 04:09:32 ccardenas Exp $ .\" .\" Copyright (c) 2015 Mike Larkin .\" Copyright (c) 2015 Reyk Floeter @@ -15,7 +15,7 @@ .\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF .\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. .\" -.Dd $Mdocdate: July 13 2018 $ +.Dd $Mdocdate: September 9 2018 $ .Dt VM.CONF 5 .Os .Sh NAME @@ -144,8 +144,15 @@ nor is specified. .It Cm disable Do not start this VM. -.It Cm disk Ar path +.It Cm disk Ar path Op Cm format Ar fmt Disk image file (may be specified multiple times to add multiple disk images). +The format may be specified as either +.Ar qcow2 +or +.Ar raw , +defaulting to +.Ar raw +if left unspecified. .It Oo Cm local Oc Cm interface Oo name Oc Op Brq ... Network interface to add to the VM. The optional diff --git a/usr.sbin/vmd/vmd.h b/usr.sbin/vmd/vmd.h index 2b83bb42c71..a791c7388d5 100644 --- a/usr.sbin/vmd/vmd.h +++ b/usr.sbin/vmd/vmd.h @@ -1,4 +1,4 @@ -/* $OpenBSD: vmd.h,v 1.78 2018/07/15 14:36:54 reyk Exp $ */ +/* $OpenBSD: vmd.h,v 1.79 2018/09/09 04:09:32 ccardenas Exp $ */ /* * Copyright (c) 2015 Mike Larkin @@ -164,6 +164,11 @@ struct vmop_create_params { #define VMIFF_LOCAL 0x04 #define VMIFF_RDOMAIN 0x08 #define VMIFF_OPTMASK (VMIFF_LOCKED|VMIFF_LOCAL|VMIFF_RDOMAIN) + + unsigned int vmc_disktypes[VMM_MAX_DISKS_PER_VM]; +#define VMDF_RAW 0x01 +#define VMDF_QCOW2 0x02 + char vmc_ifnames[VMM_MAX_NICS_PER_VM][IF_NAMESIZE]; char vmc_ifswitch[VMM_MAX_NICS_PER_VM][VM_NAME_MAX]; char vmc_ifgroup[VMM_MAX_NICS_PER_VM][IF_NAMESIZE];