From 02de433d8c4801712cfa33b1092e7f2c411d534e Mon Sep 17 00:00:00 2001 From: semarie Date: Fri, 29 Jul 2022 17:47:11 +0000 Subject: [PATCH] Replace the swap extent(9) usage by a blist data structure. It makes uvm_swap_free() faster: extents have a cost of O(n*n) which doesn't really scale with gigabytes of swap. Based on initial work from mpi@ The blist implementation comes from DragonFlyBSD. The diff adds also a ddb(4) 'show swap' command to show the blist and help debugging, and fix some off-by-one in size printed during hibernate. ok mpi@ --- regress/sys/uvm/blist/Makefile | 37 + regress/sys/uvm/blist/test-1.in | 3 + regress/sys/uvm/blist/test-1.out | 28 + regress/sys/uvm/blist/test-2.in | 6 + regress/sys/uvm/blist/test-2.out | 34 + regress/sys/uvm/blist/test-3.in | 2 + regress/sys/uvm/blist/test-3.out | 7 + regress/sys/uvm/blist/test-4.in | 3 + regress/sys/uvm/blist/test-4.out | 9 + regress/sys/uvm/blist/test-5.in | 3 + regress/sys/uvm/blist/test-5.out | 27 + share/man/man4/ddb.4 | 7 +- sys/conf/files | 3 +- sys/ddb/db_command.c | 11 +- sys/ddb/db_interface.h | 5 +- sys/kern/subr_blist.c | 1284 ++++++++++++++++++++++++++++++ sys/kern/subr_hibernate.c | 6 +- sys/sys/blist.h | 136 ++++ sys/uvm/uvm_swap.c | 115 +-- 19 files changed, 1663 insertions(+), 63 deletions(-) create mode 100644 regress/sys/uvm/blist/Makefile create mode 100644 regress/sys/uvm/blist/test-1.in create mode 100644 regress/sys/uvm/blist/test-1.out create mode 100644 regress/sys/uvm/blist/test-2.in create mode 100644 regress/sys/uvm/blist/test-2.out create mode 100644 regress/sys/uvm/blist/test-3.in create mode 100644 regress/sys/uvm/blist/test-3.out create mode 100644 regress/sys/uvm/blist/test-4.in create mode 100644 regress/sys/uvm/blist/test-4.out create mode 100644 regress/sys/uvm/blist/test-5.in create mode 100644 regress/sys/uvm/blist/test-5.out create mode 100644 sys/kern/subr_blist.c create mode 100644 sys/sys/blist.h diff --git a/regress/sys/uvm/blist/Makefile b/regress/sys/uvm/blist/Makefile new file mode 100644 index 00000000000..c46e70b6c27 --- /dev/null +++ b/regress/sys/uvm/blist/Makefile @@ -0,0 +1,37 @@ +# $OpenBSD: Makefile,v 1.1 2022/07/29 17:47:11 semarie Exp $ + +SRCS += ${.CURDIR}/../../../../sys/kern/subr_blist.c +WARNINGS = Yes + +TESTS += \ + test-1 1024 \ + test-2 1024 \ + test-3 64 \ + test-4 64 \ + test-5 1024 + +.for t s in ${TESTS} +run-$t: blist + ./blist $s <${.CURDIR}/$t.in >$t.run + diff -u ${.CURDIR}/$t.out $t.run + +show-$t: blist + ./blist $s <${.CURDIR}/$t.in 2>&1 + +regen-$t: blist + ./blist $s <${.CURDIR}/$t.in >${.CURDIR}/$t.out + +REGRESS_TARGETS += run-$t +REGEN_TARGETS += regen-$t +CLEANFILES += $t.run +.endfor + +blist: ${SRCS} + ${CC} -o $@ ${CFLAGS} ${SRCS} +CLEANFILES += blist blist.d + +regen: ${REGEN_TARGETS} + +.PHONY: regen ${REGEN_TARGETS} + +.include diff --git a/regress/sys/uvm/blist/test-1.in b/regress/sys/uvm/blist/test-1.in new file mode 100644 index 00000000000..0c00549ccaf --- /dev/null +++ b/regress/sys/uvm/blist/test-1.in @@ -0,0 +1,3 @@ +a 1 +p +g diff --git a/regress/sys/uvm/blist/test-1.out b/regress/sys/uvm/blist/test-1.out new file mode 100644 index 00000000000..2184455b9ec --- /dev/null +++ b/regress/sys/uvm/blist/test-1.out @@ -0,0 +1,28 @@ +BLIST representing 1024 blocks (4 MB of swap), requiring 0.00M of ram +BLIST raw radix tree: 18 records, top-radix 2048 +1024/1024/2048> count 1 +blist_meta_alloc blkat 0 blk 0 count 1 radix 2048 + R=0000 +1023/1024/2048> BLIST { + (0000,2048): subtree (1023/2048) big=1023 { + (0000,64): bitmap fffffffffffffffe big=64 + (0040,64): bitmap ffffffffffffffff big=64 + (0080,64): bitmap ffffffffffffffff big=64 + (00c0,64): bitmap ffffffffffffffff big=64 + (0100,64): bitmap ffffffffffffffff big=64 + (0140,64): bitmap ffffffffffffffff big=64 + (0180,64): bitmap ffffffffffffffff big=64 + (01c0,64): bitmap ffffffffffffffff big=64 + (0200,64): bitmap ffffffffffffffff big=64 + (0240,64): bitmap ffffffffffffffff big=64 + (0280,64): bitmap ffffffffffffffff big=64 + (02c0,64): bitmap ffffffffffffffff big=64 + (0300,64): bitmap ffffffffffffffff big=64 + (0340,64): bitmap ffffffffffffffff big=64 + (0380,64): bitmap ffffffffffffffff big=64 + (03c0,64): bitmap ffffffffffffffff big=64 + (0400,64): Terminator + } +} +1023/1024/2048> gapfind: begin=0040 end=0400 size=960 +1023/1024/2048> \ No newline at end of file diff --git a/regress/sys/uvm/blist/test-2.in b/regress/sys/uvm/blist/test-2.in new file mode 100644 index 00000000000..7d83c598c20 --- /dev/null +++ b/regress/sys/uvm/blist/test-2.in @@ -0,0 +1,6 @@ +a 64 +a 64 +a 64 +f 0x0040 64 +p +g diff --git a/regress/sys/uvm/blist/test-2.out b/regress/sys/uvm/blist/test-2.out new file mode 100644 index 00000000000..6da19c7c66b --- /dev/null +++ b/regress/sys/uvm/blist/test-2.out @@ -0,0 +1,34 @@ +BLIST representing 1024 blocks (4 MB of swap), requiring 0.00M of ram +BLIST raw radix tree: 18 records, top-radix 2048 +1024/1024/2048> count 64 +blist_meta_alloc blkat 0 blk 0 count 64 radix 2048 + R=0000 +960/1024/2048> count 64 +blist_meta_alloc blkat 0 blk 0 count 64 radix 2048 + R=0040 +896/1024/2048> count 64 +blist_meta_alloc blkat 0 blk 0 count 64 radix 2048 + R=0080 +832/1024/2048> 896/1024/2048> BLIST { + (0000,2048): subtree (896/2048) big=832 { + (0000,64): bitmap 0000000000000000 big=0 + (0040,64): bitmap ffffffffffffffff big=64 + (0080,64): bitmap 0000000000000000 big=64 + (00c0,64): bitmap ffffffffffffffff big=64 + (0100,64): bitmap ffffffffffffffff big=64 + (0140,64): bitmap ffffffffffffffff big=64 + (0180,64): bitmap ffffffffffffffff big=64 + (01c0,64): bitmap ffffffffffffffff big=64 + (0200,64): bitmap ffffffffffffffff big=64 + (0240,64): bitmap ffffffffffffffff big=64 + (0280,64): bitmap ffffffffffffffff big=64 + (02c0,64): bitmap ffffffffffffffff big=64 + (0300,64): bitmap ffffffffffffffff big=64 + (0340,64): bitmap ffffffffffffffff big=64 + (0380,64): bitmap ffffffffffffffff big=64 + (03c0,64): bitmap ffffffffffffffff big=64 + (0400,64): Terminator + } +} +896/1024/2048> gapfind: begin=00c0 end=0400 size=832 +896/1024/2048> \ No newline at end of file diff --git a/regress/sys/uvm/blist/test-3.in b/regress/sys/uvm/blist/test-3.in new file mode 100644 index 00000000000..960e7ef302a --- /dev/null +++ b/regress/sys/uvm/blist/test-3.in @@ -0,0 +1,2 @@ +p +g diff --git a/regress/sys/uvm/blist/test-3.out b/regress/sys/uvm/blist/test-3.out new file mode 100644 index 00000000000..b061c63d4d4 --- /dev/null +++ b/regress/sys/uvm/blist/test-3.out @@ -0,0 +1,7 @@ +BLIST representing 64 blocks (0 MB of swap), requiring 0.00M of ram +BLIST raw radix tree: 1 records, top-radix 64 +64/64/64> BLIST { + (0000,64): bitmap ffffffffffffffff big=64 +} +64/64/64> gapfind: begin=0000 end=0040 size=64 +64/64/64> \ No newline at end of file diff --git a/regress/sys/uvm/blist/test-4.in b/regress/sys/uvm/blist/test-4.in new file mode 100644 index 00000000000..fca2a03dc3b --- /dev/null +++ b/regress/sys/uvm/blist/test-4.in @@ -0,0 +1,3 @@ +a 64 +p +g diff --git a/regress/sys/uvm/blist/test-4.out b/regress/sys/uvm/blist/test-4.out new file mode 100644 index 00000000000..0a030935277 --- /dev/null +++ b/regress/sys/uvm/blist/test-4.out @@ -0,0 +1,9 @@ +BLIST representing 64 blocks (0 MB of swap), requiring 0.00M of ram +BLIST raw radix tree: 1 records, top-radix 64 +64/64/64> count 64 + R=0000 +0/64/64> BLIST { + (0000,64): bitmap 0000000000000000 big=64 +} +0/64/64> gapfind: begin=0000 end=0000 size=0 +0/64/64> \ No newline at end of file diff --git a/regress/sys/uvm/blist/test-5.in b/regress/sys/uvm/blist/test-5.in new file mode 100644 index 00000000000..3c05f57c494 --- /dev/null +++ b/regress/sys/uvm/blist/test-5.in @@ -0,0 +1,3 @@ +a 64 0x300 +p +g diff --git a/regress/sys/uvm/blist/test-5.out b/regress/sys/uvm/blist/test-5.out new file mode 100644 index 00000000000..0258ea1b75c --- /dev/null +++ b/regress/sys/uvm/blist/test-5.out @@ -0,0 +1,27 @@ +BLIST representing 1024 blocks (4 MB of swap), requiring 0.00M of ram +BLIST raw radix tree: 18 records, top-radix 2048 +1024/1024/2048> blist_meta_alloc blkat 768 blk 0 count 64 radix 2048 + R=0300 +960/1024/2048> BLIST { + (0000,2048): subtree (960/2048) big=960 { + (0000,64): bitmap ffffffffffffffff big=64 + (0040,64): bitmap ffffffffffffffff big=64 + (0080,64): bitmap ffffffffffffffff big=64 + (00c0,64): bitmap ffffffffffffffff big=64 + (0100,64): bitmap ffffffffffffffff big=64 + (0140,64): bitmap ffffffffffffffff big=64 + (0180,64): bitmap ffffffffffffffff big=64 + (01c0,64): bitmap ffffffffffffffff big=64 + (0200,64): bitmap ffffffffffffffff big=64 + (0240,64): bitmap ffffffffffffffff big=64 + (0280,64): bitmap ffffffffffffffff big=64 + (02c0,64): bitmap ffffffffffffffff big=64 + (0300,64): bitmap 0000000000000000 big=64 + (0340,64): bitmap ffffffffffffffff big=64 + (0380,64): bitmap ffffffffffffffff big=64 + (03c0,64): bitmap ffffffffffffffff big=64 + (0400,64): Terminator + } +} +960/1024/2048> gapfind: begin=0000 end=0300 size=768 +960/1024/2048> \ No newline at end of file diff --git a/share/man/man4/ddb.4 b/share/man/man4/ddb.4 index 52decfcd4a6..46835e15452 100644 --- a/share/man/man4/ddb.4 +++ b/share/man/man4/ddb.4 @@ -1,4 +1,4 @@ -.\" $OpenBSD: ddb.4,v 1.102 2022/07/28 22:19:09 bluhm Exp $ +.\" $OpenBSD: ddb.4,v 1.103 2022/07/29 17:47:11 semarie Exp $ .\" $NetBSD: ddb.4,v 1.5 1994/11/30 16:22:09 jtc Exp $ .\" .\" Mach Operating System @@ -25,7 +25,7 @@ .\" any improvements or extensions that they make and grant Carnegie Mellon .\" the rights to redistribute these changes. .\" -.Dd $Mdocdate: July 28 2022 $ +.Dd $Mdocdate: July 29 2022 $ .Dt DDB 4 .Os .Sh NAME @@ -818,6 +818,9 @@ as a struct Nested structures and bit fields are not printed. Character arrays are printed as bytes. .\" -------------------- +.It Ic show swap +Prints a detailed list of all swaps. +.\" -------------------- .It Xo .Ic show tdb .Op Cm /f diff --git a/sys/conf/files b/sys/conf/files index faa5090b9f8..3bd32d693b8 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -1,4 +1,4 @@ -# $OpenBSD: files,v 1.715 2022/07/12 17:12:31 jca Exp $ +# $OpenBSD: files,v 1.716 2022/07/29 17:47:11 semarie Exp $ # $NetBSD: files,v 1.87 1996/05/19 17:17:50 jonathan Exp $ # @(#)files.newconf 7.5 (Berkeley) 5/10/93 @@ -710,6 +710,7 @@ file kern/kern_srp.c file kern/kern_xxx.c file kern/sched_bsd.c file kern/subr_autoconf.c +file kern/subr_blist.c file kern/subr_disk.c file kern/subr_evcount.c file kern/subr_extent.c diff --git a/sys/ddb/db_command.c b/sys/ddb/db_command.c index 8f560afbd7d..bb369832baa 100644 --- a/sys/ddb/db_command.c +++ b/sys/ddb/db_command.c @@ -1,4 +1,4 @@ -/* $OpenBSD: db_command.c,v 1.95 2022/07/28 22:19:09 bluhm Exp $ */ +/* $OpenBSD: db_command.c,v 1.96 2022/07/29 17:47:11 semarie Exp $ */ /* $NetBSD: db_command.c,v 1.20 1996/03/30 22:30:05 christos Exp $ */ /* @@ -102,6 +102,7 @@ void db_tdb_print_cmd(db_expr_t, int, db_expr_t, char *); void db_vnode_print_cmd(db_expr_t, int, db_expr_t, char *); void db_nfsreq_print_cmd(db_expr_t, int, db_expr_t, char *); void db_nfsnode_print_cmd(db_expr_t, int, db_expr_t, char *); +void db_swap_print_cmd(db_expr_t, int, db_expr_t, char *); void db_help_cmd(db_expr_t, int, db_expr_t, char *); void db_fncall(db_expr_t, int, db_expr_t, char *); void db_boot_sync_cmd(db_expr_t, int, db_expr_t, char *); @@ -507,6 +508,13 @@ db_nfsnode_print_cmd(db_expr_t addr, int have_addr, db_expr_t count, } #endif +/*ARGSUSED*/ +void +db_swap_print_cmd(db_expr_t addr, int have_addr, db_expr_t count, + char *modif) +{ + swap_print_all(db_printf); +} /*ARGSUSED*/ void @@ -633,6 +641,7 @@ const struct db_command db_show_cmds[] = { { "route", db_show_route, 0, NULL }, { "socket", db_socket_print_cmd, 0, NULL }, { "struct", db_ctf_show_struct, CS_OWN, NULL }, + { "swap", db_swap_print_cmd, 0, NULL }, #ifdef IPSEC { "tdb", db_tdb_print_cmd, 0, NULL }, #endif diff --git a/sys/ddb/db_interface.h b/sys/ddb/db_interface.h index e66dc8c9eda..50a337dc9f7 100644 --- a/sys/ddb/db_interface.h +++ b/sys/ddb/db_interface.h @@ -1,4 +1,4 @@ -/* $OpenBSD: db_interface.h,v 1.23 2022/07/28 22:19:09 bluhm Exp $ */ +/* $OpenBSD: db_interface.h,v 1.24 2022/07/29 17:47:11 semarie Exp $ */ /* $NetBSD: db_interface.h,v 1.1 1996/02/05 01:57:03 christos Exp $ */ /* @@ -72,6 +72,9 @@ void nfs_request_print(void *, int, int (*)(const char *, ...)); void db_show_all_nfsnodes(db_expr_t, int, db_expr_t, char *); void nfs_node_print(void *, int, int (*)(const char *, ...)); +/* uvm/uvm_swap.c */ +void swap_print_all(int (*)(const char *, ...)); + /* ufs/ffs/ffs_softdep.c */ struct worklist; void worklist_print(struct worklist *, int, int (*)(const char *, ...)); diff --git a/sys/kern/subr_blist.c b/sys/kern/subr_blist.c new file mode 100644 index 00000000000..4cf6259417d --- /dev/null +++ b/sys/kern/subr_blist.c @@ -0,0 +1,1284 @@ +/* $OpenBSD: subr_blist.c,v 1.1 2022/07/29 17:47:12 semarie Exp $ */ +/* + * BLIST.C - Bitmap allocator/deallocator, using a radix tree with hinting + * + * Copyright (c) 1998,2004 The DragonFly Project. All rights reserved. + * + * This code is derived from software contributed to The DragonFly Project + * by Matthew Dillon + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of The DragonFly Project nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific, prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * + * This module implements a general bitmap allocator/deallocator. The + * allocator eats around 2 bits per 'block'. The module does not + * try to interpret the meaning of a 'block' other than to return + * SWAPBLK_NONE on an allocation failure. + * + * A radix tree is used to maintain the bitmap. Two radix constants are + * involved: One for the bitmaps contained in the leaf nodes (typically + * 32), and one for the meta nodes (typically 16). Both meta and leaf + * nodes have a hint field. This field gives us a hint as to the largest + * free contiguous range of blocks under the node. It may contain a + * value that is too high, but will never contain a value that is too + * low. When the radix tree is searched, allocation failures in subtrees + * update the hint. + * + * The radix tree also implements two collapsed states for meta nodes: + * the ALL-ALLOCATED state and the ALL-FREE state. If a meta node is + * in either of these two states, all information contained underneath + * the node is considered stale. These states are used to optimize + * allocation and freeing operations. + * + * The hinting greatly increases code efficiency for allocations while + * the general radix structure optimizes both allocations and frees. The + * radix tree should be able to operate well no matter how much + * fragmentation there is and no matter how large a bitmap is used. + * + * Unlike the rlist code, the blist code wires all necessary memory at + * creation time. Neither allocations nor frees require interaction with + * the memory subsystem. In contrast, the rlist code may allocate memory + * on an blist_free() call. The non-blocking features of the blist code + * are used to great advantage in the swap code (uvm/uvm_swap.c). The + * rlist code uses a little less overall memory than the blist code (but + * due to swap interleaving not all that much less), but the blist code + * scales much, much better. + * + * LAYOUT: The radix tree is layed out recursively using a + * linear array. Each meta node is immediately followed (layed out + * sequentially in memory) by BLIST_META_RADIX lower level nodes. This + * is a recursive structure but one that can be easily scanned through + * a very simple 'skip' calculation. In order to support large radixes, + * portions of the tree may reside outside our memory allocation. We + * handle this with an early-termination optimization (when bighint is + * set to -1) on the scan. The memory allocation is only large enough + * to cover the number of blocks requested at creation time even if it + * must be encompassed in larger root-node radix. + * + * NOTE: The allocator cannot currently allocate more than + * BLIST_BMAP_RADIX blocks per call. It will panic with 'allocation too + * large' if you try. This is an area that could use improvement. The + * radix is large enough that this restriction does not effect the swap + * system, though. Currently only the allocation code is effected by + * this algorithmic unfeature. The freeing code can handle arbitrary + * ranges. + * + * NOTE: The radix may exceed BLIST_BMAP_RADIX bits in order to support + * up to 2^(BLIST_BMAP_RADIX-1) blocks. The first divison will + * drop the radix down and fit it within a signed BLIST_BMAP_RADIX + * bit integer. + * + * This code can be compiled stand-alone for debugging. + */ + +#ifdef _KERNEL + +#include +#include +#include +#include + +#else + +#ifndef BLIST_NO_DEBUG +#define BLIST_DEBUG +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#define malloc(s,t,f) calloc(1, s) +#define mallocarray(n,s,t,f) reallocarray(NULL, n, s) +#define free(p,t,s) free(p) +#define KASSERT(exp) assert(exp) + +#include "../sys/blist.h" + +#define panic(...) do { errx(1, __VA_ARGS__); } while (0) + +#endif + +/* + * static support functions + */ + +static bsblk_t blst_leaf_alloc(blmeta_t *scan, bsblk_t blkat, + bsblk_t blk, bsblk_t count); +static bsblk_t blst_meta_alloc(blmeta_t *scan, bsblk_t blkat, + bsblk_t blk, bsblk_t count, + bsblk_t radix, bsblk_t skip); +static void blst_leaf_free(blmeta_t *scan, bsblk_t relblk, bsblk_t count); +static void blst_meta_free(blmeta_t *scan, bsblk_t freeBlk, bsblk_t count, + bsblk_t radix, bsblk_t skip, + bsblk_t blk); +static bsblk_t blst_leaf_fill(blmeta_t *scan, bsblk_t blk, bsblk_t count); +static bsblk_t blst_meta_fill(blmeta_t *scan, bsblk_t fillBlk, bsblk_t count, + bsblk_t radix, bsblk_t skip, + bsblk_t blk); +static void blst_copy(blmeta_t *scan, bsblk_t blk, bsblk_t radix, + bsblk_t skip, blist_t dest, bsblk_t count); +static bsblk_t blst_radix_init(blmeta_t *scan, bsblk_t radix, + bsblk_t skip, bsblk_t count); +static int blst_radix_gapfind(blmeta_t *scan, bsblk_t blk, bsblk_t radix, bsblk_t skip, + int state, bsblk_t *maxbp, bsblk_t *maxep, bsblk_t *bp, bsblk_t *ep); + +#if defined(BLIST_DEBUG) || defined(DDB) +static void blst_radix_print(blmeta_t *scan, bsblk_t blk, + bsblk_t radix, bsblk_t skip, int tab); +#endif + +/* + * blist_create() - create a blist capable of handling up to the specified + * number of blocks + * + * blocks must be greater than 0 + * + * The smallest blist consists of a single leaf node capable of + * managing BLIST_BMAP_RADIX blocks. + * + * The pages are addressable in range [0, nblocks[ + */ + +blist_t +blist_create(bsblk_t blocks) +{ + blist_t bl; + bsblk_t radix; + bsblk_t skip = 0; + + KASSERT(blocks > 0); + + /* + * Calculate radix and skip field used for scanning. + * + * Radix can exceed BLIST_BMAP_RADIX bits even if bsblk_t is limited + * to BLIST_BMAP_RADIX bits. + * + * XXX check overflow + */ + radix = BLIST_BMAP_RADIX; + + while (radix < blocks) { + radix *= BLIST_META_RADIX; + skip = (skip + 1) * BLIST_META_RADIX; + KASSERT(skip > 0); + } + + bl = malloc(sizeof(struct blist), M_VMSWAP, M_WAITOK | M_ZERO); + + bl->bl_blocks = blocks; + bl->bl_radix = radix; + bl->bl_skip = skip; + bl->bl_rootblks = 1 + + blst_radix_init(NULL, bl->bl_radix, bl->bl_skip, blocks); + bl->bl_root = mallocarray(bl->bl_rootblks, sizeof(blmeta_t), + M_VMSWAP, M_WAITOK); + +#if defined(BLIST_DEBUG) + printf( + "BLIST representing %lu blocks (%lu MB of swap)" + ", requiring %6.2fM of ram\n", + bl->bl_blocks, + bl->bl_blocks * 4 / 1024, + (bl->bl_rootblks * sizeof(blmeta_t) + 1023) / (1024.0 * 1024.0) + ); + printf("BLIST raw radix tree: %lu records, top-radix %lu\n", + bl->bl_rootblks, bl->bl_radix); +#endif + blst_radix_init(bl->bl_root, bl->bl_radix, bl->bl_skip, blocks); + + return(bl); +} + +void +blist_destroy(blist_t bl) +{ + KASSERT(bl != NULL); + + free(bl->bl_root, M_VMSWAP, sizeof(blmeta_t) * bl->bl_rootblks); + free(bl, M_VMSWAP, sizeof(struct blist)); +} + +/* + * blist_alloc() - reserve space in the block bitmap. Return the base + * of a contiguous region or SWAPBLK_NONE if space could + * not be allocated. + */ + +bsblk_t +blist_alloc(blist_t bl, bsblk_t count) +{ + bsblk_t blk = SWAPBLK_NONE; + + if (bl) { + if (bl->bl_radix == BLIST_BMAP_RADIX) + blk = blst_leaf_alloc(bl->bl_root, 0, 0, count); + else + blk = blst_meta_alloc(bl->bl_root, 0, 0, count, + bl->bl_radix, bl->bl_skip); + if (blk != SWAPBLK_NONE) + bl->bl_free -= count; + } + return(blk); +} + +bsblk_t +blist_allocat(blist_t bl, bsblk_t count, bsblk_t blkat) +{ + bsblk_t blk = SWAPBLK_NONE; + + if (bl) { + KASSERT(blkat < bl->bl_blocks); + KASSERT(blkat + count <= bl->bl_blocks); + + if (bl->bl_radix == BLIST_BMAP_RADIX) + blk = blst_leaf_alloc(bl->bl_root, blkat, 0, count); + else + blk = blst_meta_alloc(bl->bl_root, blkat, 0, count, + bl->bl_radix, bl->bl_skip); + if (blk != SWAPBLK_NONE) + bl->bl_free -= count; + } + return(blk); +} + +/* + * blist_free() - free up space in the block bitmap. Return the base + * of a contiguous region. Panic if an inconsistancy is + * found. + */ + +void +blist_free(blist_t bl, bsblk_t blkno, bsblk_t count) +{ + if (bl) { + KASSERT(blkno < bl->bl_blocks); + KASSERT(blkno + count <= bl->bl_blocks); + + if (bl->bl_radix == BLIST_BMAP_RADIX) + blst_leaf_free(bl->bl_root, blkno, count); + else + blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix, bl->bl_skip, 0); + bl->bl_free += count; + } +} + +/* + * blist_fill() - mark a region in the block bitmap as off-limits + * to the allocator (i.e. allocate it), ignoring any + * existing allocations. Return the number of blocks + * actually filled that were free before the call. + */ + +bsblk_t +blist_fill(blist_t bl, bsblk_t blkno, bsblk_t count) +{ + bsblk_t filled; + + if (bl) { + KASSERT(blkno < bl->bl_blocks); + KASSERT(blkno + count <= bl->bl_blocks); + + if (bl->bl_radix == BLIST_BMAP_RADIX) { + filled = blst_leaf_fill(bl->bl_root, blkno, count); + } else { + filled = blst_meta_fill(bl->bl_root, blkno, count, + bl->bl_radix, bl->bl_skip, 0); + } + bl->bl_free -= filled; + return (filled); + } else { + return 0; + } +} + +/* + * blist_resize() - resize an existing radix tree to handle the + * specified number of blocks. This will reallocate + * the tree and transfer the previous bitmap to the new + * one. When extending the tree you can specify whether + * the new blocks are to left allocated or freed. + */ + +void +blist_resize(blist_t *pbl, bsblk_t count, int freenew) +{ + blist_t newbl = blist_create(count); + blist_t save = *pbl; + + *pbl = newbl; + if (count > save->bl_blocks) + count = save->bl_blocks; + blst_copy(save->bl_root, 0, save->bl_radix, save->bl_skip, newbl, count); + + /* + * If resizing upwards, should we free the new space or not? + */ + if (freenew && count < newbl->bl_blocks) { + blist_free(newbl, count, newbl->bl_blocks - count); + } + blist_destroy(save); +} + +#define GAPFIND_FIRSTFREE 0 +#define GAPFIND_FIRSTUSED 1 + +/* + * blist_gapfind() - return the largest gap (free pages) in blist. + * the blist isn't modified. the returned range + * is [maxbp, maxep[ . The size of the gap is + * maxep - maxbp. If not found, the size is 0. + */ + +void +blist_gapfind(blist_t bl, bsblk_t *maxbp, bsblk_t *maxep) +{ + int state; + bsblk_t b, e; + + /* initialize gaps (max and current) */ + *maxbp = *maxep = 0; + b = e = 0; + + /* search the larger gap from block 0 */ + state = blst_radix_gapfind(bl->bl_root, 0, bl->bl_radix, bl->bl_skip, + GAPFIND_FIRSTFREE, maxbp, maxep, &b, &e); + + if (state == GAPFIND_FIRSTUSED) { + e = bl->bl_blocks; + if (*maxep - *maxbp < e - b) { + *maxbp = b; + *maxep = e; + } + } + + KASSERT(*maxbp <= *maxep); + KASSERT(*maxbp < bl->bl_blocks); + KASSERT(*maxep <= bl->bl_blocks); +} + +/* + * blst_radix_gapfind - search the larger gap in one pass + * + * - search first free block, from X -> set B + * - search first used block, from B -> set E + * - if the size (E - B) is larger than max, update it + * - loop (with X=E) until end of blist + * - max is the larger free gap + */ +static int +blst_radix_gapfind(blmeta_t *scan, bsblk_t blk, bsblk_t radix, bsblk_t skip, + int state, bsblk_t *maxbp, bsblk_t *maxep, bsblk_t *bp, bsblk_t *ep) +{ + bsblk_t i; + bsblk_t next_skip; + + if (radix == BLIST_BMAP_RADIX) { + /* leaf node: we considere only completely free bitmap as free */ + if (state == GAPFIND_FIRSTFREE) { + if (scan->u.bmu_bitmap == (bsbmp_t)-1) { + /* node is fully free */ + *bp = blk; + return GAPFIND_FIRSTUSED; + } + + /* it isn't fully free, not found, keep state */ + return state; + + } else if (state == GAPFIND_FIRSTUSED) { + if (scan->u.bmu_bitmap == (bsbmp_t)-1) { + /* it is free, not found, keep state */ + return state; + } + + /* it is (at least partially) used */ + *ep = blk; + if (*maxep - *maxbp < *ep - *bp) { + *maxbp = *bp; + *maxep = *ep; + } + return GAPFIND_FIRSTFREE; + } + } + + if (scan->u.bmu_avail == 0) { + /* ALL-ALLOCATED */ + if (state == GAPFIND_FIRSTFREE) { + /* searching free block, not found, keep state */ + return state; + + } else if (state == GAPFIND_FIRSTUSED) { + /* searching used block, found */ + *ep = blk; + if (*maxep - *maxbp < *ep - *bp) { + *maxbp = *bp; + *maxep = *ep; + } + return GAPFIND_FIRSTFREE; + } + } + + if (scan->u.bmu_avail == radix) { + /* ALL-FREE */ + if (state == GAPFIND_FIRSTFREE) { + /* searching free block, found */ + *bp = blk; + return GAPFIND_FIRSTUSED; + + } else if (state == GAPFIND_FIRSTUSED) { + /* searching used block, not found, keep state */ + return state; + } + } + + radix /= BLIST_META_RADIX; + next_skip = (skip / BLIST_META_RADIX); + + for (i = 1; i <= skip; i += next_skip) { + if (scan[i].bm_bighint == (bsblk_t)-1) + /* Terminator */ + break; + + state = blst_radix_gapfind(&scan[i], blk, radix, next_skip - 1, + state, maxbp, maxep, bp, ep); + + blk += radix; + } + + return state; +} + +#if defined(BLIST_DEBUG) || defined(DDB) + +/* + * blist_print() - dump radix tree + */ + +void +blist_print(blist_t bl) +{ + printf("BLIST {\n"); + blst_radix_print(bl->bl_root, 0, bl->bl_radix, bl->bl_skip, 4); + printf("}\n"); +} + +#endif + +/************************************************************************ + * ALLOCATION SUPPORT FUNCTIONS * + ************************************************************************ + * + * These support functions do all the actual work. They may seem + * rather longish, but that's because I've commented them up. The + * actual code is straight forward. + * + */ + +/* + * blist_leaf_alloc() - allocate at a leaf in the radix tree (a bitmap). + * + * This is the core of the allocator and is optimized for the 1 block + * and the BLIST_BMAP_RADIX block allocation cases. Other cases are + * somewhat slower. The 1 block allocation case is log2 and extremely + * quick. + */ + +static bsblk_t +blst_leaf_alloc(blmeta_t *scan, bsblk_t blkat __unused, bsblk_t blk, + bsblk_t count) +{ + bsbmp_t orig = scan->u.bmu_bitmap; + + if (orig == 0) { + /* + * Optimize bitmap all-allocated case. Also, count = 1 + * case assumes at least 1 bit is free in the bitmap, so + * we have to take care of this case here. + */ + scan->bm_bighint = 0; + return(SWAPBLK_NONE); + } + if (count == 1) { + /* + * Optimized code to allocate one bit out of the bitmap + */ + bsbmp_t mask; + int j = BLIST_BMAP_RADIX/2; + int r = 0; + + mask = (bsbmp_t)-1 >> (BLIST_BMAP_RADIX/2); + + while (j) { + if ((orig & mask) == 0) { + r += j; + orig >>= j; + } + j >>= 1; + mask >>= j; + } + scan->u.bmu_bitmap &= ~((bsbmp_t)1 << r); + return(blk + r); + } + if (count <= BLIST_BMAP_RADIX) { + /* + * non-optimized code to allocate N bits out of the bitmap. + * The more bits, the faster the code runs. It will run + * the slowest allocating 2 bits, but since there aren't any + * memory ops in the core loop (or shouldn't be, anyway), + * you probably won't notice the difference. + */ + int j; + int n = (int)(BLIST_BMAP_RADIX - count); + bsbmp_t mask; + + mask = (bsbmp_t)-1 >> n; + + for (j = 0; j <= n; ++j) { + if ((orig & mask) == mask) { + scan->u.bmu_bitmap &= ~mask; + return(blk + j); + } + mask = (mask << 1); + } + } + + /* + * We couldn't allocate count in this subtree, update bighint. + */ + scan->bm_bighint = count - 1; + + return(SWAPBLK_NONE); +} + +/* + * blist_meta_alloc() - allocate at a meta in the radix tree. + * + * Attempt to allocate at a meta node. If we can't, we update + * bighint and return a failure. Updating bighint optimize future + * calls that hit this node. We have to check for our collapse cases + * and we have a few optimizations strewn in as well. + */ +static bsblk_t +blst_meta_alloc(blmeta_t *scan, bsblk_t blkat, + bsblk_t blk, bsblk_t count, + bsblk_t radix, bsblk_t skip) +{ + int hintok = (blk >= blkat); + bsblk_t next_skip = ((bsblk_t)skip / BLIST_META_RADIX); + bsblk_t i; + +#ifndef _KERNEL + printf("blist_meta_alloc blkat %lu blk %lu count %lu radix %lu\n", + blkat, blk, count, radix); +#endif + + /* + * ALL-ALLOCATED special case + */ + if (scan->u.bmu_avail == 0) { + scan->bm_bighint = 0; + return(SWAPBLK_NONE); + } + + /* + * ALL-FREE special case, initialize uninitialized + * sublevel. + * + * NOTE: radix may exceed 32 bits until first division. + */ + if (scan->u.bmu_avail == radix) { + scan->bm_bighint = radix; + + radix /= BLIST_META_RADIX; + for (i = 1; i <= skip; i += next_skip) { + if (scan[i].bm_bighint == (bsblk_t)-1) + break; + if (next_skip == 1) { + scan[i].u.bmu_bitmap = (bsbmp_t)-1; + scan[i].bm_bighint = BLIST_BMAP_RADIX; + } else { + scan[i].bm_bighint = (bsblk_t)radix; + scan[i].u.bmu_avail = (bsblk_t)radix; + } + } + } else { + radix /= BLIST_META_RADIX; + } + + for (i = 1; i <= skip; i += next_skip) { + if (count <= scan[i].bm_bighint && + blk + (bsblk_t)radix > blkat) { + /* + * count fits in object + */ + bsblk_t r; + if (next_skip == 1) { + r = blst_leaf_alloc(&scan[i], blkat, + blk, count); + } else { + r = blst_meta_alloc(&scan[i], blkat, + blk, count, + radix, next_skip - 1); + } + if (r != SWAPBLK_NONE) { + scan->u.bmu_avail -= count; + if (scan->bm_bighint > scan->u.bmu_avail) + scan->bm_bighint = scan->u.bmu_avail; + return(r); + } + /* bighint was updated by recursion */ + } else if (scan[i].bm_bighint == (bsblk_t)-1) { + /* + * Terminator + */ + break; + } else if (count > (bsblk_t)radix) { + /* + * count does not fit in object even if it were + * complete free. + */ + panic("%s: allocation too large %lu/%lu", + __func__, count, radix); + } + blk += (bsblk_t)radix; + } + + /* + * We couldn't allocate count in this subtree, update bighint. + */ + if (hintok && scan->bm_bighint >= count) + scan->bm_bighint = count - 1; + return(SWAPBLK_NONE); +} + +/* + * BLST_LEAF_FREE() - free allocated block from leaf bitmap + */ +static void +blst_leaf_free(blmeta_t *scan, bsblk_t blk, bsblk_t count) +{ + /* + * free some data in this bitmap + * + * e.g. + * 0000111111111110000 + * \_________/\__/ + * v n + */ + int n = blk & (BLIST_BMAP_RADIX - 1); + bsbmp_t mask; + + mask = ((bsbmp_t)-1 << n) & + ((bsbmp_t)-1 >> (BLIST_BMAP_RADIX - count - n)); + + if (scan->u.bmu_bitmap & mask) + panic("%s: freeing free block", __func__); + scan->u.bmu_bitmap |= mask; + + /* + * We could probably do a better job here. We are required to make + * bighint at least as large as the biggest contiguous block of + * data. If we just shoehorn it, a little extra overhead will + * be incured on the next allocation (but only that one typically). + */ + scan->bm_bighint = BLIST_BMAP_RADIX; +} + +/* + * BLST_META_FREE() - free allocated blocks from radix tree meta info + * + * This support routine frees a range of blocks from the bitmap. + * The range must be entirely enclosed by this radix node. If a + * meta node, we break the range down recursively to free blocks + * in subnodes (which means that this code can free an arbitrary + * range whereas the allocation code cannot allocate an arbitrary + * range). + */ + +static void +blst_meta_free(blmeta_t *scan, bsblk_t freeBlk, bsblk_t count, + bsblk_t radix, bsblk_t skip, bsblk_t blk) +{ + bsblk_t i; + bsblk_t next_skip = ((bsblk_t)skip / BLIST_META_RADIX); + +#if 0 + printf("FREE (%04lx,%lu) FROM (%04lx,%lu)\n", + freeBlk, count, + blk, radix + ); +#endif + + /* + * ALL-ALLOCATED special case, initialize for recursion. + * + * We will short-cut the ALL-ALLOCATED -> ALL-FREE case. + */ + if (scan->u.bmu_avail == 0) { + scan->u.bmu_avail = count; + scan->bm_bighint = count; + + if (count != radix) { + for (i = 1; i <= skip; i += next_skip) { + if (scan[i].bm_bighint == (bsblk_t)-1) + break; + scan[i].bm_bighint = 0; + if (next_skip == 1) { + scan[i].u.bmu_bitmap = 0; + } else { + scan[i].u.bmu_avail = 0; + } + } + /* fall through */ + } + } else { + scan->u.bmu_avail += count; + /* scan->bm_bighint = radix; */ + } + + /* + * ALL-FREE special case. + * + * Set bighint for higher levels to snoop. + */ + if (scan->u.bmu_avail == radix) { + scan->bm_bighint = radix; + return; + } + + /* + * Break the free down into its components + */ + if (scan->u.bmu_avail > radix) { + panic("%s: freeing already " + "free blocks (%lu) %lu/%lu", + __func__, count, (long)scan->u.bmu_avail, radix); + } + + radix /= BLIST_META_RADIX; + + i = (freeBlk - blk) / (bsblk_t)radix; + blk += i * (bsblk_t)radix; + i = i * next_skip + 1; + + while (i <= skip && blk < freeBlk + count) { + bsblk_t v; + + v = blk + (bsblk_t)radix - freeBlk; + if (v > count) + v = count; + + if (scan->bm_bighint == (bsblk_t)-1) + panic("%s: freeing unexpected range", __func__); + + if (next_skip == 1) { + blst_leaf_free(&scan[i], freeBlk, v); + } else { + blst_meta_free(&scan[i], freeBlk, v, + radix, next_skip - 1, blk); + } + + /* + * After having dealt with the becomes-all-free case any + * partial free will not be able to bring us to the + * becomes-all-free state. + * + * We can raise bighint to at least the sub-segment's + * bighint. + */ + if (scan->bm_bighint < scan[i].bm_bighint) { + scan->bm_bighint = scan[i].bm_bighint; + } + count -= v; + freeBlk += v; + blk += (bsblk_t)radix; + i += next_skip; + } +} + +/* + * BLST_LEAF_FILL() - allocate specific blocks in leaf bitmap + * + * Allocates all blocks in the specified range regardless of + * any existing allocations in that range. Returns the number + * of blocks allocated by the call. + */ +static bsblk_t +blst_leaf_fill(blmeta_t *scan, bsblk_t blk, bsblk_t count) +{ + int n = blk & (BLIST_BMAP_RADIX - 1); + bsblk_t nblks; + bsbmp_t mask, bitmap; + + mask = ((bsbmp_t)-1 << n) & + ((bsbmp_t)-1 >> (BLIST_BMAP_RADIX - count - n)); + + /* Count the number of blocks we're about to allocate */ + bitmap = scan->u.bmu_bitmap & mask; + for (nblks = 0; bitmap != 0; nblks++) + bitmap &= bitmap - 1; + + scan->u.bmu_bitmap &= ~mask; + return (nblks); +} + +/* + * BLST_META_FILL() - allocate specific blocks at a meta node + * + * Allocates the specified range of blocks, regardless of + * any existing allocations in the range. The range must + * be within the extent of this node. Returns the number + * of blocks allocated by the call. + */ +static bsblk_t +blst_meta_fill(blmeta_t *scan, bsblk_t fillBlk, bsblk_t count, + bsblk_t radix, bsblk_t skip, bsblk_t blk) +{ + bsblk_t i; + bsblk_t next_skip = ((bsblk_t)skip / BLIST_META_RADIX); + bsblk_t nblks = 0; + + if (count == radix || scan->u.bmu_avail == 0) { + /* + * ALL-ALLOCATED special case + */ + nblks = scan->u.bmu_avail; + scan->u.bmu_avail = 0; + scan->bm_bighint = count; + return (nblks); + } + + if (scan->u.bmu_avail == radix) { + radix /= BLIST_META_RADIX; + + /* + * ALL-FREE special case, initialize sublevel + */ + for (i = 1; i <= skip; i += next_skip) { + if (scan[i].bm_bighint == (bsblk_t)-1) + break; + if (next_skip == 1) { + scan[i].u.bmu_bitmap = (bsbmp_t)-1; + scan[i].bm_bighint = BLIST_BMAP_RADIX; + } else { + scan[i].bm_bighint = (bsblk_t)radix; + scan[i].u.bmu_avail = (bsblk_t)radix; + } + } + } else { + radix /= BLIST_META_RADIX; + } + + if (count > (bsblk_t)radix) + panic("%s: allocation too large", __func__); + + i = (fillBlk - blk) / (bsblk_t)radix; + blk += i * (bsblk_t)radix; + i = i * next_skip + 1; + + while (i <= skip && blk < fillBlk + count) { + bsblk_t v; + + v = blk + (bsblk_t)radix - fillBlk; + if (v > count) + v = count; + + if (scan->bm_bighint == (bsblk_t)-1) + panic("%s: filling unexpected range", __func__); + + if (next_skip == 1) { + nblks += blst_leaf_fill(&scan[i], fillBlk, v); + } else { + nblks += blst_meta_fill(&scan[i], fillBlk, v, + radix, next_skip - 1, blk); + } + count -= v; + fillBlk += v; + blk += (bsblk_t)radix; + i += next_skip; + } + scan->u.bmu_avail -= nblks; + return (nblks); +} + +/* + * BLIST_RADIX_COPY() - copy one radix tree to another + * + * Locates free space in the source tree and frees it in the destination + * tree. The space may not already be free in the destination. + */ + +static void +blst_copy(blmeta_t *scan, bsblk_t blk, bsblk_t radix, + bsblk_t skip, blist_t dest, bsblk_t count) +{ + bsblk_t next_skip; + bsblk_t i; + + /* + * Leaf node + */ + + if (radix == BLIST_BMAP_RADIX) { + bsbmp_t v = scan->u.bmu_bitmap; + + if (v == (bsbmp_t)-1) { + blist_free(dest, blk, count); + } else if (v != 0) { + for (i = 0; i < BLIST_BMAP_RADIX && i < count; ++i) { + if (v & ((bsblk_t)1 << i)) + blist_free(dest, blk + i, 1); + } + } + return; + } + + /* + * Meta node + */ + + if (scan->u.bmu_avail == 0) { + /* + * Source all allocated, leave dest allocated + */ + return; + } + if (scan->u.bmu_avail == radix) { + /* + * Source all free, free entire dest + */ + if (count < radix) + blist_free(dest, blk, count); + else + blist_free(dest, blk, (bsblk_t)radix); + return; + } + + + radix /= BLIST_META_RADIX; + next_skip = ((bsbmp_t)skip / BLIST_META_RADIX); + + for (i = 1; count && i <= skip; i += next_skip) { + if (scan[i].bm_bighint == (bsblk_t)-1) + break; + + if (count >= (bsblk_t)radix) { + blst_copy( + &scan[i], + blk, + radix, + next_skip - 1, + dest, + (bsblk_t)radix + ); + count -= (bsblk_t)radix; + } else { + if (count) { + blst_copy( + &scan[i], + blk, + radix, + next_skip - 1, + dest, + count + ); + } + count = 0; + } + blk += (bsblk_t)radix; + } +} + +/* + * BLST_RADIX_INIT() - initialize radix tree + * + * Initialize our meta structures and bitmaps and calculate the exact + * amount of space required to manage 'count' blocks - this space may + * be considerably less than the calculated radix due to the large + * RADIX values we use. + */ + +static bsblk_t +blst_radix_init(blmeta_t *scan, bsblk_t radix, bsblk_t skip, bsblk_t count) +{ + bsblk_t i; + bsblk_t next_skip; + bsblk_t memindex = 0; + + /* + * Leaf node + */ + + if (radix == BLIST_BMAP_RADIX) { + if (scan) { + scan->bm_bighint = 0; + scan->u.bmu_bitmap = 0; + } + return(memindex); + } + + /* + * Meta node. If allocating the entire object we can special + * case it. However, we need to figure out how much memory + * is required to manage 'count' blocks, so we continue on anyway. + */ + + if (scan) { + scan->bm_bighint = 0; + scan->u.bmu_avail = 0; + } + + radix /= BLIST_META_RADIX; + next_skip = ((bsbmp_t)skip / BLIST_META_RADIX); + + for (i = 1; i <= skip; i += next_skip) { + if (count >= (bsblk_t)radix) { + /* + * Allocate the entire object + */ + memindex = i + blst_radix_init( + ((scan) ? &scan[i] : NULL), + radix, + next_skip - 1, + (bsblk_t)radix + ); + count -= (bsblk_t)radix; + } else if (count > 0) { + /* + * Allocate a partial object + */ + memindex = i + blst_radix_init( + ((scan) ? &scan[i] : NULL), + radix, + next_skip - 1, + count + ); + count = 0; + } else { + /* + * Add terminator and break out + */ + if (scan) + scan[i].bm_bighint = (bsblk_t)-1; + break; + } + } + if (memindex < i) + memindex = i; + return(memindex); +} + +#if defined(BLIST_DEBUG) || defined(DDB) + +static void +blst_radix_print(blmeta_t *scan, bsblk_t blk, bsblk_t radix, bsblk_t skip, int tab) +{ + bsblk_t i; + bsblk_t next_skip; + + if (radix == BLIST_BMAP_RADIX) { + printf( + "%*.*s(%04lx,%lu): bitmap %0*lx big=%lu\n", + tab, tab, "", + blk, radix, + (int)(1 + (BLIST_BMAP_RADIX - 1) / 4), + scan->u.bmu_bitmap, + scan->bm_bighint + ); + return; + } + + if (scan->u.bmu_avail == 0) { + printf( + "%*.*s(%04lx,%lu) ALL ALLOCATED\n", + tab, tab, "", + blk, + radix + ); + return; + } + if (scan->u.bmu_avail == radix) { + printf( + "%*.*s(%04lx,%lu) ALL FREE\n", + tab, tab, "", + blk, + radix + ); + return; + } + + printf( + "%*.*s(%04lx,%lu): subtree (%lu/%lu) big=%lu {\n", + tab, tab, "", + blk, radix, + scan->u.bmu_avail, + radix, + scan->bm_bighint + ); + + radix /= BLIST_META_RADIX; + next_skip = ((bsbmp_t)skip / BLIST_META_RADIX); + tab += 4; + + for (i = 1; i <= skip; i += next_skip) { + if (scan[i].bm_bighint == (bsblk_t)-1) { + printf( + "%*.*s(%04lx,%lu): Terminator\n", + tab, tab, "", + blk, radix + ); + break; + } + blst_radix_print( + &scan[i], + blk, + radix, + next_skip - 1, + tab + ); + blk += (bsblk_t)radix; + } + tab -= 4; + + printf( + "%*.*s}\n", + tab, tab, "" + ); +} + +#endif + +#if !defined(_KERNEL) && defined(BLIST_DEBUG) + +int +main(int ac, char **av) +{ + bsblk_t size = 1024; + bsblk_t i; + blist_t bl; + + for (i = 1; i < (bsblk_t)ac; ++i) { + const char *ptr = av[i]; + if (*ptr != '-') { + size = strtol(ptr, NULL, 0); + continue; + } + ptr += 2; + fprintf(stderr, "Bad option: %s\n", ptr - 2); + exit(1); + } + bl = blist_create(size); + blist_free(bl, 0, size); + + for (;;) { + char buf[1024]; + bsblk_t da = 0; + bsblk_t count = 0; + bsblk_t blkat; + + + printf("%lu/%lu/%lu> ", + bl->bl_free, size, bl->bl_radix); + fflush(stdout); + if (fgets(buf, sizeof(buf), stdin) == NULL) + break; + switch(buf[0]) { + case '#': + continue; + case 'r': + if (sscanf(buf + 1, "%li", &count) == 1) { + blist_resize(&bl, count, 1); + size = count; + } else { + printf("?\n"); + } + case 'p': + blist_print(bl); + break; + case 'a': + if (sscanf(buf + 1, "%li %li", &count, &blkat) == 1) { + printf("count %lu\n", count); + bsblk_t blk = blist_alloc(bl, count); + printf(" R=%04lx\n", blk); + } else if (sscanf(buf + 1, "%li %li", &count, &blkat) == 2) { + bsblk_t blk = blist_allocat(bl, count, blkat); + printf(" R=%04lx\n", blk); + } else { + printf("?\n"); + } + break; + case 'f': + if (sscanf(buf + 1, "%li %li", &da, &count) == 2) { + blist_free(bl, da, count); + } else { + printf("?\n"); + } + break; + case 'g': { + bsblk_t b, e; + blist_gapfind(bl, &b, &e); + printf("gapfind: begin=%04lx end=%04lx size=%lu\n", + b, e, e-b); + break; + } + case 'l': + if (sscanf(buf + 1, "%li %li", &da, &count) == 2) { + printf(" n=%lu\n", + blist_fill(bl, da, count)); + } else { + printf("?\n"); + } + break; + case '?': + case 'h': + puts( + "p -print\n" + "a %li -allocate\n" + "f %li %li -free\n" + "l %li %li -fill\n" + "g -gapfind\n" + "r %li -resize\n" + "h/? -help\n" + " hex may be specified with 0x prefix\n" + ); + break; + default: + printf("?\n"); + break; + } + } + return(0); +} + +#endif diff --git a/sys/kern/subr_hibernate.c b/sys/kern/subr_hibernate.c index d1c967dcda2..780bd7f8da1 100644 --- a/sys/kern/subr_hibernate.c +++ b/sys/kern/subr_hibernate.c @@ -1,4 +1,4 @@ -/* $OpenBSD: subr_hibernate.c,v 1.134 2022/02/19 23:57:09 deraadt Exp $ */ +/* $OpenBSD: subr_hibernate.c,v 1.135 2022/07/29 17:47:12 semarie Exp $ */ /* * Copyright (c) 2011 Ariane van der Steldt @@ -1918,7 +1918,7 @@ hibernate_suspend(void) if (end - start < 1000) { printf("hibernate: insufficient swap (%lu is too small)\n", - end - start); + end - start + 1); return (1); } @@ -1926,7 +1926,7 @@ hibernate_suspend(void) hib.image_offset = ctod(start); DPRINTF("hibernate @ block %lld max-length %lu blocks\n", - hib.image_offset, ctod(end) - ctod(start)); + hib.image_offset, ctod(end) - ctod(start) + 1); pmap_activate(curproc); DPRINTF("hibernate: writing chunks\n"); diff --git a/sys/sys/blist.h b/sys/sys/blist.h new file mode 100644 index 00000000000..102ca95dd45 --- /dev/null +++ b/sys/sys/blist.h @@ -0,0 +1,136 @@ +/* $OpenBSD: blist.h,v 1.1 2022/07/29 17:47:12 semarie Exp $ */ +/* + * Copyright (c) 2003,2004 The DragonFly Project. All rights reserved. + * + * This code is derived from software contributed to The DragonFly Project + * by Matthew Dillon + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of The DragonFly Project nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific, prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Implements bitmap resource lists. + * + * Usage: + * blist = blist_create(blocks) + * (void) blist_destroy(blist) + * blkno = blist_alloc(blist, count) + * (void) blist_free(blist, blkno, count) + * nblks = blist_fill(blist, blkno, count) + * (void) blist_resize(&blist, count, freeextra) + * + * + * Notes: + * on creation, the entire list is marked reserved. You should + * first blist_free() the sections you want to make available + * for allocation before doing general blist_alloc()/free() + * ops. + * + * SWAPBLK_NONE is returned on failure. This module is typically + * capable of managing up to (2^31) blocks per blist, though + * the memory utilization would be insane if you actually did + * that. Managing something like 512MB worth of 4K blocks + * eats around 32 KBytes of memory. + * + * $FreeBSD: src/sys/sys/blist.h,v 1.2.2.1 2003/01/12 09:23:12 dillon Exp $ + */ + +#ifndef _SYS_BLIST_H_ +#define _SYS_BLIST_H_ + +#ifndef _SYS_TYPES_H_ +#include +#endif + +#define SWBLK_BITS 64 +typedef u_long bsbmp_t; +typedef u_long bsblk_t; + +/* + * note: currently use SWAPBLK_NONE as an absolute value rather then + * a flag bit. + */ +#define SWAPBLK_NONE ((bsblk_t)-1) + +/* + * blmeta and bl_bitmap_t MUST be a power of 2 in size. + */ + +typedef struct blmeta { + union { + bsblk_t bmu_avail; /* space available under us */ + bsbmp_t bmu_bitmap; /* bitmap if we are a leaf */ + } u; + bsblk_t bm_bighint; /* biggest contiguous block hint*/ +} blmeta_t; + +typedef struct blist { + bsblk_t bl_blocks; /* area of coverage */ + /* XXX int64_t bl_radix */ + bsblk_t bl_radix; /* coverage radix */ + bsblk_t bl_skip; /* starting skip */ + bsblk_t bl_free; /* number of free blocks */ + blmeta_t *bl_root; /* root of radix tree */ + bsblk_t bl_rootblks; /* bsblk_t blks allocated for tree */ +} *blist_t; + +#define BLIST_META_RADIX (sizeof(bsbmp_t)*8/2) /* 2 bits per */ +#define BLIST_BMAP_RADIX (sizeof(bsbmp_t)*8) /* 1 bit per */ + +/* + * The radix may exceed the size of a 64 bit signed (or unsigned) int + * when the maximal number of blocks is allocated. With a 32-bit bsblk_t + * this corresponds to ~1G x PAGE_SIZE = 4096GB. The swap code usually + * divides this by 4, leaving us with a capability of up to four 1TB swap + * devices. + * + * With a 64-bit bsblk_t the limitation is some insane number. + * + * NOTE: For now I don't trust that we overflow-detect properly so we divide + * out to ensure that no overflow occurs. + */ + +#if SWBLK_BITS == 64 +#define BLIST_MAXBLKS (0x4000000000000000LL / \ + (BLIST_BMAP_RADIX / BLIST_META_RADIX)) +#else +#define BLIST_MAXBLKS (0x40000000 / \ + (BLIST_BMAP_RADIX / BLIST_META_RADIX)) +#endif + +#define BLIST_MAX_ALLOC BLIST_BMAP_RADIX + +blist_t blist_create(bsblk_t); +void blist_destroy(blist_t); +bsblk_t blist_alloc(blist_t, bsblk_t); +bsblk_t blist_allocat(blist_t, bsblk_t, bsblk_t); +void blist_free(blist_t, bsblk_t, bsblk_t); +bsblk_t blist_fill(blist_t, bsblk_t, bsblk_t); +void blist_print(blist_t); +void blist_resize(blist_t *, bsblk_t, int); +void blist_gapfind(blist_t, bsblk_t *, bsblk_t *); + +#endif /* _SYS_BLIST_H_ */ diff --git a/sys/uvm/uvm_swap.c b/sys/uvm/uvm_swap.c index bbba5b03b99..ad2437600ee 100644 --- a/sys/uvm/uvm_swap.c +++ b/sys/uvm/uvm_swap.c @@ -1,4 +1,4 @@ -/* $OpenBSD: uvm_swap.c,v 1.161 2022/07/18 18:02:27 jca Exp $ */ +/* $OpenBSD: uvm_swap.c,v 1.162 2022/07/29 17:47:12 semarie Exp $ */ /* $NetBSD: uvm_swap.c,v 1.40 2000/11/17 11:39:39 mrg Exp $ */ /* @@ -43,6 +43,7 @@ #include #include #include +#include #include #include #include @@ -136,8 +137,7 @@ struct swapdev { int swd_npgbad; /* #pages bad */ int swd_drumoffset; /* page0 offset in drum */ int swd_drumsize; /* #pages in drum */ - struct extent *swd_ex; /* extent for this swapdev */ - char swd_exname[12]; /* name of extent above */ + blist_t swd_blist; /* blist for this swapdev */ struct vnode *swd_vp; /* backing vnode */ TAILQ_ENTRY(swapdev) swd_next; /* priority tailq */ @@ -847,7 +847,6 @@ out: int swap_on(struct proc *p, struct swapdev *sdp) { - static int count = 0; /* static */ struct vnode *vp; int error, npages, nblocks, size; long addr; @@ -965,30 +964,20 @@ swap_on(struct proc *p, struct swapdev *sdp) } /* - * now we need to allocate an extent to manage this swap device + * now we need to allocate a blist to manage this swap device */ - snprintf(sdp->swd_exname, sizeof(sdp->swd_exname), "swap0x%04x", - count++); - - /* note that extent_create's 3rd arg is inclusive, thus "- 1" */ - sdp->swd_ex = extent_create(sdp->swd_exname, 0, npages - 1, M_VMSWAP, - 0, 0, EX_WAITOK); - /* allocate the `saved' region from the extent so it won't be used */ - if (addr) { - if (extent_alloc_region(sdp->swd_ex, 0, addr, EX_WAITOK)) - panic("disklabel reserve"); - /* XXX: is extent synchronized with swd_npginuse? */ - } + sdp->swd_blist = blist_create(npages); + /* mark all expect the `saved' region free. */ + blist_free(sdp->swd_blist, addr, size); + #ifdef HIBERNATE /* * Lock down the last region of primary disk swap, in case * hibernate needs to place a signature there. */ if (dev == swdevt[0].sw_dev && vp->v_type == VBLK && size > 3 ) { - if (extent_alloc_region(sdp->swd_ex, - npages - 1 - 1, 1, EX_WAITOK)) + if (blist_fill(sdp->swd_blist, npages - 1, 1) != 1) panic("hibernate reserve"); - /* XXX: is extent synchronized with swd_npginuse? */ } #endif @@ -1073,7 +1062,7 @@ swap_off(struct proc *p, struct swapdev *sdp) */ extent_free(swapmap, sdp->swd_drumoffset, sdp->swd_drumsize, EX_WAITOK); - extent_destroy(sdp->swd_ex); + blist_destroy(sdp->swd_blist); /* free sdp->swd_path ? */ free(sdp, M_VMSWAP, sizeof(*sdp)); return (0); @@ -1425,7 +1414,6 @@ uvm_swap_alloc(int *nslots, boolean_t lessok) { struct swapdev *sdp; struct swappri *spp; - u_long result; /* * no swap devices configured yet? definite failure. @@ -1440,16 +1428,18 @@ uvm_swap_alloc(int *nslots, boolean_t lessok) ReTry: /* XXXMRG */ LIST_FOREACH(spp, &swap_priority, spi_swappri) { TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { + bsblk_t result; + /* if it's not enabled, then we can't swap from it */ if ((sdp->swd_flags & SWF_ENABLE) == 0) continue; if (sdp->swd_npginuse + *nslots > sdp->swd_npages) continue; - if (extent_alloc(sdp->swd_ex, *nslots, EX_NOALIGN, 0, - EX_NOBOUNDARY, EX_MALLOCOK|EX_NOWAIT, - &result) != 0) { + result = blist_alloc(sdp->swd_blist, *nslots); + if (result == SWAPBLK_NONE) { continue; } + KASSERT(result < sdp->swd_drumsize); /* * successful allocation! now rotate the tailq. @@ -1466,7 +1456,8 @@ ReTry: /* XXXMRG */ /* XXXMRG: BEGIN HACK */ if (*nslots > 1 && lessok) { *nslots = 1; - goto ReTry; /* XXXMRG: ugh! extent should support this for us */ + /* XXXMRG: ugh! blist should support this for us */ + goto ReTry; } /* XXXMRG: END HACK */ @@ -1543,12 +1534,7 @@ uvm_swap_free(int startslot, int nslots) KASSERT(uvmexp.nswapdev >= 1); KASSERT(sdp != NULL); KASSERT(sdp->swd_npginuse >= nslots); - if (extent_free(sdp->swd_ex, startslot - sdp->swd_drumoffset, nslots, - EX_MALLOCOK|EX_NOWAIT) != 0) { - printf("warning: resource shortage: %d pages of swap lost\n", - nslots); - } - + blist_free(sdp->swd_blist, startslot - sdp->swd_drumoffset, nslots); sdp->swd_npginuse -= nslots; uvmexp.swpginuse -= nslots; #ifdef UVM_SWAP_ENCRYPT @@ -1966,8 +1952,6 @@ uvm_hibswap(dev_t dev, u_long *sp, u_long *ep) { struct swapdev *sdp, *swd = NULL; struct swappri *spp; - struct extent_region *exr, *exrn; - u_long start = 0, end = 0, size = 0; /* no swap devices configured yet? */ if (uvmexp.nswapdev < 1 || dev != swdevt[0].sw_dev) @@ -1983,27 +1967,48 @@ uvm_hibswap(dev_t dev, u_long *sp, u_long *ep) if (swd == NULL || (swd->swd_flags & SWF_ENABLE) == 0) return (1); - LIST_FOREACH(exr, &swd->swd_ex->ex_regions, er_link) { - u_long gapstart, gapend, gapsize; - - gapstart = exr->er_end + 1; - exrn = LIST_NEXT(exr, er_link); - if (!exrn) - break; - gapend = exrn->er_start - 1; - gapsize = gapend - gapstart; - if (gapsize > size) { - start = gapstart; - end = gapend; - size = gapsize; - } - } + blist_gapfind(swd->swd_blist, sp, ep); - if (size) { - *sp = start; - *ep = end; - return (0); - } - return (1); + if (*ep - *sp == 0) + /* no gap found */ + return (1); + + /* + * blist_gapfind returns the gap as [sp,ep[ , + * whereas [sp,ep] is expected from uvm_hibswap(). + */ + *ep -= 1; + + return (0); } #endif /* HIBERNATE */ + +#ifdef DDB +void +swap_print_all(int (*pr)(const char *, ...)) +{ + struct swappri *spp; + struct swapdev *sdp; + + LIST_FOREACH(spp, &swap_priority, spi_swappri) { + TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { +#ifdef HIBERNATE + u_long bgap = 0, egap = 0; +#endif + + pr("swap %p path \"%s\" flags 0x%x\n", sdp, + sdp->swd_path, sdp->swd_flags); + + blist_print(sdp->swd_blist); + +#ifdef HIBERNATE + if (!uvm_hibswap(sdp->swd_dev, &bgap, &egap)) + pr("hibernate gap: [0x%lx, 0x%lx] size=%lu\n", + bgap, egap, (egap - bgap + 1)); + else + pr("hibernate gap: not found\n"); +#endif + } + } +} +#endif /* DDB */ -- 2.20.1