xref: /freebsd/sys/kern/subr_blist.c (revision 13ec1e3155c7e9bf037b12af186351b7fa9b9450)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1998 Matthew Dillon.  All Rights Reserved.
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. Neither the name of the University nor the names of its contributors
14  *    may be used to endorse or promote products derived from this software
15  *    without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
18  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
23  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
25  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
26  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  */
29 /*
30  * BLIST.C -	Bitmap allocator/deallocator, using a radix tree with hinting
31  *
32  *	This module implements a general bitmap allocator/deallocator.  The
33  *	allocator eats around 2 bits per 'block'.  The module does not
34  *	try to interpret the meaning of a 'block' other than to return
35  *	SWAPBLK_NONE on an allocation failure.
36  *
37  *	A radix tree controls access to pieces of the bitmap, and includes
38  *	auxiliary information at each interior node about the availabilty of
39  *	contiguous free blocks in the subtree rooted at that node.  A radix
40  *	constant defines the size of the bitmaps contained in a leaf node
41  *	and the number of descendents of each of the meta (interior) nodes.
42  *	Each subtree is associated with a range of blocks.  The root of any
43  *	subtree stores a hint field that defines an upper bound on the size
44  *	of the largest allocation that can begin in the associated block
45  *	range.  A hint is an upper bound on a potential allocation, but not
46  *	necessarily a tight upper bound.
47  *
48  *	The bitmap field in each node directs the search for available blocks.
49  *	For a leaf node, a bit is set if the corresponding block is free.  For a
50  *	meta node, a bit is set if the corresponding subtree contains a free
51  *	block somewhere within it.  The search at a meta node considers only
52  *	children of that node that represent a range that includes a free block.
53  *
54  * 	The hinting greatly increases code efficiency for allocations while
55  *	the general radix structure optimizes both allocations and frees.  The
56  *	radix tree should be able to operate well no matter how much
57  *	fragmentation there is and no matter how large a bitmap is used.
58  *
59  *	The blist code wires all necessary memory at creation time.  Neither
60  *	allocations nor frees require interaction with the memory subsystem.
61  *	The non-blocking nature of allocations and frees is required by swap
62  *	code (vm/swap_pager.c).
63  *
64  *	LAYOUT: The radix tree is laid out recursively using a linear array.
65  *	Each meta node is immediately followed (laid out sequentially in
66  *	memory) by BLIST_RADIX lower-level nodes.  This is a recursive
67  *	structure but one that can be easily scanned through a very simple
68  *	'skip' calculation.  The memory allocation is only large enough to
69  *	cover the number of blocks requested at creation time.  Nodes that
70  *	represent blocks beyond that limit, nodes that would never be read
71  *	or written, are not allocated, so that the last of the
72  *	BLIST_RADIX lower-level nodes of a some nodes may not be allocated.
73  *
74  *	NOTE: the allocator cannot currently allocate more than
75  *	BLIST_RADIX blocks per call.  It will panic with 'allocation too
76  *	large' if you try.  This is an area that could use improvement.  The
77  *	radix is large enough that this restriction does not effect the swap
78  *	system, though.  Currently only the allocation code is affected by
79  *	this algorithmic unfeature.  The freeing code can handle arbitrary
80  *	ranges.
81  *
82  *	This code can be compiled stand-alone for debugging.
83  */
84 
85 #include <sys/cdefs.h>
86 __FBSDID("$FreeBSD$");
87 
88 #ifdef _KERNEL
89 
90 #include <sys/param.h>
91 #include <sys/systm.h>
92 #include <sys/lock.h>
93 #include <sys/kernel.h>
94 #include <sys/blist.h>
95 #include <sys/malloc.h>
96 #include <sys/sbuf.h>
97 #include <sys/proc.h>
98 #include <sys/mutex.h>
99 
100 #else
101 
102 #ifndef BLIST_NO_DEBUG
103 #define BLIST_DEBUG
104 #endif
105 
106 #include <sys/errno.h>
107 #include <sys/types.h>
108 #include <sys/malloc.h>
109 #include <sys/sbuf.h>
110 #include <assert.h>
111 #include <stdio.h>
112 #include <string.h>
113 #include <stddef.h>
114 #include <stdlib.h>
115 #include <stdarg.h>
116 #include <stdbool.h>
117 
118 #define	bitcount64(x)	__bitcount64((uint64_t)(x))
119 #define malloc(a,b,c)	calloc(a, 1)
120 #define free(a,b)	free(a)
121 #define ummin(a,b)	((a) < (b) ? (a) : (b))
122 #define imin(a,b)	((a) < (b) ? (a) : (b))
123 #define KASSERT(a,b)	assert(a)
124 
125 #include <sys/blist.h>
126 
127 #endif
128 
129 /*
130  * static support functions
131  */
132 static daddr_t	blst_leaf_alloc(blmeta_t *scan, daddr_t blk,
133     int *count, int maxcount);
134 static daddr_t	blst_meta_alloc(blmeta_t *scan, daddr_t cursor, int *count,
135     int maxcount, u_daddr_t radix);
136 static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count);
137 static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count,
138 		    u_daddr_t radix);
139 static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix,
140 		    blist_t dest, daddr_t count);
141 static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count);
142 static daddr_t blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count,
143 		    u_daddr_t radix);
144 #ifndef _KERNEL
145 static void	blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix,
146 		    int tab);
147 #endif
148 
149 #ifdef _KERNEL
150 static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space");
151 #endif
152 
153 #define	BLIST_MASK	(BLIST_RADIX - 1)
154 
155 /*
156  * For a subtree that can represent the state of up to 'radix' blocks, the
157  * number of leaf nodes of the subtree is L=radix/BLIST_RADIX.  If 'm'
158  * is short for BLIST_RADIX, then for a tree of height h with L=m**h
159  * leaf nodes, the total number of tree nodes is 1 + m + m**2 + ... + m**h,
160  * or, equivalently, (m**(h+1)-1)/(m-1).  This quantity is called 'skip'
161  * in the 'meta' functions that process subtrees.  Since integer division
162  * discards remainders, we can express this computation as
163  * skip = (m * m**h) / (m - 1)
164  * skip = (m * (radix / m)) / (m - 1)
165  * skip = radix / (m - 1)
166  * so that simple integer division by a constant can safely be used for the
167  * calculation.
168  */
169 static inline daddr_t
170 radix_to_skip(daddr_t radix)
171 {
172 
173 	return (radix / BLIST_MASK);
174 }
175 
176 /*
177  * Provide a mask with count bits set, starting as position n.
178  */
179 static inline u_daddr_t
180 bitrange(int n, int count)
181 {
182 
183 	return (((u_daddr_t)-1 << n) &
184 	    ((u_daddr_t)-1 >> (BLIST_RADIX - (n + count))));
185 }
186 
187 /*
188  * Find the first bit set in a u_daddr_t.
189  */
190 static inline int
191 generic_bitpos(u_daddr_t mask)
192 {
193 	int hi, lo, mid;
194 
195 	lo = 0;
196 	hi = BLIST_RADIX;
197 	while (lo + 1 < hi) {
198 		mid = (lo + hi) >> 1;
199 		if (mask & bitrange(0, mid))
200 			hi = mid;
201 		else
202 			lo = mid;
203 	}
204 	return (lo);
205 }
206 
207 static inline int
208 bitpos(u_daddr_t mask)
209 {
210 
211 	switch (sizeof(mask)) {
212 #ifdef HAVE_INLINE_FFSLL
213 	case sizeof(long long):
214 		return (ffsll(mask) - 1);
215 #endif
216 #ifdef HAVE_INLINE_FFS
217 	case sizeof(int):
218 		return (ffs(mask) - 1);
219 #endif
220 	default:
221 		return (generic_bitpos(mask));
222 	}
223 }
224 
225 /*
226  * blist_create() - create a blist capable of handling up to the specified
227  *		    number of blocks
228  *
229  *	blocks - must be greater than 0
230  * 	flags  - malloc flags
231  *
232  *	The smallest blist consists of a single leaf node capable of
233  *	managing BLIST_RADIX blocks.
234  */
235 blist_t
236 blist_create(daddr_t blocks, int flags)
237 {
238 	blist_t bl;
239 	u_daddr_t nodes, radix;
240 
241 	KASSERT(blocks > 0, ("invalid block count"));
242 
243 	/*
244 	 * Calculate the radix and node count used for scanning.
245 	 */
246 	nodes = 1;
247 	for (radix = 1; (blocks - 1) / BLIST_RADIX / radix > 0;
248 	    radix *= BLIST_RADIX)
249 		nodes += 1 + (blocks - 1) / BLIST_RADIX / radix;
250 
251 	/*
252 	 * Include a sentinel node to ensure that cross-leaf scans stay within
253 	 * the bounds of the allocation.
254 	 */
255 	if (blocks % BLIST_RADIX == 0)
256 		nodes++;
257 
258 	bl = malloc(offsetof(struct blist, bl_root[nodes]), M_SWAP, flags |
259 	    M_ZERO);
260 	if (bl == NULL)
261 		return (NULL);
262 
263 	bl->bl_blocks = blocks;
264 	bl->bl_radix = radix;
265 
266 #if defined(BLIST_DEBUG)
267 	printf(
268 		"BLIST representing %lld blocks (%lld MB of swap)"
269 		", requiring %lldK of ram\n",
270 		(long long)bl->bl_blocks,
271 		(long long)bl->bl_blocks * 4 / 1024,
272 		(long long)(nodes * sizeof(blmeta_t) + 1023) / 1024
273 	);
274 	printf("BLIST raw radix tree contains %lld records\n",
275 	    (long long)nodes);
276 #endif
277 
278 	return (bl);
279 }
280 
281 void
282 blist_destroy(blist_t bl)
283 {
284 
285 	free(bl, M_SWAP);
286 }
287 
288 /*
289  * blist_alloc() -   reserve space in the block bitmap.  Return the base
290  *		     of a contiguous region or SWAPBLK_NONE if space could
291  *		     not be allocated.
292  */
293 daddr_t
294 blist_alloc(blist_t bl, int *count, int maxcount)
295 {
296 	daddr_t blk, cursor;
297 
298 	KASSERT(*count <= maxcount,
299 	    ("invalid parameters %d > %d", *count, maxcount));
300 	KASSERT(*count <= BLIST_MAX_ALLOC,
301 	    ("minimum allocation too large: %d", *count));
302 
303 	/*
304 	 * This loop iterates at most twice.  An allocation failure in the
305 	 * first iteration leads to a second iteration only if the cursor was
306 	 * non-zero.  When the cursor is zero, an allocation failure will
307 	 * stop further iterations.
308 	 */
309 	for (cursor = bl->bl_cursor;; cursor = 0) {
310 		blk = blst_meta_alloc(bl->bl_root, cursor, count, maxcount,
311 		    bl->bl_radix);
312 		if (blk != SWAPBLK_NONE) {
313 			bl->bl_avail -= *count;
314 			bl->bl_cursor = blk + *count;
315 			if (bl->bl_cursor == bl->bl_blocks)
316 				bl->bl_cursor = 0;
317 			return (blk);
318 		}
319 		if (cursor == 0)
320 			return (SWAPBLK_NONE);
321 	}
322 }
323 
324 /*
325  * blist_avail() -	return the number of free blocks.
326  */
327 daddr_t
328 blist_avail(blist_t bl)
329 {
330 
331 	return (bl->bl_avail);
332 }
333 
334 /*
335  * blist_free() -	free up space in the block bitmap.  Return the base
336  *		     	of a contiguous region.
337  */
338 void
339 blist_free(blist_t bl, daddr_t blkno, daddr_t count)
340 {
341 
342 	KASSERT(blkno >= 0 && blkno + count <= bl->bl_blocks,
343 	    ("freeing invalid range: blkno %jx, count %d, blocks %jd",
344 	    (uintmax_t)blkno, (int)count, (uintmax_t)bl->bl_blocks));
345 	blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix);
346 	bl->bl_avail += count;
347 }
348 
349 /*
350  * blist_fill() -	mark a region in the block bitmap as off-limits
351  *			to the allocator (i.e. allocate it), ignoring any
352  *			existing allocations.  Return the number of blocks
353  *			actually filled that were free before the call.
354  */
355 daddr_t
356 blist_fill(blist_t bl, daddr_t blkno, daddr_t count)
357 {
358 	daddr_t filled;
359 
360 	KASSERT(blkno >= 0 && blkno + count <= bl->bl_blocks,
361 	    ("filling invalid range: blkno %jx, count %d, blocks %jd",
362 	    (uintmax_t)blkno, (int)count, (uintmax_t)bl->bl_blocks));
363 	filled = blst_meta_fill(bl->bl_root, blkno, count, bl->bl_radix);
364 	bl->bl_avail -= filled;
365 	return (filled);
366 }
367 
368 /*
369  * blist_resize() -	resize an existing radix tree to handle the
370  *			specified number of blocks.  This will reallocate
371  *			the tree and transfer the previous bitmap to the new
372  *			one.  When extending the tree you can specify whether
373  *			the new blocks are to left allocated or freed.
374  */
375 void
376 blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags)
377 {
378     blist_t newbl = blist_create(count, flags);
379     blist_t save = *pbl;
380 
381     *pbl = newbl;
382     if (count > save->bl_blocks)
383 	    count = save->bl_blocks;
384     blst_copy(save->bl_root, 0, save->bl_radix, newbl, count);
385 
386     /*
387      * If resizing upwards, should we free the new space or not?
388      */
389     if (freenew && count < newbl->bl_blocks) {
390 	    blist_free(newbl, count, newbl->bl_blocks - count);
391     }
392     blist_destroy(save);
393 }
394 
395 #ifdef BLIST_DEBUG
396 
397 /*
398  * blist_print()    - dump radix tree
399  */
400 void
401 blist_print(blist_t bl)
402 {
403 	printf("BLIST avail = %jd, cursor = %08jx {\n",
404 	    (uintmax_t)bl->bl_avail, (uintmax_t)bl->bl_cursor);
405 
406 	if (bl->bl_root->bm_bitmap != 0)
407 		blst_radix_print(bl->bl_root, 0, bl->bl_radix, 4);
408 	printf("}\n");
409 }
410 
411 #endif
412 
413 static const u_daddr_t fib[] = {
414 	1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584,
415 	4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418, 317811,
416 	514229, 832040, 1346269, 2178309, 3524578,
417 };
418 
419 /*
420  * Use 'gap' to describe a maximal range of unallocated blocks/bits.
421  */
422 struct gap_stats {
423 	daddr_t	start;		/* current gap start, or SWAPBLK_NONE */
424 	daddr_t	num;		/* number of gaps observed */
425 	daddr_t	max;		/* largest gap size */
426 	daddr_t	avg;		/* average gap size */
427 	daddr_t	err;		/* sum - num * avg */
428 	daddr_t	histo[nitems(fib)]; /* # gaps in each size range */
429 	int	max_bucket;	/* last histo elt with nonzero val */
430 };
431 
432 /*
433  * gap_stats_counting()    - is the state 'counting 1 bits'?
434  *                           or 'skipping 0 bits'?
435  */
436 static inline bool
437 gap_stats_counting(const struct gap_stats *stats)
438 {
439 
440 	return (stats->start != SWAPBLK_NONE);
441 }
442 
443 /*
444  * init_gap_stats()    - initialize stats on gap sizes
445  */
446 static inline void
447 init_gap_stats(struct gap_stats *stats)
448 {
449 
450 	bzero(stats, sizeof(*stats));
451 	stats->start = SWAPBLK_NONE;
452 }
453 
454 /*
455  * update_gap_stats()    - update stats on gap sizes
456  */
457 static void
458 update_gap_stats(struct gap_stats *stats, daddr_t posn)
459 {
460 	daddr_t size;
461 	int hi, lo, mid;
462 
463 	if (!gap_stats_counting(stats)) {
464 		stats->start = posn;
465 		return;
466 	}
467 	size = posn - stats->start;
468 	stats->start = SWAPBLK_NONE;
469 	if (size > stats->max)
470 		stats->max = size;
471 
472 	/*
473 	 * Find the fibonacci range that contains size,
474 	 * expecting to find it in an early range.
475 	 */
476 	lo = 0;
477 	hi = 1;
478 	while (hi < nitems(fib) && fib[hi] <= size) {
479 		lo = hi;
480 		hi *= 2;
481 	}
482 	if (hi >= nitems(fib))
483 		hi = nitems(fib);
484 	while (lo + 1 != hi) {
485 		mid = (lo + hi) >> 1;
486 		if (fib[mid] <= size)
487 			lo = mid;
488 		else
489 			hi = mid;
490 	}
491 	stats->histo[lo]++;
492 	if (lo > stats->max_bucket)
493 		stats->max_bucket = lo;
494 	stats->err += size - stats->avg;
495 	stats->num++;
496 	stats->avg += stats->err / stats->num;
497 	stats->err %= stats->num;
498 }
499 
500 /*
501  * dump_gap_stats()    - print stats on gap sizes
502  */
503 static inline void
504 dump_gap_stats(const struct gap_stats *stats, struct sbuf *s)
505 {
506 	int i;
507 
508 	sbuf_printf(s, "number of maximal free ranges: %jd\n",
509 	    (intmax_t)stats->num);
510 	sbuf_printf(s, "largest free range: %jd\n", (intmax_t)stats->max);
511 	sbuf_printf(s, "average maximal free range size: %jd\n",
512 	    (intmax_t)stats->avg);
513 	sbuf_printf(s, "number of maximal free ranges of different sizes:\n");
514 	sbuf_printf(s, "               count  |  size range\n");
515 	sbuf_printf(s, "               -----  |  ----------\n");
516 	for (i = 0; i < stats->max_bucket; i++) {
517 		if (stats->histo[i] != 0) {
518 			sbuf_printf(s, "%20jd  |  ",
519 			    (intmax_t)stats->histo[i]);
520 			if (fib[i] != fib[i + 1] - 1)
521 				sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
522 				    (intmax_t)fib[i + 1] - 1);
523 			else
524 				sbuf_printf(s, "%jd\n", (intmax_t)fib[i]);
525 		}
526 	}
527 	sbuf_printf(s, "%20jd  |  ", (intmax_t)stats->histo[i]);
528 	if (stats->histo[i] > 1)
529 		sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
530 		    (intmax_t)stats->max);
531 	else
532 		sbuf_printf(s, "%jd\n", (intmax_t)stats->max);
533 }
534 
535 /*
536  * blist_stats()    - dump radix tree stats
537  */
538 void
539 blist_stats(blist_t bl, struct sbuf *s)
540 {
541 	struct gap_stats gstats;
542 	struct gap_stats *stats = &gstats;
543 	daddr_t i, nodes, radix;
544 	u_daddr_t diff, mask;
545 	int digit;
546 
547 	init_gap_stats(stats);
548 	nodes = 0;
549 	radix = bl->bl_radix;
550 	for (i = 0; i < bl->bl_blocks; ) {
551 		/*
552 		 * Check for skippable subtrees starting at i.
553 		 */
554 		while (radix != 1) {
555 			if (bl->bl_root[nodes].bm_bitmap == 0) {
556 				if (gap_stats_counting(stats))
557 					update_gap_stats(stats, i);
558 				break;
559 			}
560 
561 			/*
562 			 * Skip subtree root.
563 			 */
564 			nodes++;
565 			radix /= BLIST_RADIX;
566 		}
567 		if (radix == 1) {
568 			/*
569 			 * Scan leaf.
570 			 */
571 			mask = bl->bl_root[nodes].bm_bitmap;
572 			diff = mask ^ (mask << 1);
573 			if (gap_stats_counting(stats))
574 				diff ^= 1;
575 			while (diff != 0) {
576 				digit = bitpos(diff);
577 				update_gap_stats(stats, i + digit);
578 				diff ^= bitrange(digit, 1);
579 			}
580 		}
581 		nodes += radix_to_skip(radix * BLIST_RADIX);
582 		i += radix * BLIST_RADIX;
583 
584 		/*
585 		 * Find max size subtree starting at i.
586 		 */
587 		for (radix = 1;
588 		    ((i / BLIST_RADIX / radix) & BLIST_MASK) == 0;
589 		    radix *= BLIST_RADIX)
590 			;
591 	}
592 	update_gap_stats(stats, i);
593 	dump_gap_stats(stats, s);
594 }
595 
596 /************************************************************************
597  *			  ALLOCATION SUPPORT FUNCTIONS			*
598  ************************************************************************
599  *
600  *	These support functions do all the actual work.  They may seem
601  *	rather longish, but that's because I've commented them up.  The
602  *	actual code is straight forward.
603  *
604  */
605 
606 /*
607  * BLST_NEXT_LEAF_ALLOC() - allocate the blocks starting with the next leaf.
608  *
609  *	'scan' is a leaf node, and its first block is at address 'start'.  The
610  *	next leaf node could be adjacent, or several nodes away if the least
611  *	common ancestor of 'scan' and its neighbor is several levels up.  Use
612  *	addresses to determine how many meta-nodes lie between the leaves.  If
613  *	sequence of leaves starting with the next one has enough initial bits
614  *	set, clear them and clear the bits in the meta nodes on the path up to
615  *	the least common ancestor to mark any subtrees made completely empty.
616  */
617 static int
618 blst_next_leaf_alloc(blmeta_t *scan, daddr_t start, int count, int maxcount)
619 {
620 	u_daddr_t radix;
621 	daddr_t blk;
622 	int avail, digit;
623 
624 	start += BLIST_RADIX;
625 	for (blk = start; blk - start < maxcount; blk += BLIST_RADIX) {
626 		/* Skip meta-nodes, as long as they promise more free blocks. */
627 		radix = BLIST_RADIX;
628 		while (((++scan)->bm_bitmap & 1) == 1 &&
629 		    ((blk / radix) & BLIST_MASK) == 0)
630 			radix *= BLIST_RADIX;
631 		if (~scan->bm_bitmap != 0) {
632 			/*
633 			 * Either there is no next leaf with any free blocks,
634 			 * or we've reached the next leaf and found that some
635 			 * of its blocks are not free.  In the first case,
636 			 * bitpos() returns zero here.
637 			 */
638 			avail = blk - start + bitpos(~scan->bm_bitmap);
639 			if (avail < count || avail == 0) {
640 				/*
641 				 * There isn't a next leaf with enough free
642 				 * blocks at its beginning to bother
643 				 * allocating.
644 				 */
645 				return (avail);
646 			}
647 			maxcount = imin(avail, maxcount);
648 			if (maxcount % BLIST_RADIX == 0) {
649 				/*
650 				 * There was no next leaf.  Back scan up to
651 				 * last leaf.
652 				 */
653 				do {
654 					radix /= BLIST_RADIX;
655 					--scan;
656 				} while (radix != 1);
657 				blk -= BLIST_RADIX;
658 			}
659 		}
660 	}
661 
662 	/*
663 	 * 'scan' is the last leaf that provides blocks.  Clear from 1 to
664 	 * BLIST_RADIX bits to represent the allocation of those last blocks.
665 	 */
666 	if (maxcount % BLIST_RADIX != 0)
667 		scan->bm_bitmap &= ~bitrange(0, maxcount % BLIST_RADIX);
668 	else
669 		scan->bm_bitmap = 0;
670 
671 	for (;;) {
672 		/* Back up over meta-nodes, clearing bits if necessary. */
673 		blk -= BLIST_RADIX;
674 		for (radix = BLIST_RADIX;
675 		    (digit = ((blk / radix) & BLIST_MASK)) == 0;
676 		    radix *= BLIST_RADIX) {
677 			if ((scan--)->bm_bitmap == 0)
678 				scan->bm_bitmap ^= 1;
679 		}
680 		if ((scan--)->bm_bitmap == 0)
681 			scan[-digit * radix_to_skip(radix)].bm_bitmap ^=
682 			    (u_daddr_t)1 << digit;
683 
684 		if (blk == start)
685 			break;
686 		/* Clear all the bits of this leaf. */
687 		scan->bm_bitmap = 0;
688 	}
689 	return (maxcount);
690 }
691 
692 /*
693  * BLST_LEAF_ALLOC() -	allocate at a leaf in the radix tree (a bitmap).
694  *
695  *	This function is the core of the allocator.  Its execution time is
696  *	proportional to log(count), plus height of the tree if the allocation
697  *	crosses a leaf boundary.
698  */
699 static daddr_t
700 blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int *count, int maxcount)
701 {
702 	u_daddr_t mask;
703 	int bighint, count1, hi, lo, num_shifts;
704 
705 	count1 = *count - 1;
706 	num_shifts = fls(count1);
707 	mask = ~scan->bm_bitmap;
708 	while ((mask & (mask + 1)) != 0 && num_shifts > 0) {
709 		/*
710 		 * If bit i is 0 in mask, then bits in [i, i + (count1 >>
711 		 * num_shifts)] are 1 in scan->bm_bitmap.  Reduce num_shifts to
712 		 * 0, while preserving this invariant.  The updates to mask
713 		 * leave fewer bits 0, but each bit that remains 0 represents a
714 		 * longer string of consecutive 1-bits in scan->bm_bitmap.  If
715 		 * more updates to mask cannot set more bits, because mask is
716 		 * partitioned with all 1 bits following all 0 bits, the loop
717 		 * terminates immediately.
718 		 */
719 		num_shifts--;
720 		mask |= mask >> ((count1 >> num_shifts) + 1) / 2;
721 	}
722 	bighint = count1 >> num_shifts;
723 	if (~mask == 0) {
724 		/*
725 		 * Update bighint.  There is no allocation bigger than
726 		 * count1 >> num_shifts starting in this leaf.
727 		 */
728 		scan->bm_bighint = bighint;
729 		return (SWAPBLK_NONE);
730 	}
731 
732 	/* Discard any candidates that appear before blk. */
733 	if ((blk & BLIST_MASK) != 0) {
734 		if ((~mask & bitrange(0, blk & BLIST_MASK)) != 0) {
735 			/* Grow bighint in case all discarded bits are set. */
736 			bighint += blk & BLIST_MASK;
737 			mask |= bitrange(0, blk & BLIST_MASK);
738 			if (~mask == 0) {
739 				scan->bm_bighint = bighint;
740 				return (SWAPBLK_NONE);
741 			}
742 		}
743 		blk -= blk & BLIST_MASK;
744 	}
745 
746 	/*
747 	 * The least significant set bit in mask marks the start of the first
748 	 * available range of sufficient size.  Find its position.
749 	 */
750 	lo = bitpos(~mask);
751 
752 	/*
753 	 * Find how much space is available starting at that position.
754 	 */
755 	if ((mask & (mask + 1)) != 0) {
756 		/* Count the 1 bits starting at position lo. */
757 		hi = bitpos(mask & (mask + 1)) + count1;
758 		if (maxcount < hi - lo)
759 			hi = lo + maxcount;
760 		*count = hi - lo;
761 		mask = ~bitrange(lo, *count);
762 	} else if (maxcount <= BLIST_RADIX - lo) {
763 		/* All the blocks we can use are available here. */
764 		hi = lo + maxcount;
765 		*count = maxcount;
766 		mask = ~bitrange(lo, *count);
767 		if (hi == BLIST_RADIX)
768 			scan->bm_bighint = bighint;
769 	} else {
770 		/* Check next leaf for some of the blocks we want or need. */
771 		count1 = *count - (BLIST_RADIX - lo);
772 		maxcount -= BLIST_RADIX - lo;
773 		hi = blst_next_leaf_alloc(scan, blk, count1, maxcount);
774 		if (hi < count1)
775 			/*
776 			 * The next leaf cannot supply enough blocks to reach
777 			 * the minimum required allocation.  The hint cannot be
778 			 * updated, because the same allocation request could
779 			 * be satisfied later, by this leaf, if the state of
780 			 * the next leaf changes, and without any changes to
781 			 * this leaf.
782 			 */
783 			return (SWAPBLK_NONE);
784 		*count = BLIST_RADIX - lo + hi;
785 		scan->bm_bighint = bighint;
786 	}
787 
788 	/* Clear the allocated bits from this leaf. */
789 	scan->bm_bitmap &= mask;
790 	return (blk + lo);
791 }
792 
793 /*
794  * blist_meta_alloc() -	allocate at a meta in the radix tree.
795  *
796  *	Attempt to allocate at a meta node.  If we can't, we update
797  *	bighint and return a failure.  Updating bighint optimize future
798  *	calls that hit this node.  We have to check for our collapse cases
799  *	and we have a few optimizations strewn in as well.
800  */
801 static daddr_t
802 blst_meta_alloc(blmeta_t *scan, daddr_t cursor, int *count,
803     int maxcount, u_daddr_t radix)
804 {
805 	daddr_t blk, i, r, skip;
806 	u_daddr_t mask;
807 	bool scan_from_start;
808 	int digit;
809 
810 	if (radix == 1)
811 		return (blst_leaf_alloc(scan, cursor, count, maxcount));
812 	blk = cursor & -(radix * BLIST_RADIX);
813 	scan_from_start = (cursor == blk);
814 	skip = radix_to_skip(radix);
815 	mask = scan->bm_bitmap;
816 
817 	/* Discard any candidates that appear before cursor. */
818 	digit = (cursor / radix) & BLIST_MASK;
819 	mask &= (u_daddr_t)-1 << digit;
820 	if (mask == 0)
821 		return (SWAPBLK_NONE);
822 
823 	/*
824 	 * If the first try is for a block that includes the cursor, pre-undo
825 	 * the digit * radix offset in the first call; otherwise, ignore the
826 	 * cursor entirely.
827 	 */
828 	if (((mask >> digit) & 1) == 1)
829 		cursor -= digit * radix;
830 	else
831 		cursor = blk;
832 
833 	/*
834 	 * Examine the nonempty subtree associated with each bit set in mask.
835 	 */
836 	do {
837 		digit = bitpos(mask);
838 		i = 1 + digit * skip;
839 		if (*count <= scan[i].bm_bighint) {
840 			/*
841 			 * The allocation might fit beginning in the i'th subtree.
842 			 */
843 			r = blst_meta_alloc(&scan[i], cursor + digit * radix,
844 			    count, maxcount, radix / BLIST_RADIX);
845 			if (r != SWAPBLK_NONE) {
846 				if (scan[i].bm_bitmap == 0)
847 					scan->bm_bitmap ^= bitrange(digit, 1);
848 				return (r);
849 			}
850 		}
851 		cursor = blk;
852 	} while ((mask ^= bitrange(digit, 1)) != 0);
853 
854 	/*
855 	 * We couldn't allocate count in this subtree.  If the whole tree was
856 	 * scanned, and the last tree node is allocated, update bighint.
857 	 */
858 	if (scan_from_start && !(digit == BLIST_RADIX - 1 &&
859 	    scan[i].bm_bighint == BLIST_MAX_ALLOC))
860 		scan->bm_bighint = *count - 1;
861 
862 	return (SWAPBLK_NONE);
863 }
864 
865 /*
866  * BLST_LEAF_FREE() -	free allocated block from leaf bitmap
867  *
868  */
869 static void
870 blst_leaf_free(blmeta_t *scan, daddr_t blk, int count)
871 {
872 	u_daddr_t mask;
873 
874 	/*
875 	 * free some data in this bitmap
876 	 * mask=0000111111111110000
877 	 *          \_________/\__/
878 	 *		count   n
879 	 */
880 	mask = bitrange(blk & BLIST_MASK, count);
881 	KASSERT((scan->bm_bitmap & mask) == 0,
882 	    ("freeing free block: %jx, size %d, mask %jx",
883 	    (uintmax_t)blk, count, (uintmax_t)scan->bm_bitmap & mask));
884 	scan->bm_bitmap |= mask;
885 }
886 
887 /*
888  * BLST_META_FREE() - free allocated blocks from radix tree meta info
889  *
890  *	This support routine frees a range of blocks from the bitmap.
891  *	The range must be entirely enclosed by this radix node.  If a
892  *	meta node, we break the range down recursively to free blocks
893  *	in subnodes (which means that this code can free an arbitrary
894  *	range whereas the allocation code cannot allocate an arbitrary
895  *	range).
896  */
897 static void
898 blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, u_daddr_t radix)
899 {
900 	daddr_t blk, endBlk, i, skip;
901 	int digit, endDigit;
902 
903 	/*
904 	 * We could probably do a better job here.  We are required to make
905 	 * bighint at least as large as the biggest allocable block of data.
906 	 * If we just shoehorn it, a little extra overhead will be incurred
907 	 * on the next allocation (but only that one typically).
908 	 */
909 	scan->bm_bighint = BLIST_MAX_ALLOC;
910 
911 	if (radix == 1)
912 		return (blst_leaf_free(scan, freeBlk, count));
913 
914 	endBlk = freeBlk + count;
915 	blk = (freeBlk + radix * BLIST_RADIX) & -(radix * BLIST_RADIX);
916 	/*
917 	 * blk is first block past the end of the range of this meta node,
918 	 * or 0 in case of overflow.
919 	 */
920 	if (blk != 0)
921 		endBlk = ummin(endBlk, blk);
922 	skip = radix_to_skip(radix);
923 	blk = freeBlk & -radix;
924 	digit = (blk / radix) & BLIST_MASK;
925 	endDigit = 1 + (((endBlk - 1) / radix) & BLIST_MASK);
926 	scan->bm_bitmap |= bitrange(digit, endDigit - digit);
927 	for (i = 1 + digit * skip; blk < endBlk; i += skip) {
928 		blk += radix;
929 		count = ummin(blk, endBlk) - freeBlk;
930 		blst_meta_free(&scan[i], freeBlk, count, radix / BLIST_RADIX);
931 		freeBlk = blk;
932 	}
933 }
934 
935 /*
936  * BLST_COPY() - copy one radix tree to another
937  *
938  *	Locates free space in the source tree and frees it in the destination
939  *	tree.  The space may not already be free in the destination.
940  */
941 static void
942 blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, blist_t dest,
943     daddr_t count)
944 {
945 	daddr_t endBlk, i, skip;
946 
947 	/*
948 	 * Leaf node
949 	 */
950 
951 	if (radix == 1) {
952 		u_daddr_t v = scan->bm_bitmap;
953 
954 		if (v == (u_daddr_t)-1) {
955 			blist_free(dest, blk, count);
956 		} else if (v != 0) {
957 			int i;
958 
959 			for (i = 0; i < count; ++i) {
960 				if (v & ((u_daddr_t)1 << i))
961 					blist_free(dest, blk + i, 1);
962 			}
963 		}
964 		return;
965 	}
966 
967 	/*
968 	 * Meta node
969 	 */
970 
971 	if (scan->bm_bitmap == 0) {
972 		/*
973 		 * Source all allocated, leave dest allocated
974 		 */
975 		return;
976 	}
977 
978 	endBlk = blk + count;
979 	skip = radix_to_skip(radix);
980 	for (i = 1; blk < endBlk; i += skip) {
981 		blk += radix;
982 		count = radix;
983 		if (blk >= endBlk)
984 			count -= blk - endBlk;
985 		blst_copy(&scan[i], blk - radix,
986 		    radix / BLIST_RADIX, dest, count);
987 	}
988 }
989 
990 /*
991  * BLST_LEAF_FILL() -	allocate specific blocks in leaf bitmap
992  *
993  *	This routine allocates all blocks in the specified range
994  *	regardless of any existing allocations in that range.  Returns
995  *	the number of blocks allocated by the call.
996  */
997 static daddr_t
998 blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count)
999 {
1000 	daddr_t nblks;
1001 	u_daddr_t mask;
1002 
1003 	mask = bitrange(blk & BLIST_MASK, count);
1004 
1005 	/* Count the number of blocks that we are allocating. */
1006 	nblks = bitcount64(scan->bm_bitmap & mask);
1007 
1008 	scan->bm_bitmap &= ~mask;
1009 	return (nblks);
1010 }
1011 
1012 /*
1013  * BLIST_META_FILL() -	allocate specific blocks at a meta node
1014  *
1015  *	This routine allocates the specified range of blocks,
1016  *	regardless of any existing allocations in the range.  The
1017  *	range must be within the extent of this node.  Returns the
1018  *	number of blocks allocated by the call.
1019  */
1020 static daddr_t
1021 blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, u_daddr_t radix)
1022 {
1023 	daddr_t blk, endBlk, i, nblks, skip;
1024 	int digit;
1025 
1026 	if (radix == 1)
1027 		return (blst_leaf_fill(scan, allocBlk, count));
1028 
1029 	endBlk = allocBlk + count;
1030 	blk = (allocBlk + radix * BLIST_RADIX) & -(radix * BLIST_RADIX);
1031 	/*
1032 	 * blk is first block past the end of the range of this meta node,
1033 	 * or 0 in case of overflow.
1034 	 */
1035 	if (blk != 0)
1036 		endBlk = ummin(endBlk, blk);
1037 	skip = radix_to_skip(radix);
1038 	blk = allocBlk & -radix;
1039 	nblks = 0;
1040 	while (blk < endBlk) {
1041 		digit = (blk / radix) & BLIST_MASK;
1042 		i = 1 + digit * skip;
1043 		blk += radix;
1044 		count = ummin(blk, endBlk) - allocBlk;
1045 		nblks += blst_meta_fill(&scan[i], allocBlk, count,
1046 		    radix / BLIST_RADIX);
1047 		if (scan[i].bm_bitmap == 0)
1048 			scan->bm_bitmap &= ~((u_daddr_t)1 << digit);
1049 		allocBlk = blk;
1050 	}
1051 	return (nblks);
1052 }
1053 
1054 #ifdef BLIST_DEBUG
1055 
1056 static void
1057 blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int tab)
1058 {
1059 	daddr_t skip;
1060 	u_daddr_t mask;
1061 	int digit;
1062 
1063 	if (radix == 1) {
1064 		printf(
1065 		    "%*.*s(%08llx,%lld): bitmap %0*llx big=%lld\n",
1066 		    tab, tab, "",
1067 		    (long long)blk, (long long)BLIST_RADIX,
1068 		    (int)(1 + (BLIST_RADIX - 1) / 4),
1069 		    (long long)scan->bm_bitmap,
1070 		    (long long)scan->bm_bighint
1071 		);
1072 		return;
1073 	}
1074 
1075 	printf(
1076 	    "%*.*s(%08llx): subtree (%lld/%lld) bitmap %0*llx big=%lld {\n",
1077 	    tab, tab, "",
1078 	    (long long)blk, (long long)radix * BLIST_RADIX,
1079 	    (long long)radix * BLIST_RADIX,
1080 	    (int)(1 + (BLIST_RADIX - 1) / 4),
1081 	    (long long)scan->bm_bitmap,
1082 	    (long long)scan->bm_bighint
1083 	);
1084 
1085 	skip = radix_to_skip(radix);
1086 	tab += 4;
1087 
1088 	mask = scan->bm_bitmap;
1089 	/* Examine the nonempty subtree associated with each bit set in mask */
1090 	do {
1091 		digit = bitpos(mask);
1092 		blst_radix_print(&scan[1 + digit * skip], blk + digit * radix,
1093 		    radix / BLIST_RADIX, tab);
1094 	} while ((mask ^= bitrange(digit, 1)) != 0);
1095 	tab -= 4;
1096 
1097 	printf(
1098 	    "%*.*s}\n",
1099 	    tab, tab, ""
1100 	);
1101 }
1102 
1103 #endif
1104 
1105 #ifdef BLIST_DEBUG
1106 
1107 int
1108 main(int ac, char **av)
1109 {
1110 	daddr_t size = BLIST_RADIX * BLIST_RADIX;
1111 	int i;
1112 	blist_t bl;
1113 	struct sbuf *s;
1114 
1115 	for (i = 1; i < ac; ++i) {
1116 		const char *ptr = av[i];
1117 		if (*ptr != '-') {
1118 			size = strtoll(ptr, NULL, 0);
1119 			continue;
1120 		}
1121 		ptr += 2;
1122 		fprintf(stderr, "Bad option: %s\n", ptr - 2);
1123 		exit(1);
1124 	}
1125 	bl = blist_create(size, M_WAITOK);
1126 	if (bl == NULL) {
1127 		fprintf(stderr, "blist_create failed\n");
1128 		exit(1);
1129 	}
1130 	blist_free(bl, 0, size);
1131 
1132 	for (;;) {
1133 		char buf[1024];
1134 		long long da = 0;
1135 		int count = 0, maxcount = 0;
1136 
1137 		printf("%lld/%lld/%lld> ", (long long)blist_avail(bl),
1138 		    (long long)size, (long long)bl->bl_radix * BLIST_RADIX);
1139 		fflush(stdout);
1140 		if (fgets(buf, sizeof(buf), stdin) == NULL)
1141 			break;
1142 		switch(buf[0]) {
1143 		case 'r':
1144 			if (sscanf(buf + 1, "%d", &count) == 1) {
1145 				blist_resize(&bl, count, 1, M_WAITOK);
1146 			} else {
1147 				printf("?\n");
1148 			}
1149 		case 'p':
1150 			blist_print(bl);
1151 			break;
1152 		case 's':
1153 			s = sbuf_new_auto();
1154 			blist_stats(bl, s);
1155 			sbuf_finish(s);
1156 			printf("%s", sbuf_data(s));
1157 			sbuf_delete(s);
1158 			break;
1159 		case 'a':
1160 			if (sscanf(buf + 1, "%d%d", &count, &maxcount) == 2) {
1161 				daddr_t blk = blist_alloc(bl, &count, maxcount);
1162 				printf("    R=%08llx, c=%08d\n",
1163 				    (long long)blk, count);
1164 			} else {
1165 				printf("?\n");
1166 			}
1167 			break;
1168 		case 'f':
1169 			if (sscanf(buf + 1, "%llx %d", &da, &count) == 2) {
1170 				blist_free(bl, da, count);
1171 			} else {
1172 				printf("?\n");
1173 			}
1174 			break;
1175 		case 'l':
1176 			if (sscanf(buf + 1, "%llx %d", &da, &count) == 2) {
1177 				printf("    n=%jd\n",
1178 				    (intmax_t)blist_fill(bl, da, count));
1179 			} else {
1180 				printf("?\n");
1181 			}
1182 			break;
1183 		case '?':
1184 		case 'h':
1185 			puts(
1186 			    "p          -print\n"
1187 			    "s          -stats\n"
1188 			    "a %d %d    -allocate\n"
1189 			    "f %x %d    -free\n"
1190 			    "l %x %d    -fill\n"
1191 			    "r %d       -resize\n"
1192 			    "h/?        -help\n"
1193 			    "q          -quit"
1194 			);
1195 			break;
1196 		case 'q':
1197 			break;
1198 		default:
1199 			printf("?\n");
1200 			break;
1201 		}
1202 		if (buf[0] == 'q')
1203 			break;
1204 	}
1205 	return (0);
1206 }
1207 
1208 #endif
1209