xref: /freebsd/sys/kern/subr_blist.c (revision d3d381b2b194b4d24853e92eecef55f262688d1a)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1998 Matthew Dillon.  All Rights Reserved.
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. Neither the name of the University nor the names of its contributors
14  *    may be used to endorse or promote products derived from this software
15  *    without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
18  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
23  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
25  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
26  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  */
29 /*
30  * BLIST.C -	Bitmap allocator/deallocator, using a radix tree with hinting
31  *
32  *	This module implements a general bitmap allocator/deallocator.  The
33  *	allocator eats around 2 bits per 'block'.  The module does not
34  *	try to interpret the meaning of a 'block' other than to return
35  *	SWAPBLK_NONE on an allocation failure.
36  *
37  *	A radix tree controls access to pieces of the bitmap, and includes
38  *	auxiliary information at each interior node about the availabilty of
39  *	contiguous free blocks in the subtree rooted at that node.  Two radix
40  *	constants are involved: one for the size of the bitmaps contained in the
41  *	leaf nodes (BLIST_BMAP_RADIX), and one for the number of descendents of
42  *	each of the meta (interior) nodes (BLIST_META_RADIX).  Each subtree is
43  *	associated with a range of blocks.  The root of any subtree stores a
44  *	hint field that defines an upper bound on the size of the largest
45  *	allocation that can begin in the associated block range.  A hint is an
46  *	upper bound on a potential allocation, but not necessarily a tight upper
47  *	bound.
48  *
49  *	The radix tree also implements two collapsed states for meta nodes:
50  *	the ALL-ALLOCATED state and the ALL-FREE state.  If a meta node is
51  *	in either of these two states, all information contained underneath
52  *	the node is considered stale.  These states are used to optimize
53  *	allocation and freeing operations.
54  *
55  * 	The hinting greatly increases code efficiency for allocations while
56  *	the general radix structure optimizes both allocations and frees.  The
57  *	radix tree should be able to operate well no matter how much
58  *	fragmentation there is and no matter how large a bitmap is used.
59  *
60  *	The blist code wires all necessary memory at creation time.  Neither
61  *	allocations nor frees require interaction with the memory subsystem.
62  *	The non-blocking features of the blist code are used in the swap code
63  *	(vm/swap_pager.c).
64  *
65  *	LAYOUT: The radix tree is laid out recursively using a
66  *	linear array.  Each meta node is immediately followed (laid out
67  *	sequentially in memory) by BLIST_META_RADIX lower level nodes.  This
68  *	is a recursive structure but one that can be easily scanned through
69  *	a very simple 'skip' calculation.  In order to support large radixes,
70  *	portions of the tree may reside outside our memory allocation.  We
71  *	handle this with an early-termination optimization (when bighint is
72  *	set to -1) on the scan.  The memory allocation is only large enough
73  *	to cover the number of blocks requested at creation time even if it
74  *	must be encompassed in larger root-node radix.
75  *
76  *	NOTE: the allocator cannot currently allocate more than
77  *	BLIST_BMAP_RADIX blocks per call.  It will panic with 'allocation too
78  *	large' if you try.  This is an area that could use improvement.  The
79  *	radix is large enough that this restriction does not effect the swap
80  *	system, though.  Currently only the allocation code is affected by
81  *	this algorithmic unfeature.  The freeing code can handle arbitrary
82  *	ranges.
83  *
84  *	This code can be compiled stand-alone for debugging.
85  */
86 
87 #include <sys/cdefs.h>
88 __FBSDID("$FreeBSD$");
89 
90 #ifdef _KERNEL
91 
92 #include <sys/param.h>
93 #include <sys/systm.h>
94 #include <sys/lock.h>
95 #include <sys/kernel.h>
96 #include <sys/blist.h>
97 #include <sys/malloc.h>
98 #include <sys/sbuf.h>
99 #include <sys/proc.h>
100 #include <sys/mutex.h>
101 
102 #else
103 
104 #ifndef BLIST_NO_DEBUG
105 #define BLIST_DEBUG
106 #endif
107 
108 #include <sys/types.h>
109 #include <sys/malloc.h>
110 #include <sys/sbuf.h>
111 #include <stdio.h>
112 #include <string.h>
113 #include <stddef.h>
114 #include <stdlib.h>
115 #include <stdarg.h>
116 #include <stdbool.h>
117 
118 #define	bitcount64(x)	__bitcount64((uint64_t)(x))
119 #define malloc(a,b,c)	calloc(a, 1)
120 #define free(a,b)	free(a)
121 static __inline int imax(int a, int b) { return (a > b ? a : b); }
122 
123 #include <sys/blist.h>
124 
125 void panic(const char *ctl, ...);
126 
127 #endif
128 
129 /*
130  * static support functions
131  */
132 static daddr_t	blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count);
133 static daddr_t	blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count,
134 		    u_daddr_t radix);
135 static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count);
136 static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count,
137 		    u_daddr_t radix);
138 static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix,
139 		    blist_t dest, daddr_t count);
140 static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count);
141 static daddr_t blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count,
142 		    u_daddr_t radix);
143 #ifndef _KERNEL
144 static void	blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix,
145 		    int tab);
146 #endif
147 
148 #ifdef _KERNEL
149 static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space");
150 #endif
151 
152 _Static_assert(BLIST_BMAP_RADIX % BLIST_META_RADIX == 0,
153     "radix divisibility error");
154 #define	BLIST_BMAP_MASK	(BLIST_BMAP_RADIX - 1)
155 #define	BLIST_META_MASK	(BLIST_META_RADIX - 1)
156 
157 /*
158  * For a subtree that can represent the state of up to 'radix' blocks, the
159  * number of leaf nodes of the subtree is L=radix/BLIST_BMAP_RADIX.  If 'm'
160  * is short for BLIST_META_RADIX, then for a tree of height h with L=m**h
161  * leaf nodes, the total number of tree nodes is 1 + m + m**2 + ... + m**h,
162  * or, equivalently, (m**(h+1)-1)/(m-1).  This quantity is called 'skip'
163  * in the 'meta' functions that process subtrees.  Since integer division
164  * discards remainders, we can express this computation as
165  * skip = (m * m**h) / (m - 1)
166  * skip = (m * (radix / BLIST_BMAP_RADIX)) / (m - 1)
167  * and since m divides BLIST_BMAP_RADIX, we can simplify further to
168  * skip = (radix / (BLIST_BMAP_RADIX / m)) / (m - 1)
169  * skip = radix / ((BLIST_BMAP_RADIX / m) * (m - 1))
170  * so that simple integer division by a constant can safely be used for the
171  * calculation.
172  */
173 static inline daddr_t
174 radix_to_skip(daddr_t radix)
175 {
176 
177 	return (radix /
178 	    ((BLIST_BMAP_RADIX / BLIST_META_RADIX) * BLIST_META_MASK));
179 }
180 
181 /*
182  * Use binary search, or a faster method, to find the 1 bit in a u_daddr_t.
183  * Assumes that the argument has only one bit set.
184  */
185 static inline int
186 bitpos(u_daddr_t mask)
187 {
188 	int hi, lo, mid;
189 
190 	switch (sizeof(mask)) {
191 #ifdef HAVE_INLINE_FFSLL
192 	case sizeof(long long):
193 		return (ffsll(mask) - 1);
194 #endif
195 	default:
196 		lo = 0;
197 		hi = BLIST_BMAP_RADIX;
198 		while (lo + 1 < hi) {
199 			mid = (lo + hi) >> 1;
200 			if ((mask >> mid) != 0)
201 				lo = mid;
202 			else
203 				hi = mid;
204 		}
205 		return (lo);
206 	}
207 }
208 
209 /*
210  * blist_create() - create a blist capable of handling up to the specified
211  *		    number of blocks
212  *
213  *	blocks - must be greater than 0
214  * 	flags  - malloc flags
215  *
216  *	The smallest blist consists of a single leaf node capable of
217  *	managing BLIST_BMAP_RADIX blocks.
218  */
219 blist_t
220 blist_create(daddr_t blocks, int flags)
221 {
222 	blist_t bl;
223 	daddr_t i, last_block;
224 	u_daddr_t nodes, radix, skip;
225 	int digit;
226 
227 	/*
228 	 * Calculate the radix and node count used for scanning.  Find the last
229 	 * block that is followed by a terminator.
230 	 */
231 	last_block = blocks - 1;
232 	radix = BLIST_BMAP_RADIX;
233 	while (radix < blocks) {
234 		if (((last_block / radix + 1) & BLIST_META_MASK) != 0)
235 			/*
236 			 * A terminator will be added.  Update last_block to the
237 			 * position just before that terminator.
238 			 */
239 			last_block |= radix - 1;
240 		radix *= BLIST_META_RADIX;
241 	}
242 
243 	/*
244 	 * Count the meta-nodes in the expanded tree, including the final
245 	 * terminator, from the bottom level up to the root.
246 	 */
247 	nodes = (last_block >= blocks) ? 2 : 1;
248 	last_block /= BLIST_BMAP_RADIX;
249 	while (last_block > 0) {
250 		nodes += last_block + 1;
251 		last_block /= BLIST_META_RADIX;
252 	}
253 	bl = malloc(offsetof(struct blist, bl_root[nodes]), M_SWAP, flags |
254 	    M_ZERO);
255 	if (bl == NULL)
256 		return (NULL);
257 
258 	bl->bl_blocks = blocks;
259 	bl->bl_radix = radix;
260 	bl->bl_cursor = 0;
261 
262 	/*
263 	 * Initialize the empty tree by filling in root values, then initialize
264 	 * just the terminators in the rest of the tree.
265 	 */
266 	bl->bl_root[0].bm_bighint = 0;
267 	if (radix == BLIST_BMAP_RADIX)
268 		bl->bl_root[0].u.bmu_bitmap = 0;
269 	else
270 		bl->bl_root[0].u.bmu_avail = 0;
271 	last_block = blocks - 1;
272 	i = 0;
273 	while (radix > BLIST_BMAP_RADIX) {
274 		radix /= BLIST_META_RADIX;
275 		skip = radix_to_skip(radix);
276 		digit = last_block / radix;
277 		i += 1 + digit * skip;
278 		if (digit != BLIST_META_MASK) {
279 			/*
280 			 * Add a terminator.
281 			 */
282 			bl->bl_root[i + skip].bm_bighint = (daddr_t)-1;
283 			bl->bl_root[i + skip].u.bmu_bitmap = 0;
284 		}
285 		last_block %= radix;
286 	}
287 
288 #if defined(BLIST_DEBUG)
289 	printf(
290 		"BLIST representing %lld blocks (%lld MB of swap)"
291 		", requiring %lldK of ram\n",
292 		(long long)bl->bl_blocks,
293 		(long long)bl->bl_blocks * 4 / 1024,
294 		(long long)(nodes * sizeof(blmeta_t) + 1023) / 1024
295 	);
296 	printf("BLIST raw radix tree contains %lld records\n",
297 	    (long long)nodes);
298 #endif
299 
300 	return (bl);
301 }
302 
303 void
304 blist_destroy(blist_t bl)
305 {
306 
307 	free(bl, M_SWAP);
308 }
309 
310 /*
311  * blist_alloc() -   reserve space in the block bitmap.  Return the base
312  *		     of a contiguous region or SWAPBLK_NONE if space could
313  *		     not be allocated.
314  */
315 daddr_t
316 blist_alloc(blist_t bl, daddr_t count)
317 {
318 	daddr_t blk;
319 
320 	/*
321 	 * This loop iterates at most twice.  An allocation failure in the
322 	 * first iteration leads to a second iteration only if the cursor was
323 	 * non-zero.  When the cursor is zero, an allocation failure will
324 	 * reduce the hint, stopping further iterations.
325 	 */
326 	while (count <= bl->bl_root->bm_bighint) {
327 		blk = blst_meta_alloc(bl->bl_root, bl->bl_cursor, count,
328 		    bl->bl_radix);
329 		if (blk != SWAPBLK_NONE) {
330 			bl->bl_cursor = blk + count;
331 			if (bl->bl_cursor == bl->bl_blocks)
332 				bl->bl_cursor = 0;
333 			return (blk);
334 		} else if (bl->bl_cursor != 0)
335 			bl->bl_cursor = 0;
336 	}
337 	return (SWAPBLK_NONE);
338 }
339 
340 /*
341  * blist_avail() -	return the number of free blocks.
342  */
343 daddr_t
344 blist_avail(blist_t bl)
345 {
346 
347 	if (bl->bl_radix == BLIST_BMAP_RADIX)
348 		return (bitcount64(bl->bl_root->u.bmu_bitmap));
349 	else
350 		return (bl->bl_root->u.bmu_avail);
351 }
352 
353 /*
354  * blist_free() -	free up space in the block bitmap.  Return the base
355  *		     	of a contiguous region.  Panic if an inconsistancy is
356  *			found.
357  */
358 void
359 blist_free(blist_t bl, daddr_t blkno, daddr_t count)
360 {
361 
362 	blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix);
363 }
364 
365 /*
366  * blist_fill() -	mark a region in the block bitmap as off-limits
367  *			to the allocator (i.e. allocate it), ignoring any
368  *			existing allocations.  Return the number of blocks
369  *			actually filled that were free before the call.
370  */
371 daddr_t
372 blist_fill(blist_t bl, daddr_t blkno, daddr_t count)
373 {
374 
375 	return (blst_meta_fill(bl->bl_root, blkno, count, bl->bl_radix));
376 }
377 
378 /*
379  * blist_resize() -	resize an existing radix tree to handle the
380  *			specified number of blocks.  This will reallocate
381  *			the tree and transfer the previous bitmap to the new
382  *			one.  When extending the tree you can specify whether
383  *			the new blocks are to left allocated or freed.
384  */
385 void
386 blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags)
387 {
388     blist_t newbl = blist_create(count, flags);
389     blist_t save = *pbl;
390 
391     *pbl = newbl;
392     if (count > save->bl_blocks)
393 	    count = save->bl_blocks;
394     blst_copy(save->bl_root, 0, save->bl_radix, newbl, count);
395 
396     /*
397      * If resizing upwards, should we free the new space or not?
398      */
399     if (freenew && count < newbl->bl_blocks) {
400 	    blist_free(newbl, count, newbl->bl_blocks - count);
401     }
402     blist_destroy(save);
403 }
404 
405 #ifdef BLIST_DEBUG
406 
407 /*
408  * blist_print()    - dump radix tree
409  */
410 void
411 blist_print(blist_t bl)
412 {
413 	printf("BLIST cursor = %08jx {\n", (uintmax_t)bl->bl_cursor);
414 	blst_radix_print(bl->bl_root, 0, bl->bl_radix, 4);
415 	printf("}\n");
416 }
417 
418 #endif
419 
420 static const u_daddr_t fib[] = {
421 	1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584,
422 	4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418, 317811,
423 	514229, 832040, 1346269, 2178309, 3524578,
424 };
425 
426 /*
427  * Use 'gap' to describe a maximal range of unallocated blocks/bits.
428  */
429 struct gap_stats {
430 	daddr_t	start;		/* current gap start, or SWAPBLK_NONE */
431 	daddr_t	num;		/* number of gaps observed */
432 	daddr_t	max;		/* largest gap size */
433 	daddr_t	avg;		/* average gap size */
434 	daddr_t	err;		/* sum - num * avg */
435 	daddr_t	histo[nitems(fib)]; /* # gaps in each size range */
436 	int	max_bucket;	/* last histo elt with nonzero val */
437 };
438 
439 /*
440  * gap_stats_counting()    - is the state 'counting 1 bits'?
441  *                           or 'skipping 0 bits'?
442  */
443 static inline bool
444 gap_stats_counting(const struct gap_stats *stats)
445 {
446 
447 	return (stats->start != SWAPBLK_NONE);
448 }
449 
450 /*
451  * init_gap_stats()    - initialize stats on gap sizes
452  */
453 static inline void
454 init_gap_stats(struct gap_stats *stats)
455 {
456 
457 	bzero(stats, sizeof(*stats));
458 	stats->start = SWAPBLK_NONE;
459 }
460 
461 /*
462  * update_gap_stats()    - update stats on gap sizes
463  */
464 static void
465 update_gap_stats(struct gap_stats *stats, daddr_t posn)
466 {
467 	daddr_t size;
468 	int hi, lo, mid;
469 
470 	if (!gap_stats_counting(stats)) {
471 		stats->start = posn;
472 		return;
473 	}
474 	size = posn - stats->start;
475 	stats->start = SWAPBLK_NONE;
476 	if (size > stats->max)
477 		stats->max = size;
478 
479 	/*
480 	 * Find the fibonacci range that contains size,
481 	 * expecting to find it in an early range.
482 	 */
483 	lo = 0;
484 	hi = 1;
485 	while (hi < nitems(fib) && fib[hi] <= size) {
486 		lo = hi;
487 		hi *= 2;
488 	}
489 	if (hi >= nitems(fib))
490 		hi = nitems(fib);
491 	while (lo + 1 != hi) {
492 		mid = (lo + hi) >> 1;
493 		if (fib[mid] <= size)
494 			lo = mid;
495 		else
496 			hi = mid;
497 	}
498 	stats->histo[lo]++;
499 	if (lo > stats->max_bucket)
500 		stats->max_bucket = lo;
501 	stats->err += size - stats->avg;
502 	stats->num++;
503 	stats->avg += stats->err / stats->num;
504 	stats->err %= stats->num;
505 }
506 
507 /*
508  * dump_gap_stats()    - print stats on gap sizes
509  */
510 static inline void
511 dump_gap_stats(const struct gap_stats *stats, struct sbuf *s)
512 {
513 	int i;
514 
515 	sbuf_printf(s, "number of maximal free ranges: %jd\n",
516 	    (intmax_t)stats->num);
517 	sbuf_printf(s, "largest free range: %jd\n", (intmax_t)stats->max);
518 	sbuf_printf(s, "average maximal free range size: %jd\n",
519 	    (intmax_t)stats->avg);
520 	sbuf_printf(s, "number of maximal free ranges of different sizes:\n");
521 	sbuf_printf(s, "               count  |  size range\n");
522 	sbuf_printf(s, "               -----  |  ----------\n");
523 	for (i = 0; i < stats->max_bucket; i++) {
524 		if (stats->histo[i] != 0) {
525 			sbuf_printf(s, "%20jd  |  ",
526 			    (intmax_t)stats->histo[i]);
527 			if (fib[i] != fib[i + 1] - 1)
528 				sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
529 				    (intmax_t)fib[i + 1] - 1);
530 			else
531 				sbuf_printf(s, "%jd\n", (intmax_t)fib[i]);
532 		}
533 	}
534 	sbuf_printf(s, "%20jd  |  ", (intmax_t)stats->histo[i]);
535 	if (stats->histo[i] > 1)
536 		sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
537 		    (intmax_t)stats->max);
538 	else
539 		sbuf_printf(s, "%jd\n", (intmax_t)stats->max);
540 }
541 
542 /*
543  * blist_stats()    - dump radix tree stats
544  */
545 void
546 blist_stats(blist_t bl, struct sbuf *s)
547 {
548 	struct gap_stats gstats;
549 	struct gap_stats *stats = &gstats;
550 	daddr_t i, nodes, radix;
551 	u_daddr_t bit, diff, mask;
552 
553 	init_gap_stats(stats);
554 	nodes = 0;
555 	i = bl->bl_radix;
556 	while (i < bl->bl_radix + bl->bl_blocks) {
557 		/*
558 		 * Find max size subtree starting at i.
559 		 */
560 		radix = BLIST_BMAP_RADIX;
561 		while (((i / radix) & BLIST_META_MASK) == 0)
562 			radix *= BLIST_META_RADIX;
563 
564 		/*
565 		 * Check for skippable subtrees starting at i.
566 		 */
567 		while (radix > BLIST_BMAP_RADIX) {
568 			if (bl->bl_root[nodes].u.bmu_avail == 0) {
569 				if (gap_stats_counting(stats))
570 					update_gap_stats(stats, i);
571 				break;
572 			}
573 			if (bl->bl_root[nodes].u.bmu_avail == radix) {
574 				if (!gap_stats_counting(stats))
575 					update_gap_stats(stats, i);
576 				break;
577 			}
578 
579 			/*
580 			 * Skip subtree root.
581 			 */
582 			nodes++;
583 			radix /= BLIST_META_RADIX;
584 		}
585 		if (radix == BLIST_BMAP_RADIX) {
586 			/*
587 			 * Scan leaf.
588 			 */
589 			mask = bl->bl_root[nodes].u.bmu_bitmap;
590 			diff = mask ^ (mask << 1);
591 			if (gap_stats_counting(stats))
592 				diff ^= 1;
593 			while (diff != 0) {
594 				bit = diff & -diff;
595 				update_gap_stats(stats, i + bitpos(bit));
596 				diff ^= bit;
597 			}
598 		}
599 		nodes += radix_to_skip(radix);
600 		i += radix;
601 	}
602 	update_gap_stats(stats, i);
603 	dump_gap_stats(stats, s);
604 }
605 
606 /************************************************************************
607  *			  ALLOCATION SUPPORT FUNCTIONS			*
608  ************************************************************************
609  *
610  *	These support functions do all the actual work.  They may seem
611  *	rather longish, but that's because I've commented them up.  The
612  *	actual code is straight forward.
613  *
614  */
615 
616 /*
617  * blist_leaf_alloc() -	allocate at a leaf in the radix tree (a bitmap).
618  *
619  *	This is the core of the allocator and is optimized for the
620  *	BLIST_BMAP_RADIX block allocation case.  Otherwise, execution
621  *	time is proportional to log2(count) + bitpos time.
622  */
623 static daddr_t
624 blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count)
625 {
626 	u_daddr_t mask;
627 	int count1, hi, lo, num_shifts, range1, range_ext;
628 
629 	range1 = 0;
630 	count1 = count - 1;
631 	num_shifts = fls(count1);
632 	mask = scan->u.bmu_bitmap;
633 	while ((-mask & ~mask) != 0 && num_shifts > 0) {
634 		/*
635 		 * If bit i is set in mask, then bits in [i, i+range1] are set
636 		 * in scan->u.bmu_bitmap.  The value of range1 is equal to
637 		 * count1 >> num_shifts.  Grow range and reduce num_shifts to 0,
638 		 * while preserving these invariants.  The updates to mask leave
639 		 * fewer bits set, but each bit that remains set represents a
640 		 * longer string of consecutive bits set in scan->u.bmu_bitmap.
641 		 * If more updates to mask cannot clear more bits, because mask
642 		 * is partitioned with all 0 bits preceding all 1 bits, the loop
643 		 * terminates immediately.
644 		 */
645 		num_shifts--;
646 		range_ext = range1 + ((count1 >> num_shifts) & 1);
647 		/*
648 		 * mask is a signed quantity for the shift because when it is
649 		 * shifted right, the sign bit should copied; when the last
650 		 * block of the leaf is free, pretend, for a while, that all the
651 		 * blocks that follow it are also free.
652 		 */
653 		mask &= (daddr_t)mask >> range_ext;
654 		range1 += range_ext;
655 	}
656 	if (mask == 0) {
657 		/*
658 		 * Update bighint.  There is no allocation bigger than range1
659 		 * starting in this leaf.
660 		 */
661 		scan->bm_bighint = range1;
662 		return (SWAPBLK_NONE);
663 	}
664 
665 	/* Discard any candidates that appear before blk. */
666 	mask &= (u_daddr_t)-1 << (blk & BLIST_BMAP_MASK);
667 	if (mask == 0)
668 		return (SWAPBLK_NONE);
669 
670 	/*
671 	 * The least significant set bit in mask marks the start of the first
672 	 * available range of sufficient size.  Clear all the bits but that one,
673 	 * and then find its position.
674 	 */
675 	mask &= -mask;
676 	lo = bitpos(mask);
677 
678 	hi = lo + count;
679 	if (hi > BLIST_BMAP_RADIX) {
680 		/*
681 		 * An allocation within this leaf is impossible, so a successful
682 		 * allocation depends on the next leaf providing some of the blocks.
683 		 */
684 		if (((blk / BLIST_BMAP_RADIX + 1) & BLIST_META_MASK) == 0) {
685 			/*
686 			 * The next leaf has a different meta-node parent, so it
687 			 * is not necessarily initialized.  Update bighint,
688 			 * comparing the range found at the end of mask to the
689 			 * largest earlier range that could have been made to
690 			 * vanish in the initial processing of mask.
691 			 */
692 			scan->bm_bighint = imax(BLIST_BMAP_RADIX - lo, range1);
693 			return (SWAPBLK_NONE);
694 		}
695 		hi -= BLIST_BMAP_RADIX;
696 		if (((scan[1].u.bmu_bitmap + 1) & ~((u_daddr_t)-1 << hi)) != 0) {
697 			/*
698 			 * The next leaf doesn't have enough free blocks at the
699 			 * beginning to complete the spanning allocation.  The
700 			 * hint cannot be updated, because the same allocation
701 			 * request could be satisfied later, by this leaf, if
702 			 * the state of the next leaf changes, and without any
703 			 * changes to this leaf.
704 			 */
705 			return (SWAPBLK_NONE);
706 		}
707 		/* Clear the first 'hi' bits in the next leaf, allocating them. */
708 		scan[1].u.bmu_bitmap &= (u_daddr_t)-1 << hi;
709 		hi = BLIST_BMAP_RADIX;
710 	}
711 
712 	/* Set the bits of mask at position 'lo' and higher. */
713 	mask = -mask;
714 	if (hi == BLIST_BMAP_RADIX) {
715 		/*
716 		 * Update bighint.  There is no allocation bigger than range1
717 		 * available in this leaf after this allocation completes.
718 		 */
719 		scan->bm_bighint = range1;
720 	} else {
721 		/* Clear the bits of mask at position 'hi' and higher. */
722 		mask &= (u_daddr_t)-1 >> (BLIST_BMAP_RADIX - hi);
723 		/* If this allocation uses all the bits, clear the hint. */
724 		if (mask == scan->u.bmu_bitmap)
725 			scan->bm_bighint = 0;
726 	}
727 	/* Clear the allocated bits from this leaf. */
728 	scan->u.bmu_bitmap &= ~mask;
729 	return ((blk & ~BLIST_BMAP_MASK) + lo);
730 }
731 
732 /*
733  * blist_meta_alloc() -	allocate at a meta in the radix tree.
734  *
735  *	Attempt to allocate at a meta node.  If we can't, we update
736  *	bighint and return a failure.  Updating bighint optimize future
737  *	calls that hit this node.  We have to check for our collapse cases
738  *	and we have a few optimizations strewn in as well.
739  */
740 static daddr_t
741 blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count, u_daddr_t radix)
742 {
743 	daddr_t blk, i, next_skip, r, skip;
744 	int child;
745 	bool scan_from_start;
746 
747 	if (radix == BLIST_BMAP_RADIX)
748 		return (blst_leaf_alloc(scan, cursor, count));
749 	if (scan->u.bmu_avail < count) {
750 		/*
751 		 * The meta node's hint must be too large if the allocation
752 		 * exceeds the number of free blocks.  Reduce the hint, and
753 		 * return failure.
754 		 */
755 		scan->bm_bighint = scan->u.bmu_avail;
756 		return (SWAPBLK_NONE);
757 	}
758 	blk = cursor & -radix;
759 	skip = radix_to_skip(radix);
760 	next_skip = skip / BLIST_META_RADIX;
761 
762 	/*
763 	 * An ALL-FREE meta node requires special handling before allocating
764 	 * any of its blocks.
765 	 */
766 	if (scan->u.bmu_avail == radix) {
767 		radix /= BLIST_META_RADIX;
768 
769 		/*
770 		 * Reinitialize each of the meta node's children.  An ALL-FREE
771 		 * meta node cannot have a terminator in any subtree.
772 		 */
773 		for (i = 1; i < skip; i += next_skip) {
774 			if (next_skip == 1)
775 				scan[i].u.bmu_bitmap = (u_daddr_t)-1;
776 			else
777 				scan[i].u.bmu_avail = radix;
778 			scan[i].bm_bighint = radix;
779 		}
780 	} else {
781 		radix /= BLIST_META_RADIX;
782 	}
783 
784 	if (count > radix) {
785 		/*
786 		 * The allocation exceeds the number of blocks that are
787 		 * managed by a subtree of this meta node.
788 		 */
789 		panic("allocation too large");
790 	}
791 	scan_from_start = cursor == blk;
792 	child = (cursor - blk) / radix;
793 	blk += child * radix;
794 	for (i = 1 + child * next_skip; i < skip; i += next_skip) {
795 		if (count <= scan[i].bm_bighint) {
796 			/*
797 			 * The allocation might fit beginning in the i'th subtree.
798 			 */
799 			r = blst_meta_alloc(&scan[i],
800 			    cursor > blk ? cursor : blk, count, radix);
801 			if (r != SWAPBLK_NONE) {
802 				scan->u.bmu_avail -= count;
803 				return (r);
804 			}
805 		} else if (scan[i].bm_bighint == (daddr_t)-1) {
806 			/*
807 			 * Terminator
808 			 */
809 			break;
810 		}
811 		blk += radix;
812 	}
813 
814 	/*
815 	 * We couldn't allocate count in this subtree, update bighint.
816 	 */
817 	if (scan_from_start && scan->bm_bighint >= count)
818 		scan->bm_bighint = count - 1;
819 
820 	return (SWAPBLK_NONE);
821 }
822 
823 /*
824  * BLST_LEAF_FREE() -	free allocated block from leaf bitmap
825  *
826  */
827 static void
828 blst_leaf_free(blmeta_t *scan, daddr_t blk, int count)
829 {
830 	u_daddr_t mask;
831 	int n;
832 
833 	/*
834 	 * free some data in this bitmap
835 	 * mask=0000111111111110000
836 	 *          \_________/\__/
837 	 *		count   n
838 	 */
839 	n = blk & BLIST_BMAP_MASK;
840 	mask = ((u_daddr_t)-1 << n) &
841 	    ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n));
842 	if (scan->u.bmu_bitmap & mask)
843 		panic("freeing free block");
844 	scan->u.bmu_bitmap |= mask;
845 
846 	/*
847 	 * We could probably do a better job here.  We are required to make
848 	 * bighint at least as large as the biggest contiguous block of
849 	 * data.  If we just shoehorn it, a little extra overhead will
850 	 * be incured on the next allocation (but only that one typically).
851 	 */
852 	scan->bm_bighint = BLIST_BMAP_RADIX;
853 }
854 
855 /*
856  * BLST_META_FREE() - free allocated blocks from radix tree meta info
857  *
858  *	This support routine frees a range of blocks from the bitmap.
859  *	The range must be entirely enclosed by this radix node.  If a
860  *	meta node, we break the range down recursively to free blocks
861  *	in subnodes (which means that this code can free an arbitrary
862  *	range whereas the allocation code cannot allocate an arbitrary
863  *	range).
864  */
865 static void
866 blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, u_daddr_t radix)
867 {
868 	daddr_t blk, i, next_skip, skip, v;
869 	int child;
870 
871 	if (scan->bm_bighint == (daddr_t)-1)
872 		panic("freeing invalid range");
873 	if (radix == BLIST_BMAP_RADIX)
874 		return (blst_leaf_free(scan, freeBlk, count));
875 	skip = radix_to_skip(radix);
876 	next_skip = skip / BLIST_META_RADIX;
877 
878 	if (scan->u.bmu_avail == 0) {
879 		/*
880 		 * ALL-ALLOCATED special case, with possible
881 		 * shortcut to ALL-FREE special case.
882 		 */
883 		scan->u.bmu_avail = count;
884 		scan->bm_bighint = count;
885 
886 		if (count != radix)  {
887 			for (i = 1; i < skip; i += next_skip) {
888 				if (scan[i].bm_bighint == (daddr_t)-1)
889 					break;
890 				scan[i].bm_bighint = 0;
891 				if (next_skip == 1) {
892 					scan[i].u.bmu_bitmap = 0;
893 				} else {
894 					scan[i].u.bmu_avail = 0;
895 				}
896 			}
897 			/* fall through */
898 		}
899 	} else {
900 		scan->u.bmu_avail += count;
901 		/* scan->bm_bighint = radix; */
902 	}
903 
904 	/*
905 	 * ALL-FREE special case.
906 	 */
907 
908 	if (scan->u.bmu_avail == radix)
909 		return;
910 	if (scan->u.bmu_avail > radix)
911 		panic("blst_meta_free: freeing already free blocks (%lld) %lld/%lld",
912 		    (long long)count, (long long)scan->u.bmu_avail,
913 		    (long long)radix);
914 
915 	/*
916 	 * Break the free down into its components
917 	 */
918 
919 	blk = freeBlk & -radix;
920 	radix /= BLIST_META_RADIX;
921 
922 	child = (freeBlk - blk) / radix;
923 	blk += child * radix;
924 	i = 1 + child * next_skip;
925 	while (i < skip && blk < freeBlk + count) {
926 		v = blk + radix - freeBlk;
927 		if (v > count)
928 			v = count;
929 		blst_meta_free(&scan[i], freeBlk, v, radix);
930 		if (scan->bm_bighint < scan[i].bm_bighint)
931 			scan->bm_bighint = scan[i].bm_bighint;
932 		count -= v;
933 		freeBlk += v;
934 		blk += radix;
935 		i += next_skip;
936 	}
937 }
938 
939 /*
940  * BLIST_RADIX_COPY() - copy one radix tree to another
941  *
942  *	Locates free space in the source tree and frees it in the destination
943  *	tree.  The space may not already be free in the destination.
944  */
945 static void
946 blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, blist_t dest,
947     daddr_t count)
948 {
949 	daddr_t i, next_skip, skip;
950 
951 	/*
952 	 * Leaf node
953 	 */
954 
955 	if (radix == BLIST_BMAP_RADIX) {
956 		u_daddr_t v = scan->u.bmu_bitmap;
957 
958 		if (v == (u_daddr_t)-1) {
959 			blist_free(dest, blk, count);
960 		} else if (v != 0) {
961 			int i;
962 
963 			for (i = 0; i < BLIST_BMAP_RADIX && i < count; ++i) {
964 				if (v & ((u_daddr_t)1 << i))
965 					blist_free(dest, blk + i, 1);
966 			}
967 		}
968 		return;
969 	}
970 
971 	/*
972 	 * Meta node
973 	 */
974 
975 	if (scan->u.bmu_avail == 0) {
976 		/*
977 		 * Source all allocated, leave dest allocated
978 		 */
979 		return;
980 	}
981 	if (scan->u.bmu_avail == radix) {
982 		/*
983 		 * Source all free, free entire dest
984 		 */
985 		if (count < radix)
986 			blist_free(dest, blk, count);
987 		else
988 			blist_free(dest, blk, radix);
989 		return;
990 	}
991 
992 
993 	skip = radix_to_skip(radix);
994 	next_skip = skip / BLIST_META_RADIX;
995 	radix /= BLIST_META_RADIX;
996 
997 	for (i = 1; count && i < skip; i += next_skip) {
998 		if (scan[i].bm_bighint == (daddr_t)-1)
999 			break;
1000 
1001 		if (count >= radix) {
1002 			blst_copy(&scan[i], blk, radix, dest, radix);
1003 			count -= radix;
1004 		} else {
1005 			if (count) {
1006 				blst_copy(&scan[i], blk, radix, dest, count);
1007 			}
1008 			count = 0;
1009 		}
1010 		blk += radix;
1011 	}
1012 }
1013 
1014 /*
1015  * BLST_LEAF_FILL() -	allocate specific blocks in leaf bitmap
1016  *
1017  *	This routine allocates all blocks in the specified range
1018  *	regardless of any existing allocations in that range.  Returns
1019  *	the number of blocks allocated by the call.
1020  */
1021 static daddr_t
1022 blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count)
1023 {
1024 	daddr_t nblks;
1025 	u_daddr_t mask;
1026 	int n;
1027 
1028 	n = blk & BLIST_BMAP_MASK;
1029 	mask = ((u_daddr_t)-1 << n) &
1030 	    ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n));
1031 
1032 	/* Count the number of blocks that we are allocating. */
1033 	nblks = bitcount64(scan->u.bmu_bitmap & mask);
1034 
1035 	scan->u.bmu_bitmap &= ~mask;
1036 	return (nblks);
1037 }
1038 
1039 /*
1040  * BLIST_META_FILL() -	allocate specific blocks at a meta node
1041  *
1042  *	This routine allocates the specified range of blocks,
1043  *	regardless of any existing allocations in the range.  The
1044  *	range must be within the extent of this node.  Returns the
1045  *	number of blocks allocated by the call.
1046  */
1047 static daddr_t
1048 blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, u_daddr_t radix)
1049 {
1050 	daddr_t blk, i, nblks, next_skip, skip, v;
1051 	int child;
1052 
1053 	if (scan->bm_bighint == (daddr_t)-1)
1054 		panic("filling invalid range");
1055 	if (count > radix) {
1056 		/*
1057 		 * The allocation exceeds the number of blocks that are
1058 		 * managed by this node.
1059 		 */
1060 		panic("fill too large");
1061 	}
1062 	if (radix == BLIST_BMAP_RADIX)
1063 		return (blst_leaf_fill(scan, allocBlk, count));
1064 	if (count == radix || scan->u.bmu_avail == 0)  {
1065 		/*
1066 		 * ALL-ALLOCATED special case
1067 		 */
1068 		nblks = scan->u.bmu_avail;
1069 		scan->u.bmu_avail = 0;
1070 		scan->bm_bighint = 0;
1071 		return (nblks);
1072 	}
1073 	skip = radix_to_skip(radix);
1074 	next_skip = skip / BLIST_META_RADIX;
1075 	blk = allocBlk & -radix;
1076 
1077 	/*
1078 	 * An ALL-FREE meta node requires special handling before allocating
1079 	 * any of its blocks.
1080 	 */
1081 	if (scan->u.bmu_avail == radix) {
1082 		radix /= BLIST_META_RADIX;
1083 
1084 		/*
1085 		 * Reinitialize each of the meta node's children.  An ALL-FREE
1086 		 * meta node cannot have a terminator in any subtree.
1087 		 */
1088 		for (i = 1; i < skip; i += next_skip) {
1089 			if (next_skip == 1)
1090 				scan[i].u.bmu_bitmap = (u_daddr_t)-1;
1091 			else
1092 				scan[i].u.bmu_avail = radix;
1093 			scan[i].bm_bighint = radix;
1094 		}
1095 	} else {
1096 		radix /= BLIST_META_RADIX;
1097 	}
1098 
1099 	nblks = 0;
1100 	child = (allocBlk - blk) / radix;
1101 	blk += child * radix;
1102 	i = 1 + child * next_skip;
1103 	while (i < skip && blk < allocBlk + count) {
1104 		v = blk + radix - allocBlk;
1105 		if (v > count)
1106 			v = count;
1107 		nblks += blst_meta_fill(&scan[i], allocBlk, v, radix);
1108 		count -= v;
1109 		allocBlk += v;
1110 		blk += radix;
1111 		i += next_skip;
1112 	}
1113 	scan->u.bmu_avail -= nblks;
1114 	return (nblks);
1115 }
1116 
1117 #ifdef BLIST_DEBUG
1118 
1119 static void
1120 blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int tab)
1121 {
1122 	daddr_t i, next_skip, skip;
1123 
1124 	if (radix == BLIST_BMAP_RADIX) {
1125 		printf(
1126 		    "%*.*s(%08llx,%lld): bitmap %016llx big=%lld\n",
1127 		    tab, tab, "",
1128 		    (long long)blk, (long long)radix,
1129 		    (long long)scan->u.bmu_bitmap,
1130 		    (long long)scan->bm_bighint
1131 		);
1132 		return;
1133 	}
1134 
1135 	if (scan->u.bmu_avail == 0) {
1136 		printf(
1137 		    "%*.*s(%08llx,%lld) ALL ALLOCATED\n",
1138 		    tab, tab, "",
1139 		    (long long)blk,
1140 		    (long long)radix
1141 		);
1142 		return;
1143 	}
1144 	if (scan->u.bmu_avail == radix) {
1145 		printf(
1146 		    "%*.*s(%08llx,%lld) ALL FREE\n",
1147 		    tab, tab, "",
1148 		    (long long)blk,
1149 		    (long long)radix
1150 		);
1151 		return;
1152 	}
1153 
1154 	printf(
1155 	    "%*.*s(%08llx,%lld): subtree (%lld/%lld) big=%lld {\n",
1156 	    tab, tab, "",
1157 	    (long long)blk, (long long)radix,
1158 	    (long long)scan->u.bmu_avail,
1159 	    (long long)radix,
1160 	    (long long)scan->bm_bighint
1161 	);
1162 
1163 	skip = radix_to_skip(radix);
1164 	next_skip = skip / BLIST_META_RADIX;
1165 	radix /= BLIST_META_RADIX;
1166 	tab += 4;
1167 
1168 	for (i = 1; i < skip; i += next_skip) {
1169 		if (scan[i].bm_bighint == (daddr_t)-1) {
1170 			printf(
1171 			    "%*.*s(%08llx,%lld): Terminator\n",
1172 			    tab, tab, "",
1173 			    (long long)blk, (long long)radix
1174 			);
1175 			break;
1176 		}
1177 		blst_radix_print(&scan[i], blk, radix, tab);
1178 		blk += radix;
1179 	}
1180 	tab -= 4;
1181 
1182 	printf(
1183 	    "%*.*s}\n",
1184 	    tab, tab, ""
1185 	);
1186 }
1187 
1188 #endif
1189 
1190 #ifdef BLIST_DEBUG
1191 
1192 int
1193 main(int ac, char **av)
1194 {
1195 	int size = 1024;
1196 	int i;
1197 	blist_t bl;
1198 	struct sbuf *s;
1199 
1200 	for (i = 1; i < ac; ++i) {
1201 		const char *ptr = av[i];
1202 		if (*ptr != '-') {
1203 			size = strtol(ptr, NULL, 0);
1204 			continue;
1205 		}
1206 		ptr += 2;
1207 		fprintf(stderr, "Bad option: %s\n", ptr - 2);
1208 		exit(1);
1209 	}
1210 	bl = blist_create(size, M_WAITOK);
1211 	blist_free(bl, 0, size);
1212 
1213 	for (;;) {
1214 		char buf[1024];
1215 		long long da = 0;
1216 		long long count = 0;
1217 
1218 		printf("%lld/%lld/%lld> ", (long long)blist_avail(bl),
1219 		    (long long)size, (long long)bl->bl_radix);
1220 		fflush(stdout);
1221 		if (fgets(buf, sizeof(buf), stdin) == NULL)
1222 			break;
1223 		switch(buf[0]) {
1224 		case 'r':
1225 			if (sscanf(buf + 1, "%lld", &count) == 1) {
1226 				blist_resize(&bl, count, 1, M_WAITOK);
1227 			} else {
1228 				printf("?\n");
1229 			}
1230 		case 'p':
1231 			blist_print(bl);
1232 			break;
1233 		case 's':
1234 			s = sbuf_new_auto();
1235 			blist_stats(bl, s);
1236 			sbuf_finish(s);
1237 			printf("%s", sbuf_data(s));
1238 			sbuf_delete(s);
1239 			break;
1240 		case 'a':
1241 			if (sscanf(buf + 1, "%lld", &count) == 1) {
1242 				daddr_t blk = blist_alloc(bl, count);
1243 				printf("    R=%08llx\n", (long long)blk);
1244 			} else {
1245 				printf("?\n");
1246 			}
1247 			break;
1248 		case 'f':
1249 			if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) {
1250 				blist_free(bl, da, count);
1251 			} else {
1252 				printf("?\n");
1253 			}
1254 			break;
1255 		case 'l':
1256 			if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) {
1257 				printf("    n=%jd\n",
1258 				    (intmax_t)blist_fill(bl, da, count));
1259 			} else {
1260 				printf("?\n");
1261 			}
1262 			break;
1263 		case '?':
1264 		case 'h':
1265 			puts(
1266 			    "p          -print\n"
1267 			    "s          -stats\n"
1268 			    "a %d       -allocate\n"
1269 			    "f %x %d    -free\n"
1270 			    "l %x %d    -fill\n"
1271 			    "r %d       -resize\n"
1272 			    "h/?        -help"
1273 			);
1274 			break;
1275 		default:
1276 			printf("?\n");
1277 			break;
1278 		}
1279 	}
1280 	return(0);
1281 }
1282 
1283 void
1284 panic(const char *ctl, ...)
1285 {
1286 	va_list va;
1287 
1288 	va_start(va, ctl);
1289 	vfprintf(stderr, ctl, va);
1290 	fprintf(stderr, "\n");
1291 	va_end(va);
1292 	exit(1);
1293 }
1294 
1295 #endif
1296