xref: /freebsd/sys/kern/subr_blist.c (revision fd253945ac76a54ff9c11cf02f5458561f711866)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1998 Matthew Dillon.  All Rights Reserved.
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. Neither the name of the University nor the names of its contributors
14  *    may be used to endorse or promote products derived from this software
15  *    without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
18  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
23  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
25  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
26  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  */
29 /*
30  * BLIST.C -	Bitmap allocator/deallocator, using a radix tree with hinting
31  *
32  *	This module implements a general bitmap allocator/deallocator.  The
33  *	allocator eats around 2 bits per 'block'.  The module does not
34  *	try to interpret the meaning of a 'block' other than to return
35  *	SWAPBLK_NONE on an allocation failure.
36  *
37  *	A radix tree controls access to pieces of the bitmap, and includes
38  *	auxiliary information at each interior node about the availabilty of
39  *	contiguous free blocks in the subtree rooted at that node.  Two radix
40  *	constants are involved: one for the size of the bitmaps contained in the
41  *	leaf nodes (BLIST_BMAP_RADIX), and one for the number of descendents of
42  *	each of the meta (interior) nodes (BLIST_META_RADIX).  Each subtree is
43  *	associated with a range of blocks.  The root of any subtree stores a
44  *	hint field that defines an upper bound on the size of the largest
45  *	allocation that can begin in the associated block range.  A hint is an
46  *	upper bound on a potential allocation, but not necessarily a tight upper
47  *	bound.
48  *
49  *	The bitmap field in each node directs the search for available blocks.
50  *	For a leaf node, a bit is set if the corresponding block is free.  For a
51  *	meta node, a bit is set if the corresponding subtree contains a free
52  *	block somewhere within it.  The search at a meta node considers only
53  *	children of that node that represent a range that includes a free block.
54  *
55  * 	The hinting greatly increases code efficiency for allocations while
56  *	the general radix structure optimizes both allocations and frees.  The
57  *	radix tree should be able to operate well no matter how much
58  *	fragmentation there is and no matter how large a bitmap is used.
59  *
60  *	The blist code wires all necessary memory at creation time.  Neither
61  *	allocations nor frees require interaction with the memory subsystem.
62  *	The non-blocking nature of allocations and frees is required by swap
63  *	code (vm/swap_pager.c).
64  *
65  *	LAYOUT: The radix tree is laid out recursively using a linear array.
66  *	Each meta node is immediately followed (laid out sequentially in
67  *	memory) by BLIST_META_RADIX lower level nodes.  This is a recursive
68  *	structure but one that can be easily scanned through a very simple
69  *	'skip' calculation.  The memory allocation is only large enough to
70  *	cover the number of blocks requested at creation time.  Nodes that
71  *	represent blocks beyond that limit, nodes that would never be read
72  *	or written, are not allocated, so that the last of the
73  *	BLIST_META_RADIX lower level nodes of a some nodes may not be
74  *	allocated.
75  *
76  *	NOTE: the allocator cannot currently allocate more than
77  *	BLIST_BMAP_RADIX blocks per call.  It will panic with 'allocation too
78  *	large' if you try.  This is an area that could use improvement.  The
79  *	radix is large enough that this restriction does not effect the swap
80  *	system, though.  Currently only the allocation code is affected by
81  *	this algorithmic unfeature.  The freeing code can handle arbitrary
82  *	ranges.
83  *
84  *	This code can be compiled stand-alone for debugging.
85  */
86 
87 #include <sys/cdefs.h>
88 __FBSDID("$FreeBSD$");
89 
90 #ifdef _KERNEL
91 
92 #include <sys/param.h>
93 #include <sys/systm.h>
94 #include <sys/lock.h>
95 #include <sys/kernel.h>
96 #include <sys/blist.h>
97 #include <sys/malloc.h>
98 #include <sys/sbuf.h>
99 #include <sys/proc.h>
100 #include <sys/mutex.h>
101 
102 #else
103 
104 #ifndef BLIST_NO_DEBUG
105 #define BLIST_DEBUG
106 #endif
107 
108 #include <sys/errno.h>
109 #include <sys/types.h>
110 #include <sys/malloc.h>
111 #include <sys/sbuf.h>
112 #include <stdio.h>
113 #include <string.h>
114 #include <stddef.h>
115 #include <stdlib.h>
116 #include <stdarg.h>
117 #include <stdbool.h>
118 
119 #define	bitcount64(x)	__bitcount64((uint64_t)(x))
120 #define malloc(a,b,c)	calloc(a, 1)
121 #define free(a,b)	free(a)
122 #define ummin(a,b)	((a) < (b) ? (a) : (b))
123 
124 #include <sys/blist.h>
125 
126 void panic(const char *ctl, ...);
127 
128 #endif
129 
130 /*
131  * static support functions
132  */
133 static daddr_t	blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count);
134 static daddr_t	blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count,
135 		    u_daddr_t radix);
136 static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count);
137 static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count,
138 		    u_daddr_t radix);
139 static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix,
140 		    blist_t dest, daddr_t count);
141 static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count);
142 static daddr_t blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count,
143 		    u_daddr_t radix);
144 #ifndef _KERNEL
145 static void	blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix,
146 		    int tab);
147 #endif
148 
149 #ifdef _KERNEL
150 static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space");
151 #endif
152 
153 _Static_assert(BLIST_BMAP_RADIX % BLIST_META_RADIX == 0,
154     "radix divisibility error");
155 #define	BLIST_BMAP_MASK	(BLIST_BMAP_RADIX - 1)
156 #define	BLIST_META_MASK	(BLIST_META_RADIX - 1)
157 
158 /*
159  * For a subtree that can represent the state of up to 'radix' blocks, the
160  * number of leaf nodes of the subtree is L=radix/BLIST_BMAP_RADIX.  If 'm'
161  * is short for BLIST_META_RADIX, then for a tree of height h with L=m**h
162  * leaf nodes, the total number of tree nodes is 1 + m + m**2 + ... + m**h,
163  * or, equivalently, (m**(h+1)-1)/(m-1).  This quantity is called 'skip'
164  * in the 'meta' functions that process subtrees.  Since integer division
165  * discards remainders, we can express this computation as
166  * skip = (m * m**h) / (m - 1)
167  * skip = (m * (radix / BLIST_BMAP_RADIX)) / (m - 1)
168  * and since m divides BLIST_BMAP_RADIX, we can simplify further to
169  * skip = (radix / (BLIST_BMAP_RADIX / m)) / (m - 1)
170  * skip = radix / ((BLIST_BMAP_RADIX / m) * (m - 1))
171  * so that simple integer division by a constant can safely be used for the
172  * calculation.
173  */
174 static inline daddr_t
175 radix_to_skip(daddr_t radix)
176 {
177 
178 	return (radix /
179 	    ((BLIST_BMAP_RADIX / BLIST_META_RADIX) * BLIST_META_MASK));
180 }
181 
182 /*
183  * Provide a mask with count bits set, starting as position n.
184  */
185 static inline u_daddr_t
186 bitrange(int n, int count)
187 {
188 
189 	return (((u_daddr_t)-1 << n) &
190 	    ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - (n + count))));
191 }
192 
193 
194 /*
195  * Use binary search, or a faster method, to find the 1 bit in a u_daddr_t.
196  * Assumes that the argument has only one bit set.
197  */
198 static inline int
199 bitpos(u_daddr_t mask)
200 {
201 	int hi, lo, mid;
202 
203 	switch (sizeof(mask)) {
204 #ifdef HAVE_INLINE_FFSLL
205 	case sizeof(long long):
206 		return (ffsll(mask) - 1);
207 #endif
208 	default:
209 		lo = 0;
210 		hi = BLIST_BMAP_RADIX;
211 		while (lo + 1 < hi) {
212 			mid = (lo + hi) >> 1;
213 			if ((mask >> mid) != 0)
214 				lo = mid;
215 			else
216 				hi = mid;
217 		}
218 		return (lo);
219 	}
220 }
221 
222 /*
223  * blist_create() - create a blist capable of handling up to the specified
224  *		    number of blocks
225  *
226  *	blocks - must be greater than 0
227  * 	flags  - malloc flags
228  *
229  *	The smallest blist consists of a single leaf node capable of
230  *	managing BLIST_BMAP_RADIX blocks.
231  */
232 blist_t
233 blist_create(daddr_t blocks, int flags)
234 {
235 	blist_t bl;
236 	u_daddr_t nodes, radix;
237 
238 	if (blocks == 0)
239 		panic("invalid block count");
240 
241 	/*
242 	 * Calculate the radix and node count used for scanning.
243 	 */
244 	nodes = 1;
245 	radix = BLIST_BMAP_RADIX;
246 	while (radix <= blocks) {
247 		nodes += 1 + (blocks - 1) / radix;
248 		radix *= BLIST_META_RADIX;
249 	}
250 
251 	bl = malloc(offsetof(struct blist, bl_root[nodes]), M_SWAP, flags |
252 	    M_ZERO);
253 	if (bl == NULL)
254 		return (NULL);
255 
256 	bl->bl_blocks = blocks;
257 	bl->bl_radix = radix;
258 
259 #if defined(BLIST_DEBUG)
260 	printf(
261 		"BLIST representing %lld blocks (%lld MB of swap)"
262 		", requiring %lldK of ram\n",
263 		(long long)bl->bl_blocks,
264 		(long long)bl->bl_blocks * 4 / 1024,
265 		(long long)(nodes * sizeof(blmeta_t) + 1023) / 1024
266 	);
267 	printf("BLIST raw radix tree contains %lld records\n",
268 	    (long long)nodes);
269 #endif
270 
271 	return (bl);
272 }
273 
274 void
275 blist_destroy(blist_t bl)
276 {
277 
278 	free(bl, M_SWAP);
279 }
280 
281 /*
282  * blist_alloc() -   reserve space in the block bitmap.  Return the base
283  *		     of a contiguous region or SWAPBLK_NONE if space could
284  *		     not be allocated.
285  */
286 daddr_t
287 blist_alloc(blist_t bl, daddr_t count)
288 {
289 	daddr_t blk;
290 
291 	if (count > BLIST_MAX_ALLOC)
292 		panic("allocation too large");
293 
294 	/*
295 	 * This loop iterates at most twice.  An allocation failure in the
296 	 * first iteration leads to a second iteration only if the cursor was
297 	 * non-zero.  When the cursor is zero, an allocation failure will
298 	 * reduce the hint, stopping further iterations.
299 	 */
300 	while (count <= bl->bl_root->bm_bighint) {
301 		blk = blst_meta_alloc(bl->bl_root, bl->bl_cursor, count,
302 		    bl->bl_radix);
303 		if (blk != SWAPBLK_NONE) {
304 			bl->bl_avail -= count;
305 			bl->bl_cursor = blk + count;
306 			if (bl->bl_cursor == bl->bl_blocks)
307 				bl->bl_cursor = 0;
308 			return (blk);
309 		}
310 		bl->bl_cursor = 0;
311 	}
312 	return (SWAPBLK_NONE);
313 }
314 
315 /*
316  * blist_avail() -	return the number of free blocks.
317  */
318 daddr_t
319 blist_avail(blist_t bl)
320 {
321 
322 	return (bl->bl_avail);
323 }
324 
325 /*
326  * blist_free() -	free up space in the block bitmap.  Return the base
327  *		     	of a contiguous region.  Panic if an inconsistancy is
328  *			found.
329  */
330 void
331 blist_free(blist_t bl, daddr_t blkno, daddr_t count)
332 {
333 
334 	if (blkno < 0 || blkno + count > bl->bl_blocks)
335 		panic("freeing invalid range");
336 	blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix);
337 	bl->bl_avail += count;
338 }
339 
340 /*
341  * blist_fill() -	mark a region in the block bitmap as off-limits
342  *			to the allocator (i.e. allocate it), ignoring any
343  *			existing allocations.  Return the number of blocks
344  *			actually filled that were free before the call.
345  */
346 daddr_t
347 blist_fill(blist_t bl, daddr_t blkno, daddr_t count)
348 {
349 	daddr_t filled;
350 
351 	if (blkno < 0 || blkno + count > bl->bl_blocks)
352 		panic("filling invalid range");
353 	filled = blst_meta_fill(bl->bl_root, blkno, count, bl->bl_radix);
354 	bl->bl_avail -= filled;
355 	return (filled);
356 }
357 
358 /*
359  * blist_resize() -	resize an existing radix tree to handle the
360  *			specified number of blocks.  This will reallocate
361  *			the tree and transfer the previous bitmap to the new
362  *			one.  When extending the tree you can specify whether
363  *			the new blocks are to left allocated or freed.
364  */
365 void
366 blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags)
367 {
368     blist_t newbl = blist_create(count, flags);
369     blist_t save = *pbl;
370 
371     *pbl = newbl;
372     if (count > save->bl_blocks)
373 	    count = save->bl_blocks;
374     blst_copy(save->bl_root, 0, save->bl_radix, newbl, count);
375 
376     /*
377      * If resizing upwards, should we free the new space or not?
378      */
379     if (freenew && count < newbl->bl_blocks) {
380 	    blist_free(newbl, count, newbl->bl_blocks - count);
381     }
382     blist_destroy(save);
383 }
384 
385 #ifdef BLIST_DEBUG
386 
387 /*
388  * blist_print()    - dump radix tree
389  */
390 void
391 blist_print(blist_t bl)
392 {
393 	printf("BLIST avail = %jd, cursor = %08jx {\n",
394 	    (uintmax_t)bl->bl_avail, (uintmax_t)bl->bl_cursor);
395 
396 	if (bl->bl_root->bm_bitmap != 0)
397 		blst_radix_print(bl->bl_root, 0, bl->bl_radix, 4);
398 	printf("}\n");
399 }
400 
401 #endif
402 
403 static const u_daddr_t fib[] = {
404 	1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584,
405 	4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418, 317811,
406 	514229, 832040, 1346269, 2178309, 3524578,
407 };
408 
409 /*
410  * Use 'gap' to describe a maximal range of unallocated blocks/bits.
411  */
412 struct gap_stats {
413 	daddr_t	start;		/* current gap start, or SWAPBLK_NONE */
414 	daddr_t	num;		/* number of gaps observed */
415 	daddr_t	max;		/* largest gap size */
416 	daddr_t	avg;		/* average gap size */
417 	daddr_t	err;		/* sum - num * avg */
418 	daddr_t	histo[nitems(fib)]; /* # gaps in each size range */
419 	int	max_bucket;	/* last histo elt with nonzero val */
420 };
421 
422 /*
423  * gap_stats_counting()    - is the state 'counting 1 bits'?
424  *                           or 'skipping 0 bits'?
425  */
426 static inline bool
427 gap_stats_counting(const struct gap_stats *stats)
428 {
429 
430 	return (stats->start != SWAPBLK_NONE);
431 }
432 
433 /*
434  * init_gap_stats()    - initialize stats on gap sizes
435  */
436 static inline void
437 init_gap_stats(struct gap_stats *stats)
438 {
439 
440 	bzero(stats, sizeof(*stats));
441 	stats->start = SWAPBLK_NONE;
442 }
443 
444 /*
445  * update_gap_stats()    - update stats on gap sizes
446  */
447 static void
448 update_gap_stats(struct gap_stats *stats, daddr_t posn)
449 {
450 	daddr_t size;
451 	int hi, lo, mid;
452 
453 	if (!gap_stats_counting(stats)) {
454 		stats->start = posn;
455 		return;
456 	}
457 	size = posn - stats->start;
458 	stats->start = SWAPBLK_NONE;
459 	if (size > stats->max)
460 		stats->max = size;
461 
462 	/*
463 	 * Find the fibonacci range that contains size,
464 	 * expecting to find it in an early range.
465 	 */
466 	lo = 0;
467 	hi = 1;
468 	while (hi < nitems(fib) && fib[hi] <= size) {
469 		lo = hi;
470 		hi *= 2;
471 	}
472 	if (hi >= nitems(fib))
473 		hi = nitems(fib);
474 	while (lo + 1 != hi) {
475 		mid = (lo + hi) >> 1;
476 		if (fib[mid] <= size)
477 			lo = mid;
478 		else
479 			hi = mid;
480 	}
481 	stats->histo[lo]++;
482 	if (lo > stats->max_bucket)
483 		stats->max_bucket = lo;
484 	stats->err += size - stats->avg;
485 	stats->num++;
486 	stats->avg += stats->err / stats->num;
487 	stats->err %= stats->num;
488 }
489 
490 /*
491  * dump_gap_stats()    - print stats on gap sizes
492  */
493 static inline void
494 dump_gap_stats(const struct gap_stats *stats, struct sbuf *s)
495 {
496 	int i;
497 
498 	sbuf_printf(s, "number of maximal free ranges: %jd\n",
499 	    (intmax_t)stats->num);
500 	sbuf_printf(s, "largest free range: %jd\n", (intmax_t)stats->max);
501 	sbuf_printf(s, "average maximal free range size: %jd\n",
502 	    (intmax_t)stats->avg);
503 	sbuf_printf(s, "number of maximal free ranges of different sizes:\n");
504 	sbuf_printf(s, "               count  |  size range\n");
505 	sbuf_printf(s, "               -----  |  ----------\n");
506 	for (i = 0; i < stats->max_bucket; i++) {
507 		if (stats->histo[i] != 0) {
508 			sbuf_printf(s, "%20jd  |  ",
509 			    (intmax_t)stats->histo[i]);
510 			if (fib[i] != fib[i + 1] - 1)
511 				sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
512 				    (intmax_t)fib[i + 1] - 1);
513 			else
514 				sbuf_printf(s, "%jd\n", (intmax_t)fib[i]);
515 		}
516 	}
517 	sbuf_printf(s, "%20jd  |  ", (intmax_t)stats->histo[i]);
518 	if (stats->histo[i] > 1)
519 		sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
520 		    (intmax_t)stats->max);
521 	else
522 		sbuf_printf(s, "%jd\n", (intmax_t)stats->max);
523 }
524 
525 /*
526  * blist_stats()    - dump radix tree stats
527  */
528 void
529 blist_stats(blist_t bl, struct sbuf *s)
530 {
531 	struct gap_stats gstats;
532 	struct gap_stats *stats = &gstats;
533 	daddr_t i, nodes, radix;
534 	u_daddr_t bit, diff, mask;
535 
536 	init_gap_stats(stats);
537 	nodes = 0;
538 	i = bl->bl_radix;
539 	while (i < bl->bl_radix + bl->bl_blocks) {
540 		/*
541 		 * Find max size subtree starting at i.
542 		 */
543 		radix = BLIST_BMAP_RADIX;
544 		while (((i / radix) & BLIST_META_MASK) == 0)
545 			radix *= BLIST_META_RADIX;
546 
547 		/*
548 		 * Check for skippable subtrees starting at i.
549 		 */
550 		while (radix > BLIST_BMAP_RADIX) {
551 			if (bl->bl_root[nodes].bm_bitmap == 0) {
552 				if (gap_stats_counting(stats))
553 					update_gap_stats(stats, i);
554 				break;
555 			}
556 
557 			/*
558 			 * Skip subtree root.
559 			 */
560 			nodes++;
561 			radix /= BLIST_META_RADIX;
562 		}
563 		if (radix == BLIST_BMAP_RADIX) {
564 			/*
565 			 * Scan leaf.
566 			 */
567 			mask = bl->bl_root[nodes].bm_bitmap;
568 			diff = mask ^ (mask << 1);
569 			if (gap_stats_counting(stats))
570 				diff ^= 1;
571 			while (diff != 0) {
572 				bit = diff & -diff;
573 				update_gap_stats(stats, i + bitpos(bit));
574 				diff ^= bit;
575 			}
576 		}
577 		nodes += radix_to_skip(radix);
578 		i += radix;
579 	}
580 	update_gap_stats(stats, i);
581 	dump_gap_stats(stats, s);
582 }
583 
584 /************************************************************************
585  *			  ALLOCATION SUPPORT FUNCTIONS			*
586  ************************************************************************
587  *
588  *	These support functions do all the actual work.  They may seem
589  *	rather longish, but that's because I've commented them up.  The
590  *	actual code is straight forward.
591  *
592  */
593 
594 /*
595  * BLST_NEXT_LEAF_ALLOC() - allocate the first few blocks in the next leaf.
596  *
597  *	'scan' is a leaf node, associated with a block containing 'blk'.
598  *	The next leaf node could be adjacent, or several nodes away if the
599  *	least common ancestor of 'scan' and its neighbor is several levels
600  *	up.  Use 'blk' to determine how many meta-nodes lie between the
601  *	leaves.  If the next leaf has enough initial bits set, clear them
602  *	and clear the bits in the meta nodes on the path up to the least
603  *	common ancestor to mark any subtrees made completely empty.
604  */
605 static int
606 blst_next_leaf_alloc(blmeta_t *scan, daddr_t blk, int count)
607 {
608 	blmeta_t *next;
609 	daddr_t skip;
610 	u_daddr_t radix;
611 	int digit;
612 
613 	next = scan + 1;
614 	blk += BLIST_BMAP_RADIX;
615 	radix = BLIST_BMAP_RADIX;
616 	while ((digit = ((blk / radix) & BLIST_META_MASK)) == 0 &&
617 	    (next->bm_bitmap & 1) == 1) {
618 		next++;
619 		radix *= BLIST_META_RADIX;
620 	}
621 	if (((next->bm_bitmap + 1) & ~((u_daddr_t)-1 << count)) != 0) {
622 		/*
623 		 * The next leaf doesn't have enough free blocks at the
624 		 * beginning to complete the spanning allocation.
625 		 */
626 		return (ENOMEM);
627 	}
628 	/* Clear the first 'count' bits in the next leaf to allocate. */
629 	next->bm_bitmap &= (u_daddr_t)-1 << count;
630 
631 	/*
632 	 * Update bitmaps of next-ancestors, up to least common ancestor.
633 	 */
634 	skip = radix_to_skip(radix);
635 	while (radix != BLIST_BMAP_RADIX && next->bm_bitmap == 0) {
636 		(--next)->bm_bitmap ^= 1;
637 		radix /= BLIST_META_RADIX;
638 	}
639 	if (next->bm_bitmap == 0)
640 		scan[-digit * skip].bm_bitmap ^= (u_daddr_t)1 << digit;
641 	return (0);
642 }
643 
644 /*
645  * BLST_LEAF_ALLOC() -	allocate at a leaf in the radix tree (a bitmap).
646  *
647  *	This is the core of the allocator and is optimized for the
648  *	BLIST_BMAP_RADIX block allocation case.  Otherwise, execution
649  *	time is proportional to log2(count) + bitpos time.
650  */
651 static daddr_t
652 blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count)
653 {
654 	u_daddr_t mask;
655 	int count1, hi, lo, num_shifts, range1, range_ext;
656 
657 	range1 = 0;
658 	count1 = count - 1;
659 	num_shifts = fls(count1);
660 	mask = scan->bm_bitmap;
661 	while ((-mask & ~mask) != 0 && num_shifts > 0) {
662 		/*
663 		 * If bit i is set in mask, then bits in [i, i+range1] are set
664 		 * in scan->bm_bitmap.  The value of range1 is equal to
665 		 * count1 >> num_shifts.  Grow range and reduce num_shifts to 0,
666 		 * while preserving these invariants.  The updates to mask leave
667 		 * fewer bits set, but each bit that remains set represents a
668 		 * longer string of consecutive bits set in scan->bm_bitmap.
669 		 * If more updates to mask cannot clear more bits, because mask
670 		 * is partitioned with all 0 bits preceding all 1 bits, the loop
671 		 * terminates immediately.
672 		 */
673 		num_shifts--;
674 		range_ext = range1 + ((count1 >> num_shifts) & 1);
675 		/*
676 		 * mask is a signed quantity for the shift because when it is
677 		 * shifted right, the sign bit should copied; when the last
678 		 * block of the leaf is free, pretend, for a while, that all the
679 		 * blocks that follow it are also free.
680 		 */
681 		mask &= (daddr_t)mask >> range_ext;
682 		range1 += range_ext;
683 	}
684 	if (mask == 0) {
685 		/*
686 		 * Update bighint.  There is no allocation bigger than range1
687 		 * starting in this leaf.
688 		 */
689 		scan->bm_bighint = range1;
690 		return (SWAPBLK_NONE);
691 	}
692 
693 	/* Discard any candidates that appear before blk. */
694 	mask &= (u_daddr_t)-1 << (blk & BLIST_BMAP_MASK);
695 	if (mask == 0)
696 		return (SWAPBLK_NONE);
697 
698 	/*
699 	 * The least significant set bit in mask marks the start of the first
700 	 * available range of sufficient size.  Clear all the bits but that one,
701 	 * and then find its position.
702 	 */
703 	mask &= -mask;
704 	lo = bitpos(mask);
705 
706 	hi = lo + count;
707 	if (hi > BLIST_BMAP_RADIX) {
708 		/*
709 		 * An allocation within this leaf is impossible, so a successful
710 		 * allocation depends on the next leaf providing some of the blocks.
711 		 */
712 		if (blst_next_leaf_alloc(scan, blk, hi - BLIST_BMAP_RADIX) != 0)
713 			/*
714 			 * The hint cannot be updated, because the same
715 			 * allocation request could be satisfied later, by this
716 			 * leaf, if the state of the next leaf changes, and
717 			 * without any changes to this leaf.
718 			 */
719 			return (SWAPBLK_NONE);
720 		hi = BLIST_BMAP_RADIX;
721 	}
722 
723 	/* Set the bits of mask at position 'lo' and higher. */
724 	mask = -mask;
725 	if (hi == BLIST_BMAP_RADIX) {
726 		/*
727 		 * Update bighint.  There is no allocation bigger than range1
728 		 * available in this leaf after this allocation completes.
729 		 */
730 		scan->bm_bighint = range1;
731 	} else {
732 		/* Clear the bits of mask at position 'hi' and higher. */
733 		mask &= (u_daddr_t)-1 >> (BLIST_BMAP_RADIX - hi);
734 	}
735 	/* Clear the allocated bits from this leaf. */
736 	scan->bm_bitmap &= ~mask;
737 	return ((blk & ~BLIST_BMAP_MASK) + lo);
738 }
739 
740 /*
741  * blist_meta_alloc() -	allocate at a meta in the radix tree.
742  *
743  *	Attempt to allocate at a meta node.  If we can't, we update
744  *	bighint and return a failure.  Updating bighint optimize future
745  *	calls that hit this node.  We have to check for our collapse cases
746  *	and we have a few optimizations strewn in as well.
747  */
748 static daddr_t
749 blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count, u_daddr_t radix)
750 {
751 	daddr_t blk, i, r, skip;
752 	u_daddr_t bit, mask;
753 	bool scan_from_start;
754 	int digit;
755 
756 	if (radix == BLIST_BMAP_RADIX)
757 		return (blst_leaf_alloc(scan, cursor, count));
758 	blk = cursor & -radix;
759 	scan_from_start = (cursor == blk);
760 	radix /= BLIST_META_RADIX;
761 	skip = radix_to_skip(radix);
762 	mask = scan->bm_bitmap;
763 
764 	/* Discard any candidates that appear before cursor. */
765 	digit = (cursor / radix) & BLIST_META_MASK;
766 	mask &= (u_daddr_t)-1 << digit;
767 
768 	/*
769 	 * If the first try is for a block that includes the cursor, pre-undo
770 	 * the digit * radix offset in the first call; otherwise, ignore the
771 	 * cursor entirely.
772 	 */
773 	if (((mask >> digit) & 1) == 1)
774 		cursor -= digit * radix;
775 	else
776 		cursor = blk;
777 
778 	/*
779 	 * Examine the nonempty subtree associated with each bit set in mask.
780 	 */
781 	do {
782 		bit = mask & -mask;
783 		digit = bitpos(bit);
784 		i = 1 + digit * skip;
785 		if (count <= scan[i].bm_bighint) {
786 			/*
787 			 * The allocation might fit beginning in the i'th subtree.
788 			 */
789 			r = blst_meta_alloc(&scan[i], cursor + digit * radix,
790 			    count, radix);
791 			if (r != SWAPBLK_NONE) {
792 				if (scan[i].bm_bitmap == 0)
793 					scan->bm_bitmap ^= bit;
794 				return (r);
795 			}
796 		}
797 		cursor = blk;
798 	} while ((mask ^= bit) != 0);
799 
800 	/*
801 	 * We couldn't allocate count in this subtree.  If the whole tree was
802 	 * scanned, and the last tree node is allocated, update bighint.
803 	 */
804 	if (scan_from_start && !(digit == BLIST_META_RADIX - 1 &&
805 	    scan[i].bm_bighint == BLIST_MAX_ALLOC))
806 		scan->bm_bighint = count - 1;
807 
808 	return (SWAPBLK_NONE);
809 }
810 
811 /*
812  * BLST_LEAF_FREE() -	free allocated block from leaf bitmap
813  *
814  */
815 static void
816 blst_leaf_free(blmeta_t *scan, daddr_t blk, int count)
817 {
818 	u_daddr_t mask;
819 
820 	/*
821 	 * free some data in this bitmap
822 	 * mask=0000111111111110000
823 	 *          \_________/\__/
824 	 *		count   n
825 	 */
826 	mask = bitrange(blk & BLIST_BMAP_MASK, count);
827 	if (scan->bm_bitmap & mask)
828 		panic("freeing free block");
829 	scan->bm_bitmap |= mask;
830 }
831 
832 /*
833  * BLST_META_FREE() - free allocated blocks from radix tree meta info
834  *
835  *	This support routine frees a range of blocks from the bitmap.
836  *	The range must be entirely enclosed by this radix node.  If a
837  *	meta node, we break the range down recursively to free blocks
838  *	in subnodes (which means that this code can free an arbitrary
839  *	range whereas the allocation code cannot allocate an arbitrary
840  *	range).
841  */
842 static void
843 blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, u_daddr_t radix)
844 {
845 	daddr_t blk, endBlk, i, skip;
846 	int digit, endDigit;
847 
848 	/*
849 	 * We could probably do a better job here.  We are required to make
850 	 * bighint at least as large as the biggest allocable block of data.
851 	 * If we just shoehorn it, a little extra overhead will be incurred
852 	 * on the next allocation (but only that one typically).
853 	 */
854 	scan->bm_bighint = BLIST_MAX_ALLOC;
855 
856 	if (radix == BLIST_BMAP_RADIX)
857 		return (blst_leaf_free(scan, freeBlk, count));
858 
859 	endBlk = ummin(freeBlk + count, (freeBlk + radix) & -radix);
860 	radix /= BLIST_META_RADIX;
861 	skip = radix_to_skip(radix);
862 	blk = freeBlk & -radix;
863 	digit = (blk / radix) & BLIST_META_MASK;
864 	endDigit = 1 + (((endBlk - 1) / radix) & BLIST_META_MASK);
865 	scan->bm_bitmap |= bitrange(digit, endDigit - digit);
866 	for (i = 1 + digit * skip; blk < endBlk; i += skip) {
867 		blk += radix;
868 		count = ummin(blk, endBlk) - freeBlk;
869 		blst_meta_free(&scan[i], freeBlk, count, radix);
870 		freeBlk = blk;
871 	}
872 }
873 
874 /*
875  * BLST_COPY() - copy one radix tree to another
876  *
877  *	Locates free space in the source tree and frees it in the destination
878  *	tree.  The space may not already be free in the destination.
879  */
880 static void
881 blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, blist_t dest,
882     daddr_t count)
883 {
884 	daddr_t endBlk, i, skip;
885 
886 	/*
887 	 * Leaf node
888 	 */
889 
890 	if (radix == BLIST_BMAP_RADIX) {
891 		u_daddr_t v = scan->bm_bitmap;
892 
893 		if (v == (u_daddr_t)-1) {
894 			blist_free(dest, blk, count);
895 		} else if (v != 0) {
896 			int i;
897 
898 			for (i = 0; i < count; ++i) {
899 				if (v & ((u_daddr_t)1 << i))
900 					blist_free(dest, blk + i, 1);
901 			}
902 		}
903 		return;
904 	}
905 
906 	/*
907 	 * Meta node
908 	 */
909 
910 	if (scan->bm_bitmap == 0) {
911 		/*
912 		 * Source all allocated, leave dest allocated
913 		 */
914 		return;
915 	}
916 
917 	endBlk = blk + count;
918 	radix /= BLIST_META_RADIX;
919 	skip = radix_to_skip(radix);
920 	for (i = 1; blk < endBlk; i += skip) {
921 		blk += radix;
922 		count = radix;
923 		if (blk >= endBlk)
924 			count -= blk - endBlk;
925 		blst_copy(&scan[i], blk - radix, radix, dest, count);
926 	}
927 }
928 
929 /*
930  * BLST_LEAF_FILL() -	allocate specific blocks in leaf bitmap
931  *
932  *	This routine allocates all blocks in the specified range
933  *	regardless of any existing allocations in that range.  Returns
934  *	the number of blocks allocated by the call.
935  */
936 static daddr_t
937 blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count)
938 {
939 	daddr_t nblks;
940 	u_daddr_t mask;
941 
942 	mask = bitrange(blk & BLIST_BMAP_MASK, count);
943 
944 	/* Count the number of blocks that we are allocating. */
945 	nblks = bitcount64(scan->bm_bitmap & mask);
946 
947 	scan->bm_bitmap &= ~mask;
948 	return (nblks);
949 }
950 
951 /*
952  * BLIST_META_FILL() -	allocate specific blocks at a meta node
953  *
954  *	This routine allocates the specified range of blocks,
955  *	regardless of any existing allocations in the range.  The
956  *	range must be within the extent of this node.  Returns the
957  *	number of blocks allocated by the call.
958  */
959 static daddr_t
960 blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, u_daddr_t radix)
961 {
962 	daddr_t blk, endBlk, i, nblks, skip;
963 	int digit;
964 
965 	if (radix == BLIST_BMAP_RADIX)
966 		return (blst_leaf_fill(scan, allocBlk, count));
967 
968 	endBlk = ummin(allocBlk + count, (allocBlk + radix) & -radix);
969 	radix /= BLIST_META_RADIX;
970 	skip = radix_to_skip(radix);
971 	blk = allocBlk & -radix;
972 	nblks = 0;
973 	while (blk < endBlk) {
974 		digit = (blk / radix) & BLIST_META_MASK;
975 		i = 1 + digit * skip;
976 		blk += radix;
977 		count = ummin(blk, endBlk) - allocBlk;
978 		nblks += blst_meta_fill(&scan[i], allocBlk, count, radix);
979 		if (scan[i].bm_bitmap == 0)
980 			scan->bm_bitmap &= ~((u_daddr_t)1 << digit);
981 		allocBlk = blk;
982 	}
983 	return (nblks);
984 }
985 
986 #ifdef BLIST_DEBUG
987 
988 static void
989 blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int tab)
990 {
991 	daddr_t skip;
992 	u_daddr_t bit, mask;
993 	int digit;
994 
995 	if (radix == BLIST_BMAP_RADIX) {
996 		printf(
997 		    "%*.*s(%08llx,%lld): bitmap %0*llx big=%lld\n",
998 		    tab, tab, "",
999 		    (long long)blk, (long long)radix,
1000 		    1 + (BLIST_BMAP_RADIX - 1) / 4,
1001 		    (long long)scan->bm_bitmap,
1002 		    (long long)scan->bm_bighint
1003 		);
1004 		return;
1005 	}
1006 
1007 	printf(
1008 	    "%*.*s(%08llx): subtree (%lld/%lld) bitmap %0*llx big=%lld {\n",
1009 	    tab, tab, "",
1010 	    (long long)blk, (long long)radix,
1011 	    (long long)radix,
1012 	    1 + (BLIST_META_RADIX - 1) / 4,
1013 	    (long long)scan->bm_bitmap,
1014 	    (long long)scan->bm_bighint
1015 	);
1016 
1017 	radix /= BLIST_META_RADIX;
1018 	skip = radix_to_skip(radix);
1019 	tab += 4;
1020 
1021 	mask = scan->bm_bitmap;
1022 	/* Examine the nonempty subtree associated with each bit set in mask */
1023 	do {
1024 		bit = mask & -mask;
1025 		digit = bitpos(bit);
1026 		blst_radix_print(&scan[1 + digit * skip], blk + digit * radix,
1027 		    radix, tab);
1028 	} while ((mask ^= bit) != 0);
1029 	tab -= 4;
1030 
1031 	printf(
1032 	    "%*.*s}\n",
1033 	    tab, tab, ""
1034 	);
1035 }
1036 
1037 #endif
1038 
1039 #ifdef BLIST_DEBUG
1040 
1041 int
1042 main(int ac, char **av)
1043 {
1044 	int size = BLIST_META_RADIX * BLIST_BMAP_RADIX;
1045 	int i;
1046 	blist_t bl;
1047 	struct sbuf *s;
1048 
1049 	for (i = 1; i < ac; ++i) {
1050 		const char *ptr = av[i];
1051 		if (*ptr != '-') {
1052 			size = strtol(ptr, NULL, 0);
1053 			continue;
1054 		}
1055 		ptr += 2;
1056 		fprintf(stderr, "Bad option: %s\n", ptr - 2);
1057 		exit(1);
1058 	}
1059 	bl = blist_create(size, M_WAITOK);
1060 	blist_free(bl, 0, size);
1061 
1062 	for (;;) {
1063 		char buf[1024];
1064 		long long da = 0;
1065 		long long count = 0;
1066 
1067 		printf("%lld/%lld/%lld> ", (long long)blist_avail(bl),
1068 		    (long long)size, (long long)bl->bl_radix);
1069 		fflush(stdout);
1070 		if (fgets(buf, sizeof(buf), stdin) == NULL)
1071 			break;
1072 		switch(buf[0]) {
1073 		case 'r':
1074 			if (sscanf(buf + 1, "%lld", &count) == 1) {
1075 				blist_resize(&bl, count, 1, M_WAITOK);
1076 			} else {
1077 				printf("?\n");
1078 			}
1079 		case 'p':
1080 			blist_print(bl);
1081 			break;
1082 		case 's':
1083 			s = sbuf_new_auto();
1084 			blist_stats(bl, s);
1085 			sbuf_finish(s);
1086 			printf("%s", sbuf_data(s));
1087 			sbuf_delete(s);
1088 			break;
1089 		case 'a':
1090 			if (sscanf(buf + 1, "%lld", &count) == 1) {
1091 				daddr_t blk = blist_alloc(bl, count);
1092 				printf("    R=%08llx\n", (long long)blk);
1093 			} else {
1094 				printf("?\n");
1095 			}
1096 			break;
1097 		case 'f':
1098 			if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) {
1099 				blist_free(bl, da, count);
1100 			} else {
1101 				printf("?\n");
1102 			}
1103 			break;
1104 		case 'l':
1105 			if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) {
1106 				printf("    n=%jd\n",
1107 				    (intmax_t)blist_fill(bl, da, count));
1108 			} else {
1109 				printf("?\n");
1110 			}
1111 			break;
1112 		case '?':
1113 		case 'h':
1114 			puts(
1115 			    "p          -print\n"
1116 			    "s          -stats\n"
1117 			    "a %d       -allocate\n"
1118 			    "f %x %d    -free\n"
1119 			    "l %x %d    -fill\n"
1120 			    "r %d       -resize\n"
1121 			    "h/?        -help"
1122 			);
1123 			break;
1124 		default:
1125 			printf("?\n");
1126 			break;
1127 		}
1128 	}
1129 	return(0);
1130 }
1131 
1132 void
1133 panic(const char *ctl, ...)
1134 {
1135 	va_list va;
1136 
1137 	va_start(va, ctl);
1138 	vfprintf(stderr, ctl, va);
1139 	fprintf(stderr, "\n");
1140 	va_end(va);
1141 	exit(1);
1142 }
1143 
1144 #endif
1145