xref: /freebsd/sys/kern/subr_blist.c (revision c81495a621c461b3d3395a7c5b0e73458201c443)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1998 Matthew Dillon.  All Rights Reserved.
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. Neither the name of the University nor the names of its contributors
14  *    may be used to endorse or promote products derived from this software
15  *    without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
18  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
23  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
25  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
26  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  */
29 /*
30  * BLIST.C -	Bitmap allocator/deallocator, using a radix tree with hinting
31  *
32  *	This module implements a general bitmap allocator/deallocator.  The
33  *	allocator eats around 2 bits per 'block'.  The module does not
34  *	try to interpret the meaning of a 'block' other than to return
35  *	SWAPBLK_NONE on an allocation failure.
36  *
37  *	A radix tree controls access to pieces of the bitmap, and includes
38  *	auxiliary information at each interior node about the availabilty of
39  *	contiguous free blocks in the subtree rooted at that node.  A radix
40  *	constant defines the size of the bitmaps contained in a leaf node
41  *	and the number of descendents of each of the meta (interior) nodes.
42  *	Each subtree is associated with a range of blocks.  The root of any
43  *	subtree stores a hint field that defines an upper bound on the size
44  *	of the largest allocation that can begin in the associated block
45  *	range.  A hint is an upper bound on a potential allocation, but not
46  *	necessarily a tight upper bound.
47  *
48  *	The bitmap field in each node directs the search for available blocks.
49  *	For a leaf node, a bit is set if the corresponding block is free.  For a
50  *	meta node, a bit is set if the corresponding subtree contains a free
51  *	block somewhere within it.  The search at a meta node considers only
52  *	children of that node that represent a range that includes a free block.
53  *
54  * 	The hinting greatly increases code efficiency for allocations while
55  *	the general radix structure optimizes both allocations and frees.  The
56  *	radix tree should be able to operate well no matter how much
57  *	fragmentation there is and no matter how large a bitmap is used.
58  *
59  *	The blist code wires all necessary memory at creation time.  Neither
60  *	allocations nor frees require interaction with the memory subsystem.
61  *	The non-blocking nature of allocations and frees is required by swap
62  *	code (vm/swap_pager.c).
63  *
64  *	LAYOUT: The radix tree is laid out recursively using a linear array.
65  *	Each meta node is immediately followed (laid out sequentially in
66  *	memory) by BLIST_RADIX lower-level nodes.  This is a recursive
67  *	structure but one that can be easily scanned through a very simple
68  *	'skip' calculation.  The memory allocation is only large enough to
69  *	cover the number of blocks requested at creation time.  Nodes that
70  *	represent blocks beyond that limit, nodes that would never be read
71  *	or written, are not allocated, so that the last of the
72  *	BLIST_RADIX lower-level nodes of a some nodes may not be allocated.
73  *
74  *	NOTE: the allocator cannot currently allocate more than
75  *	BLIST_RADIX blocks per call.  It will panic with 'allocation too
76  *	large' if you try.  This is an area that could use improvement.  The
77  *	radix is large enough that this restriction does not effect the swap
78  *	system, though.  Currently only the allocation code is affected by
79  *	this algorithmic unfeature.  The freeing code can handle arbitrary
80  *	ranges.
81  *
82  *	This code can be compiled stand-alone for debugging.
83  */
84 
85 #include <sys/cdefs.h>
86 __FBSDID("$FreeBSD$");
87 
88 #ifdef _KERNEL
89 
90 #include <sys/param.h>
91 #include <sys/systm.h>
92 #include <sys/lock.h>
93 #include <sys/kernel.h>
94 #include <sys/blist.h>
95 #include <sys/malloc.h>
96 #include <sys/sbuf.h>
97 #include <sys/proc.h>
98 #include <sys/mutex.h>
99 
100 #else
101 
102 #ifndef BLIST_NO_DEBUG
103 #define BLIST_DEBUG
104 #endif
105 
106 #include <sys/errno.h>
107 #include <sys/types.h>
108 #include <sys/malloc.h>
109 #include <sys/sbuf.h>
110 #include <assert.h>
111 #include <stdio.h>
112 #include <string.h>
113 #include <stddef.h>
114 #include <stdlib.h>
115 #include <stdarg.h>
116 #include <stdbool.h>
117 
118 #define	bitcount64(x)	__bitcount64((uint64_t)(x))
119 #define malloc(a,b,c)	calloc(a, 1)
120 #define free(a,b)	free(a)
121 #define ummin(a,b)	((a) < (b) ? (a) : (b))
122 #define imin(a,b)	((a) < (b) ? (a) : (b))
123 #define KASSERT(a,b)	assert(a)
124 
125 #include <sys/blist.h>
126 
127 #endif
128 
129 /*
130  * static support functions
131  */
132 static daddr_t	blst_leaf_alloc(blmeta_t *scan, daddr_t blk,
133     int *count, int maxcount);
134 static daddr_t	blst_meta_alloc(blmeta_t *scan, daddr_t cursor, int *count,
135     int maxcount, u_daddr_t radix);
136 static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count);
137 static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count,
138 		    u_daddr_t radix);
139 static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix,
140 		    blist_t dest, daddr_t count);
141 static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count);
142 static daddr_t blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count,
143 		    u_daddr_t radix);
144 #ifndef _KERNEL
145 static void	blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix,
146 		    int tab);
147 #endif
148 
149 #ifdef _KERNEL
150 static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space");
151 #endif
152 
153 #define	BLIST_MASK	(BLIST_RADIX - 1)
154 
155 /*
156  * For a subtree that can represent the state of up to 'radix' blocks, the
157  * number of leaf nodes of the subtree is L=radix/BLIST_RADIX.  If 'm'
158  * is short for BLIST_RADIX, then for a tree of height h with L=m**h
159  * leaf nodes, the total number of tree nodes is 1 + m + m**2 + ... + m**h,
160  * or, equivalently, (m**(h+1)-1)/(m-1).  This quantity is called 'skip'
161  * in the 'meta' functions that process subtrees.  Since integer division
162  * discards remainders, we can express this computation as
163  * skip = (m * m**h) / (m - 1)
164  * skip = (m * (radix / m)) / (m - 1)
165  * skip = radix / (m - 1)
166  * so that simple integer division by a constant can safely be used for the
167  * calculation.
168  */
169 static inline daddr_t
170 radix_to_skip(daddr_t radix)
171 {
172 
173 	return (radix / BLIST_MASK);
174 }
175 
176 /*
177  * Provide a mask with count bits set, starting as position n.
178  */
179 static inline u_daddr_t
180 bitrange(int n, int count)
181 {
182 
183 	return (((u_daddr_t)-1 << n) &
184 	    ((u_daddr_t)-1 >> (BLIST_RADIX - (n + count))));
185 }
186 
187 static inline int
188 bitpos(u_daddr_t mask)
189 {
190 
191 	_Static_assert(sizeof(long long) >= sizeof(mask),
192 	    "mask too big for ffsll()");
193 	return (ffsll(mask) - 1);
194 }
195 
196 /*
197  * blist_create() - create a blist capable of handling up to the specified
198  *		    number of blocks
199  *
200  *	blocks - must be greater than 0
201  * 	flags  - malloc flags
202  *
203  *	The smallest blist consists of a single leaf node capable of
204  *	managing BLIST_RADIX blocks.
205  */
206 blist_t
207 blist_create(daddr_t blocks, int flags)
208 {
209 	blist_t bl;
210 	u_daddr_t nodes, radix;
211 
212 	KASSERT(blocks > 0, ("invalid block count"));
213 
214 	/*
215 	 * Calculate the radix and node count used for scanning.
216 	 */
217 	nodes = 1;
218 	for (radix = 1; (blocks - 1) / BLIST_RADIX / radix > 0;
219 	    radix *= BLIST_RADIX)
220 		nodes += 1 + (blocks - 1) / BLIST_RADIX / radix;
221 
222 	/*
223 	 * Include a sentinel node to ensure that cross-leaf scans stay within
224 	 * the bounds of the allocation.
225 	 */
226 	if (blocks % BLIST_RADIX == 0)
227 		nodes++;
228 
229 	bl = malloc(offsetof(struct blist, bl_root[nodes]), M_SWAP, flags |
230 	    M_ZERO);
231 	if (bl == NULL)
232 		return (NULL);
233 
234 	bl->bl_blocks = blocks;
235 	bl->bl_radix = radix;
236 
237 #if defined(BLIST_DEBUG)
238 	printf(
239 		"BLIST representing %lld blocks (%lld MB of swap)"
240 		", requiring %lldK of ram\n",
241 		(long long)bl->bl_blocks,
242 		(long long)bl->bl_blocks * 4 / 1024,
243 		(long long)(nodes * sizeof(blmeta_t) + 1023) / 1024
244 	);
245 	printf("BLIST raw radix tree contains %lld records\n",
246 	    (long long)nodes);
247 #endif
248 
249 	return (bl);
250 }
251 
252 void
253 blist_destroy(blist_t bl)
254 {
255 
256 	free(bl, M_SWAP);
257 }
258 
259 /*
260  * blist_alloc() -   reserve space in the block bitmap.  Return the base
261  *		     of a contiguous region or SWAPBLK_NONE if space could
262  *		     not be allocated.
263  */
264 daddr_t
265 blist_alloc(blist_t bl, int *count, int maxcount)
266 {
267 	daddr_t blk, cursor;
268 
269 	KASSERT(*count <= maxcount,
270 	    ("invalid parameters %d > %d", *count, maxcount));
271 	KASSERT(*count <= BLIST_MAX_ALLOC,
272 	    ("minimum allocation too large: %d", *count));
273 
274 	/*
275 	 * This loop iterates at most twice.  An allocation failure in the
276 	 * first iteration leads to a second iteration only if the cursor was
277 	 * non-zero.  When the cursor is zero, an allocation failure will
278 	 * stop further iterations.
279 	 */
280 	for (cursor = bl->bl_cursor;; cursor = 0) {
281 		blk = blst_meta_alloc(bl->bl_root, cursor, count, maxcount,
282 		    bl->bl_radix);
283 		if (blk != SWAPBLK_NONE) {
284 			bl->bl_avail -= *count;
285 			bl->bl_cursor = blk + *count;
286 			if (bl->bl_cursor == bl->bl_blocks)
287 				bl->bl_cursor = 0;
288 			return (blk);
289 		}
290 		if (cursor == 0)
291 			return (SWAPBLK_NONE);
292 	}
293 }
294 
295 /*
296  * blist_avail() -	return the number of free blocks.
297  */
298 daddr_t
299 blist_avail(blist_t bl)
300 {
301 
302 	return (bl->bl_avail);
303 }
304 
305 /*
306  * blist_free() -	free up space in the block bitmap.  Return the base
307  *		     	of a contiguous region.
308  */
309 void
310 blist_free(blist_t bl, daddr_t blkno, daddr_t count)
311 {
312 
313 	KASSERT(blkno >= 0 && blkno + count <= bl->bl_blocks,
314 	    ("freeing invalid range: blkno %jx, count %d, blocks %jd",
315 	    (uintmax_t)blkno, (int)count, (uintmax_t)bl->bl_blocks));
316 	blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix);
317 	bl->bl_avail += count;
318 }
319 
320 /*
321  * blist_fill() -	mark a region in the block bitmap as off-limits
322  *			to the allocator (i.e. allocate it), ignoring any
323  *			existing allocations.  Return the number of blocks
324  *			actually filled that were free before the call.
325  */
326 daddr_t
327 blist_fill(blist_t bl, daddr_t blkno, daddr_t count)
328 {
329 	daddr_t filled;
330 
331 	KASSERT(blkno >= 0 && blkno + count <= bl->bl_blocks,
332 	    ("filling invalid range: blkno %jx, count %d, blocks %jd",
333 	    (uintmax_t)blkno, (int)count, (uintmax_t)bl->bl_blocks));
334 	filled = blst_meta_fill(bl->bl_root, blkno, count, bl->bl_radix);
335 	bl->bl_avail -= filled;
336 	return (filled);
337 }
338 
339 /*
340  * blist_resize() -	resize an existing radix tree to handle the
341  *			specified number of blocks.  This will reallocate
342  *			the tree and transfer the previous bitmap to the new
343  *			one.  When extending the tree you can specify whether
344  *			the new blocks are to left allocated or freed.
345  */
346 void
347 blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags)
348 {
349     blist_t newbl = blist_create(count, flags);
350     blist_t save = *pbl;
351 
352     *pbl = newbl;
353     if (count > save->bl_blocks)
354 	    count = save->bl_blocks;
355     blst_copy(save->bl_root, 0, save->bl_radix, newbl, count);
356 
357     /*
358      * If resizing upwards, should we free the new space or not?
359      */
360     if (freenew && count < newbl->bl_blocks) {
361 	    blist_free(newbl, count, newbl->bl_blocks - count);
362     }
363     blist_destroy(save);
364 }
365 
366 #ifdef BLIST_DEBUG
367 
368 /*
369  * blist_print()    - dump radix tree
370  */
371 void
372 blist_print(blist_t bl)
373 {
374 	printf("BLIST avail = %jd, cursor = %08jx {\n",
375 	    (uintmax_t)bl->bl_avail, (uintmax_t)bl->bl_cursor);
376 
377 	if (bl->bl_root->bm_bitmap != 0)
378 		blst_radix_print(bl->bl_root, 0, bl->bl_radix, 4);
379 	printf("}\n");
380 }
381 
382 #endif
383 
384 static const u_daddr_t fib[] = {
385 	1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584,
386 	4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418, 317811,
387 	514229, 832040, 1346269, 2178309, 3524578,
388 };
389 
390 /*
391  * Use 'gap' to describe a maximal range of unallocated blocks/bits.
392  */
393 struct gap_stats {
394 	daddr_t	start;		/* current gap start, or SWAPBLK_NONE */
395 	daddr_t	num;		/* number of gaps observed */
396 	daddr_t	max;		/* largest gap size */
397 	daddr_t	avg;		/* average gap size */
398 	daddr_t	err;		/* sum - num * avg */
399 	daddr_t	histo[nitems(fib)]; /* # gaps in each size range */
400 	int	max_bucket;	/* last histo elt with nonzero val */
401 };
402 
403 /*
404  * gap_stats_counting()    - is the state 'counting 1 bits'?
405  *                           or 'skipping 0 bits'?
406  */
407 static inline bool
408 gap_stats_counting(const struct gap_stats *stats)
409 {
410 
411 	return (stats->start != SWAPBLK_NONE);
412 }
413 
414 /*
415  * init_gap_stats()    - initialize stats on gap sizes
416  */
417 static inline void
418 init_gap_stats(struct gap_stats *stats)
419 {
420 
421 	bzero(stats, sizeof(*stats));
422 	stats->start = SWAPBLK_NONE;
423 }
424 
425 /*
426  * update_gap_stats()    - update stats on gap sizes
427  */
428 static void
429 update_gap_stats(struct gap_stats *stats, daddr_t posn)
430 {
431 	daddr_t size;
432 	int hi, lo, mid;
433 
434 	if (!gap_stats_counting(stats)) {
435 		stats->start = posn;
436 		return;
437 	}
438 	size = posn - stats->start;
439 	stats->start = SWAPBLK_NONE;
440 	if (size > stats->max)
441 		stats->max = size;
442 
443 	/*
444 	 * Find the fibonacci range that contains size,
445 	 * expecting to find it in an early range.
446 	 */
447 	lo = 0;
448 	hi = 1;
449 	while (hi < nitems(fib) && fib[hi] <= size) {
450 		lo = hi;
451 		hi *= 2;
452 	}
453 	if (hi >= nitems(fib))
454 		hi = nitems(fib);
455 	while (lo + 1 != hi) {
456 		mid = (lo + hi) >> 1;
457 		if (fib[mid] <= size)
458 			lo = mid;
459 		else
460 			hi = mid;
461 	}
462 	stats->histo[lo]++;
463 	if (lo > stats->max_bucket)
464 		stats->max_bucket = lo;
465 	stats->err += size - stats->avg;
466 	stats->num++;
467 	stats->avg += stats->err / stats->num;
468 	stats->err %= stats->num;
469 }
470 
471 /*
472  * dump_gap_stats()    - print stats on gap sizes
473  */
474 static inline void
475 dump_gap_stats(const struct gap_stats *stats, struct sbuf *s)
476 {
477 	int i;
478 
479 	sbuf_printf(s, "number of maximal free ranges: %jd\n",
480 	    (intmax_t)stats->num);
481 	sbuf_printf(s, "largest free range: %jd\n", (intmax_t)stats->max);
482 	sbuf_printf(s, "average maximal free range size: %jd\n",
483 	    (intmax_t)stats->avg);
484 	sbuf_printf(s, "number of maximal free ranges of different sizes:\n");
485 	sbuf_printf(s, "               count  |  size range\n");
486 	sbuf_printf(s, "               -----  |  ----------\n");
487 	for (i = 0; i < stats->max_bucket; i++) {
488 		if (stats->histo[i] != 0) {
489 			sbuf_printf(s, "%20jd  |  ",
490 			    (intmax_t)stats->histo[i]);
491 			if (fib[i] != fib[i + 1] - 1)
492 				sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
493 				    (intmax_t)fib[i + 1] - 1);
494 			else
495 				sbuf_printf(s, "%jd\n", (intmax_t)fib[i]);
496 		}
497 	}
498 	sbuf_printf(s, "%20jd  |  ", (intmax_t)stats->histo[i]);
499 	if (stats->histo[i] > 1)
500 		sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
501 		    (intmax_t)stats->max);
502 	else
503 		sbuf_printf(s, "%jd\n", (intmax_t)stats->max);
504 }
505 
506 /*
507  * blist_stats()    - dump radix tree stats
508  */
509 void
510 blist_stats(blist_t bl, struct sbuf *s)
511 {
512 	struct gap_stats gstats;
513 	struct gap_stats *stats = &gstats;
514 	daddr_t i, nodes, radix;
515 	u_daddr_t diff, mask;
516 	int digit;
517 
518 	init_gap_stats(stats);
519 	nodes = 0;
520 	radix = bl->bl_radix;
521 	for (i = 0; i < bl->bl_blocks; ) {
522 		/*
523 		 * Check for skippable subtrees starting at i.
524 		 */
525 		while (radix != 1) {
526 			if (bl->bl_root[nodes].bm_bitmap == 0) {
527 				if (gap_stats_counting(stats))
528 					update_gap_stats(stats, i);
529 				break;
530 			}
531 
532 			/*
533 			 * Skip subtree root.
534 			 */
535 			nodes++;
536 			radix /= BLIST_RADIX;
537 		}
538 		if (radix == 1) {
539 			/*
540 			 * Scan leaf.
541 			 */
542 			mask = bl->bl_root[nodes].bm_bitmap;
543 			diff = mask ^ (mask << 1);
544 			if (gap_stats_counting(stats))
545 				diff ^= 1;
546 			while (diff != 0) {
547 				digit = bitpos(diff);
548 				update_gap_stats(stats, i + digit);
549 				diff ^= bitrange(digit, 1);
550 			}
551 		}
552 		nodes += radix_to_skip(radix * BLIST_RADIX);
553 		i += radix * BLIST_RADIX;
554 
555 		/*
556 		 * Find max size subtree starting at i.
557 		 */
558 		for (radix = 1;
559 		    ((i / BLIST_RADIX / radix) & BLIST_MASK) == 0;
560 		    radix *= BLIST_RADIX)
561 			;
562 	}
563 	update_gap_stats(stats, i);
564 	dump_gap_stats(stats, s);
565 }
566 
567 /************************************************************************
568  *			  ALLOCATION SUPPORT FUNCTIONS			*
569  ************************************************************************
570  *
571  *	These support functions do all the actual work.  They may seem
572  *	rather longish, but that's because I've commented them up.  The
573  *	actual code is straight forward.
574  *
575  */
576 
577 /*
578  * BLST_NEXT_LEAF_ALLOC() - allocate the blocks starting with the next leaf.
579  *
580  *	'scan' is a leaf node, and its first block is at address 'start'.  The
581  *	next leaf node could be adjacent, or several nodes away if the least
582  *	common ancestor of 'scan' and its neighbor is several levels up.  Use
583  *	addresses to determine how many meta-nodes lie between the leaves.  If
584  *	sequence of leaves starting with the next one has enough initial bits
585  *	set, clear them and clear the bits in the meta nodes on the path up to
586  *	the least common ancestor to mark any subtrees made completely empty.
587  */
588 static int
589 blst_next_leaf_alloc(blmeta_t *scan, daddr_t start, int count, int maxcount)
590 {
591 	u_daddr_t radix;
592 	daddr_t blk;
593 	int avail, digit;
594 
595 	start += BLIST_RADIX;
596 	for (blk = start; blk - start < maxcount; blk += BLIST_RADIX) {
597 		/* Skip meta-nodes, as long as they promise more free blocks. */
598 		radix = BLIST_RADIX;
599 		while (((++scan)->bm_bitmap & 1) == 1 &&
600 		    ((blk / radix) & BLIST_MASK) == 0)
601 			radix *= BLIST_RADIX;
602 		if (~scan->bm_bitmap != 0) {
603 			/*
604 			 * Either there is no next leaf with any free blocks,
605 			 * or we've reached the next leaf and found that some
606 			 * of its blocks are not free.  In the first case,
607 			 * bitpos() returns zero here.
608 			 */
609 			avail = blk - start + bitpos(~scan->bm_bitmap);
610 			if (avail < count || avail == 0) {
611 				/*
612 				 * There isn't a next leaf with enough free
613 				 * blocks at its beginning to bother
614 				 * allocating.
615 				 */
616 				return (avail);
617 			}
618 			maxcount = imin(avail, maxcount);
619 			if (maxcount % BLIST_RADIX == 0) {
620 				/*
621 				 * There was no next leaf.  Back scan up to
622 				 * last leaf.
623 				 */
624 				do {
625 					radix /= BLIST_RADIX;
626 					--scan;
627 				} while (radix != 1);
628 				blk -= BLIST_RADIX;
629 			}
630 		}
631 	}
632 
633 	/*
634 	 * 'scan' is the last leaf that provides blocks.  Clear from 1 to
635 	 * BLIST_RADIX bits to represent the allocation of those last blocks.
636 	 */
637 	if (maxcount % BLIST_RADIX != 0)
638 		scan->bm_bitmap &= ~bitrange(0, maxcount % BLIST_RADIX);
639 	else
640 		scan->bm_bitmap = 0;
641 
642 	for (;;) {
643 		/* Back up over meta-nodes, clearing bits if necessary. */
644 		blk -= BLIST_RADIX;
645 		for (radix = BLIST_RADIX;
646 		    (digit = ((blk / radix) & BLIST_MASK)) == 0;
647 		    radix *= BLIST_RADIX) {
648 			if ((scan--)->bm_bitmap == 0)
649 				scan->bm_bitmap ^= 1;
650 		}
651 		if ((scan--)->bm_bitmap == 0)
652 			scan[-digit * radix_to_skip(radix)].bm_bitmap ^=
653 			    (u_daddr_t)1 << digit;
654 
655 		if (blk == start)
656 			break;
657 		/* Clear all the bits of this leaf. */
658 		scan->bm_bitmap = 0;
659 	}
660 	return (maxcount);
661 }
662 
663 /*
664  * BLST_LEAF_ALLOC() -	allocate at a leaf in the radix tree (a bitmap).
665  *
666  *	This function is the core of the allocator.  Its execution time is
667  *	proportional to log(count), plus height of the tree if the allocation
668  *	crosses a leaf boundary.
669  */
670 static daddr_t
671 blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int *count, int maxcount)
672 {
673 	u_daddr_t mask;
674 	int bighint, count1, hi, lo, num_shifts;
675 
676 	count1 = *count - 1;
677 	num_shifts = fls(count1);
678 	mask = ~scan->bm_bitmap;
679 	while ((mask & (mask + 1)) != 0 && num_shifts > 0) {
680 		/*
681 		 * If bit i is 0 in mask, then bits in [i, i + (count1 >>
682 		 * num_shifts)] are 1 in scan->bm_bitmap.  Reduce num_shifts to
683 		 * 0, while preserving this invariant.  The updates to mask
684 		 * leave fewer bits 0, but each bit that remains 0 represents a
685 		 * longer string of consecutive 1-bits in scan->bm_bitmap.  If
686 		 * more updates to mask cannot set more bits, because mask is
687 		 * partitioned with all 1 bits following all 0 bits, the loop
688 		 * terminates immediately.
689 		 */
690 		num_shifts--;
691 		mask |= mask >> ((count1 >> num_shifts) + 1) / 2;
692 	}
693 	bighint = count1 >> num_shifts;
694 	if (~mask == 0) {
695 		/*
696 		 * Update bighint.  There is no allocation bigger than
697 		 * count1 >> num_shifts starting in this leaf.
698 		 */
699 		scan->bm_bighint = bighint;
700 		return (SWAPBLK_NONE);
701 	}
702 
703 	/* Discard any candidates that appear before blk. */
704 	if ((blk & BLIST_MASK) != 0) {
705 		if ((~mask & bitrange(0, blk & BLIST_MASK)) != 0) {
706 			/* Grow bighint in case all discarded bits are set. */
707 			bighint += blk & BLIST_MASK;
708 			mask |= bitrange(0, blk & BLIST_MASK);
709 			if (~mask == 0) {
710 				scan->bm_bighint = bighint;
711 				return (SWAPBLK_NONE);
712 			}
713 		}
714 		blk -= blk & BLIST_MASK;
715 	}
716 
717 	/*
718 	 * The least significant set bit in mask marks the start of the first
719 	 * available range of sufficient size.  Find its position.
720 	 */
721 	lo = bitpos(~mask);
722 
723 	/*
724 	 * Find how much space is available starting at that position.
725 	 */
726 	if ((mask & (mask + 1)) != 0) {
727 		/* Count the 1 bits starting at position lo. */
728 		hi = bitpos(mask & (mask + 1)) + count1;
729 		if (maxcount < hi - lo)
730 			hi = lo + maxcount;
731 		*count = hi - lo;
732 		mask = ~bitrange(lo, *count);
733 	} else if (maxcount <= BLIST_RADIX - lo) {
734 		/* All the blocks we can use are available here. */
735 		hi = lo + maxcount;
736 		*count = maxcount;
737 		mask = ~bitrange(lo, *count);
738 		if (hi == BLIST_RADIX)
739 			scan->bm_bighint = bighint;
740 	} else {
741 		/* Check next leaf for some of the blocks we want or need. */
742 		count1 = *count - (BLIST_RADIX - lo);
743 		maxcount -= BLIST_RADIX - lo;
744 		hi = blst_next_leaf_alloc(scan, blk, count1, maxcount);
745 		if (hi < count1)
746 			/*
747 			 * The next leaf cannot supply enough blocks to reach
748 			 * the minimum required allocation.  The hint cannot be
749 			 * updated, because the same allocation request could
750 			 * be satisfied later, by this leaf, if the state of
751 			 * the next leaf changes, and without any changes to
752 			 * this leaf.
753 			 */
754 			return (SWAPBLK_NONE);
755 		*count = BLIST_RADIX - lo + hi;
756 		scan->bm_bighint = bighint;
757 	}
758 
759 	/* Clear the allocated bits from this leaf. */
760 	scan->bm_bitmap &= mask;
761 	return (blk + lo);
762 }
763 
764 /*
765  * blist_meta_alloc() -	allocate at a meta in the radix tree.
766  *
767  *	Attempt to allocate at a meta node.  If we can't, we update
768  *	bighint and return a failure.  Updating bighint optimize future
769  *	calls that hit this node.  We have to check for our collapse cases
770  *	and we have a few optimizations strewn in as well.
771  */
772 static daddr_t
773 blst_meta_alloc(blmeta_t *scan, daddr_t cursor, int *count,
774     int maxcount, u_daddr_t radix)
775 {
776 	daddr_t blk, i, r, skip;
777 	u_daddr_t mask;
778 	bool scan_from_start;
779 	int digit;
780 
781 	if (radix == 1)
782 		return (blst_leaf_alloc(scan, cursor, count, maxcount));
783 	blk = cursor & -(radix * BLIST_RADIX);
784 	scan_from_start = (cursor == blk);
785 	skip = radix_to_skip(radix);
786 	mask = scan->bm_bitmap;
787 
788 	/* Discard any candidates that appear before cursor. */
789 	digit = (cursor / radix) & BLIST_MASK;
790 	mask &= (u_daddr_t)-1 << digit;
791 	if (mask == 0)
792 		return (SWAPBLK_NONE);
793 
794 	/*
795 	 * If the first try is for a block that includes the cursor, pre-undo
796 	 * the digit * radix offset in the first call; otherwise, ignore the
797 	 * cursor entirely.
798 	 */
799 	if (((mask >> digit) & 1) == 1)
800 		cursor -= digit * radix;
801 	else
802 		cursor = blk;
803 
804 	/*
805 	 * Examine the nonempty subtree associated with each bit set in mask.
806 	 */
807 	do {
808 		digit = bitpos(mask);
809 		i = 1 + digit * skip;
810 		if (*count <= scan[i].bm_bighint) {
811 			/*
812 			 * The allocation might fit beginning in the i'th subtree.
813 			 */
814 			r = blst_meta_alloc(&scan[i], cursor + digit * radix,
815 			    count, maxcount, radix / BLIST_RADIX);
816 			if (r != SWAPBLK_NONE) {
817 				if (scan[i].bm_bitmap == 0)
818 					scan->bm_bitmap ^= bitrange(digit, 1);
819 				return (r);
820 			}
821 		}
822 		cursor = blk;
823 	} while ((mask ^= bitrange(digit, 1)) != 0);
824 
825 	/*
826 	 * We couldn't allocate count in this subtree.  If the whole tree was
827 	 * scanned, and the last tree node is allocated, update bighint.
828 	 */
829 	if (scan_from_start && !(digit == BLIST_RADIX - 1 &&
830 	    scan[i].bm_bighint == BLIST_MAX_ALLOC))
831 		scan->bm_bighint = *count - 1;
832 
833 	return (SWAPBLK_NONE);
834 }
835 
836 /*
837  * BLST_LEAF_FREE() -	free allocated block from leaf bitmap
838  *
839  */
840 static void
841 blst_leaf_free(blmeta_t *scan, daddr_t blk, int count)
842 {
843 	u_daddr_t mask;
844 
845 	/*
846 	 * free some data in this bitmap
847 	 * mask=0000111111111110000
848 	 *          \_________/\__/
849 	 *		count   n
850 	 */
851 	mask = bitrange(blk & BLIST_MASK, count);
852 	KASSERT((scan->bm_bitmap & mask) == 0,
853 	    ("freeing free block: %jx, size %d, mask %jx",
854 	    (uintmax_t)blk, count, (uintmax_t)scan->bm_bitmap & mask));
855 	scan->bm_bitmap |= mask;
856 }
857 
858 /*
859  * BLST_META_FREE() - free allocated blocks from radix tree meta info
860  *
861  *	This support routine frees a range of blocks from the bitmap.
862  *	The range must be entirely enclosed by this radix node.  If a
863  *	meta node, we break the range down recursively to free blocks
864  *	in subnodes (which means that this code can free an arbitrary
865  *	range whereas the allocation code cannot allocate an arbitrary
866  *	range).
867  */
868 static void
869 blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, u_daddr_t radix)
870 {
871 	daddr_t blk, endBlk, i, skip;
872 	int digit, endDigit;
873 
874 	/*
875 	 * We could probably do a better job here.  We are required to make
876 	 * bighint at least as large as the biggest allocable block of data.
877 	 * If we just shoehorn it, a little extra overhead will be incurred
878 	 * on the next allocation (but only that one typically).
879 	 */
880 	scan->bm_bighint = BLIST_MAX_ALLOC;
881 
882 	if (radix == 1)
883 		return (blst_leaf_free(scan, freeBlk, count));
884 
885 	endBlk = freeBlk + count;
886 	blk = (freeBlk + radix * BLIST_RADIX) & -(radix * BLIST_RADIX);
887 	/*
888 	 * blk is first block past the end of the range of this meta node,
889 	 * or 0 in case of overflow.
890 	 */
891 	if (blk != 0)
892 		endBlk = ummin(endBlk, blk);
893 	skip = radix_to_skip(radix);
894 	blk = freeBlk & -radix;
895 	digit = (blk / radix) & BLIST_MASK;
896 	endDigit = 1 + (((endBlk - 1) / radix) & BLIST_MASK);
897 	scan->bm_bitmap |= bitrange(digit, endDigit - digit);
898 	for (i = 1 + digit * skip; blk < endBlk; i += skip) {
899 		blk += radix;
900 		count = ummin(blk, endBlk) - freeBlk;
901 		blst_meta_free(&scan[i], freeBlk, count, radix / BLIST_RADIX);
902 		freeBlk = blk;
903 	}
904 }
905 
906 /*
907  * BLST_COPY() - copy one radix tree to another
908  *
909  *	Locates free space in the source tree and frees it in the destination
910  *	tree.  The space may not already be free in the destination.
911  */
912 static void
913 blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, blist_t dest,
914     daddr_t count)
915 {
916 	daddr_t endBlk, i, skip;
917 
918 	/*
919 	 * Leaf node
920 	 */
921 
922 	if (radix == 1) {
923 		u_daddr_t v = scan->bm_bitmap;
924 
925 		if (v == (u_daddr_t)-1) {
926 			blist_free(dest, blk, count);
927 		} else if (v != 0) {
928 			int i;
929 
930 			for (i = 0; i < count; ++i) {
931 				if (v & ((u_daddr_t)1 << i))
932 					blist_free(dest, blk + i, 1);
933 			}
934 		}
935 		return;
936 	}
937 
938 	/*
939 	 * Meta node
940 	 */
941 
942 	if (scan->bm_bitmap == 0) {
943 		/*
944 		 * Source all allocated, leave dest allocated
945 		 */
946 		return;
947 	}
948 
949 	endBlk = blk + count;
950 	skip = radix_to_skip(radix);
951 	for (i = 1; blk < endBlk; i += skip) {
952 		blk += radix;
953 		count = radix;
954 		if (blk >= endBlk)
955 			count -= blk - endBlk;
956 		blst_copy(&scan[i], blk - radix,
957 		    radix / BLIST_RADIX, dest, count);
958 	}
959 }
960 
961 /*
962  * BLST_LEAF_FILL() -	allocate specific blocks in leaf bitmap
963  *
964  *	This routine allocates all blocks in the specified range
965  *	regardless of any existing allocations in that range.  Returns
966  *	the number of blocks allocated by the call.
967  */
968 static daddr_t
969 blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count)
970 {
971 	daddr_t nblks;
972 	u_daddr_t mask;
973 
974 	mask = bitrange(blk & BLIST_MASK, count);
975 
976 	/* Count the number of blocks that we are allocating. */
977 	nblks = bitcount64(scan->bm_bitmap & mask);
978 
979 	scan->bm_bitmap &= ~mask;
980 	return (nblks);
981 }
982 
983 /*
984  * BLIST_META_FILL() -	allocate specific blocks at a meta node
985  *
986  *	This routine allocates the specified range of blocks,
987  *	regardless of any existing allocations in the range.  The
988  *	range must be within the extent of this node.  Returns the
989  *	number of blocks allocated by the call.
990  */
991 static daddr_t
992 blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, u_daddr_t radix)
993 {
994 	daddr_t blk, endBlk, i, nblks, skip;
995 	int digit;
996 
997 	if (radix == 1)
998 		return (blst_leaf_fill(scan, allocBlk, count));
999 
1000 	endBlk = allocBlk + count;
1001 	blk = (allocBlk + radix * BLIST_RADIX) & -(radix * BLIST_RADIX);
1002 	/*
1003 	 * blk is first block past the end of the range of this meta node,
1004 	 * or 0 in case of overflow.
1005 	 */
1006 	if (blk != 0)
1007 		endBlk = ummin(endBlk, blk);
1008 	skip = radix_to_skip(radix);
1009 	blk = allocBlk & -radix;
1010 	nblks = 0;
1011 	while (blk < endBlk) {
1012 		digit = (blk / radix) & BLIST_MASK;
1013 		i = 1 + digit * skip;
1014 		blk += radix;
1015 		count = ummin(blk, endBlk) - allocBlk;
1016 		nblks += blst_meta_fill(&scan[i], allocBlk, count,
1017 		    radix / BLIST_RADIX);
1018 		if (scan[i].bm_bitmap == 0)
1019 			scan->bm_bitmap &= ~((u_daddr_t)1 << digit);
1020 		allocBlk = blk;
1021 	}
1022 	return (nblks);
1023 }
1024 
1025 #ifdef BLIST_DEBUG
1026 
1027 static void
1028 blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int tab)
1029 {
1030 	daddr_t skip;
1031 	u_daddr_t mask;
1032 	int digit;
1033 
1034 	if (radix == 1) {
1035 		printf(
1036 		    "%*.*s(%08llx,%lld): bitmap %0*llx big=%lld\n",
1037 		    tab, tab, "",
1038 		    (long long)blk, (long long)BLIST_RADIX,
1039 		    (int)(1 + (BLIST_RADIX - 1) / 4),
1040 		    (long long)scan->bm_bitmap,
1041 		    (long long)scan->bm_bighint
1042 		);
1043 		return;
1044 	}
1045 
1046 	printf(
1047 	    "%*.*s(%08llx): subtree (%lld/%lld) bitmap %0*llx big=%lld {\n",
1048 	    tab, tab, "",
1049 	    (long long)blk, (long long)radix * BLIST_RADIX,
1050 	    (long long)radix * BLIST_RADIX,
1051 	    (int)(1 + (BLIST_RADIX - 1) / 4),
1052 	    (long long)scan->bm_bitmap,
1053 	    (long long)scan->bm_bighint
1054 	);
1055 
1056 	skip = radix_to_skip(radix);
1057 	tab += 4;
1058 
1059 	mask = scan->bm_bitmap;
1060 	/* Examine the nonempty subtree associated with each bit set in mask */
1061 	do {
1062 		digit = bitpos(mask);
1063 		blst_radix_print(&scan[1 + digit * skip], blk + digit * radix,
1064 		    radix / BLIST_RADIX, tab);
1065 	} while ((mask ^= bitrange(digit, 1)) != 0);
1066 	tab -= 4;
1067 
1068 	printf(
1069 	    "%*.*s}\n",
1070 	    tab, tab, ""
1071 	);
1072 }
1073 
1074 #endif
1075 
1076 #ifdef BLIST_DEBUG
1077 
1078 int
1079 main(int ac, char **av)
1080 {
1081 	daddr_t size = BLIST_RADIX * BLIST_RADIX;
1082 	int i;
1083 	blist_t bl;
1084 	struct sbuf *s;
1085 
1086 	for (i = 1; i < ac; ++i) {
1087 		const char *ptr = av[i];
1088 		if (*ptr != '-') {
1089 			size = strtoll(ptr, NULL, 0);
1090 			continue;
1091 		}
1092 		ptr += 2;
1093 		fprintf(stderr, "Bad option: %s\n", ptr - 2);
1094 		exit(1);
1095 	}
1096 	bl = blist_create(size, M_WAITOK);
1097 	if (bl == NULL) {
1098 		fprintf(stderr, "blist_create failed\n");
1099 		exit(1);
1100 	}
1101 	blist_free(bl, 0, size);
1102 
1103 	for (;;) {
1104 		char buf[1024];
1105 		long long da = 0;
1106 		int count = 0, maxcount = 0;
1107 
1108 		printf("%lld/%lld/%lld> ", (long long)blist_avail(bl),
1109 		    (long long)size, (long long)bl->bl_radix * BLIST_RADIX);
1110 		fflush(stdout);
1111 		if (fgets(buf, sizeof(buf), stdin) == NULL)
1112 			break;
1113 		switch(buf[0]) {
1114 		case 'r':
1115 			if (sscanf(buf + 1, "%d", &count) == 1) {
1116 				blist_resize(&bl, count, 1, M_WAITOK);
1117 			} else {
1118 				printf("?\n");
1119 			}
1120 		case 'p':
1121 			blist_print(bl);
1122 			break;
1123 		case 's':
1124 			s = sbuf_new_auto();
1125 			blist_stats(bl, s);
1126 			sbuf_finish(s);
1127 			printf("%s", sbuf_data(s));
1128 			sbuf_delete(s);
1129 			break;
1130 		case 'a':
1131 			if (sscanf(buf + 1, "%d%d", &count, &maxcount) == 2) {
1132 				daddr_t blk = blist_alloc(bl, &count, maxcount);
1133 				printf("    R=%08llx, c=%08d\n",
1134 				    (long long)blk, count);
1135 			} else {
1136 				printf("?\n");
1137 			}
1138 			break;
1139 		case 'f':
1140 			if (sscanf(buf + 1, "%llx %d", &da, &count) == 2) {
1141 				blist_free(bl, da, count);
1142 			} else {
1143 				printf("?\n");
1144 			}
1145 			break;
1146 		case 'l':
1147 			if (sscanf(buf + 1, "%llx %d", &da, &count) == 2) {
1148 				printf("    n=%jd\n",
1149 				    (intmax_t)blist_fill(bl, da, count));
1150 			} else {
1151 				printf("?\n");
1152 			}
1153 			break;
1154 		case '?':
1155 		case 'h':
1156 			puts(
1157 			    "p          -print\n"
1158 			    "s          -stats\n"
1159 			    "a %d %d    -allocate\n"
1160 			    "f %x %d    -free\n"
1161 			    "l %x %d    -fill\n"
1162 			    "r %d       -resize\n"
1163 			    "h/?        -help\n"
1164 			    "q          -quit"
1165 			);
1166 			break;
1167 		case 'q':
1168 			break;
1169 		default:
1170 			printf("?\n");
1171 			break;
1172 		}
1173 		if (buf[0] == 'q')
1174 			break;
1175 	}
1176 	return (0);
1177 }
1178 
1179 #endif
1180