xref: /freebsd/sys/kern/subr_blist.c (revision 77ebcc05eac2658a68b447e654cfdf7ff3e703b8)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1998 Matthew Dillon.  All Rights Reserved.
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. Neither the name of the University nor the names of its contributors
14  *    may be used to endorse or promote products derived from this software
15  *    without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
18  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
23  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
25  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
26  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  */
29 /*
30  * BLIST.C -	Bitmap allocator/deallocator, using a radix tree with hinting
31  *
32  *	This module implements a general bitmap allocator/deallocator.  The
33  *	allocator eats around 2 bits per 'block'.  The module does not
34  *	try to interpret the meaning of a 'block' other than to return
35  *	SWAPBLK_NONE on an allocation failure.
36  *
37  *	A radix tree controls access to pieces of the bitmap, and includes
38  *	auxiliary information at each interior node about the availabilty of
39  *	contiguous free blocks in the subtree rooted at that node.  Two radix
40  *	constants are involved: one for the size of the bitmaps contained in the
41  *	leaf nodes (BLIST_BMAP_RADIX), and one for the number of descendents of
42  *	each of the meta (interior) nodes (BLIST_META_RADIX).  Each subtree is
43  *	associated with a range of blocks.  The root of any subtree stores a
44  *	hint field that defines an upper bound on the size of the largest
45  *	allocation that can begin in the associated block range.  A hint is an
46  *	upper bound on a potential allocation, but not necessarily a tight upper
47  *	bound.
48  *
49  *	The bitmap field in each node directs the search for available blocks.
50  *	For a leaf node, a bit is set if the corresponding block is free.  For a
51  *	meta node, a bit is set if the corresponding subtree contains a free
52  *	block somewhere within it.  The search at a meta node considers only
53  *	children of that node that represent a range that includes a free block.
54  *
55  * 	The hinting greatly increases code efficiency for allocations while
56  *	the general radix structure optimizes both allocations and frees.  The
57  *	radix tree should be able to operate well no matter how much
58  *	fragmentation there is and no matter how large a bitmap is used.
59  *
60  *	The blist code wires all necessary memory at creation time.  Neither
61  *	allocations nor frees require interaction with the memory subsystem.
62  *	The non-blocking nature of allocations and frees is required by swap
63  *	code (vm/swap_pager.c).
64  *
65  *	LAYOUT: The radix tree is laid out recursively using a linear array.
66  *	Each meta node is immediately followed (laid out sequentially in
67  *	memory) by BLIST_META_RADIX lower level nodes.  This is a recursive
68  *	structure but one that can be easily scanned through a very simple
69  *	'skip' calculation.  The memory allocation is only large enough to
70  *	cover the number of blocks requested at creation time.  Nodes that
71  *	represent blocks beyond that limit, nodes that would never be read
72  *	or written, are not allocated, so that the last of the
73  *	BLIST_META_RADIX lower level nodes of a some nodes may not be
74  *	allocated.
75  *
76  *	NOTE: the allocator cannot currently allocate more than
77  *	BLIST_BMAP_RADIX blocks per call.  It will panic with 'allocation too
78  *	large' if you try.  This is an area that could use improvement.  The
79  *	radix is large enough that this restriction does not effect the swap
80  *	system, though.  Currently only the allocation code is affected by
81  *	this algorithmic unfeature.  The freeing code can handle arbitrary
82  *	ranges.
83  *
84  *	This code can be compiled stand-alone for debugging.
85  */
86 
87 #include <sys/cdefs.h>
88 __FBSDID("$FreeBSD$");
89 
90 #ifdef _KERNEL
91 
92 #include <sys/param.h>
93 #include <sys/systm.h>
94 #include <sys/lock.h>
95 #include <sys/kernel.h>
96 #include <sys/blist.h>
97 #include <sys/malloc.h>
98 #include <sys/sbuf.h>
99 #include <sys/proc.h>
100 #include <sys/mutex.h>
101 
102 #else
103 
104 #ifndef BLIST_NO_DEBUG
105 #define BLIST_DEBUG
106 #endif
107 
108 #include <sys/errno.h>
109 #include <sys/types.h>
110 #include <sys/malloc.h>
111 #include <sys/sbuf.h>
112 #include <stdio.h>
113 #include <string.h>
114 #include <stddef.h>
115 #include <stdlib.h>
116 #include <stdarg.h>
117 #include <stdbool.h>
118 
119 #define	bitcount64(x)	__bitcount64((uint64_t)(x))
120 #define malloc(a,b,c)	calloc(a, 1)
121 #define free(a,b)	free(a)
122 #define ummin(a,b)	((a) < (b) ? (a) : (b))
123 
124 #include <sys/blist.h>
125 
126 void panic(const char *ctl, ...);
127 
128 #endif
129 
130 /*
131  * static support functions
132  */
133 static daddr_t	blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count);
134 static daddr_t	blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count,
135 		    u_daddr_t radix);
136 static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count);
137 static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count,
138 		    u_daddr_t radix);
139 static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix,
140 		    blist_t dest, daddr_t count);
141 static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count);
142 static daddr_t blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count,
143 		    u_daddr_t radix);
144 #ifndef _KERNEL
145 static void	blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix,
146 		    int tab);
147 #endif
148 
149 #ifdef _KERNEL
150 static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space");
151 #endif
152 
153 _Static_assert(BLIST_BMAP_RADIX % BLIST_META_RADIX == 0,
154     "radix divisibility error");
155 #define	BLIST_BMAP_MASK	(BLIST_BMAP_RADIX - 1)
156 #define	BLIST_META_MASK	(BLIST_META_RADIX - 1)
157 
158 /*
159  * For a subtree that can represent the state of up to 'radix' blocks, the
160  * number of leaf nodes of the subtree is L=radix/BLIST_BMAP_RADIX.  If 'm'
161  * is short for BLIST_META_RADIX, then for a tree of height h with L=m**h
162  * leaf nodes, the total number of tree nodes is 1 + m + m**2 + ... + m**h,
163  * or, equivalently, (m**(h+1)-1)/(m-1).  This quantity is called 'skip'
164  * in the 'meta' functions that process subtrees.  Since integer division
165  * discards remainders, we can express this computation as
166  * skip = (m * m**h) / (m - 1)
167  * skip = (m * (radix / BLIST_BMAP_RADIX)) / (m - 1)
168  * and since m divides BLIST_BMAP_RADIX, we can simplify further to
169  * skip = (radix / (BLIST_BMAP_RADIX / m)) / (m - 1)
170  * skip = radix / ((BLIST_BMAP_RADIX / m) * (m - 1))
171  * so that simple integer division by a constant can safely be used for the
172  * calculation.
173  */
174 static inline daddr_t
175 radix_to_skip(daddr_t radix)
176 {
177 
178 	return (radix /
179 	    ((BLIST_BMAP_RADIX / BLIST_META_RADIX) * BLIST_META_MASK));
180 }
181 
182 /*
183  * Provide a mask with count bits set, starting as position n.
184  */
185 static inline u_daddr_t
186 bitrange(int n, int count)
187 {
188 
189 	return (((u_daddr_t)-1 << n) &
190 	    ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - (n + count))));
191 }
192 
193 
194 /*
195  * Use binary search, or a faster method, to find the 1 bit in a u_daddr_t.
196  * Assumes that the argument has only one bit set.
197  */
198 static inline int
199 bitpos(u_daddr_t mask)
200 {
201 	int hi, lo, mid;
202 
203 	switch (sizeof(mask)) {
204 #ifdef HAVE_INLINE_FFSLL
205 	case sizeof(long long):
206 		return (ffsll(mask) - 1);
207 #endif
208 	default:
209 		lo = 0;
210 		hi = BLIST_BMAP_RADIX;
211 		while (lo + 1 < hi) {
212 			mid = (lo + hi) >> 1;
213 			if ((mask >> mid) != 0)
214 				lo = mid;
215 			else
216 				hi = mid;
217 		}
218 		return (lo);
219 	}
220 }
221 
222 /*
223  * blist_create() - create a blist capable of handling up to the specified
224  *		    number of blocks
225  *
226  *	blocks - must be greater than 0
227  * 	flags  - malloc flags
228  *
229  *	The smallest blist consists of a single leaf node capable of
230  *	managing BLIST_BMAP_RADIX blocks.
231  */
232 blist_t
233 blist_create(daddr_t blocks, int flags)
234 {
235 	blist_t bl;
236 	u_daddr_t nodes, radix;
237 
238 	if (blocks == 0)
239 		panic("invalid block count");
240 
241 	/*
242 	 * Calculate the radix and node count used for scanning.
243 	 */
244 	nodes = 1;
245 	radix = BLIST_BMAP_RADIX;
246 	while (radix <= blocks) {
247 		nodes += 1 + (blocks - 1) / radix;
248 		radix *= BLIST_META_RADIX;
249 	}
250 
251 	bl = malloc(offsetof(struct blist, bl_root[nodes]), M_SWAP, flags |
252 	    M_ZERO);
253 	if (bl == NULL)
254 		return (NULL);
255 
256 	bl->bl_blocks = blocks;
257 	bl->bl_radix = radix;
258 
259 #if defined(BLIST_DEBUG)
260 	printf(
261 		"BLIST representing %lld blocks (%lld MB of swap)"
262 		", requiring %lldK of ram\n",
263 		(long long)bl->bl_blocks,
264 		(long long)bl->bl_blocks * 4 / 1024,
265 		(long long)(nodes * sizeof(blmeta_t) + 1023) / 1024
266 	);
267 	printf("BLIST raw radix tree contains %lld records\n",
268 	    (long long)nodes);
269 #endif
270 
271 	return (bl);
272 }
273 
274 void
275 blist_destroy(blist_t bl)
276 {
277 
278 	free(bl, M_SWAP);
279 }
280 
281 /*
282  * blist_alloc() -   reserve space in the block bitmap.  Return the base
283  *		     of a contiguous region or SWAPBLK_NONE if space could
284  *		     not be allocated.
285  */
286 daddr_t
287 blist_alloc(blist_t bl, daddr_t count)
288 {
289 	daddr_t blk;
290 
291 	if (count > BLIST_MAX_ALLOC)
292 		panic("allocation too large");
293 
294 	/*
295 	 * This loop iterates at most twice.  An allocation failure in the
296 	 * first iteration leads to a second iteration only if the cursor was
297 	 * non-zero.  When the cursor is zero, an allocation failure will
298 	 * reduce the hint, stopping further iterations.
299 	 */
300 	while (count <= bl->bl_root->bm_bighint) {
301 		blk = blst_meta_alloc(bl->bl_root, bl->bl_cursor, count,
302 		    bl->bl_radix);
303 		if (blk != SWAPBLK_NONE) {
304 			bl->bl_avail -= count;
305 			bl->bl_cursor = blk + count;
306 			if (bl->bl_cursor == bl->bl_blocks)
307 				bl->bl_cursor = 0;
308 			return (blk);
309 		}
310 		bl->bl_cursor = 0;
311 	}
312 	return (SWAPBLK_NONE);
313 }
314 
315 /*
316  * blist_avail() -	return the number of free blocks.
317  */
318 daddr_t
319 blist_avail(blist_t bl)
320 {
321 
322 	return (bl->bl_avail);
323 }
324 
325 /*
326  * blist_free() -	free up space in the block bitmap.  Return the base
327  *		     	of a contiguous region.  Panic if an inconsistancy is
328  *			found.
329  */
330 void
331 blist_free(blist_t bl, daddr_t blkno, daddr_t count)
332 {
333 
334 	if (blkno < 0 || blkno + count > bl->bl_blocks)
335 		panic("freeing invalid range");
336 	blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix);
337 	bl->bl_avail += count;
338 }
339 
340 /*
341  * blist_fill() -	mark a region in the block bitmap as off-limits
342  *			to the allocator (i.e. allocate it), ignoring any
343  *			existing allocations.  Return the number of blocks
344  *			actually filled that were free before the call.
345  */
346 daddr_t
347 blist_fill(blist_t bl, daddr_t blkno, daddr_t count)
348 {
349 	daddr_t filled;
350 
351 	if (blkno < 0 || blkno + count > bl->bl_blocks)
352 		panic("filling invalid range");
353 	filled = blst_meta_fill(bl->bl_root, blkno, count, bl->bl_radix);
354 	bl->bl_avail -= filled;
355 	return (filled);
356 }
357 
358 /*
359  * blist_resize() -	resize an existing radix tree to handle the
360  *			specified number of blocks.  This will reallocate
361  *			the tree and transfer the previous bitmap to the new
362  *			one.  When extending the tree you can specify whether
363  *			the new blocks are to left allocated or freed.
364  */
365 void
366 blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags)
367 {
368     blist_t newbl = blist_create(count, flags);
369     blist_t save = *pbl;
370 
371     *pbl = newbl;
372     if (count > save->bl_blocks)
373 	    count = save->bl_blocks;
374     blst_copy(save->bl_root, 0, save->bl_radix, newbl, count);
375 
376     /*
377      * If resizing upwards, should we free the new space or not?
378      */
379     if (freenew && count < newbl->bl_blocks) {
380 	    blist_free(newbl, count, newbl->bl_blocks - count);
381     }
382     blist_destroy(save);
383 }
384 
385 #ifdef BLIST_DEBUG
386 
387 /*
388  * blist_print()    - dump radix tree
389  */
390 void
391 blist_print(blist_t bl)
392 {
393 	printf("BLIST avail = %jd, cursor = %08jx {\n",
394 	    (uintmax_t)bl->bl_avail, (uintmax_t)bl->bl_cursor);
395 
396 	if (bl->bl_root->bm_bitmap != 0)
397 		blst_radix_print(bl->bl_root, 0, bl->bl_radix, 4);
398 	printf("}\n");
399 }
400 
401 #endif
402 
403 static const u_daddr_t fib[] = {
404 	1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584,
405 	4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418, 317811,
406 	514229, 832040, 1346269, 2178309, 3524578,
407 };
408 
409 /*
410  * Use 'gap' to describe a maximal range of unallocated blocks/bits.
411  */
412 struct gap_stats {
413 	daddr_t	start;		/* current gap start, or SWAPBLK_NONE */
414 	daddr_t	num;		/* number of gaps observed */
415 	daddr_t	max;		/* largest gap size */
416 	daddr_t	avg;		/* average gap size */
417 	daddr_t	err;		/* sum - num * avg */
418 	daddr_t	histo[nitems(fib)]; /* # gaps in each size range */
419 	int	max_bucket;	/* last histo elt with nonzero val */
420 };
421 
422 /*
423  * gap_stats_counting()    - is the state 'counting 1 bits'?
424  *                           or 'skipping 0 bits'?
425  */
426 static inline bool
427 gap_stats_counting(const struct gap_stats *stats)
428 {
429 
430 	return (stats->start != SWAPBLK_NONE);
431 }
432 
433 /*
434  * init_gap_stats()    - initialize stats on gap sizes
435  */
436 static inline void
437 init_gap_stats(struct gap_stats *stats)
438 {
439 
440 	bzero(stats, sizeof(*stats));
441 	stats->start = SWAPBLK_NONE;
442 }
443 
444 /*
445  * update_gap_stats()    - update stats on gap sizes
446  */
447 static void
448 update_gap_stats(struct gap_stats *stats, daddr_t posn)
449 {
450 	daddr_t size;
451 	int hi, lo, mid;
452 
453 	if (!gap_stats_counting(stats)) {
454 		stats->start = posn;
455 		return;
456 	}
457 	size = posn - stats->start;
458 	stats->start = SWAPBLK_NONE;
459 	if (size > stats->max)
460 		stats->max = size;
461 
462 	/*
463 	 * Find the fibonacci range that contains size,
464 	 * expecting to find it in an early range.
465 	 */
466 	lo = 0;
467 	hi = 1;
468 	while (hi < nitems(fib) && fib[hi] <= size) {
469 		lo = hi;
470 		hi *= 2;
471 	}
472 	if (hi >= nitems(fib))
473 		hi = nitems(fib);
474 	while (lo + 1 != hi) {
475 		mid = (lo + hi) >> 1;
476 		if (fib[mid] <= size)
477 			lo = mid;
478 		else
479 			hi = mid;
480 	}
481 	stats->histo[lo]++;
482 	if (lo > stats->max_bucket)
483 		stats->max_bucket = lo;
484 	stats->err += size - stats->avg;
485 	stats->num++;
486 	stats->avg += stats->err / stats->num;
487 	stats->err %= stats->num;
488 }
489 
490 /*
491  * dump_gap_stats()    - print stats on gap sizes
492  */
493 static inline void
494 dump_gap_stats(const struct gap_stats *stats, struct sbuf *s)
495 {
496 	int i;
497 
498 	sbuf_printf(s, "number of maximal free ranges: %jd\n",
499 	    (intmax_t)stats->num);
500 	sbuf_printf(s, "largest free range: %jd\n", (intmax_t)stats->max);
501 	sbuf_printf(s, "average maximal free range size: %jd\n",
502 	    (intmax_t)stats->avg);
503 	sbuf_printf(s, "number of maximal free ranges of different sizes:\n");
504 	sbuf_printf(s, "               count  |  size range\n");
505 	sbuf_printf(s, "               -----  |  ----------\n");
506 	for (i = 0; i < stats->max_bucket; i++) {
507 		if (stats->histo[i] != 0) {
508 			sbuf_printf(s, "%20jd  |  ",
509 			    (intmax_t)stats->histo[i]);
510 			if (fib[i] != fib[i + 1] - 1)
511 				sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
512 				    (intmax_t)fib[i + 1] - 1);
513 			else
514 				sbuf_printf(s, "%jd\n", (intmax_t)fib[i]);
515 		}
516 	}
517 	sbuf_printf(s, "%20jd  |  ", (intmax_t)stats->histo[i]);
518 	if (stats->histo[i] > 1)
519 		sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
520 		    (intmax_t)stats->max);
521 	else
522 		sbuf_printf(s, "%jd\n", (intmax_t)stats->max);
523 }
524 
525 /*
526  * blist_stats()    - dump radix tree stats
527  */
528 void
529 blist_stats(blist_t bl, struct sbuf *s)
530 {
531 	struct gap_stats gstats;
532 	struct gap_stats *stats = &gstats;
533 	daddr_t i, nodes, radix;
534 	u_daddr_t bit, diff, mask;
535 
536 	init_gap_stats(stats);
537 	nodes = 0;
538 	i = bl->bl_radix;
539 	while (i < bl->bl_radix + bl->bl_blocks) {
540 		/*
541 		 * Find max size subtree starting at i.
542 		 */
543 		radix = BLIST_BMAP_RADIX;
544 		while (((i / radix) & BLIST_META_MASK) == 0)
545 			radix *= BLIST_META_RADIX;
546 
547 		/*
548 		 * Check for skippable subtrees starting at i.
549 		 */
550 		while (radix > BLIST_BMAP_RADIX) {
551 			if (bl->bl_root[nodes].bm_bitmap == 0) {
552 				if (gap_stats_counting(stats))
553 					update_gap_stats(stats, i);
554 				break;
555 			}
556 
557 			/*
558 			 * Skip subtree root.
559 			 */
560 			nodes++;
561 			radix /= BLIST_META_RADIX;
562 		}
563 		if (radix == BLIST_BMAP_RADIX) {
564 			/*
565 			 * Scan leaf.
566 			 */
567 			mask = bl->bl_root[nodes].bm_bitmap;
568 			diff = mask ^ (mask << 1);
569 			if (gap_stats_counting(stats))
570 				diff ^= 1;
571 			while (diff != 0) {
572 				bit = diff & -diff;
573 				update_gap_stats(stats, i + bitpos(bit));
574 				diff ^= bit;
575 			}
576 		}
577 		nodes += radix_to_skip(radix);
578 		i += radix;
579 	}
580 	update_gap_stats(stats, i);
581 	dump_gap_stats(stats, s);
582 }
583 
584 /************************************************************************
585  *			  ALLOCATION SUPPORT FUNCTIONS			*
586  ************************************************************************
587  *
588  *	These support functions do all the actual work.  They may seem
589  *	rather longish, but that's because I've commented them up.  The
590  *	actual code is straight forward.
591  *
592  */
593 
594 /*
595  * BLST_NEXT_LEAF_ALLOC() - allocate the first few blocks in the next leaf.
596  *
597  *	'scan' is a leaf node, associated with a block containing 'blk'.
598  *	The next leaf node could be adjacent, or several nodes away if the
599  *	least common ancestor of 'scan' and its neighbor is several levels
600  *	up.  Use 'blk' to determine how many meta-nodes lie between the
601  *	leaves.  If the next leaf has enough initial bits set, clear them
602  *	and clear the bits in the meta nodes on the path up to the least
603  *	common ancestor to mark any subtrees made completely empty.
604  */
605 static int
606 blst_next_leaf_alloc(blmeta_t *scan, daddr_t blk, int count)
607 {
608 	blmeta_t *next;
609 	daddr_t skip;
610 	u_daddr_t radix;
611 	int digit;
612 
613 	next = scan + 1;
614 	blk += BLIST_BMAP_RADIX;
615 	radix = BLIST_BMAP_RADIX;
616 	while ((digit = ((blk / radix) & BLIST_META_MASK)) == 0 &&
617 	    (next->bm_bitmap & 1) == 1) {
618 		next++;
619 		radix *= BLIST_META_RADIX;
620 	}
621 	if (((next->bm_bitmap + 1) & ~((u_daddr_t)-1 << count)) != 0) {
622 		/*
623 		 * The next leaf doesn't have enough free blocks at the
624 		 * beginning to complete the spanning allocation.
625 		 */
626 		return (ENOMEM);
627 	}
628 	/* Clear the first 'count' bits in the next leaf to allocate. */
629 	next->bm_bitmap &= (u_daddr_t)-1 << count;
630 
631 	/*
632 	 * Update bitmaps of next-ancestors, up to least common ancestor.
633 	 */
634 	skip = radix_to_skip(radix);
635 	while (radix != BLIST_BMAP_RADIX && next->bm_bitmap == 0) {
636 		(--next)->bm_bitmap ^= 1;
637 		radix /= BLIST_META_RADIX;
638 	}
639 	if (next->bm_bitmap == 0)
640 		scan[-digit * skip].bm_bitmap ^= (u_daddr_t)1 << digit;
641 	return (0);
642 }
643 
644 /*
645  * BLST_LEAF_ALLOC() -	allocate at a leaf in the radix tree (a bitmap).
646  *
647  *	This is the core of the allocator and is optimized for the
648  *	BLIST_BMAP_RADIX block allocation case.  Otherwise, execution
649  *	time is proportional to log2(count) + bitpos time.
650  */
651 static daddr_t
652 blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count)
653 {
654 	u_daddr_t mask;
655 	int count1, hi, lo, num_shifts, range1, range_ext;
656 
657 	range1 = 0;
658 	count1 = count - 1;
659 	num_shifts = fls(count1);
660 	mask = scan->bm_bitmap;
661 	while ((-mask & ~mask) != 0 && num_shifts > 0) {
662 		/*
663 		 * If bit i is set in mask, then bits in [i, i+range1] are set
664 		 * in scan->bm_bitmap.  The value of range1 is equal to
665 		 * count1 >> num_shifts.  Grow range and reduce num_shifts to 0,
666 		 * while preserving these invariants.  The updates to mask leave
667 		 * fewer bits set, but each bit that remains set represents a
668 		 * longer string of consecutive bits set in scan->bm_bitmap.
669 		 * If more updates to mask cannot clear more bits, because mask
670 		 * is partitioned with all 0 bits preceding all 1 bits, the loop
671 		 * terminates immediately.
672 		 */
673 		num_shifts--;
674 		range_ext = range1 + ((count1 >> num_shifts) & 1);
675 		/*
676 		 * mask is a signed quantity for the shift because when it is
677 		 * shifted right, the sign bit should copied; when the last
678 		 * block of the leaf is free, pretend, for a while, that all the
679 		 * blocks that follow it are also free.
680 		 */
681 		mask &= (daddr_t)mask >> range_ext;
682 		range1 += range_ext;
683 	}
684 	if (mask == 0) {
685 		/*
686 		 * Update bighint.  There is no allocation bigger than range1
687 		 * starting in this leaf.
688 		 */
689 		scan->bm_bighint = range1;
690 		return (SWAPBLK_NONE);
691 	}
692 
693 	/* Discard any candidates that appear before blk. */
694 	mask &= (u_daddr_t)-1 << (blk & BLIST_BMAP_MASK);
695 	if (mask == 0)
696 		return (SWAPBLK_NONE);
697 
698 	/*
699 	 * The least significant set bit in mask marks the start of the first
700 	 * available range of sufficient size.  Clear all the bits but that one,
701 	 * and then find its position.
702 	 */
703 	mask &= -mask;
704 	lo = bitpos(mask);
705 
706 	hi = lo + count;
707 	if (hi > BLIST_BMAP_RADIX) {
708 		/*
709 		 * An allocation within this leaf is impossible, so a successful
710 		 * allocation depends on the next leaf providing some of the blocks.
711 		 */
712 		if (blst_next_leaf_alloc(scan, blk, hi - BLIST_BMAP_RADIX) != 0)
713 			/*
714 			 * The hint cannot be updated, because the same
715 			 * allocation request could be satisfied later, by this
716 			 * leaf, if the state of the next leaf changes, and
717 			 * without any changes to this leaf.
718 			 */
719 			return (SWAPBLK_NONE);
720 		hi = BLIST_BMAP_RADIX;
721 	}
722 
723 	/* Set the bits of mask at position 'lo' and higher. */
724 	mask = -mask;
725 	if (hi == BLIST_BMAP_RADIX) {
726 		/*
727 		 * Update bighint.  There is no allocation bigger than range1
728 		 * available in this leaf after this allocation completes.
729 		 */
730 		scan->bm_bighint = range1;
731 	} else {
732 		/* Clear the bits of mask at position 'hi' and higher. */
733 		mask &= (u_daddr_t)-1 >> (BLIST_BMAP_RADIX - hi);
734 	}
735 	/* Clear the allocated bits from this leaf. */
736 	scan->bm_bitmap &= ~mask;
737 	return ((blk & ~BLIST_BMAP_MASK) + lo);
738 }
739 
740 /*
741  * blist_meta_alloc() -	allocate at a meta in the radix tree.
742  *
743  *	Attempt to allocate at a meta node.  If we can't, we update
744  *	bighint and return a failure.  Updating bighint optimize future
745  *	calls that hit this node.  We have to check for our collapse cases
746  *	and we have a few optimizations strewn in as well.
747  */
748 static daddr_t
749 blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count, u_daddr_t radix)
750 {
751 	daddr_t blk, i, r, skip;
752 	u_daddr_t bit, mask;
753 	bool scan_from_start;
754 	int digit;
755 
756 	if (radix == BLIST_BMAP_RADIX)
757 		return (blst_leaf_alloc(scan, cursor, count));
758 	blk = cursor & -radix;
759 	scan_from_start = (cursor == blk);
760 	radix /= BLIST_META_RADIX;
761 	skip = radix_to_skip(radix);
762 	mask = scan->bm_bitmap;
763 
764 	/* Discard any candidates that appear before cursor. */
765 	digit = (cursor / radix) & BLIST_META_MASK;
766 	mask &= (u_daddr_t)-1 << digit;
767 	if (mask == 0)
768 		return (SWAPBLK_NONE);
769 
770 	/*
771 	 * If the first try is for a block that includes the cursor, pre-undo
772 	 * the digit * radix offset in the first call; otherwise, ignore the
773 	 * cursor entirely.
774 	 */
775 	if (((mask >> digit) & 1) == 1)
776 		cursor -= digit * radix;
777 	else
778 		cursor = blk;
779 
780 	/*
781 	 * Examine the nonempty subtree associated with each bit set in mask.
782 	 */
783 	do {
784 		bit = mask & -mask;
785 		digit = bitpos(bit);
786 		i = 1 + digit * skip;
787 		if (count <= scan[i].bm_bighint) {
788 			/*
789 			 * The allocation might fit beginning in the i'th subtree.
790 			 */
791 			r = blst_meta_alloc(&scan[i], cursor + digit * radix,
792 			    count, radix);
793 			if (r != SWAPBLK_NONE) {
794 				if (scan[i].bm_bitmap == 0)
795 					scan->bm_bitmap ^= bit;
796 				return (r);
797 			}
798 		}
799 		cursor = blk;
800 	} while ((mask ^= bit) != 0);
801 
802 	/*
803 	 * We couldn't allocate count in this subtree.  If the whole tree was
804 	 * scanned, and the last tree node is allocated, update bighint.
805 	 */
806 	if (scan_from_start && !(digit == BLIST_META_RADIX - 1 &&
807 	    scan[i].bm_bighint == BLIST_MAX_ALLOC))
808 		scan->bm_bighint = count - 1;
809 
810 	return (SWAPBLK_NONE);
811 }
812 
813 /*
814  * BLST_LEAF_FREE() -	free allocated block from leaf bitmap
815  *
816  */
817 static void
818 blst_leaf_free(blmeta_t *scan, daddr_t blk, int count)
819 {
820 	u_daddr_t mask;
821 
822 	/*
823 	 * free some data in this bitmap
824 	 * mask=0000111111111110000
825 	 *          \_________/\__/
826 	 *		count   n
827 	 */
828 	mask = bitrange(blk & BLIST_BMAP_MASK, count);
829 	if (scan->bm_bitmap & mask)
830 		panic("freeing free block");
831 	scan->bm_bitmap |= mask;
832 }
833 
834 /*
835  * BLST_META_FREE() - free allocated blocks from radix tree meta info
836  *
837  *	This support routine frees a range of blocks from the bitmap.
838  *	The range must be entirely enclosed by this radix node.  If a
839  *	meta node, we break the range down recursively to free blocks
840  *	in subnodes (which means that this code can free an arbitrary
841  *	range whereas the allocation code cannot allocate an arbitrary
842  *	range).
843  */
844 static void
845 blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, u_daddr_t radix)
846 {
847 	daddr_t blk, endBlk, i, skip;
848 	int digit, endDigit;
849 
850 	/*
851 	 * We could probably do a better job here.  We are required to make
852 	 * bighint at least as large as the biggest allocable block of data.
853 	 * If we just shoehorn it, a little extra overhead will be incurred
854 	 * on the next allocation (but only that one typically).
855 	 */
856 	scan->bm_bighint = BLIST_MAX_ALLOC;
857 
858 	if (radix == BLIST_BMAP_RADIX)
859 		return (blst_leaf_free(scan, freeBlk, count));
860 
861 	endBlk = ummin(freeBlk + count, (freeBlk + radix) & -radix);
862 	radix /= BLIST_META_RADIX;
863 	skip = radix_to_skip(radix);
864 	blk = freeBlk & -radix;
865 	digit = (blk / radix) & BLIST_META_MASK;
866 	endDigit = 1 + (((endBlk - 1) / radix) & BLIST_META_MASK);
867 	scan->bm_bitmap |= bitrange(digit, endDigit - digit);
868 	for (i = 1 + digit * skip; blk < endBlk; i += skip) {
869 		blk += radix;
870 		count = ummin(blk, endBlk) - freeBlk;
871 		blst_meta_free(&scan[i], freeBlk, count, radix);
872 		freeBlk = blk;
873 	}
874 }
875 
876 /*
877  * BLST_COPY() - copy one radix tree to another
878  *
879  *	Locates free space in the source tree and frees it in the destination
880  *	tree.  The space may not already be free in the destination.
881  */
882 static void
883 blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, blist_t dest,
884     daddr_t count)
885 {
886 	daddr_t endBlk, i, skip;
887 
888 	/*
889 	 * Leaf node
890 	 */
891 
892 	if (radix == BLIST_BMAP_RADIX) {
893 		u_daddr_t v = scan->bm_bitmap;
894 
895 		if (v == (u_daddr_t)-1) {
896 			blist_free(dest, blk, count);
897 		} else if (v != 0) {
898 			int i;
899 
900 			for (i = 0; i < count; ++i) {
901 				if (v & ((u_daddr_t)1 << i))
902 					blist_free(dest, blk + i, 1);
903 			}
904 		}
905 		return;
906 	}
907 
908 	/*
909 	 * Meta node
910 	 */
911 
912 	if (scan->bm_bitmap == 0) {
913 		/*
914 		 * Source all allocated, leave dest allocated
915 		 */
916 		return;
917 	}
918 
919 	endBlk = blk + count;
920 	radix /= BLIST_META_RADIX;
921 	skip = radix_to_skip(radix);
922 	for (i = 1; blk < endBlk; i += skip) {
923 		blk += radix;
924 		count = radix;
925 		if (blk >= endBlk)
926 			count -= blk - endBlk;
927 		blst_copy(&scan[i], blk - radix, radix, dest, count);
928 	}
929 }
930 
931 /*
932  * BLST_LEAF_FILL() -	allocate specific blocks in leaf bitmap
933  *
934  *	This routine allocates all blocks in the specified range
935  *	regardless of any existing allocations in that range.  Returns
936  *	the number of blocks allocated by the call.
937  */
938 static daddr_t
939 blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count)
940 {
941 	daddr_t nblks;
942 	u_daddr_t mask;
943 
944 	mask = bitrange(blk & BLIST_BMAP_MASK, count);
945 
946 	/* Count the number of blocks that we are allocating. */
947 	nblks = bitcount64(scan->bm_bitmap & mask);
948 
949 	scan->bm_bitmap &= ~mask;
950 	return (nblks);
951 }
952 
953 /*
954  * BLIST_META_FILL() -	allocate specific blocks at a meta node
955  *
956  *	This routine allocates the specified range of blocks,
957  *	regardless of any existing allocations in the range.  The
958  *	range must be within the extent of this node.  Returns the
959  *	number of blocks allocated by the call.
960  */
961 static daddr_t
962 blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, u_daddr_t radix)
963 {
964 	daddr_t blk, endBlk, i, nblks, skip;
965 	int digit;
966 
967 	if (radix == BLIST_BMAP_RADIX)
968 		return (blst_leaf_fill(scan, allocBlk, count));
969 
970 	endBlk = ummin(allocBlk + count, (allocBlk + radix) & -radix);
971 	radix /= BLIST_META_RADIX;
972 	skip = radix_to_skip(radix);
973 	blk = allocBlk & -radix;
974 	nblks = 0;
975 	while (blk < endBlk) {
976 		digit = (blk / radix) & BLIST_META_MASK;
977 		i = 1 + digit * skip;
978 		blk += radix;
979 		count = ummin(blk, endBlk) - allocBlk;
980 		nblks += blst_meta_fill(&scan[i], allocBlk, count, radix);
981 		if (scan[i].bm_bitmap == 0)
982 			scan->bm_bitmap &= ~((u_daddr_t)1 << digit);
983 		allocBlk = blk;
984 	}
985 	return (nblks);
986 }
987 
988 #ifdef BLIST_DEBUG
989 
990 static void
991 blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int tab)
992 {
993 	daddr_t skip;
994 	u_daddr_t bit, mask;
995 	int digit;
996 
997 	if (radix == BLIST_BMAP_RADIX) {
998 		printf(
999 		    "%*.*s(%08llx,%lld): bitmap %0*llx big=%lld\n",
1000 		    tab, tab, "",
1001 		    (long long)blk, (long long)radix,
1002 		    1 + (BLIST_BMAP_RADIX - 1) / 4,
1003 		    (long long)scan->bm_bitmap,
1004 		    (long long)scan->bm_bighint
1005 		);
1006 		return;
1007 	}
1008 
1009 	printf(
1010 	    "%*.*s(%08llx): subtree (%lld/%lld) bitmap %0*llx big=%lld {\n",
1011 	    tab, tab, "",
1012 	    (long long)blk, (long long)radix,
1013 	    (long long)radix,
1014 	    1 + (BLIST_META_RADIX - 1) / 4,
1015 	    (long long)scan->bm_bitmap,
1016 	    (long long)scan->bm_bighint
1017 	);
1018 
1019 	radix /= BLIST_META_RADIX;
1020 	skip = radix_to_skip(radix);
1021 	tab += 4;
1022 
1023 	mask = scan->bm_bitmap;
1024 	/* Examine the nonempty subtree associated with each bit set in mask */
1025 	do {
1026 		bit = mask & -mask;
1027 		digit = bitpos(bit);
1028 		blst_radix_print(&scan[1 + digit * skip], blk + digit * radix,
1029 		    radix, tab);
1030 	} while ((mask ^= bit) != 0);
1031 	tab -= 4;
1032 
1033 	printf(
1034 	    "%*.*s}\n",
1035 	    tab, tab, ""
1036 	);
1037 }
1038 
1039 #endif
1040 
1041 #ifdef BLIST_DEBUG
1042 
1043 int
1044 main(int ac, char **av)
1045 {
1046 	int size = BLIST_META_RADIX * BLIST_BMAP_RADIX;
1047 	int i;
1048 	blist_t bl;
1049 	struct sbuf *s;
1050 
1051 	for (i = 1; i < ac; ++i) {
1052 		const char *ptr = av[i];
1053 		if (*ptr != '-') {
1054 			size = strtol(ptr, NULL, 0);
1055 			continue;
1056 		}
1057 		ptr += 2;
1058 		fprintf(stderr, "Bad option: %s\n", ptr - 2);
1059 		exit(1);
1060 	}
1061 	bl = blist_create(size, M_WAITOK);
1062 	blist_free(bl, 0, size);
1063 
1064 	for (;;) {
1065 		char buf[1024];
1066 		long long da = 0;
1067 		long long count = 0;
1068 
1069 		printf("%lld/%lld/%lld> ", (long long)blist_avail(bl),
1070 		    (long long)size, (long long)bl->bl_radix);
1071 		fflush(stdout);
1072 		if (fgets(buf, sizeof(buf), stdin) == NULL)
1073 			break;
1074 		switch(buf[0]) {
1075 		case 'r':
1076 			if (sscanf(buf + 1, "%lld", &count) == 1) {
1077 				blist_resize(&bl, count, 1, M_WAITOK);
1078 			} else {
1079 				printf("?\n");
1080 			}
1081 		case 'p':
1082 			blist_print(bl);
1083 			break;
1084 		case 's':
1085 			s = sbuf_new_auto();
1086 			blist_stats(bl, s);
1087 			sbuf_finish(s);
1088 			printf("%s", sbuf_data(s));
1089 			sbuf_delete(s);
1090 			break;
1091 		case 'a':
1092 			if (sscanf(buf + 1, "%lld", &count) == 1) {
1093 				daddr_t blk = blist_alloc(bl, count);
1094 				printf("    R=%08llx\n", (long long)blk);
1095 			} else {
1096 				printf("?\n");
1097 			}
1098 			break;
1099 		case 'f':
1100 			if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) {
1101 				blist_free(bl, da, count);
1102 			} else {
1103 				printf("?\n");
1104 			}
1105 			break;
1106 		case 'l':
1107 			if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) {
1108 				printf("    n=%jd\n",
1109 				    (intmax_t)blist_fill(bl, da, count));
1110 			} else {
1111 				printf("?\n");
1112 			}
1113 			break;
1114 		case '?':
1115 		case 'h':
1116 			puts(
1117 			    "p          -print\n"
1118 			    "s          -stats\n"
1119 			    "a %d       -allocate\n"
1120 			    "f %x %d    -free\n"
1121 			    "l %x %d    -fill\n"
1122 			    "r %d       -resize\n"
1123 			    "h/?        -help"
1124 			);
1125 			break;
1126 		default:
1127 			printf("?\n");
1128 			break;
1129 		}
1130 	}
1131 	return(0);
1132 }
1133 
1134 void
1135 panic(const char *ctl, ...)
1136 {
1137 	va_list va;
1138 
1139 	va_start(va, ctl);
1140 	vfprintf(stderr, ctl, va);
1141 	fprintf(stderr, "\n");
1142 	va_end(va);
1143 	exit(1);
1144 }
1145 
1146 #endif
1147