xref: /freebsd/sys/kern/subr_blist.c (revision 5c831a5bd61576cacb48b39f8eeb47b92707a355)
1 /*-
2  * Copyright (c) 1998 Matthew Dillon.  All Rights Reserved.
3  * Redistribution and use in source and binary forms, with or without
4  * modification, are permitted provided that the following conditions
5  * are met:
6  * 1. Redistributions of source code must retain the above copyright
7  *    notice, this list of conditions and the following disclaimer.
8  * 2. Redistributions in binary form must reproduce the above copyright
9  *    notice, this list of conditions and the following disclaimer in the
10  *    documentation and/or other materials provided with the distribution.
11  * 3. Neither the name of the University nor the names of its contributors
12  *    may be used to endorse or promote products derived from this software
13  *    without specific prior written permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
16  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
19  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
21  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 /*
28  * BLIST.C -	Bitmap allocator/deallocator, using a radix tree with hinting
29  *
30  *	This module implements a general bitmap allocator/deallocator.  The
31  *	allocator eats around 2 bits per 'block'.  The module does not
32  *	try to interpret the meaning of a 'block' other than to return
33  *	SWAPBLK_NONE on an allocation failure.
34  *
35  *	A radix tree controls access to pieces of the bitmap, and includes
36  *	auxiliary information at each interior node about the availabilty of
37  *	contiguous free blocks in the subtree rooted at that node.  Two radix
38  *	constants are involved: one for the size of the bitmaps contained in the
39  *	leaf nodes (BLIST_BMAP_RADIX), and one for the number of descendents of
40  *	each of the meta (interior) nodes (BLIST_META_RADIX).  Each subtree is
41  *	associated with a range of blocks.  The root of any subtree stores a
42  *	hint field that defines an upper bound on the size of the largest
43  *	allocation that can begin in the associated block range.  A hint is an
44  *	upper bound on a potential allocation, but not necessarily a tight upper
45  *	bound.
46  *
47  *	The radix tree also implements two collapsed states for meta nodes:
48  *	the ALL-ALLOCATED state and the ALL-FREE state.  If a meta node is
49  *	in either of these two states, all information contained underneath
50  *	the node is considered stale.  These states are used to optimize
51  *	allocation and freeing operations.
52  *
53  * 	The hinting greatly increases code efficiency for allocations while
54  *	the general radix structure optimizes both allocations and frees.  The
55  *	radix tree should be able to operate well no matter how much
56  *	fragmentation there is and no matter how large a bitmap is used.
57  *
58  *	The blist code wires all necessary memory at creation time.  Neither
59  *	allocations nor frees require interaction with the memory subsystem.
60  *	The non-blocking features of the blist code are used in the swap code
61  *	(vm/swap_pager.c).
62  *
63  *	LAYOUT: The radix tree is laid out recursively using a
64  *	linear array.  Each meta node is immediately followed (laid out
65  *	sequentially in memory) by BLIST_META_RADIX lower level nodes.  This
66  *	is a recursive structure but one that can be easily scanned through
67  *	a very simple 'skip' calculation.  In order to support large radixes,
68  *	portions of the tree may reside outside our memory allocation.  We
69  *	handle this with an early-termination optimization (when bighint is
70  *	set to -1) on the scan.  The memory allocation is only large enough
71  *	to cover the number of blocks requested at creation time even if it
72  *	must be encompassed in larger root-node radix.
73  *
74  *	NOTE: the allocator cannot currently allocate more than
75  *	BLIST_BMAP_RADIX blocks per call.  It will panic with 'allocation too
76  *	large' if you try.  This is an area that could use improvement.  The
77  *	radix is large enough that this restriction does not effect the swap
78  *	system, though.  Currently only the allocation code is affected by
79  *	this algorithmic unfeature.  The freeing code can handle arbitrary
80  *	ranges.
81  *
82  *	This code can be compiled stand-alone for debugging.
83  */
84 
85 #include <sys/cdefs.h>
86 __FBSDID("$FreeBSD$");
87 
88 #ifdef _KERNEL
89 
90 #include <sys/param.h>
91 #include <sys/systm.h>
92 #include <sys/lock.h>
93 #include <sys/kernel.h>
94 #include <sys/blist.h>
95 #include <sys/malloc.h>
96 #include <sys/sbuf.h>
97 #include <sys/proc.h>
98 #include <sys/mutex.h>
99 
100 #else
101 
102 #ifndef BLIST_NO_DEBUG
103 #define BLIST_DEBUG
104 #endif
105 
106 #include <sys/types.h>
107 #include <sys/malloc.h>
108 #include <sys/sbuf.h>
109 #include <stdio.h>
110 #include <string.h>
111 #include <stdlib.h>
112 #include <stdarg.h>
113 #include <stdbool.h>
114 
115 #define	bitcount64(x)	__bitcount64((uint64_t)(x))
116 #define malloc(a,b,c)	calloc(a, 1)
117 #define free(a,b)	free(a)
118 static __inline int imax(int a, int b) { return (a > b ? a : b); }
119 
120 #include <sys/blist.h>
121 
122 void panic(const char *ctl, ...);
123 
124 #endif
125 
126 /*
127  * static support functions
128  */
129 static daddr_t	blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count);
130 static daddr_t	blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count,
131 		    u_daddr_t radix);
132 static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count);
133 static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count,
134 		    u_daddr_t radix);
135 static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix,
136 		    blist_t dest, daddr_t count);
137 static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count);
138 static daddr_t blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count,
139 		    u_daddr_t radix);
140 static daddr_t	blst_radix_init(blmeta_t *scan, daddr_t radix, daddr_t count);
141 #ifndef _KERNEL
142 static void	blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix,
143 		    int tab);
144 #endif
145 
146 #ifdef _KERNEL
147 static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space");
148 #endif
149 
150 _Static_assert(BLIST_BMAP_RADIX % BLIST_META_RADIX == 0,
151     "radix divisibility error");
152 #define	BLIST_BMAP_MASK	(BLIST_BMAP_RADIX - 1)
153 #define	BLIST_META_MASK	(BLIST_META_RADIX - 1)
154 
155 /*
156  * For a subtree that can represent the state of up to 'radix' blocks, the
157  * number of leaf nodes of the subtree is L=radix/BLIST_BMAP_RADIX.  If 'm'
158  * is short for BLIST_META_RADIX, then for a tree of height h with L=m**h
159  * leaf nodes, the total number of tree nodes is 1 + m + m**2 + ... + m**h,
160  * or, equivalently, (m**(h+1)-1)/(m-1).  This quantity is called 'skip'
161  * in the 'meta' functions that process subtrees.  Since integer division
162  * discards remainders, we can express this computation as
163  * skip = (m * m**h) / (m - 1)
164  * skip = (m * (radix / BLIST_BMAP_RADIX)) / (m - 1)
165  * and since m divides BLIST_BMAP_RADIX, we can simplify further to
166  * skip = (radix / (BLIST_BMAP_RADIX / m)) / (m - 1)
167  * skip = radix / ((BLIST_BMAP_RADIX / m) * (m - 1))
168  * so that simple integer division by a constant can safely be used for the
169  * calculation.
170  */
171 static inline daddr_t
172 radix_to_skip(daddr_t radix)
173 {
174 
175 	return (radix /
176 	    ((BLIST_BMAP_RADIX / BLIST_META_RADIX) * BLIST_META_MASK));
177 }
178 
179 /*
180  * Use binary search, or a faster method, to find the 1 bit in a u_daddr_t.
181  * Assumes that the argument has only one bit set.
182  */
183 static inline int
184 bitpos(u_daddr_t mask)
185 {
186 	int hi, lo, mid;
187 
188 	switch (sizeof(mask)) {
189 #ifdef HAVE_INLINE_FFSLL
190 	case sizeof(long long):
191 		return (ffsll(mask) - 1);
192 #endif
193 	default:
194 		lo = 0;
195 		hi = BLIST_BMAP_RADIX;
196 		while (lo + 1 < hi) {
197 			mid = (lo + hi) >> 1;
198 			if ((mask >> mid) != 0)
199 				lo = mid;
200 			else
201 				hi = mid;
202 		}
203 		return (lo);
204 	}
205 }
206 
207 /*
208  * blist_create() - create a blist capable of handling up to the specified
209  *		    number of blocks
210  *
211  *	blocks - must be greater than 0
212  * 	flags  - malloc flags
213  *
214  *	The smallest blist consists of a single leaf node capable of
215  *	managing BLIST_BMAP_RADIX blocks.
216  */
217 blist_t
218 blist_create(daddr_t blocks, int flags)
219 {
220 	blist_t bl;
221 	daddr_t nodes, radix;
222 
223 	/*
224 	 * Calculate the radix field used for scanning.
225 	 */
226 	radix = BLIST_BMAP_RADIX;
227 	while (radix < blocks) {
228 		radix *= BLIST_META_RADIX;
229 	}
230 	nodes = 1 + blst_radix_init(NULL, radix, blocks);
231 
232 	bl = malloc(sizeof(struct blist), M_SWAP, flags);
233 	if (bl == NULL)
234 		return (NULL);
235 
236 	bl->bl_blocks = blocks;
237 	bl->bl_radix = radix;
238 	bl->bl_cursor = 0;
239 	bl->bl_root = malloc(nodes * sizeof(blmeta_t), M_SWAP, flags);
240 	if (bl->bl_root == NULL) {
241 		free(bl, M_SWAP);
242 		return (NULL);
243 	}
244 	blst_radix_init(bl->bl_root, radix, blocks);
245 
246 #if defined(BLIST_DEBUG)
247 	printf(
248 		"BLIST representing %lld blocks (%lld MB of swap)"
249 		", requiring %lldK of ram\n",
250 		(long long)bl->bl_blocks,
251 		(long long)bl->bl_blocks * 4 / 1024,
252 		(long long)(nodes * sizeof(blmeta_t) + 1023) / 1024
253 	);
254 	printf("BLIST raw radix tree contains %lld records\n",
255 	    (long long)nodes);
256 #endif
257 
258 	return (bl);
259 }
260 
261 void
262 blist_destroy(blist_t bl)
263 {
264 	free(bl->bl_root, M_SWAP);
265 	free(bl, M_SWAP);
266 }
267 
268 /*
269  * blist_alloc() -   reserve space in the block bitmap.  Return the base
270  *		     of a contiguous region or SWAPBLK_NONE if space could
271  *		     not be allocated.
272  */
273 daddr_t
274 blist_alloc(blist_t bl, daddr_t count)
275 {
276 	daddr_t blk;
277 
278 	/*
279 	 * This loop iterates at most twice.  An allocation failure in the
280 	 * first iteration leads to a second iteration only if the cursor was
281 	 * non-zero.  When the cursor is zero, an allocation failure will
282 	 * reduce the hint, stopping further iterations.
283 	 */
284 	while (count <= bl->bl_root->bm_bighint) {
285 		blk = blst_meta_alloc(bl->bl_root, bl->bl_cursor, count,
286 		    bl->bl_radix);
287 		if (blk != SWAPBLK_NONE) {
288 			bl->bl_cursor = blk + count;
289 			if (bl->bl_cursor == bl->bl_blocks)
290 				bl->bl_cursor = 0;
291 			return (blk);
292 		} else if (bl->bl_cursor != 0)
293 			bl->bl_cursor = 0;
294 	}
295 	return (SWAPBLK_NONE);
296 }
297 
298 /*
299  * blist_avail() -	return the number of free blocks.
300  */
301 daddr_t
302 blist_avail(blist_t bl)
303 {
304 
305 	if (bl->bl_radix == BLIST_BMAP_RADIX)
306 		return (bitcount64(bl->bl_root->u.bmu_bitmap));
307 	else
308 		return (bl->bl_root->u.bmu_avail);
309 }
310 
311 /*
312  * blist_free() -	free up space in the block bitmap.  Return the base
313  *		     	of a contiguous region.  Panic if an inconsistancy is
314  *			found.
315  */
316 void
317 blist_free(blist_t bl, daddr_t blkno, daddr_t count)
318 {
319 
320 	blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix);
321 }
322 
323 /*
324  * blist_fill() -	mark a region in the block bitmap as off-limits
325  *			to the allocator (i.e. allocate it), ignoring any
326  *			existing allocations.  Return the number of blocks
327  *			actually filled that were free before the call.
328  */
329 daddr_t
330 blist_fill(blist_t bl, daddr_t blkno, daddr_t count)
331 {
332 
333 	return (blst_meta_fill(bl->bl_root, blkno, count, bl->bl_radix));
334 }
335 
336 /*
337  * blist_resize() -	resize an existing radix tree to handle the
338  *			specified number of blocks.  This will reallocate
339  *			the tree and transfer the previous bitmap to the new
340  *			one.  When extending the tree you can specify whether
341  *			the new blocks are to left allocated or freed.
342  */
343 void
344 blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags)
345 {
346     blist_t newbl = blist_create(count, flags);
347     blist_t save = *pbl;
348 
349     *pbl = newbl;
350     if (count > save->bl_blocks)
351 	    count = save->bl_blocks;
352     blst_copy(save->bl_root, 0, save->bl_radix, newbl, count);
353 
354     /*
355      * If resizing upwards, should we free the new space or not?
356      */
357     if (freenew && count < newbl->bl_blocks) {
358 	    blist_free(newbl, count, newbl->bl_blocks - count);
359     }
360     blist_destroy(save);
361 }
362 
363 #ifdef BLIST_DEBUG
364 
365 /*
366  * blist_print()    - dump radix tree
367  */
368 void
369 blist_print(blist_t bl)
370 {
371 	printf("BLIST cursor = %08jx {\n", (uintmax_t)bl->bl_cursor);
372 	blst_radix_print(bl->bl_root, 0, bl->bl_radix, 4);
373 	printf("}\n");
374 }
375 
376 #endif
377 
378 static const u_daddr_t fib[] = {
379 	1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584,
380 	4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418, 317811,
381 	514229, 832040, 1346269, 2178309, 3524578,
382 };
383 
384 /*
385  * Use 'gap' to describe a maximal range of unallocated blocks/bits.
386  */
387 struct gap_stats {
388 	daddr_t	start;		/* current gap start, or SWAPBLK_NONE */
389 	daddr_t	num;		/* number of gaps observed */
390 	daddr_t	max;		/* largest gap size */
391 	daddr_t	avg;		/* average gap size */
392 	daddr_t	err;		/* sum - num * avg */
393 	daddr_t	histo[nitems(fib)]; /* # gaps in each size range */
394 	int	max_bucket;	/* last histo elt with nonzero val */
395 };
396 
397 /*
398  * gap_stats_counting()    - is the state 'counting 1 bits'?
399  *                           or 'skipping 0 bits'?
400  */
401 static inline bool
402 gap_stats_counting(const struct gap_stats *stats)
403 {
404 
405 	return (stats->start != SWAPBLK_NONE);
406 }
407 
408 /*
409  * init_gap_stats()    - initialize stats on gap sizes
410  */
411 static inline void
412 init_gap_stats(struct gap_stats *stats)
413 {
414 
415 	bzero(stats, sizeof(*stats));
416 	stats->start = SWAPBLK_NONE;
417 }
418 
419 /*
420  * update_gap_stats()    - update stats on gap sizes
421  */
422 static void
423 update_gap_stats(struct gap_stats *stats, daddr_t posn)
424 {
425 	daddr_t size;
426 	int hi, lo, mid;
427 
428 	if (!gap_stats_counting(stats)) {
429 		stats->start = posn;
430 		return;
431 	}
432 	size = posn - stats->start;
433 	stats->start = SWAPBLK_NONE;
434 	if (size > stats->max)
435 		stats->max = size;
436 
437 	/*
438 	 * Find the fibonacci range that contains size,
439 	 * expecting to find it in an early range.
440 	 */
441 	lo = 0;
442 	hi = 1;
443 	while (hi < nitems(fib) && fib[hi] <= size) {
444 		lo = hi;
445 		hi *= 2;
446 	}
447 	if (hi >= nitems(fib))
448 		hi = nitems(fib);
449 	while (lo + 1 != hi) {
450 		mid = (lo + hi) >> 1;
451 		if (fib[mid] <= size)
452 			lo = mid;
453 		else
454 			hi = mid;
455 	}
456 	stats->histo[lo]++;
457 	if (lo > stats->max_bucket)
458 		stats->max_bucket = lo;
459 	stats->err += size - stats->avg;
460 	stats->num++;
461 	stats->avg += stats->err / stats->num;
462 	stats->err %= stats->num;
463 }
464 
465 /*
466  * dump_gap_stats()    - print stats on gap sizes
467  */
468 static inline void
469 dump_gap_stats(const struct gap_stats *stats, struct sbuf *s)
470 {
471 	int i;
472 
473 	sbuf_printf(s, "number of maximal free ranges: %jd\n",
474 	    (intmax_t)stats->num);
475 	sbuf_printf(s, "largest free range: %jd\n", (intmax_t)stats->max);
476 	sbuf_printf(s, "average maximal free range size: %jd\n",
477 	    (intmax_t)stats->avg);
478 	sbuf_printf(s, "number of maximal free ranges of different sizes:\n");
479 	sbuf_printf(s, "               count  |  size range\n");
480 	sbuf_printf(s, "               -----  |  ----------\n");
481 	for (i = 0; i < stats->max_bucket; i++) {
482 		if (stats->histo[i] != 0) {
483 			sbuf_printf(s, "%20jd  |  ",
484 			    (intmax_t)stats->histo[i]);
485 			if (fib[i] != fib[i + 1] - 1)
486 				sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
487 				    (intmax_t)fib[i + 1] - 1);
488 			else
489 				sbuf_printf(s, "%jd\n", (intmax_t)fib[i]);
490 		}
491 	}
492 	sbuf_printf(s, "%20jd  |  ", (intmax_t)stats->histo[i]);
493 	if (stats->histo[i] > 1)
494 		sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
495 		    (intmax_t)stats->max);
496 	else
497 		sbuf_printf(s, "%jd\n", (intmax_t)stats->max);
498 }
499 
500 /*
501  * blist_stats()    - dump radix tree stats
502  */
503 void
504 blist_stats(blist_t bl, struct sbuf *s)
505 {
506 	struct gap_stats gstats;
507 	struct gap_stats *stats = &gstats;
508 	daddr_t i, nodes, radix;
509 	u_daddr_t bit, diff, mask;
510 
511 	init_gap_stats(stats);
512 	nodes = 0;
513 	i = bl->bl_radix;
514 	while (i < bl->bl_radix + bl->bl_blocks) {
515 		/*
516 		 * Find max size subtree starting at i.
517 		 */
518 		radix = BLIST_BMAP_RADIX;
519 		while (((i / radix) & BLIST_META_MASK) == 0)
520 			radix *= BLIST_META_RADIX;
521 
522 		/*
523 		 * Check for skippable subtrees starting at i.
524 		 */
525 		while (radix > BLIST_BMAP_RADIX) {
526 			if (bl->bl_root[nodes].u.bmu_avail == 0) {
527 				if (gap_stats_counting(stats))
528 					update_gap_stats(stats, i);
529 				break;
530 			}
531 			if (bl->bl_root[nodes].u.bmu_avail == radix) {
532 				if (!gap_stats_counting(stats))
533 					update_gap_stats(stats, i);
534 				break;
535 			}
536 
537 			/*
538 			 * Skip subtree root.
539 			 */
540 			nodes++;
541 			radix /= BLIST_META_RADIX;
542 		}
543 		if (radix == BLIST_BMAP_RADIX) {
544 			/*
545 			 * Scan leaf.
546 			 */
547 			mask = bl->bl_root[nodes].u.bmu_bitmap;
548 			diff = mask ^ (mask << 1);
549 			if (gap_stats_counting(stats))
550 				diff ^= 1;
551 			while (diff != 0) {
552 				bit = diff & -diff;
553 				update_gap_stats(stats, i + bitpos(bit));
554 				diff ^= bit;
555 			}
556 		}
557 		nodes += radix_to_skip(radix);
558 		i += radix;
559 	}
560 	update_gap_stats(stats, i);
561 	dump_gap_stats(stats, s);
562 }
563 
564 /************************************************************************
565  *			  ALLOCATION SUPPORT FUNCTIONS			*
566  ************************************************************************
567  *
568  *	These support functions do all the actual work.  They may seem
569  *	rather longish, but that's because I've commented them up.  The
570  *	actual code is straight forward.
571  *
572  */
573 
574 /*
575  * blist_leaf_alloc() -	allocate at a leaf in the radix tree (a bitmap).
576  *
577  *	This is the core of the allocator and is optimized for the
578  *	BLIST_BMAP_RADIX block allocation case.  Otherwise, execution
579  *	time is proportional to log2(count) + bitpos time.
580  */
581 static daddr_t
582 blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count)
583 {
584 	u_daddr_t mask;
585 	int count1, hi, lo, num_shifts, range1, range_ext;
586 
587 	range1 = 0;
588 	count1 = count - 1;
589 	num_shifts = fls(count1);
590 	mask = scan->u.bmu_bitmap;
591 	while ((-mask & ~mask) != 0 && num_shifts > 0) {
592 		/*
593 		 * If bit i is set in mask, then bits in [i, i+range1] are set
594 		 * in scan->u.bmu_bitmap.  The value of range1 is equal to
595 		 * count1 >> num_shifts.  Grow range and reduce num_shifts to 0,
596 		 * while preserving these invariants.  The updates to mask leave
597 		 * fewer bits set, but each bit that remains set represents a
598 		 * longer string of consecutive bits set in scan->u.bmu_bitmap.
599 		 * If more updates to mask cannot clear more bits, because mask
600 		 * is partitioned with all 0 bits preceding all 1 bits, the loop
601 		 * terminates immediately.
602 		 */
603 		num_shifts--;
604 		range_ext = range1 + ((count1 >> num_shifts) & 1);
605 		/*
606 		 * mask is a signed quantity for the shift because when it is
607 		 * shifted right, the sign bit should copied; when the last
608 		 * block of the leaf is free, pretend, for a while, that all the
609 		 * blocks that follow it are also free.
610 		 */
611 		mask &= (daddr_t)mask >> range_ext;
612 		range1 += range_ext;
613 	}
614 	if (mask == 0) {
615 		/*
616 		 * Update bighint.  There is no allocation bigger than range1
617 		 * starting in this leaf.
618 		 */
619 		scan->bm_bighint = range1;
620 		return (SWAPBLK_NONE);
621 	}
622 
623 	/* Discard any candidates that appear before blk. */
624 	mask &= (u_daddr_t)-1 << (blk & BLIST_BMAP_MASK);
625 	if (mask == 0)
626 		return (SWAPBLK_NONE);
627 
628 	/*
629 	 * The least significant set bit in mask marks the start of the first
630 	 * available range of sufficient size.  Clear all the bits but that one,
631 	 * and then find its position.
632 	 */
633 	mask &= -mask;
634 	lo = bitpos(mask);
635 
636 	hi = lo + count;
637 	if (hi > BLIST_BMAP_RADIX) {
638 		/*
639 		 * An allocation within this leaf is impossible, so a successful
640 		 * allocation depends on the next leaf providing some of the blocks.
641 		 */
642 		if (((blk / BLIST_BMAP_RADIX + 1) & BLIST_META_MASK) == 0) {
643 			/*
644 			 * The next leaf has a different meta-node parent, so it
645 			 * is not necessarily initialized.  Update bighint,
646 			 * comparing the range found at the end of mask to the
647 			 * largest earlier range that could have been made to
648 			 * vanish in the initial processing of mask.
649 			 */
650 			scan->bm_bighint = imax(BLIST_BMAP_RADIX - lo, range1);
651 			return (SWAPBLK_NONE);
652 		}
653 		hi -= BLIST_BMAP_RADIX;
654 		if (((scan[1].u.bmu_bitmap + 1) & ~((u_daddr_t)-1 << hi)) != 0) {
655 			/*
656 			 * The next leaf doesn't have enough free blocks at the
657 			 * beginning to complete the spanning allocation.  The
658 			 * hint cannot be updated, because the same allocation
659 			 * request could be satisfied later, by this leaf, if
660 			 * the state of the next leaf changes, and without any
661 			 * changes to this leaf.
662 			 */
663 			return (SWAPBLK_NONE);
664 		}
665 		/* Clear the first 'hi' bits in the next leaf, allocating them. */
666 		scan[1].u.bmu_bitmap &= (u_daddr_t)-1 << hi;
667 		hi = BLIST_BMAP_RADIX;
668 	}
669 
670 	/* Set the bits of mask at position 'lo' and higher. */
671 	mask = -mask;
672 	if (hi == BLIST_BMAP_RADIX) {
673 		/*
674 		 * Update bighint.  There is no allocation bigger than range1
675 		 * available in this leaf after this allocation completes.
676 		 */
677 		scan->bm_bighint = range1;
678 	} else {
679 		/* Clear the bits of mask at position 'hi' and higher. */
680 		mask &= (u_daddr_t)-1 >> (BLIST_BMAP_RADIX - hi);
681 		/* If this allocation uses all the bits, clear the hint. */
682 		if (mask == scan->u.bmu_bitmap)
683 			scan->bm_bighint = 0;
684 	}
685 	/* Clear the allocated bits from this leaf. */
686 	scan->u.bmu_bitmap &= ~mask;
687 	return ((blk & ~BLIST_BMAP_MASK) + lo);
688 }
689 
690 /*
691  * blist_meta_alloc() -	allocate at a meta in the radix tree.
692  *
693  *	Attempt to allocate at a meta node.  If we can't, we update
694  *	bighint and return a failure.  Updating bighint optimize future
695  *	calls that hit this node.  We have to check for our collapse cases
696  *	and we have a few optimizations strewn in as well.
697  */
698 static daddr_t
699 blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count, u_daddr_t radix)
700 {
701 	daddr_t blk, i, next_skip, r, skip;
702 	int child;
703 	bool scan_from_start;
704 
705 	if (radix == BLIST_BMAP_RADIX)
706 		return (blst_leaf_alloc(scan, cursor, count));
707 	if (scan->u.bmu_avail < count) {
708 		/*
709 		 * The meta node's hint must be too large if the allocation
710 		 * exceeds the number of free blocks.  Reduce the hint, and
711 		 * return failure.
712 		 */
713 		scan->bm_bighint = scan->u.bmu_avail;
714 		return (SWAPBLK_NONE);
715 	}
716 	blk = cursor & -radix;
717 	skip = radix_to_skip(radix);
718 	next_skip = skip / BLIST_META_RADIX;
719 
720 	/*
721 	 * An ALL-FREE meta node requires special handling before allocating
722 	 * any of its blocks.
723 	 */
724 	if (scan->u.bmu_avail == radix) {
725 		radix /= BLIST_META_RADIX;
726 
727 		/*
728 		 * Reinitialize each of the meta node's children.  An ALL-FREE
729 		 * meta node cannot have a terminator in any subtree.
730 		 */
731 		for (i = 1; i < skip; i += next_skip) {
732 			if (next_skip == 1)
733 				scan[i].u.bmu_bitmap = (u_daddr_t)-1;
734 			else
735 				scan[i].u.bmu_avail = radix;
736 			scan[i].bm_bighint = radix;
737 		}
738 	} else {
739 		radix /= BLIST_META_RADIX;
740 	}
741 
742 	if (count > radix) {
743 		/*
744 		 * The allocation exceeds the number of blocks that are
745 		 * managed by a subtree of this meta node.
746 		 */
747 		panic("allocation too large");
748 	}
749 	scan_from_start = cursor == blk;
750 	child = (cursor - blk) / radix;
751 	blk += child * radix;
752 	for (i = 1 + child * next_skip; i < skip; i += next_skip) {
753 		if (count <= scan[i].bm_bighint) {
754 			/*
755 			 * The allocation might fit beginning in the i'th subtree.
756 			 */
757 			r = blst_meta_alloc(&scan[i],
758 			    cursor > blk ? cursor : blk, count, radix);
759 			if (r != SWAPBLK_NONE) {
760 				scan->u.bmu_avail -= count;
761 				return (r);
762 			}
763 		} else if (scan[i].bm_bighint == (daddr_t)-1) {
764 			/*
765 			 * Terminator
766 			 */
767 			break;
768 		}
769 		blk += radix;
770 	}
771 
772 	/*
773 	 * We couldn't allocate count in this subtree, update bighint.
774 	 */
775 	if (scan_from_start && scan->bm_bighint >= count)
776 		scan->bm_bighint = count - 1;
777 
778 	return (SWAPBLK_NONE);
779 }
780 
781 /*
782  * BLST_LEAF_FREE() -	free allocated block from leaf bitmap
783  *
784  */
785 static void
786 blst_leaf_free(blmeta_t *scan, daddr_t blk, int count)
787 {
788 	u_daddr_t mask;
789 	int n;
790 
791 	/*
792 	 * free some data in this bitmap
793 	 * mask=0000111111111110000
794 	 *          \_________/\__/
795 	 *		count   n
796 	 */
797 	n = blk & BLIST_BMAP_MASK;
798 	mask = ((u_daddr_t)-1 << n) &
799 	    ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n));
800 	if (scan->u.bmu_bitmap & mask)
801 		panic("freeing free block");
802 	scan->u.bmu_bitmap |= mask;
803 
804 	/*
805 	 * We could probably do a better job here.  We are required to make
806 	 * bighint at least as large as the biggest contiguous block of
807 	 * data.  If we just shoehorn it, a little extra overhead will
808 	 * be incured on the next allocation (but only that one typically).
809 	 */
810 	scan->bm_bighint = BLIST_BMAP_RADIX;
811 }
812 
813 /*
814  * BLST_META_FREE() - free allocated blocks from radix tree meta info
815  *
816  *	This support routine frees a range of blocks from the bitmap.
817  *	The range must be entirely enclosed by this radix node.  If a
818  *	meta node, we break the range down recursively to free blocks
819  *	in subnodes (which means that this code can free an arbitrary
820  *	range whereas the allocation code cannot allocate an arbitrary
821  *	range).
822  */
823 static void
824 blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, u_daddr_t radix)
825 {
826 	daddr_t blk, i, next_skip, skip, v;
827 	int child;
828 
829 	if (scan->bm_bighint == (daddr_t)-1)
830 		panic("freeing invalid range");
831 	if (radix == BLIST_BMAP_RADIX)
832 		return (blst_leaf_free(scan, freeBlk, count));
833 	skip = radix_to_skip(radix);
834 	next_skip = skip / BLIST_META_RADIX;
835 
836 	if (scan->u.bmu_avail == 0) {
837 		/*
838 		 * ALL-ALLOCATED special case, with possible
839 		 * shortcut to ALL-FREE special case.
840 		 */
841 		scan->u.bmu_avail = count;
842 		scan->bm_bighint = count;
843 
844 		if (count != radix)  {
845 			for (i = 1; i < skip; i += next_skip) {
846 				if (scan[i].bm_bighint == (daddr_t)-1)
847 					break;
848 				scan[i].bm_bighint = 0;
849 				if (next_skip == 1) {
850 					scan[i].u.bmu_bitmap = 0;
851 				} else {
852 					scan[i].u.bmu_avail = 0;
853 				}
854 			}
855 			/* fall through */
856 		}
857 	} else {
858 		scan->u.bmu_avail += count;
859 		/* scan->bm_bighint = radix; */
860 	}
861 
862 	/*
863 	 * ALL-FREE special case.
864 	 */
865 
866 	if (scan->u.bmu_avail == radix)
867 		return;
868 	if (scan->u.bmu_avail > radix)
869 		panic("blst_meta_free: freeing already free blocks (%lld) %lld/%lld",
870 		    (long long)count, (long long)scan->u.bmu_avail,
871 		    (long long)radix);
872 
873 	/*
874 	 * Break the free down into its components
875 	 */
876 
877 	blk = freeBlk & -radix;
878 	radix /= BLIST_META_RADIX;
879 
880 	child = (freeBlk - blk) / radix;
881 	blk += child * radix;
882 	i = 1 + child * next_skip;
883 	while (i < skip && blk < freeBlk + count) {
884 		v = blk + radix - freeBlk;
885 		if (v > count)
886 			v = count;
887 		blst_meta_free(&scan[i], freeBlk, v, radix);
888 		if (scan->bm_bighint < scan[i].bm_bighint)
889 			scan->bm_bighint = scan[i].bm_bighint;
890 		count -= v;
891 		freeBlk += v;
892 		blk += radix;
893 		i += next_skip;
894 	}
895 }
896 
897 /*
898  * BLIST_RADIX_COPY() - copy one radix tree to another
899  *
900  *	Locates free space in the source tree and frees it in the destination
901  *	tree.  The space may not already be free in the destination.
902  */
903 static void
904 blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, blist_t dest,
905     daddr_t count)
906 {
907 	daddr_t i, next_skip, skip;
908 
909 	/*
910 	 * Leaf node
911 	 */
912 
913 	if (radix == BLIST_BMAP_RADIX) {
914 		u_daddr_t v = scan->u.bmu_bitmap;
915 
916 		if (v == (u_daddr_t)-1) {
917 			blist_free(dest, blk, count);
918 		} else if (v != 0) {
919 			int i;
920 
921 			for (i = 0; i < BLIST_BMAP_RADIX && i < count; ++i) {
922 				if (v & ((u_daddr_t)1 << i))
923 					blist_free(dest, blk + i, 1);
924 			}
925 		}
926 		return;
927 	}
928 
929 	/*
930 	 * Meta node
931 	 */
932 
933 	if (scan->u.bmu_avail == 0) {
934 		/*
935 		 * Source all allocated, leave dest allocated
936 		 */
937 		return;
938 	}
939 	if (scan->u.bmu_avail == radix) {
940 		/*
941 		 * Source all free, free entire dest
942 		 */
943 		if (count < radix)
944 			blist_free(dest, blk, count);
945 		else
946 			blist_free(dest, blk, radix);
947 		return;
948 	}
949 
950 
951 	skip = radix_to_skip(radix);
952 	next_skip = skip / BLIST_META_RADIX;
953 	radix /= BLIST_META_RADIX;
954 
955 	for (i = 1; count && i < skip; i += next_skip) {
956 		if (scan[i].bm_bighint == (daddr_t)-1)
957 			break;
958 
959 		if (count >= radix) {
960 			blst_copy(&scan[i], blk, radix, dest, radix);
961 			count -= radix;
962 		} else {
963 			if (count) {
964 				blst_copy(&scan[i], blk, radix, dest, count);
965 			}
966 			count = 0;
967 		}
968 		blk += radix;
969 	}
970 }
971 
972 /*
973  * BLST_LEAF_FILL() -	allocate specific blocks in leaf bitmap
974  *
975  *	This routine allocates all blocks in the specified range
976  *	regardless of any existing allocations in that range.  Returns
977  *	the number of blocks allocated by the call.
978  */
979 static daddr_t
980 blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count)
981 {
982 	daddr_t nblks;
983 	u_daddr_t mask;
984 	int n;
985 
986 	n = blk & BLIST_BMAP_MASK;
987 	mask = ((u_daddr_t)-1 << n) &
988 	    ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n));
989 
990 	/* Count the number of blocks that we are allocating. */
991 	nblks = bitcount64(scan->u.bmu_bitmap & mask);
992 
993 	scan->u.bmu_bitmap &= ~mask;
994 	return (nblks);
995 }
996 
997 /*
998  * BLIST_META_FILL() -	allocate specific blocks at a meta node
999  *
1000  *	This routine allocates the specified range of blocks,
1001  *	regardless of any existing allocations in the range.  The
1002  *	range must be within the extent of this node.  Returns the
1003  *	number of blocks allocated by the call.
1004  */
1005 static daddr_t
1006 blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, u_daddr_t radix)
1007 {
1008 	daddr_t blk, i, nblks, next_skip, skip, v;
1009 	int child;
1010 
1011 	if (scan->bm_bighint == (daddr_t)-1)
1012 		panic("filling invalid range");
1013 	if (count > radix) {
1014 		/*
1015 		 * The allocation exceeds the number of blocks that are
1016 		 * managed by this node.
1017 		 */
1018 		panic("fill too large");
1019 	}
1020 	if (radix == BLIST_BMAP_RADIX)
1021 		return (blst_leaf_fill(scan, allocBlk, count));
1022 	if (count == radix || scan->u.bmu_avail == 0)  {
1023 		/*
1024 		 * ALL-ALLOCATED special case
1025 		 */
1026 		nblks = scan->u.bmu_avail;
1027 		scan->u.bmu_avail = 0;
1028 		scan->bm_bighint = 0;
1029 		return (nblks);
1030 	}
1031 	skip = radix_to_skip(radix);
1032 	next_skip = skip / BLIST_META_RADIX;
1033 	blk = allocBlk & -radix;
1034 
1035 	/*
1036 	 * An ALL-FREE meta node requires special handling before allocating
1037 	 * any of its blocks.
1038 	 */
1039 	if (scan->u.bmu_avail == radix) {
1040 		radix /= BLIST_META_RADIX;
1041 
1042 		/*
1043 		 * Reinitialize each of the meta node's children.  An ALL-FREE
1044 		 * meta node cannot have a terminator in any subtree.
1045 		 */
1046 		for (i = 1; i < skip; i += next_skip) {
1047 			if (next_skip == 1)
1048 				scan[i].u.bmu_bitmap = (u_daddr_t)-1;
1049 			else
1050 				scan[i].u.bmu_avail = radix;
1051 			scan[i].bm_bighint = radix;
1052 		}
1053 	} else {
1054 		radix /= BLIST_META_RADIX;
1055 	}
1056 
1057 	nblks = 0;
1058 	child = (allocBlk - blk) / radix;
1059 	blk += child * radix;
1060 	i = 1 + child * next_skip;
1061 	while (i < skip && blk < allocBlk + count) {
1062 		v = blk + radix - allocBlk;
1063 		if (v > count)
1064 			v = count;
1065 		nblks += blst_meta_fill(&scan[i], allocBlk, v, radix);
1066 		count -= v;
1067 		allocBlk += v;
1068 		blk += radix;
1069 		i += next_skip;
1070 	}
1071 	scan->u.bmu_avail -= nblks;
1072 	return (nblks);
1073 }
1074 
1075 /*
1076  * BLST_RADIX_INIT() - initialize radix tree
1077  *
1078  *	Initialize our meta structures and bitmaps and calculate the exact
1079  *	amount of space required to manage 'count' blocks - this space may
1080  *	be considerably less than the calculated radix due to the large
1081  *	RADIX values we use.
1082  */
1083 static daddr_t
1084 blst_radix_init(blmeta_t *scan, daddr_t radix, daddr_t count)
1085 {
1086 	daddr_t i, memindex, next_skip, skip;
1087 
1088 	memindex = 0;
1089 
1090 	/*
1091 	 * Leaf node
1092 	 */
1093 
1094 	if (radix == BLIST_BMAP_RADIX) {
1095 		if (scan) {
1096 			scan->bm_bighint = 0;
1097 			scan->u.bmu_bitmap = 0;
1098 		}
1099 		return (memindex);
1100 	}
1101 
1102 	/*
1103 	 * Meta node.  If allocating the entire object we can special
1104 	 * case it.  However, we need to figure out how much memory
1105 	 * is required to manage 'count' blocks, so we continue on anyway.
1106 	 */
1107 
1108 	if (scan) {
1109 		scan->bm_bighint = 0;
1110 		scan->u.bmu_avail = 0;
1111 	}
1112 
1113 	skip = radix_to_skip(radix);
1114 	next_skip = skip / BLIST_META_RADIX;
1115 	radix /= BLIST_META_RADIX;
1116 
1117 	for (i = 1; i < skip; i += next_skip) {
1118 		if (count >= radix) {
1119 			/*
1120 			 * Allocate the entire object
1121 			 */
1122 			memindex = i +
1123 			    blst_radix_init(((scan) ? &scan[i] : NULL), radix,
1124 			    radix);
1125 			count -= radix;
1126 		} else if (count > 0) {
1127 			/*
1128 			 * Allocate a partial object
1129 			 */
1130 			memindex = i +
1131 			    blst_radix_init(((scan) ? &scan[i] : NULL), radix,
1132 			    count);
1133 			count = 0;
1134 		} else {
1135 			/*
1136 			 * Add terminator and break out.  Make terminator bitmap
1137 			 * zero to avoid a spanning leaf allocation that
1138 			 * includes the terminator.
1139 			 */
1140 			if (scan) {
1141 				scan[i].bm_bighint = (daddr_t)-1;
1142 				scan[i].u.bmu_bitmap = 0;
1143 			}
1144 			break;
1145 		}
1146 	}
1147 	if (memindex < i)
1148 		memindex = i;
1149 	return (memindex);
1150 }
1151 
1152 #ifdef BLIST_DEBUG
1153 
1154 static void
1155 blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int tab)
1156 {
1157 	daddr_t i, next_skip, skip;
1158 
1159 	if (radix == BLIST_BMAP_RADIX) {
1160 		printf(
1161 		    "%*.*s(%08llx,%lld): bitmap %016llx big=%lld\n",
1162 		    tab, tab, "",
1163 		    (long long)blk, (long long)radix,
1164 		    (long long)scan->u.bmu_bitmap,
1165 		    (long long)scan->bm_bighint
1166 		);
1167 		return;
1168 	}
1169 
1170 	if (scan->u.bmu_avail == 0) {
1171 		printf(
1172 		    "%*.*s(%08llx,%lld) ALL ALLOCATED\n",
1173 		    tab, tab, "",
1174 		    (long long)blk,
1175 		    (long long)radix
1176 		);
1177 		return;
1178 	}
1179 	if (scan->u.bmu_avail == radix) {
1180 		printf(
1181 		    "%*.*s(%08llx,%lld) ALL FREE\n",
1182 		    tab, tab, "",
1183 		    (long long)blk,
1184 		    (long long)radix
1185 		);
1186 		return;
1187 	}
1188 
1189 	printf(
1190 	    "%*.*s(%08llx,%lld): subtree (%lld/%lld) big=%lld {\n",
1191 	    tab, tab, "",
1192 	    (long long)blk, (long long)radix,
1193 	    (long long)scan->u.bmu_avail,
1194 	    (long long)radix,
1195 	    (long long)scan->bm_bighint
1196 	);
1197 
1198 	skip = radix_to_skip(radix);
1199 	next_skip = skip / BLIST_META_RADIX;
1200 	radix /= BLIST_META_RADIX;
1201 	tab += 4;
1202 
1203 	for (i = 1; i < skip; i += next_skip) {
1204 		if (scan[i].bm_bighint == (daddr_t)-1) {
1205 			printf(
1206 			    "%*.*s(%08llx,%lld): Terminator\n",
1207 			    tab, tab, "",
1208 			    (long long)blk, (long long)radix
1209 			);
1210 			break;
1211 		}
1212 		blst_radix_print(&scan[i], blk, radix, tab);
1213 		blk += radix;
1214 	}
1215 	tab -= 4;
1216 
1217 	printf(
1218 	    "%*.*s}\n",
1219 	    tab, tab, ""
1220 	);
1221 }
1222 
1223 #endif
1224 
1225 #ifdef BLIST_DEBUG
1226 
1227 int
1228 main(int ac, char **av)
1229 {
1230 	int size = 1024;
1231 	int i;
1232 	blist_t bl;
1233 	struct sbuf *s;
1234 
1235 	for (i = 1; i < ac; ++i) {
1236 		const char *ptr = av[i];
1237 		if (*ptr != '-') {
1238 			size = strtol(ptr, NULL, 0);
1239 			continue;
1240 		}
1241 		ptr += 2;
1242 		fprintf(stderr, "Bad option: %s\n", ptr - 2);
1243 		exit(1);
1244 	}
1245 	bl = blist_create(size, M_WAITOK);
1246 	blist_free(bl, 0, size);
1247 
1248 	for (;;) {
1249 		char buf[1024];
1250 		long long da = 0;
1251 		long long count = 0;
1252 
1253 		printf("%lld/%lld/%lld> ", (long long)blist_avail(bl),
1254 		    (long long)size, (long long)bl->bl_radix);
1255 		fflush(stdout);
1256 		if (fgets(buf, sizeof(buf), stdin) == NULL)
1257 			break;
1258 		switch(buf[0]) {
1259 		case 'r':
1260 			if (sscanf(buf + 1, "%lld", &count) == 1) {
1261 				blist_resize(&bl, count, 1, M_WAITOK);
1262 			} else {
1263 				printf("?\n");
1264 			}
1265 		case 'p':
1266 			blist_print(bl);
1267 			break;
1268 		case 's':
1269 			s = sbuf_new_auto();
1270 			blist_stats(bl, s);
1271 			sbuf_finish(s);
1272 			printf("%s", sbuf_data(s));
1273 			sbuf_delete(s);
1274 			break;
1275 		case 'a':
1276 			if (sscanf(buf + 1, "%lld", &count) == 1) {
1277 				daddr_t blk = blist_alloc(bl, count);
1278 				printf("    R=%08llx\n", (long long)blk);
1279 			} else {
1280 				printf("?\n");
1281 			}
1282 			break;
1283 		case 'f':
1284 			if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) {
1285 				blist_free(bl, da, count);
1286 			} else {
1287 				printf("?\n");
1288 			}
1289 			break;
1290 		case 'l':
1291 			if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) {
1292 				printf("    n=%jd\n",
1293 				    (intmax_t)blist_fill(bl, da, count));
1294 			} else {
1295 				printf("?\n");
1296 			}
1297 			break;
1298 		case '?':
1299 		case 'h':
1300 			puts(
1301 			    "p          -print\n"
1302 			    "s          -stats\n"
1303 			    "a %d       -allocate\n"
1304 			    "f %x %d    -free\n"
1305 			    "l %x %d    -fill\n"
1306 			    "r %d       -resize\n"
1307 			    "h/?        -help"
1308 			);
1309 			break;
1310 		default:
1311 			printf("?\n");
1312 			break;
1313 		}
1314 	}
1315 	return(0);
1316 }
1317 
1318 void
1319 panic(const char *ctl, ...)
1320 {
1321 	va_list va;
1322 
1323 	va_start(va, ctl);
1324 	vfprintf(stderr, ctl, va);
1325 	fprintf(stderr, "\n");
1326 	va_end(va);
1327 	exit(1);
1328 }
1329 
1330 #endif
1331