xref: /freebsd/sys/kern/subr_blist.c (revision da3086df1863e9aed45eb14e265c306f8bb27d80)
1 /*-
2  * Copyright (c) 1998 Matthew Dillon.  All Rights Reserved.
3  * Redistribution and use in source and binary forms, with or without
4  * modification, are permitted provided that the following conditions
5  * are met:
6  * 1. Redistributions of source code must retain the above copyright
7  *    notice, this list of conditions and the following disclaimer.
8  * 2. Redistributions in binary form must reproduce the above copyright
9  *    notice, this list of conditions and the following disclaimer in the
10  *    documentation and/or other materials provided with the distribution.
11  * 3. Neither the name of the University nor the names of its contributors
12  *    may be used to endorse or promote products derived from this software
13  *    without specific prior written permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
16  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
19  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
21  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 /*
28  * BLIST.C -	Bitmap allocator/deallocator, using a radix tree with hinting
29  *
30  *	This module implements a general bitmap allocator/deallocator.  The
31  *	allocator eats around 2 bits per 'block'.  The module does not
32  *	try to interpret the meaning of a 'block' other than to return
33  *	SWAPBLK_NONE on an allocation failure.
34  *
35  *	A radix tree is used to maintain the bitmap.  Two radix constants are
36  *	involved:  One for the bitmaps contained in the leaf nodes (typically
37  *	64), and one for the meta nodes (typically 16).  Both meta and leaf
38  *	nodes have a hint field.  This field gives us a hint as to the largest
39  *	free contiguous range of blocks under the node.  It may contain a
40  *	value that is too high, but will never contain a value that is too
41  *	low.  When the radix tree is searched, allocation failures in subtrees
42  *	update the hint.
43  *
44  *	The radix tree also implements two collapsed states for meta nodes:
45  *	the ALL-ALLOCATED state and the ALL-FREE state.  If a meta node is
46  *	in either of these two states, all information contained underneath
47  *	the node is considered stale.  These states are used to optimize
48  *	allocation and freeing operations.
49  *
50  * 	The hinting greatly increases code efficiency for allocations while
51  *	the general radix structure optimizes both allocations and frees.  The
52  *	radix tree should be able to operate well no matter how much
53  *	fragmentation there is and no matter how large a bitmap is used.
54  *
55  *	The blist code wires all necessary memory at creation time.  Neither
56  *	allocations nor frees require interaction with the memory subsystem.
57  *	The non-blocking features of the blist code are used in the swap code
58  *	(vm/swap_pager.c).
59  *
60  *	LAYOUT: The radix tree is laid out recursively using a
61  *	linear array.  Each meta node is immediately followed (laid out
62  *	sequentially in memory) by BLIST_META_RADIX lower level nodes.  This
63  *	is a recursive structure but one that can be easily scanned through
64  *	a very simple 'skip' calculation.  In order to support large radixes,
65  *	portions of the tree may reside outside our memory allocation.  We
66  *	handle this with an early-termination optimization (when bighint is
67  *	set to -1) on the scan.  The memory allocation is only large enough
68  *	to cover the number of blocks requested at creation time even if it
69  *	must be encompassed in larger root-node radix.
70  *
71  *	NOTE: the allocator cannot currently allocate more than
72  *	BLIST_BMAP_RADIX blocks per call.  It will panic with 'allocation too
73  *	large' if you try.  This is an area that could use improvement.  The
74  *	radix is large enough that this restriction does not effect the swap
75  *	system, though.  Currently only the allocation code is affected by
76  *	this algorithmic unfeature.  The freeing code can handle arbitrary
77  *	ranges.
78  *
79  *	This code can be compiled stand-alone for debugging.
80  */
81 
82 #include <sys/cdefs.h>
83 __FBSDID("$FreeBSD$");
84 
85 #ifdef _KERNEL
86 
87 #include <sys/param.h>
88 #include <sys/systm.h>
89 #include <sys/lock.h>
90 #include <sys/kernel.h>
91 #include <sys/blist.h>
92 #include <sys/malloc.h>
93 #include <sys/sbuf.h>
94 #include <sys/proc.h>
95 #include <sys/mutex.h>
96 
97 #else
98 
99 #ifndef BLIST_NO_DEBUG
100 #define BLIST_DEBUG
101 #endif
102 
103 #include <sys/types.h>
104 #include <sys/malloc.h>
105 #include <sys/sbuf.h>
106 #include <stdio.h>
107 #include <string.h>
108 #include <stdlib.h>
109 #include <stdarg.h>
110 #include <stdbool.h>
111 
112 #define	bitcount64(x)	__bitcount64((uint64_t)(x))
113 #define malloc(a,b,c)	calloc(a, 1)
114 #define free(a,b)	free(a)
115 #define CTASSERT(expr)
116 
117 #include <sys/blist.h>
118 
119 void panic(const char *ctl, ...);
120 
121 #endif
122 
123 /*
124  * static support functions
125  */
126 static daddr_t	blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count,
127 		    daddr_t cursor);
128 static daddr_t	blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count,
129 		    u_daddr_t radix);
130 static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count);
131 static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count,
132 		    u_daddr_t radix);
133 static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix,
134 		    blist_t dest, daddr_t count);
135 static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count);
136 static daddr_t blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count,
137 		    u_daddr_t radix);
138 static daddr_t	blst_radix_init(blmeta_t *scan, daddr_t radix, daddr_t count);
139 #ifndef _KERNEL
140 static void	blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix,
141 		    int tab);
142 #endif
143 
144 #ifdef _KERNEL
145 static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space");
146 #endif
147 
148 CTASSERT(BLIST_BMAP_RADIX % BLIST_META_RADIX == 0);
149 #define	BLIST_META_MASK	(BLIST_META_RADIX - 1)
150 
151 /*
152  * For a subtree that can represent the state of up to 'radix' blocks, the
153  * number of leaf nodes of the subtree is L=radix/BLIST_BMAP_RADIX.  If 'm'
154  * is short for BLIST_META_RADIX, then for a tree of height h with L=m**h
155  * leaf nodes, the total number of tree nodes is 1 + m + m**2 + ... + m**h,
156  * or, equivalently, (m**(h+1)-1)/(m-1).  This quantity is called 'skip'
157  * in the 'meta' functions that process subtrees.  Since integer division
158  * discards remainders, we can express this computation as
159  * skip = (m * m**h) / (m - 1)
160  * skip = (m * (radix / BLIST_BMAP_RADIX)) / (m - 1)
161  * and since m divides BLIST_BMAP_RADIX, we can simplify further to
162  * skip = (radix / (BLIST_BMAP_RADIX / m)) / (m - 1)
163  * skip = radix / ((BLIST_BMAP_RADIX / m) * (m - 1))
164  * so that simple integer division by a constant can safely be used for the
165  * calculation.
166  */
167 static inline daddr_t
168 radix_to_skip(daddr_t radix)
169 {
170 
171 	return (radix /
172 	    ((BLIST_BMAP_RADIX / BLIST_META_RADIX) * BLIST_META_MASK));
173 }
174 
175 /*
176  * Use binary search, or a faster method, to find the 1 bit in a u_daddr_t.
177  * Assumes that the argument has only one bit set.
178  */
179 static inline int
180 bitpos(u_daddr_t mask)
181 {
182 	int hi, lo, mid;
183 
184 	switch (sizeof(mask)) {
185 #ifdef HAVE_INLINE_FFSLL
186 	case sizeof(long long):
187 		return (ffsll(mask) - 1);
188 #endif
189 	default:
190 		lo = 0;
191 		hi = BLIST_BMAP_RADIX;
192 		while (lo + 1 < hi) {
193 			mid = (lo + hi) >> 1;
194 			if ((mask >> mid) != 0)
195 				lo = mid;
196 			else
197 				hi = mid;
198 		}
199 		return (lo);
200 	}
201 }
202 
203 /*
204  * blist_create() - create a blist capable of handling up to the specified
205  *		    number of blocks
206  *
207  *	blocks - must be greater than 0
208  * 	flags  - malloc flags
209  *
210  *	The smallest blist consists of a single leaf node capable of
211  *	managing BLIST_BMAP_RADIX blocks.
212  */
213 blist_t
214 blist_create(daddr_t blocks, int flags)
215 {
216 	blist_t bl;
217 	daddr_t nodes, radix;
218 
219 	/*
220 	 * Calculate the radix field used for scanning.
221 	 */
222 	radix = BLIST_BMAP_RADIX;
223 	while (radix < blocks) {
224 		radix *= BLIST_META_RADIX;
225 	}
226 	nodes = 1 + blst_radix_init(NULL, radix, blocks);
227 
228 	bl = malloc(sizeof(struct blist), M_SWAP, flags);
229 	if (bl == NULL)
230 		return (NULL);
231 
232 	bl->bl_blocks = blocks;
233 	bl->bl_radix = radix;
234 	bl->bl_cursor = 0;
235 	bl->bl_root = malloc(nodes * sizeof(blmeta_t), M_SWAP, flags);
236 	if (bl->bl_root == NULL) {
237 		free(bl, M_SWAP);
238 		return (NULL);
239 	}
240 	blst_radix_init(bl->bl_root, radix, blocks);
241 
242 #if defined(BLIST_DEBUG)
243 	printf(
244 		"BLIST representing %lld blocks (%lld MB of swap)"
245 		", requiring %lldK of ram\n",
246 		(long long)bl->bl_blocks,
247 		(long long)bl->bl_blocks * 4 / 1024,
248 		(long long)(nodes * sizeof(blmeta_t) + 1023) / 1024
249 	);
250 	printf("BLIST raw radix tree contains %lld records\n",
251 	    (long long)nodes);
252 #endif
253 
254 	return (bl);
255 }
256 
257 void
258 blist_destroy(blist_t bl)
259 {
260 	free(bl->bl_root, M_SWAP);
261 	free(bl, M_SWAP);
262 }
263 
264 /*
265  * blist_alloc() -   reserve space in the block bitmap.  Return the base
266  *		     of a contiguous region or SWAPBLK_NONE if space could
267  *		     not be allocated.
268  */
269 daddr_t
270 blist_alloc(blist_t bl, daddr_t count)
271 {
272 	daddr_t blk;
273 
274 	/*
275 	 * This loop iterates at most twice.  An allocation failure in the
276 	 * first iteration leads to a second iteration only if the cursor was
277 	 * non-zero.  When the cursor is zero, an allocation failure will
278 	 * reduce the hint, stopping further iterations.
279 	 */
280 	while (count <= bl->bl_root->bm_bighint) {
281 		blk = blst_meta_alloc(bl->bl_root, bl->bl_cursor, count,
282 		    bl->bl_radix);
283 		if (blk != SWAPBLK_NONE) {
284 			bl->bl_cursor = blk + count;
285 			if (bl->bl_cursor == bl->bl_blocks)
286 				bl->bl_cursor = 0;
287 			return (blk);
288 		} else if (bl->bl_cursor != 0)
289 			bl->bl_cursor = 0;
290 	}
291 	return (SWAPBLK_NONE);
292 }
293 
294 /*
295  * blist_avail() -	return the number of free blocks.
296  */
297 daddr_t
298 blist_avail(blist_t bl)
299 {
300 
301 	if (bl->bl_radix == BLIST_BMAP_RADIX)
302 		return (bitcount64(bl->bl_root->u.bmu_bitmap));
303 	else
304 		return (bl->bl_root->u.bmu_avail);
305 }
306 
307 /*
308  * blist_free() -	free up space in the block bitmap.  Return the base
309  *		     	of a contiguous region.  Panic if an inconsistancy is
310  *			found.
311  */
312 void
313 blist_free(blist_t bl, daddr_t blkno, daddr_t count)
314 {
315 
316 	blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix);
317 }
318 
319 /*
320  * blist_fill() -	mark a region in the block bitmap as off-limits
321  *			to the allocator (i.e. allocate it), ignoring any
322  *			existing allocations.  Return the number of blocks
323  *			actually filled that were free before the call.
324  */
325 daddr_t
326 blist_fill(blist_t bl, daddr_t blkno, daddr_t count)
327 {
328 
329 	return (blst_meta_fill(bl->bl_root, blkno, count, bl->bl_radix));
330 }
331 
332 /*
333  * blist_resize() -	resize an existing radix tree to handle the
334  *			specified number of blocks.  This will reallocate
335  *			the tree and transfer the previous bitmap to the new
336  *			one.  When extending the tree you can specify whether
337  *			the new blocks are to left allocated or freed.
338  */
339 void
340 blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags)
341 {
342     blist_t newbl = blist_create(count, flags);
343     blist_t save = *pbl;
344 
345     *pbl = newbl;
346     if (count > save->bl_blocks)
347 	    count = save->bl_blocks;
348     blst_copy(save->bl_root, 0, save->bl_radix, newbl, count);
349 
350     /*
351      * If resizing upwards, should we free the new space or not?
352      */
353     if (freenew && count < newbl->bl_blocks) {
354 	    blist_free(newbl, count, newbl->bl_blocks - count);
355     }
356     blist_destroy(save);
357 }
358 
359 #ifdef BLIST_DEBUG
360 
361 /*
362  * blist_print()    - dump radix tree
363  */
364 void
365 blist_print(blist_t bl)
366 {
367 	printf("BLIST cursor = %08jx {\n", (uintmax_t)bl->bl_cursor);
368 	blst_radix_print(bl->bl_root, 0, bl->bl_radix, 4);
369 	printf("}\n");
370 }
371 
372 #endif
373 
374 static const u_daddr_t fib[] = {
375 	1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584,
376 	4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418, 317811,
377 	514229, 832040, 1346269, 2178309, 3524578,
378 };
379 
380 /*
381  * Use 'gap' to describe a maximal range of unallocated blocks/bits.
382  */
383 struct gap_stats {
384 	daddr_t	start;		/* current gap start, or SWAPBLK_NONE */
385 	daddr_t	num;		/* number of gaps observed */
386 	daddr_t	max;		/* largest gap size */
387 	daddr_t	avg;		/* average gap size */
388 	daddr_t	err;		/* sum - num * avg */
389 	daddr_t	histo[nitems(fib)]; /* # gaps in each size range */
390 	int	max_bucket;	/* last histo elt with nonzero val */
391 };
392 
393 /*
394  * gap_stats_counting()    - is the state 'counting 1 bits'?
395  *                           or 'skipping 0 bits'?
396  */
397 static inline bool
398 gap_stats_counting(const struct gap_stats *stats)
399 {
400 
401 	return (stats->start != SWAPBLK_NONE);
402 }
403 
404 /*
405  * init_gap_stats()    - initialize stats on gap sizes
406  */
407 static inline void
408 init_gap_stats(struct gap_stats *stats)
409 {
410 
411 	bzero(stats, sizeof(*stats));
412 	stats->start = SWAPBLK_NONE;
413 }
414 
415 /*
416  * update_gap_stats()    - update stats on gap sizes
417  */
418 static void
419 update_gap_stats(struct gap_stats *stats, daddr_t posn)
420 {
421 	daddr_t size;
422 	int hi, lo, mid;
423 
424 	if (!gap_stats_counting(stats)) {
425 		stats->start = posn;
426 		return;
427 	}
428 	size = posn - stats->start;
429 	stats->start = SWAPBLK_NONE;
430 	if (size > stats->max)
431 		stats->max = size;
432 
433 	/*
434 	 * Find the fibonacci range that contains size,
435 	 * expecting to find it in an early range.
436 	 */
437 	lo = 0;
438 	hi = 1;
439 	while (hi < nitems(fib) && fib[hi] <= size) {
440 		lo = hi;
441 		hi *= 2;
442 	}
443 	if (hi >= nitems(fib))
444 		hi = nitems(fib);
445 	while (lo + 1 != hi) {
446 		mid = (lo + hi) >> 1;
447 		if (fib[mid] <= size)
448 			lo = mid;
449 		else
450 			hi = mid;
451 	}
452 	stats->histo[lo]++;
453 	if (lo > stats->max_bucket)
454 		stats->max_bucket = lo;
455 	stats->err += size - stats->avg;
456 	stats->num++;
457 	stats->avg += stats->err / stats->num;
458 	stats->err %= stats->num;
459 }
460 
461 /*
462  * dump_gap_stats()    - print stats on gap sizes
463  */
464 static inline void
465 dump_gap_stats(const struct gap_stats *stats, struct sbuf *s)
466 {
467 	int i;
468 
469 	sbuf_printf(s, "number of maximal free ranges: %jd\n",
470 	    (intmax_t)stats->num);
471 	sbuf_printf(s, "largest free range: %jd\n", (intmax_t)stats->max);
472 	sbuf_printf(s, "average maximal free range size: %jd\n",
473 	    (intmax_t)stats->avg);
474 	sbuf_printf(s, "number of maximal free ranges of different sizes:\n");
475 	sbuf_printf(s, "               count  |  size range\n");
476 	sbuf_printf(s, "               -----  |  ----------\n");
477 	for (i = 0; i < stats->max_bucket; i++) {
478 		if (stats->histo[i] != 0) {
479 			sbuf_printf(s, "%20jd  |  ",
480 			    (intmax_t)stats->histo[i]);
481 			if (fib[i] != fib[i + 1] - 1)
482 				sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
483 				    (intmax_t)fib[i + 1] - 1);
484 			else
485 				sbuf_printf(s, "%jd\n", (intmax_t)fib[i]);
486 		}
487 	}
488 	sbuf_printf(s, "%20jd  |  ", (intmax_t)stats->histo[i]);
489 	if (stats->histo[i] > 1)
490 		sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
491 		    (intmax_t)stats->max);
492 	else
493 		sbuf_printf(s, "%jd\n", (intmax_t)stats->max);
494 }
495 
496 /*
497  * blist_stats()    - dump radix tree stats
498  */
499 void
500 blist_stats(blist_t bl, struct sbuf *s)
501 {
502 	struct gap_stats gstats;
503 	struct gap_stats *stats = &gstats;
504 	daddr_t i, nodes, radix;
505 	u_daddr_t bit, diff, mask;
506 
507 	init_gap_stats(stats);
508 	nodes = 0;
509 	i = bl->bl_radix;
510 	while (i < bl->bl_radix + bl->bl_blocks) {
511 		/*
512 		 * Find max size subtree starting at i.
513 		 */
514 		radix = BLIST_BMAP_RADIX;
515 		while (((i / radix) & BLIST_META_MASK) == 0)
516 			radix *= BLIST_META_RADIX;
517 
518 		/*
519 		 * Check for skippable subtrees starting at i.
520 		 */
521 		while (radix > BLIST_BMAP_RADIX) {
522 			if (bl->bl_root[nodes].u.bmu_avail == 0) {
523 				if (gap_stats_counting(stats))
524 					update_gap_stats(stats, i);
525 				break;
526 			}
527 			if (bl->bl_root[nodes].u.bmu_avail == radix) {
528 				if (!gap_stats_counting(stats))
529 					update_gap_stats(stats, i);
530 				break;
531 			}
532 
533 			/*
534 			 * Skip subtree root.
535 			 */
536 			nodes++;
537 			radix /= BLIST_META_RADIX;
538 		}
539 		if (radix == BLIST_BMAP_RADIX) {
540 			/*
541 			 * Scan leaf.
542 			 */
543 			mask = bl->bl_root[nodes].u.bmu_bitmap;
544 			diff = mask ^ (mask << 1);
545 			if (gap_stats_counting(stats))
546 				diff ^= 1;
547 			while (diff != 0) {
548 				bit = diff & -diff;
549 				update_gap_stats(stats, i + bitpos(bit));
550 				diff ^= bit;
551 			}
552 		}
553 		nodes += radix_to_skip(radix);
554 		i += radix;
555 	}
556 	update_gap_stats(stats, i);
557 	dump_gap_stats(stats, s);
558 }
559 
560 /************************************************************************
561  *			  ALLOCATION SUPPORT FUNCTIONS			*
562  ************************************************************************
563  *
564  *	These support functions do all the actual work.  They may seem
565  *	rather longish, but that's because I've commented them up.  The
566  *	actual code is straight forward.
567  *
568  */
569 
570 /*
571  * blist_leaf_alloc() -	allocate at a leaf in the radix tree (a bitmap).
572  *
573  *	This is the core of the allocator and is optimized for the
574  *	BLIST_BMAP_RADIX block allocation case.  Otherwise, execution
575  *	time is proportional to log2(count) + bitpos time.
576  */
577 static daddr_t
578 blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count, daddr_t cursor)
579 {
580 	u_daddr_t mask;
581 	int count1, lo, num_shifts, range1, range_ext;
582 
583 	if (count == BLIST_BMAP_RADIX) {
584 		/*
585 		 * Optimize allocation of BLIST_BMAP_RADIX bits.  If this wasn't
586 		 * a special case, then forming the final value of 'mask' below
587 		 * would require special handling to avoid an invalid left shift
588 		 * when count equals the number of bits in mask.
589 		 */
590 		if (~scan->u.bmu_bitmap != 0) {
591 			scan->bm_bighint = BLIST_BMAP_RADIX - 1;
592 			return (SWAPBLK_NONE);
593 		}
594 		if (cursor != blk)
595 			return (SWAPBLK_NONE);
596 		scan->u.bmu_bitmap = 0;
597 		scan->bm_bighint = 0;
598 		return (blk);
599 	}
600 	range1 = 0;
601 	count1 = count - 1;
602 	num_shifts = fls(count1);
603 	mask = scan->u.bmu_bitmap;
604 	while (mask != 0 && num_shifts > 0) {
605 		/*
606 		 * If bit i is set in mask, then bits in [i, i+range1] are set
607 		 * in scan->u.bmu_bitmap.  The value of range1 is equal to
608 		 * count1 >> num_shifts.  Grow range and reduce num_shifts to 0,
609 		 * while preserving these invariants.  The updates to mask leave
610 		 * fewer bits set, but each bit that remains set represents a
611 		 * longer string of consecutive bits set in scan->u.bmu_bitmap.
612 		 */
613 		num_shifts--;
614 		range_ext = range1 + ((count1 >> num_shifts) & 1);
615 		mask &= mask >> range_ext;
616 		range1 += range_ext;
617 	}
618 	if (mask == 0) {
619 		/*
620 		 * Update bighint.  There is no allocation bigger than range1
621 		 * available in this leaf.
622 		 */
623 		scan->bm_bighint = range1;
624 		return (SWAPBLK_NONE);
625 	}
626 
627 	/*
628 	 * Discard any candidates that appear before the cursor.
629 	 */
630 	lo = cursor - blk;
631 	mask &= ~(u_daddr_t)0 << lo;
632 
633 	if (mask == 0)
634 		return (SWAPBLK_NONE);
635 
636 	/*
637 	 * The least significant set bit in mask marks the start of the first
638 	 * available range of sufficient size.  Clear all the bits but that one,
639 	 * and then find its position.
640 	 */
641 	mask &= -mask;
642 	lo = bitpos(mask);
643 
644 	/*
645 	 * Set in mask exactly the bits being allocated, and clear them from
646 	 * the set of available bits.
647 	 */
648 	mask = (mask << count) - mask;
649 	scan->u.bmu_bitmap &= ~mask;
650 	return (blk + lo);
651 }
652 
653 /*
654  * blist_meta_alloc() -	allocate at a meta in the radix tree.
655  *
656  *	Attempt to allocate at a meta node.  If we can't, we update
657  *	bighint and return a failure.  Updating bighint optimize future
658  *	calls that hit this node.  We have to check for our collapse cases
659  *	and we have a few optimizations strewn in as well.
660  */
661 static daddr_t
662 blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count, u_daddr_t radix)
663 {
664 	daddr_t blk, i, next_skip, r, skip;
665 	int child;
666 	bool scan_from_start;
667 
668 	blk = cursor & -radix;
669 	if (radix == BLIST_BMAP_RADIX)
670 		return (blst_leaf_alloc(scan, blk, count, cursor));
671 	if (scan->u.bmu_avail < count) {
672 		/*
673 		 * The meta node's hint must be too large if the allocation
674 		 * exceeds the number of free blocks.  Reduce the hint, and
675 		 * return failure.
676 		 */
677 		scan->bm_bighint = scan->u.bmu_avail;
678 		return (SWAPBLK_NONE);
679 	}
680 	skip = radix_to_skip(radix);
681 	next_skip = skip / BLIST_META_RADIX;
682 
683 	/*
684 	 * An ALL-FREE meta node requires special handling before allocating
685 	 * any of its blocks.
686 	 */
687 	if (scan->u.bmu_avail == radix) {
688 		radix /= BLIST_META_RADIX;
689 
690 		/*
691 		 * Reinitialize each of the meta node's children.  An ALL-FREE
692 		 * meta node cannot have a terminator in any subtree.
693 		 */
694 		for (i = 1; i < skip; i += next_skip) {
695 			if (next_skip == 1)
696 				scan[i].u.bmu_bitmap = (u_daddr_t)-1;
697 			else
698 				scan[i].u.bmu_avail = radix;
699 			scan[i].bm_bighint = radix;
700 		}
701 	} else {
702 		radix /= BLIST_META_RADIX;
703 	}
704 
705 	if (count > radix) {
706 		/*
707 		 * The allocation exceeds the number of blocks that are
708 		 * managed by a subtree of this meta node.
709 		 */
710 		panic("allocation too large");
711 	}
712 	scan_from_start = cursor == blk;
713 	child = (cursor - blk) / radix;
714 	blk += child * radix;
715 	for (i = 1 + child * next_skip; i < skip; i += next_skip) {
716 		if (count <= scan[i].bm_bighint) {
717 			/*
718 			 * The allocation might fit in the i'th subtree.
719 			 */
720 			r = blst_meta_alloc(&scan[i],
721 			    cursor > blk ? cursor : blk, count, radix);
722 			if (r != SWAPBLK_NONE) {
723 				scan->u.bmu_avail -= count;
724 				return (r);
725 			}
726 		} else if (scan[i].bm_bighint == (daddr_t)-1) {
727 			/*
728 			 * Terminator
729 			 */
730 			break;
731 		}
732 		blk += radix;
733 	}
734 
735 	/*
736 	 * We couldn't allocate count in this subtree, update bighint.
737 	 */
738 	if (scan_from_start && scan->bm_bighint >= count)
739 		scan->bm_bighint = count - 1;
740 
741 	return (SWAPBLK_NONE);
742 }
743 
744 /*
745  * BLST_LEAF_FREE() -	free allocated block from leaf bitmap
746  *
747  */
748 static void
749 blst_leaf_free(blmeta_t *scan, daddr_t blk, int count)
750 {
751 	/*
752 	 * free some data in this bitmap
753 	 *
754 	 * e.g.
755 	 *	0000111111111110000
756 	 *          \_________/\__/
757 	 *		v        n
758 	 */
759 	int n = blk & (BLIST_BMAP_RADIX - 1);
760 	u_daddr_t mask;
761 
762 	mask = ((u_daddr_t)-1 << n) &
763 	    ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n));
764 
765 	if (scan->u.bmu_bitmap & mask)
766 		panic("blst_radix_free: freeing free block");
767 	scan->u.bmu_bitmap |= mask;
768 
769 	/*
770 	 * We could probably do a better job here.  We are required to make
771 	 * bighint at least as large as the biggest contiguous block of
772 	 * data.  If we just shoehorn it, a little extra overhead will
773 	 * be incured on the next allocation (but only that one typically).
774 	 */
775 	scan->bm_bighint = BLIST_BMAP_RADIX;
776 }
777 
778 /*
779  * BLST_META_FREE() - free allocated blocks from radix tree meta info
780  *
781  *	This support routine frees a range of blocks from the bitmap.
782  *	The range must be entirely enclosed by this radix node.  If a
783  *	meta node, we break the range down recursively to free blocks
784  *	in subnodes (which means that this code can free an arbitrary
785  *	range whereas the allocation code cannot allocate an arbitrary
786  *	range).
787  */
788 static void
789 blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, u_daddr_t radix)
790 {
791 	daddr_t blk, i, next_skip, skip, v;
792 	int child;
793 
794 	if (scan->bm_bighint == (daddr_t)-1)
795 		panic("freeing invalid range");
796 	if (radix == BLIST_BMAP_RADIX)
797 		return (blst_leaf_free(scan, freeBlk, count));
798 	skip = radix_to_skip(radix);
799 	next_skip = skip / BLIST_META_RADIX;
800 
801 	if (scan->u.bmu_avail == 0) {
802 		/*
803 		 * ALL-ALLOCATED special case, with possible
804 		 * shortcut to ALL-FREE special case.
805 		 */
806 		scan->u.bmu_avail = count;
807 		scan->bm_bighint = count;
808 
809 		if (count != radix)  {
810 			for (i = 1; i < skip; i += next_skip) {
811 				if (scan[i].bm_bighint == (daddr_t)-1)
812 					break;
813 				scan[i].bm_bighint = 0;
814 				if (next_skip == 1) {
815 					scan[i].u.bmu_bitmap = 0;
816 				} else {
817 					scan[i].u.bmu_avail = 0;
818 				}
819 			}
820 			/* fall through */
821 		}
822 	} else {
823 		scan->u.bmu_avail += count;
824 		/* scan->bm_bighint = radix; */
825 	}
826 
827 	/*
828 	 * ALL-FREE special case.
829 	 */
830 
831 	if (scan->u.bmu_avail == radix)
832 		return;
833 	if (scan->u.bmu_avail > radix)
834 		panic("blst_meta_free: freeing already free blocks (%lld) %lld/%lld",
835 		    (long long)count, (long long)scan->u.bmu_avail,
836 		    (long long)radix);
837 
838 	/*
839 	 * Break the free down into its components
840 	 */
841 
842 	blk = freeBlk & -radix;
843 	radix /= BLIST_META_RADIX;
844 
845 	child = (freeBlk - blk) / radix;
846 	blk += child * radix;
847 	i = 1 + child * next_skip;
848 	while (i < skip && blk < freeBlk + count) {
849 		v = blk + radix - freeBlk;
850 		if (v > count)
851 			v = count;
852 		blst_meta_free(&scan[i], freeBlk, v, radix);
853 		if (scan->bm_bighint < scan[i].bm_bighint)
854 			scan->bm_bighint = scan[i].bm_bighint;
855 		count -= v;
856 		freeBlk += v;
857 		blk += radix;
858 		i += next_skip;
859 	}
860 }
861 
862 /*
863  * BLIST_RADIX_COPY() - copy one radix tree to another
864  *
865  *	Locates free space in the source tree and frees it in the destination
866  *	tree.  The space may not already be free in the destination.
867  */
868 static void
869 blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, blist_t dest,
870     daddr_t count)
871 {
872 	daddr_t i, next_skip, skip;
873 
874 	/*
875 	 * Leaf node
876 	 */
877 
878 	if (radix == BLIST_BMAP_RADIX) {
879 		u_daddr_t v = scan->u.bmu_bitmap;
880 
881 		if (v == (u_daddr_t)-1) {
882 			blist_free(dest, blk, count);
883 		} else if (v != 0) {
884 			int i;
885 
886 			for (i = 0; i < BLIST_BMAP_RADIX && i < count; ++i) {
887 				if (v & ((u_daddr_t)1 << i))
888 					blist_free(dest, blk + i, 1);
889 			}
890 		}
891 		return;
892 	}
893 
894 	/*
895 	 * Meta node
896 	 */
897 
898 	if (scan->u.bmu_avail == 0) {
899 		/*
900 		 * Source all allocated, leave dest allocated
901 		 */
902 		return;
903 	}
904 	if (scan->u.bmu_avail == radix) {
905 		/*
906 		 * Source all free, free entire dest
907 		 */
908 		if (count < radix)
909 			blist_free(dest, blk, count);
910 		else
911 			blist_free(dest, blk, radix);
912 		return;
913 	}
914 
915 
916 	skip = radix_to_skip(radix);
917 	next_skip = skip / BLIST_META_RADIX;
918 	radix /= BLIST_META_RADIX;
919 
920 	for (i = 1; count && i < skip; i += next_skip) {
921 		if (scan[i].bm_bighint == (daddr_t)-1)
922 			break;
923 
924 		if (count >= radix) {
925 			blst_copy(&scan[i], blk, radix, dest, radix);
926 			count -= radix;
927 		} else {
928 			if (count) {
929 				blst_copy(&scan[i], blk, radix, dest, count);
930 			}
931 			count = 0;
932 		}
933 		blk += radix;
934 	}
935 }
936 
937 /*
938  * BLST_LEAF_FILL() -	allocate specific blocks in leaf bitmap
939  *
940  *	This routine allocates all blocks in the specified range
941  *	regardless of any existing allocations in that range.  Returns
942  *	the number of blocks allocated by the call.
943  */
944 static daddr_t
945 blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count)
946 {
947 	int n = blk & (BLIST_BMAP_RADIX - 1);
948 	daddr_t nblks;
949 	u_daddr_t mask;
950 
951 	mask = ((u_daddr_t)-1 << n) &
952 	    ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n));
953 
954 	/* Count the number of blocks that we are allocating. */
955 	nblks = bitcount64(scan->u.bmu_bitmap & mask);
956 
957 	scan->u.bmu_bitmap &= ~mask;
958 	return (nblks);
959 }
960 
961 /*
962  * BLIST_META_FILL() -	allocate specific blocks at a meta node
963  *
964  *	This routine allocates the specified range of blocks,
965  *	regardless of any existing allocations in the range.  The
966  *	range must be within the extent of this node.  Returns the
967  *	number of blocks allocated by the call.
968  */
969 static daddr_t
970 blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, u_daddr_t radix)
971 {
972 	daddr_t blk, i, nblks, next_skip, skip, v;
973 	int child;
974 
975 	if (scan->bm_bighint == (daddr_t)-1)
976 		panic("filling invalid range");
977 	if (count > radix) {
978 		/*
979 		 * The allocation exceeds the number of blocks that are
980 		 * managed by this node.
981 		 */
982 		panic("fill too large");
983 	}
984 	if (radix == BLIST_BMAP_RADIX)
985 		return (blst_leaf_fill(scan, allocBlk, count));
986 	if (count == radix || scan->u.bmu_avail == 0)  {
987 		/*
988 		 * ALL-ALLOCATED special case
989 		 */
990 		nblks = scan->u.bmu_avail;
991 		scan->u.bmu_avail = 0;
992 		scan->bm_bighint = 0;
993 		return (nblks);
994 	}
995 	skip = radix_to_skip(radix);
996 	next_skip = skip / BLIST_META_RADIX;
997 	blk = allocBlk & -radix;
998 
999 	/*
1000 	 * An ALL-FREE meta node requires special handling before allocating
1001 	 * any of its blocks.
1002 	 */
1003 	if (scan->u.bmu_avail == radix) {
1004 		radix /= BLIST_META_RADIX;
1005 
1006 		/*
1007 		 * Reinitialize each of the meta node's children.  An ALL-FREE
1008 		 * meta node cannot have a terminator in any subtree.
1009 		 */
1010 		for (i = 1; i < skip; i += next_skip) {
1011 			if (next_skip == 1)
1012 				scan[i].u.bmu_bitmap = (u_daddr_t)-1;
1013 			else
1014 				scan[i].u.bmu_avail = radix;
1015 			scan[i].bm_bighint = radix;
1016 		}
1017 	} else {
1018 		radix /= BLIST_META_RADIX;
1019 	}
1020 
1021 	nblks = 0;
1022 	child = (allocBlk - blk) / radix;
1023 	blk += child * radix;
1024 	i = 1 + child * next_skip;
1025 	while (i < skip && blk < allocBlk + count) {
1026 		v = blk + radix - allocBlk;
1027 		if (v > count)
1028 			v = count;
1029 		nblks += blst_meta_fill(&scan[i], allocBlk, v, radix);
1030 		count -= v;
1031 		allocBlk += v;
1032 		blk += radix;
1033 		i += next_skip;
1034 	}
1035 	scan->u.bmu_avail -= nblks;
1036 	return (nblks);
1037 }
1038 
1039 /*
1040  * BLST_RADIX_INIT() - initialize radix tree
1041  *
1042  *	Initialize our meta structures and bitmaps and calculate the exact
1043  *	amount of space required to manage 'count' blocks - this space may
1044  *	be considerably less than the calculated radix due to the large
1045  *	RADIX values we use.
1046  */
1047 static daddr_t
1048 blst_radix_init(blmeta_t *scan, daddr_t radix, daddr_t count)
1049 {
1050 	daddr_t i, memindex, next_skip, skip;
1051 
1052 	memindex = 0;
1053 
1054 	/*
1055 	 * Leaf node
1056 	 */
1057 
1058 	if (radix == BLIST_BMAP_RADIX) {
1059 		if (scan) {
1060 			scan->bm_bighint = 0;
1061 			scan->u.bmu_bitmap = 0;
1062 		}
1063 		return (memindex);
1064 	}
1065 
1066 	/*
1067 	 * Meta node.  If allocating the entire object we can special
1068 	 * case it.  However, we need to figure out how much memory
1069 	 * is required to manage 'count' blocks, so we continue on anyway.
1070 	 */
1071 
1072 	if (scan) {
1073 		scan->bm_bighint = 0;
1074 		scan->u.bmu_avail = 0;
1075 	}
1076 
1077 	skip = radix_to_skip(radix);
1078 	next_skip = skip / BLIST_META_RADIX;
1079 	radix /= BLIST_META_RADIX;
1080 
1081 	for (i = 1; i < skip; i += next_skip) {
1082 		if (count >= radix) {
1083 			/*
1084 			 * Allocate the entire object
1085 			 */
1086 			memindex = i +
1087 			    blst_radix_init(((scan) ? &scan[i] : NULL), radix,
1088 			    radix);
1089 			count -= radix;
1090 		} else if (count > 0) {
1091 			/*
1092 			 * Allocate a partial object
1093 			 */
1094 			memindex = i +
1095 			    blst_radix_init(((scan) ? &scan[i] : NULL), radix,
1096 			    count);
1097 			count = 0;
1098 		} else {
1099 			/*
1100 			 * Add terminator and break out
1101 			 */
1102 			if (scan)
1103 				scan[i].bm_bighint = (daddr_t)-1;
1104 			break;
1105 		}
1106 	}
1107 	if (memindex < i)
1108 		memindex = i;
1109 	return (memindex);
1110 }
1111 
1112 #ifdef BLIST_DEBUG
1113 
1114 static void
1115 blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int tab)
1116 {
1117 	daddr_t i, next_skip, skip;
1118 
1119 	if (radix == BLIST_BMAP_RADIX) {
1120 		printf(
1121 		    "%*.*s(%08llx,%lld): bitmap %016llx big=%lld\n",
1122 		    tab, tab, "",
1123 		    (long long)blk, (long long)radix,
1124 		    (long long)scan->u.bmu_bitmap,
1125 		    (long long)scan->bm_bighint
1126 		);
1127 		return;
1128 	}
1129 
1130 	if (scan->u.bmu_avail == 0) {
1131 		printf(
1132 		    "%*.*s(%08llx,%lld) ALL ALLOCATED\n",
1133 		    tab, tab, "",
1134 		    (long long)blk,
1135 		    (long long)radix
1136 		);
1137 		return;
1138 	}
1139 	if (scan->u.bmu_avail == radix) {
1140 		printf(
1141 		    "%*.*s(%08llx,%lld) ALL FREE\n",
1142 		    tab, tab, "",
1143 		    (long long)blk,
1144 		    (long long)radix
1145 		);
1146 		return;
1147 	}
1148 
1149 	printf(
1150 	    "%*.*s(%08llx,%lld): subtree (%lld/%lld) big=%lld {\n",
1151 	    tab, tab, "",
1152 	    (long long)blk, (long long)radix,
1153 	    (long long)scan->u.bmu_avail,
1154 	    (long long)radix,
1155 	    (long long)scan->bm_bighint
1156 	);
1157 
1158 	skip = radix_to_skip(radix);
1159 	next_skip = skip / BLIST_META_RADIX;
1160 	radix /= BLIST_META_RADIX;
1161 	tab += 4;
1162 
1163 	for (i = 1; i < skip; i += next_skip) {
1164 		if (scan[i].bm_bighint == (daddr_t)-1) {
1165 			printf(
1166 			    "%*.*s(%08llx,%lld): Terminator\n",
1167 			    tab, tab, "",
1168 			    (long long)blk, (long long)radix
1169 			);
1170 			break;
1171 		}
1172 		blst_radix_print(&scan[i], blk, radix, tab);
1173 		blk += radix;
1174 	}
1175 	tab -= 4;
1176 
1177 	printf(
1178 	    "%*.*s}\n",
1179 	    tab, tab, ""
1180 	);
1181 }
1182 
1183 #endif
1184 
1185 #ifdef BLIST_DEBUG
1186 
1187 int
1188 main(int ac, char **av)
1189 {
1190 	int size = 1024;
1191 	int i;
1192 	blist_t bl;
1193 	struct sbuf *s;
1194 
1195 	for (i = 1; i < ac; ++i) {
1196 		const char *ptr = av[i];
1197 		if (*ptr != '-') {
1198 			size = strtol(ptr, NULL, 0);
1199 			continue;
1200 		}
1201 		ptr += 2;
1202 		fprintf(stderr, "Bad option: %s\n", ptr - 2);
1203 		exit(1);
1204 	}
1205 	bl = blist_create(size, M_WAITOK);
1206 	blist_free(bl, 0, size);
1207 
1208 	for (;;) {
1209 		char buf[1024];
1210 		long long da = 0;
1211 		long long count = 0;
1212 
1213 		printf("%lld/%lld/%lld> ", (long long)blist_avail(bl),
1214 		    (long long)size, (long long)bl->bl_radix);
1215 		fflush(stdout);
1216 		if (fgets(buf, sizeof(buf), stdin) == NULL)
1217 			break;
1218 		switch(buf[0]) {
1219 		case 'r':
1220 			if (sscanf(buf + 1, "%lld", &count) == 1) {
1221 				blist_resize(&bl, count, 1, M_WAITOK);
1222 			} else {
1223 				printf("?\n");
1224 			}
1225 		case 'p':
1226 			blist_print(bl);
1227 			break;
1228 		case 's':
1229 			s = sbuf_new_auto();
1230 			blist_stats(bl, s);
1231 			sbuf_finish(s);
1232 			printf("%s", sbuf_data(s));
1233 			sbuf_delete(s);
1234 			break;
1235 		case 'a':
1236 			if (sscanf(buf + 1, "%lld", &count) == 1) {
1237 				daddr_t blk = blist_alloc(bl, count);
1238 				printf("    R=%08llx\n", (long long)blk);
1239 			} else {
1240 				printf("?\n");
1241 			}
1242 			break;
1243 		case 'f':
1244 			if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) {
1245 				blist_free(bl, da, count);
1246 			} else {
1247 				printf("?\n");
1248 			}
1249 			break;
1250 		case 'l':
1251 			if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) {
1252 				printf("    n=%jd\n",
1253 				    (intmax_t)blist_fill(bl, da, count));
1254 			} else {
1255 				printf("?\n");
1256 			}
1257 			break;
1258 		case '?':
1259 		case 'h':
1260 			puts(
1261 			    "p          -print\n"
1262 			    "s          -stats\n"
1263 			    "a %d       -allocate\n"
1264 			    "f %x %d    -free\n"
1265 			    "l %x %d    -fill\n"
1266 			    "r %d       -resize\n"
1267 			    "h/?        -help"
1268 			);
1269 			break;
1270 		default:
1271 			printf("?\n");
1272 			break;
1273 		}
1274 	}
1275 	return(0);
1276 }
1277 
1278 void
1279 panic(const char *ctl, ...)
1280 {
1281 	va_list va;
1282 
1283 	va_start(va, ctl);
1284 	vfprintf(stderr, ctl, va);
1285 	fprintf(stderr, "\n");
1286 	va_end(va);
1287 	exit(1);
1288 }
1289 
1290 #endif
1291