xref: /freebsd/sys/kern/subr_blist.c (revision c99b67a7947ea215f9c1d44ec022680e98920cd1)
1 /*-
2  * Copyright (c) 1998 Matthew Dillon.  All Rights Reserved.
3  * Redistribution and use in source and binary forms, with or without
4  * modification, are permitted provided that the following conditions
5  * are met:
6  * 1. Redistributions of source code must retain the above copyright
7  *    notice, this list of conditions and the following disclaimer.
8  * 2. Redistributions in binary form must reproduce the above copyright
9  *    notice, this list of conditions and the following disclaimer in the
10  *    documentation and/or other materials provided with the distribution.
11  * 3. Neither the name of the University nor the names of its contributors
12  *    may be used to endorse or promote products derived from this software
13  *    without specific prior written permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
16  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
19  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
21  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 /*
28  * BLIST.C -	Bitmap allocator/deallocator, using a radix tree with hinting
29  *
30  *	This module implements a general bitmap allocator/deallocator.  The
31  *	allocator eats around 2 bits per 'block'.  The module does not
32  *	try to interpret the meaning of a 'block' other than to return
33  *	SWAPBLK_NONE on an allocation failure.
34  *
35  *	A radix tree is used to maintain the bitmap.  Two radix constants are
36  *	involved:  One for the bitmaps contained in the leaf nodes (typically
37  *	32), and one for the meta nodes (typically 16).  Both meta and leaf
38  *	nodes have a hint field.  This field gives us a hint as to the largest
39  *	free contiguous range of blocks under the node.  It may contain a
40  *	value that is too high, but will never contain a value that is too
41  *	low.  When the radix tree is searched, allocation failures in subtrees
42  *	update the hint.
43  *
44  *	The radix tree also implements two collapsed states for meta nodes:
45  *	the ALL-ALLOCATED state and the ALL-FREE state.  If a meta node is
46  *	in either of these two states, all information contained underneath
47  *	the node is considered stale.  These states are used to optimize
48  *	allocation and freeing operations.
49  *
50  * 	The hinting greatly increases code efficiency for allocations while
51  *	the general radix structure optimizes both allocations and frees.  The
52  *	radix tree should be able to operate well no matter how much
53  *	fragmentation there is and no matter how large a bitmap is used.
54  *
55  *	The blist code wires all necessary memory at creation time.  Neither
56  *	allocations nor frees require interaction with the memory subsystem.
57  *	The non-blocking features of the blist code are used in the swap code
58  *	(vm/swap_pager.c).
59  *
60  *	LAYOUT: The radix tree is laid out recursively using a
61  *	linear array.  Each meta node is immediately followed (laid out
62  *	sequentially in memory) by BLIST_META_RADIX lower level nodes.  This
63  *	is a recursive structure but one that can be easily scanned through
64  *	a very simple 'skip' calculation.  In order to support large radixes,
65  *	portions of the tree may reside outside our memory allocation.  We
66  *	handle this with an early-termination optimization (when bighint is
67  *	set to -1) on the scan.  The memory allocation is only large enough
68  *	to cover the number of blocks requested at creation time even if it
69  *	must be encompassed in larger root-node radix.
70  *
71  *	NOTE: the allocator cannot currently allocate more than
72  *	BLIST_BMAP_RADIX blocks per call.  It will panic with 'allocation too
73  *	large' if you try.  This is an area that could use improvement.  The
74  *	radix is large enough that this restriction does not effect the swap
75  *	system, though.  Currently only the allocation code is effected by
76  *	this algorithmic unfeature.  The freeing code can handle arbitrary
77  *	ranges.
78  *
79  *	This code can be compiled stand-alone for debugging.
80  */
81 
82 #include <sys/cdefs.h>
83 __FBSDID("$FreeBSD$");
84 
85 #ifdef _KERNEL
86 
87 #include <sys/param.h>
88 #include <sys/systm.h>
89 #include <sys/lock.h>
90 #include <sys/kernel.h>
91 #include <sys/blist.h>
92 #include <sys/malloc.h>
93 #include <sys/proc.h>
94 #include <sys/mutex.h>
95 
96 #else
97 
98 #ifndef BLIST_NO_DEBUG
99 #define BLIST_DEBUG
100 #endif
101 
102 #include <sys/types.h>
103 #include <sys/malloc.h>
104 #include <stdio.h>
105 #include <string.h>
106 #include <stdlib.h>
107 #include <stdarg.h>
108 
109 #define	bitcount64(x)	__bitcount64((uint64_t)(x))
110 #define malloc(a,b,c)	calloc(a, 1)
111 #define free(a,b)	free(a)
112 
113 #include <sys/blist.h>
114 
115 void panic(const char *ctl, ...);
116 
117 #endif
118 
119 /*
120  * static support functions
121  */
122 
123 static daddr_t blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count);
124 static daddr_t	blst_meta_alloc(blmeta_t *scan, daddr_t blk, daddr_t count,
125 		    daddr_t radix, daddr_t skip, daddr_t cursor);
126 static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count);
127 static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count,
128 					daddr_t radix, int skip, daddr_t blk);
129 static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix,
130 				daddr_t skip, blist_t dest, daddr_t count);
131 static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count);
132 static daddr_t blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count,
133 				daddr_t radix, int skip, daddr_t blk);
134 static daddr_t	blst_radix_init(blmeta_t *scan, daddr_t radix,
135 						int skip, daddr_t count);
136 #ifndef _KERNEL
137 static void	blst_radix_print(blmeta_t *scan, daddr_t blk,
138 					daddr_t radix, int skip, int tab);
139 #endif
140 
141 #ifdef _KERNEL
142 static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space");
143 #endif
144 
145 /*
146  * blist_create() - create a blist capable of handling up to the specified
147  *		    number of blocks
148  *
149  *	blocks - must be greater than 0
150  * 	flags  - malloc flags
151  *
152  *	The smallest blist consists of a single leaf node capable of
153  *	managing BLIST_BMAP_RADIX blocks.
154  */
155 
156 blist_t
157 blist_create(daddr_t blocks, int flags)
158 {
159 	blist_t bl;
160 	daddr_t nodes, radix;
161 	int skip = 0;
162 
163 	/*
164 	 * Calculate radix and skip field used for scanning.
165 	 */
166 	radix = BLIST_BMAP_RADIX;
167 
168 	while (radix < blocks) {
169 		radix *= BLIST_META_RADIX;
170 		skip = (skip + 1) * BLIST_META_RADIX;
171 	}
172 
173 	bl = malloc(sizeof(struct blist), M_SWAP, flags);
174 	if (bl == NULL)
175 		return (NULL);
176 
177 	bl->bl_blocks = blocks;
178 	bl->bl_radix = radix;
179 	bl->bl_skip = skip;
180 	bl->bl_cursor = 0;
181 	nodes = 1 + blst_radix_init(NULL, radix, bl->bl_skip, blocks);
182 	bl->bl_root = malloc(nodes * sizeof(blmeta_t), M_SWAP, flags);
183 	if (bl->bl_root == NULL) {
184 		free(bl, M_SWAP);
185 		return (NULL);
186 	}
187 	blst_radix_init(bl->bl_root, radix, bl->bl_skip, blocks);
188 
189 #if defined(BLIST_DEBUG)
190 	printf(
191 		"BLIST representing %lld blocks (%lld MB of swap)"
192 		", requiring %lldK of ram\n",
193 		(long long)bl->bl_blocks,
194 		(long long)bl->bl_blocks * 4 / 1024,
195 		(long long)(nodes * sizeof(blmeta_t) + 1023) / 1024
196 	);
197 	printf("BLIST raw radix tree contains %lld records\n",
198 	    (long long)nodes);
199 #endif
200 
201 	return (bl);
202 }
203 
204 void
205 blist_destroy(blist_t bl)
206 {
207 	free(bl->bl_root, M_SWAP);
208 	free(bl, M_SWAP);
209 }
210 
211 /*
212  * blist_alloc() -   reserve space in the block bitmap.  Return the base
213  *		     of a contiguous region or SWAPBLK_NONE if space could
214  *		     not be allocated.
215  */
216 
217 daddr_t
218 blist_alloc(blist_t bl, daddr_t count)
219 {
220 	daddr_t blk;
221 
222 	/*
223 	 * This loop iterates at most twice.  An allocation failure in the
224 	 * first iteration leads to a second iteration only if the cursor was
225 	 * non-zero.  When the cursor is zero, an allocation failure will
226 	 * reduce the hint, stopping further iterations.
227 	 */
228 	while (count <= bl->bl_root->bm_bighint) {
229 		if (bl->bl_radix == BLIST_BMAP_RADIX)
230 			blk = blst_leaf_alloc(bl->bl_root, 0, count);
231 		else
232 			blk = blst_meta_alloc(bl->bl_root, 0, count,
233 			    bl->bl_radix, bl->bl_skip, bl->bl_cursor);
234 		if (blk != SWAPBLK_NONE) {
235 			bl->bl_cursor = blk + count;
236 			return (blk);
237 		} else if (bl->bl_cursor != 0)
238 			bl->bl_cursor = 0;
239 	}
240 	return (SWAPBLK_NONE);
241 }
242 
243 /*
244  * blist_avail() -	return the number of free blocks.
245  */
246 
247 daddr_t
248 blist_avail(blist_t bl)
249 {
250 
251 	if (bl->bl_radix == BLIST_BMAP_RADIX)
252 		return (bitcount64(bl->bl_root->u.bmu_bitmap));
253 	else
254 		return (bl->bl_root->u.bmu_avail);
255 }
256 
257 /*
258  * blist_free() -	free up space in the block bitmap.  Return the base
259  *		     	of a contiguous region.  Panic if an inconsistancy is
260  *			found.
261  */
262 
263 void
264 blist_free(blist_t bl, daddr_t blkno, daddr_t count)
265 {
266 	if (bl) {
267 		if (bl->bl_radix == BLIST_BMAP_RADIX)
268 			blst_leaf_free(bl->bl_root, blkno, count);
269 		else
270 			blst_meta_free(bl->bl_root, blkno, count,
271 			    bl->bl_radix, bl->bl_skip, 0);
272 	}
273 }
274 
275 /*
276  * blist_fill() -	mark a region in the block bitmap as off-limits
277  *			to the allocator (i.e. allocate it), ignoring any
278  *			existing allocations.  Return the number of blocks
279  *			actually filled that were free before the call.
280  */
281 
282 daddr_t
283 blist_fill(blist_t bl, daddr_t blkno, daddr_t count)
284 {
285 	daddr_t filled;
286 
287 	if (bl) {
288 		if (bl->bl_radix == BLIST_BMAP_RADIX)
289 			filled = blst_leaf_fill(bl->bl_root, blkno, count);
290 		else
291 			filled = blst_meta_fill(bl->bl_root, blkno, count,
292 			    bl->bl_radix, bl->bl_skip, 0);
293 		return (filled);
294 	}
295 	return (0);
296 }
297 
298 /*
299  * blist_resize() -	resize an existing radix tree to handle the
300  *			specified number of blocks.  This will reallocate
301  *			the tree and transfer the previous bitmap to the new
302  *			one.  When extending the tree you can specify whether
303  *			the new blocks are to left allocated or freed.
304  */
305 
306 void
307 blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags)
308 {
309     blist_t newbl = blist_create(count, flags);
310     blist_t save = *pbl;
311 
312     *pbl = newbl;
313     if (count > save->bl_blocks)
314 	    count = save->bl_blocks;
315     blst_copy(save->bl_root, 0, save->bl_radix, save->bl_skip, newbl, count);
316 
317     /*
318      * If resizing upwards, should we free the new space or not?
319      */
320     if (freenew && count < newbl->bl_blocks) {
321 	    blist_free(newbl, count, newbl->bl_blocks - count);
322     }
323     blist_destroy(save);
324 }
325 
326 #ifdef BLIST_DEBUG
327 
328 /*
329  * blist_print()    - dump radix tree
330  */
331 
332 void
333 blist_print(blist_t bl)
334 {
335 	printf("BLIST {\n");
336 	blst_radix_print(bl->bl_root, 0, bl->bl_radix, bl->bl_skip, 4);
337 	printf("}\n");
338 }
339 
340 #endif
341 
342 /************************************************************************
343  *			  ALLOCATION SUPPORT FUNCTIONS			*
344  ************************************************************************
345  *
346  *	These support functions do all the actual work.  They may seem
347  *	rather longish, but that's because I've commented them up.  The
348  *	actual code is straight forward.
349  *
350  */
351 
352 /*
353  * blist_leaf_alloc() -	allocate at a leaf in the radix tree (a bitmap).
354  *
355  *	This is the core of the allocator and is optimized for the 1 block
356  *	and the BLIST_BMAP_RADIX block allocation cases.  Other cases are
357  *	somewhat slower.  The 1 block allocation case is log2 and extremely
358  *	quick.
359  */
360 
361 static daddr_t
362 blst_leaf_alloc(
363 	blmeta_t *scan,
364 	daddr_t blk,
365 	int count
366 ) {
367 	u_daddr_t orig = scan->u.bmu_bitmap;
368 
369 	if (orig == 0) {
370 		/*
371 		 * Optimize bitmap all-allocated case.  Also, count = 1
372 		 * case assumes at least 1 bit is free in the bitmap, so
373 		 * we have to take care of this case here.
374 		 */
375 		scan->bm_bighint = 0;
376 		return(SWAPBLK_NONE);
377 	}
378 	if (count == 1) {
379 		/*
380 		 * Optimized code to allocate one bit out of the bitmap
381 		 */
382 		u_daddr_t mask;
383 		int j = BLIST_BMAP_RADIX/2;
384 		int r = 0;
385 
386 		mask = (u_daddr_t)-1 >> (BLIST_BMAP_RADIX/2);
387 
388 		while (j) {
389 			if ((orig & mask) == 0) {
390 			    r += j;
391 			    orig >>= j;
392 			}
393 			j >>= 1;
394 			mask >>= j;
395 		}
396 		scan->u.bmu_bitmap &= ~((u_daddr_t)1 << r);
397 		return(blk + r);
398 	}
399 	if (count <= BLIST_BMAP_RADIX) {
400 		/*
401 		 * non-optimized code to allocate N bits out of the bitmap.
402 		 * The more bits, the faster the code runs.  It will run
403 		 * the slowest allocating 2 bits, but since there aren't any
404 		 * memory ops in the core loop (or shouldn't be, anyway),
405 		 * you probably won't notice the difference.
406 		 */
407 		int j;
408 		int n = BLIST_BMAP_RADIX - count;
409 		u_daddr_t mask;
410 
411 		mask = (u_daddr_t)-1 >> n;
412 
413 		for (j = 0; j <= n; ++j) {
414 			if ((orig & mask) == mask) {
415 				scan->u.bmu_bitmap &= ~mask;
416 				return(blk + j);
417 			}
418 			mask = (mask << 1);
419 		}
420 	}
421 	/*
422 	 * We couldn't allocate count in this subtree, update bighint.
423 	 */
424 	scan->bm_bighint = count - 1;
425 	return(SWAPBLK_NONE);
426 }
427 
428 /*
429  * blist_meta_alloc() -	allocate at a meta in the radix tree.
430  *
431  *	Attempt to allocate at a meta node.  If we can't, we update
432  *	bighint and return a failure.  Updating bighint optimize future
433  *	calls that hit this node.  We have to check for our collapse cases
434  *	and we have a few optimizations strewn in as well.
435  */
436 
437 static daddr_t
438 blst_meta_alloc(blmeta_t *scan, daddr_t blk, daddr_t count, daddr_t radix,
439     daddr_t skip, daddr_t cursor)
440 {
441 	daddr_t i, next_skip, r;
442 	int child;
443 	bool scan_from_start;
444 
445 	if (scan->u.bmu_avail < count) {
446 		/*
447 		 * The meta node's hint must be too large if the allocation
448 		 * exceeds the number of free blocks.  Reduce the hint, and
449 		 * return failure.
450 		 */
451 		scan->bm_bighint = scan->u.bmu_avail;
452 		return (SWAPBLK_NONE);
453 	}
454 	next_skip = skip / BLIST_META_RADIX;
455 
456 	/*
457 	 * An ALL-FREE meta node requires special handling before allocating
458 	 * any of its blocks.
459 	 */
460 	if (scan->u.bmu_avail == radix) {
461 		radix /= BLIST_META_RADIX;
462 
463 		/*
464 		 * Reinitialize each of the meta node's children.  An ALL-FREE
465 		 * meta node cannot have a terminator in any subtree.
466 		 */
467 		for (i = 1; i <= skip; i += next_skip) {
468 			if (next_skip == 1)
469 				scan[i].u.bmu_bitmap = (u_daddr_t)-1;
470 			else
471 				scan[i].u.bmu_avail = radix;
472 			scan[i].bm_bighint = radix;
473 		}
474 	} else {
475 		radix /= BLIST_META_RADIX;
476 	}
477 
478 	if (count > radix) {
479 		/*
480 		 * The allocation exceeds the number of blocks that are
481 		 * managed by a subtree of this meta node.
482 		 */
483 		panic("allocation too large");
484 	}
485 	scan_from_start = cursor == blk;
486 	child = (cursor - blk) / radix;
487 	blk += child * radix;
488 	for (i = 1 + child * next_skip; i <= skip; i += next_skip) {
489 		if (count <= scan[i].bm_bighint) {
490 			/*
491 			 * The allocation might fit in the i'th subtree.
492 			 */
493 			if (next_skip == 1) {
494 				r = blst_leaf_alloc(&scan[i], blk, count);
495 			} else {
496 				r = blst_meta_alloc(&scan[i], blk, count,
497 				    radix, next_skip - 1, cursor > blk ?
498 				    cursor : blk);
499 			}
500 			if (r != SWAPBLK_NONE) {
501 				scan->u.bmu_avail -= count;
502 				return (r);
503 			}
504 		} else if (scan[i].bm_bighint == (daddr_t)-1) {
505 			/*
506 			 * Terminator
507 			 */
508 			break;
509 		}
510 		blk += radix;
511 	}
512 
513 	/*
514 	 * We couldn't allocate count in this subtree, update bighint.
515 	 */
516 	if (scan_from_start && scan->bm_bighint >= count)
517 		scan->bm_bighint = count - 1;
518 
519 	return (SWAPBLK_NONE);
520 }
521 
522 /*
523  * BLST_LEAF_FREE() -	free allocated block from leaf bitmap
524  *
525  */
526 
527 static void
528 blst_leaf_free(
529 	blmeta_t *scan,
530 	daddr_t blk,
531 	int count
532 ) {
533 	/*
534 	 * free some data in this bitmap
535 	 *
536 	 * e.g.
537 	 *	0000111111111110000
538 	 *          \_________/\__/
539 	 *		v        n
540 	 */
541 	int n = blk & (BLIST_BMAP_RADIX - 1);
542 	u_daddr_t mask;
543 
544 	mask = ((u_daddr_t)-1 << n) &
545 	    ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n));
546 
547 	if (scan->u.bmu_bitmap & mask)
548 		panic("blst_radix_free: freeing free block");
549 	scan->u.bmu_bitmap |= mask;
550 
551 	/*
552 	 * We could probably do a better job here.  We are required to make
553 	 * bighint at least as large as the biggest contiguous block of
554 	 * data.  If we just shoehorn it, a little extra overhead will
555 	 * be incured on the next allocation (but only that one typically).
556 	 */
557 	scan->bm_bighint = BLIST_BMAP_RADIX;
558 }
559 
560 /*
561  * BLST_META_FREE() - free allocated blocks from radix tree meta info
562  *
563  *	This support routine frees a range of blocks from the bitmap.
564  *	The range must be entirely enclosed by this radix node.  If a
565  *	meta node, we break the range down recursively to free blocks
566  *	in subnodes (which means that this code can free an arbitrary
567  *	range whereas the allocation code cannot allocate an arbitrary
568  *	range).
569  */
570 
571 static void
572 blst_meta_free(
573 	blmeta_t *scan,
574 	daddr_t freeBlk,
575 	daddr_t count,
576 	daddr_t radix,
577 	int skip,
578 	daddr_t blk
579 ) {
580 	int i;
581 	int next_skip = ((u_int)skip / BLIST_META_RADIX);
582 
583 #if 0
584 	printf("free (%llx,%lld) FROM (%llx,%lld)\n",
585 	    (long long)freeBlk, (long long)count,
586 	    (long long)blk, (long long)radix
587 	);
588 #endif
589 
590 	if (scan->u.bmu_avail == 0) {
591 		/*
592 		 * ALL-ALLOCATED special case, with possible
593 		 * shortcut to ALL-FREE special case.
594 		 */
595 		scan->u.bmu_avail = count;
596 		scan->bm_bighint = count;
597 
598 		if (count != radix)  {
599 			for (i = 1; i <= skip; i += next_skip) {
600 				if (scan[i].bm_bighint == (daddr_t)-1)
601 					break;
602 				scan[i].bm_bighint = 0;
603 				if (next_skip == 1) {
604 					scan[i].u.bmu_bitmap = 0;
605 				} else {
606 					scan[i].u.bmu_avail = 0;
607 				}
608 			}
609 			/* fall through */
610 		}
611 	} else {
612 		scan->u.bmu_avail += count;
613 		/* scan->bm_bighint = radix; */
614 	}
615 
616 	/*
617 	 * ALL-FREE special case.
618 	 */
619 
620 	if (scan->u.bmu_avail == radix)
621 		return;
622 	if (scan->u.bmu_avail > radix)
623 		panic("blst_meta_free: freeing already free blocks (%lld) %lld/%lld",
624 		    (long long)count, (long long)scan->u.bmu_avail,
625 		    (long long)radix);
626 
627 	/*
628 	 * Break the free down into its components
629 	 */
630 
631 	radix /= BLIST_META_RADIX;
632 
633 	i = (freeBlk - blk) / radix;
634 	blk += i * radix;
635 	i = i * next_skip + 1;
636 
637 	while (i <= skip && blk < freeBlk + count) {
638 		daddr_t v;
639 
640 		v = blk + radix - freeBlk;
641 		if (v > count)
642 			v = count;
643 
644 		if (scan->bm_bighint == (daddr_t)-1)
645 			panic("blst_meta_free: freeing unexpected range");
646 
647 		if (next_skip == 1) {
648 			blst_leaf_free(&scan[i], freeBlk, v);
649 		} else {
650 			blst_meta_free(&scan[i], freeBlk, v, radix, next_skip - 1, blk);
651 		}
652 		if (scan->bm_bighint < scan[i].bm_bighint)
653 		    scan->bm_bighint = scan[i].bm_bighint;
654 		count -= v;
655 		freeBlk += v;
656 		blk += radix;
657 		i += next_skip;
658 	}
659 }
660 
661 /*
662  * BLIST_RADIX_COPY() - copy one radix tree to another
663  *
664  *	Locates free space in the source tree and frees it in the destination
665  *	tree.  The space may not already be free in the destination.
666  */
667 
668 static void blst_copy(
669 	blmeta_t *scan,
670 	daddr_t blk,
671 	daddr_t radix,
672 	daddr_t skip,
673 	blist_t dest,
674 	daddr_t count
675 ) {
676 	int next_skip;
677 	int i;
678 
679 	/*
680 	 * Leaf node
681 	 */
682 
683 	if (radix == BLIST_BMAP_RADIX) {
684 		u_daddr_t v = scan->u.bmu_bitmap;
685 
686 		if (v == (u_daddr_t)-1) {
687 			blist_free(dest, blk, count);
688 		} else if (v != 0) {
689 			int i;
690 
691 			for (i = 0; i < BLIST_BMAP_RADIX && i < count; ++i) {
692 				if (v & ((u_daddr_t)1 << i))
693 					blist_free(dest, blk + i, 1);
694 			}
695 		}
696 		return;
697 	}
698 
699 	/*
700 	 * Meta node
701 	 */
702 
703 	if (scan->u.bmu_avail == 0) {
704 		/*
705 		 * Source all allocated, leave dest allocated
706 		 */
707 		return;
708 	}
709 	if (scan->u.bmu_avail == radix) {
710 		/*
711 		 * Source all free, free entire dest
712 		 */
713 		if (count < radix)
714 			blist_free(dest, blk, count);
715 		else
716 			blist_free(dest, blk, radix);
717 		return;
718 	}
719 
720 
721 	radix /= BLIST_META_RADIX;
722 	next_skip = ((u_int)skip / BLIST_META_RADIX);
723 
724 	for (i = 1; count && i <= skip; i += next_skip) {
725 		if (scan[i].bm_bighint == (daddr_t)-1)
726 			break;
727 
728 		if (count >= radix) {
729 			blst_copy(
730 			    &scan[i],
731 			    blk,
732 			    radix,
733 			    next_skip - 1,
734 			    dest,
735 			    radix
736 			);
737 			count -= radix;
738 		} else {
739 			if (count) {
740 				blst_copy(
741 				    &scan[i],
742 				    blk,
743 				    radix,
744 				    next_skip - 1,
745 				    dest,
746 				    count
747 				);
748 			}
749 			count = 0;
750 		}
751 		blk += radix;
752 	}
753 }
754 
755 /*
756  * BLST_LEAF_FILL() -	allocate specific blocks in leaf bitmap
757  *
758  *	This routine allocates all blocks in the specified range
759  *	regardless of any existing allocations in that range.  Returns
760  *	the number of blocks allocated by the call.
761  */
762 
763 static daddr_t
764 blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count)
765 {
766 	int n = blk & (BLIST_BMAP_RADIX - 1);
767 	daddr_t nblks;
768 	u_daddr_t mask;
769 
770 	mask = ((u_daddr_t)-1 << n) &
771 	    ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n));
772 
773 	/* Count the number of blocks that we are allocating. */
774 	nblks = bitcount64(scan->u.bmu_bitmap & mask);
775 
776 	scan->u.bmu_bitmap &= ~mask;
777 	return (nblks);
778 }
779 
780 /*
781  * BLIST_META_FILL() -	allocate specific blocks at a meta node
782  *
783  *	This routine allocates the specified range of blocks,
784  *	regardless of any existing allocations in the range.  The
785  *	range must be within the extent of this node.  Returns the
786  *	number of blocks allocated by the call.
787  */
788 static daddr_t
789 blst_meta_fill(
790 	blmeta_t *scan,
791 	daddr_t allocBlk,
792 	daddr_t count,
793 	daddr_t radix,
794 	int skip,
795 	daddr_t blk
796 ) {
797 	int i;
798 	int next_skip = ((u_int)skip / BLIST_META_RADIX);
799 	daddr_t nblks = 0;
800 
801 	if (count > radix) {
802 		/*
803 		 * The allocation exceeds the number of blocks that are
804 		 * managed by this meta node.
805 		 */
806 		panic("allocation too large");
807 	}
808 	if (count == radix || scan->u.bmu_avail == 0)  {
809 		/*
810 		 * ALL-ALLOCATED special case
811 		 */
812 		nblks = scan->u.bmu_avail;
813 		scan->u.bmu_avail = 0;
814 		scan->bm_bighint = 0;
815 		return nblks;
816 	}
817 
818 	/*
819 	 * An ALL-FREE meta node requires special handling before allocating
820 	 * any of its blocks.
821 	 */
822 	if (scan->u.bmu_avail == radix) {
823 		radix /= BLIST_META_RADIX;
824 
825 		/*
826 		 * Reinitialize each of the meta node's children.  An ALL-FREE
827 		 * meta node cannot have a terminator in any subtree.
828 		 */
829 		for (i = 1; i <= skip; i += next_skip) {
830 			if (next_skip == 1) {
831 				scan[i].u.bmu_bitmap = (u_daddr_t)-1;
832 				scan[i].bm_bighint = BLIST_BMAP_RADIX;
833 			} else {
834 				scan[i].bm_bighint = radix;
835 				scan[i].u.bmu_avail = radix;
836 			}
837 		}
838 	} else {
839 		radix /= BLIST_META_RADIX;
840 	}
841 
842 	i = (allocBlk - blk) / radix;
843 	blk += i * radix;
844 	i = i * next_skip + 1;
845 
846 	while (i <= skip && blk < allocBlk + count) {
847 		daddr_t v;
848 
849 		v = blk + radix - allocBlk;
850 		if (v > count)
851 			v = count;
852 
853 		if (scan->bm_bighint == (daddr_t)-1)
854 			panic("blst_meta_fill: filling unexpected range");
855 
856 		if (next_skip == 1) {
857 			nblks += blst_leaf_fill(&scan[i], allocBlk, v);
858 		} else {
859 			nblks += blst_meta_fill(&scan[i], allocBlk, v,
860 			    radix, next_skip - 1, blk);
861 		}
862 		count -= v;
863 		allocBlk += v;
864 		blk += radix;
865 		i += next_skip;
866 	}
867 	scan->u.bmu_avail -= nblks;
868 	return nblks;
869 }
870 
871 /*
872  * BLST_RADIX_INIT() - initialize radix tree
873  *
874  *	Initialize our meta structures and bitmaps and calculate the exact
875  *	amount of space required to manage 'count' blocks - this space may
876  *	be considerably less than the calculated radix due to the large
877  *	RADIX values we use.
878  */
879 
880 static daddr_t
881 blst_radix_init(blmeta_t *scan, daddr_t radix, int skip, daddr_t count)
882 {
883 	int i;
884 	int next_skip;
885 	daddr_t memindex = 0;
886 
887 	/*
888 	 * Leaf node
889 	 */
890 
891 	if (radix == BLIST_BMAP_RADIX) {
892 		if (scan) {
893 			scan->bm_bighint = 0;
894 			scan->u.bmu_bitmap = 0;
895 		}
896 		return(memindex);
897 	}
898 
899 	/*
900 	 * Meta node.  If allocating the entire object we can special
901 	 * case it.  However, we need to figure out how much memory
902 	 * is required to manage 'count' blocks, so we continue on anyway.
903 	 */
904 
905 	if (scan) {
906 		scan->bm_bighint = 0;
907 		scan->u.bmu_avail = 0;
908 	}
909 
910 	radix /= BLIST_META_RADIX;
911 	next_skip = ((u_int)skip / BLIST_META_RADIX);
912 
913 	for (i = 1; i <= skip; i += next_skip) {
914 		if (count >= radix) {
915 			/*
916 			 * Allocate the entire object
917 			 */
918 			memindex = i + blst_radix_init(
919 			    ((scan) ? &scan[i] : NULL),
920 			    radix,
921 			    next_skip - 1,
922 			    radix
923 			);
924 			count -= radix;
925 		} else if (count > 0) {
926 			/*
927 			 * Allocate a partial object
928 			 */
929 			memindex = i + blst_radix_init(
930 			    ((scan) ? &scan[i] : NULL),
931 			    radix,
932 			    next_skip - 1,
933 			    count
934 			);
935 			count = 0;
936 		} else {
937 			/*
938 			 * Add terminator and break out
939 			 */
940 			if (scan)
941 				scan[i].bm_bighint = (daddr_t)-1;
942 			break;
943 		}
944 	}
945 	if (memindex < i)
946 		memindex = i;
947 	return(memindex);
948 }
949 
950 #ifdef BLIST_DEBUG
951 
952 static void
953 blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int skip, int tab)
954 {
955 	int i;
956 	int next_skip;
957 	int lastState = 0;
958 
959 	if (radix == BLIST_BMAP_RADIX) {
960 		printf(
961 		    "%*.*s(%08llx,%lld): bitmap %016llx big=%lld\n",
962 		    tab, tab, "",
963 		    (long long)blk, (long long)radix,
964 		    (long long)scan->u.bmu_bitmap,
965 		    (long long)scan->bm_bighint
966 		);
967 		return;
968 	}
969 
970 	if (scan->u.bmu_avail == 0) {
971 		printf(
972 		    "%*.*s(%08llx,%lld) ALL ALLOCATED\n",
973 		    tab, tab, "",
974 		    (long long)blk,
975 		    (long long)radix
976 		);
977 		return;
978 	}
979 	if (scan->u.bmu_avail == radix) {
980 		printf(
981 		    "%*.*s(%08llx,%lld) ALL FREE\n",
982 		    tab, tab, "",
983 		    (long long)blk,
984 		    (long long)radix
985 		);
986 		return;
987 	}
988 
989 	printf(
990 	    "%*.*s(%08llx,%lld): subtree (%lld/%lld) big=%lld {\n",
991 	    tab, tab, "",
992 	    (long long)blk, (long long)radix,
993 	    (long long)scan->u.bmu_avail,
994 	    (long long)radix,
995 	    (long long)scan->bm_bighint
996 	);
997 
998 	radix /= BLIST_META_RADIX;
999 	next_skip = ((u_int)skip / BLIST_META_RADIX);
1000 	tab += 4;
1001 
1002 	for (i = 1; i <= skip; i += next_skip) {
1003 		if (scan[i].bm_bighint == (daddr_t)-1) {
1004 			printf(
1005 			    "%*.*s(%08llx,%lld): Terminator\n",
1006 			    tab, tab, "",
1007 			    (long long)blk, (long long)radix
1008 			);
1009 			lastState = 0;
1010 			break;
1011 		}
1012 		blst_radix_print(
1013 		    &scan[i],
1014 		    blk,
1015 		    radix,
1016 		    next_skip - 1,
1017 		    tab
1018 		);
1019 		blk += radix;
1020 	}
1021 	tab -= 4;
1022 
1023 	printf(
1024 	    "%*.*s}\n",
1025 	    tab, tab, ""
1026 	);
1027 }
1028 
1029 #endif
1030 
1031 #ifdef BLIST_DEBUG
1032 
1033 int
1034 main(int ac, char **av)
1035 {
1036 	int size = 1024;
1037 	int i;
1038 	blist_t bl;
1039 
1040 	for (i = 1; i < ac; ++i) {
1041 		const char *ptr = av[i];
1042 		if (*ptr != '-') {
1043 			size = strtol(ptr, NULL, 0);
1044 			continue;
1045 		}
1046 		ptr += 2;
1047 		fprintf(stderr, "Bad option: %s\n", ptr - 2);
1048 		exit(1);
1049 	}
1050 	bl = blist_create(size, M_WAITOK);
1051 	blist_free(bl, 0, size);
1052 
1053 	for (;;) {
1054 		char buf[1024];
1055 		long long da = 0;
1056 		long long count = 0;
1057 
1058 		printf("%lld/%lld/%lld> ", (long long)blist_avail(bl),
1059 		    (long long)size, (long long)bl->bl_radix);
1060 		fflush(stdout);
1061 		if (fgets(buf, sizeof(buf), stdin) == NULL)
1062 			break;
1063 		switch(buf[0]) {
1064 		case 'r':
1065 			if (sscanf(buf + 1, "%lld", &count) == 1) {
1066 				blist_resize(&bl, count, 1, M_WAITOK);
1067 			} else {
1068 				printf("?\n");
1069 			}
1070 		case 'p':
1071 			blist_print(bl);
1072 			break;
1073 		case 'a':
1074 			if (sscanf(buf + 1, "%lld", &count) == 1) {
1075 				daddr_t blk = blist_alloc(bl, count);
1076 				printf("    R=%08llx\n", (long long)blk);
1077 			} else {
1078 				printf("?\n");
1079 			}
1080 			break;
1081 		case 'f':
1082 			if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) {
1083 				blist_free(bl, da, count);
1084 			} else {
1085 				printf("?\n");
1086 			}
1087 			break;
1088 		case 'l':
1089 			if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) {
1090 				printf("    n=%jd\n",
1091 				    (intmax_t)blist_fill(bl, da, count));
1092 			} else {
1093 				printf("?\n");
1094 			}
1095 			break;
1096 		case '?':
1097 		case 'h':
1098 			puts(
1099 			    "p          -print\n"
1100 			    "a %d       -allocate\n"
1101 			    "f %x %d    -free\n"
1102 			    "l %x %d    -fill\n"
1103 			    "r %d       -resize\n"
1104 			    "h/?        -help"
1105 			);
1106 			break;
1107 		default:
1108 			printf("?\n");
1109 			break;
1110 		}
1111 	}
1112 	return(0);
1113 }
1114 
1115 void
1116 panic(const char *ctl, ...)
1117 {
1118 	va_list va;
1119 
1120 	va_start(va, ctl);
1121 	vfprintf(stderr, ctl, va);
1122 	fprintf(stderr, "\n");
1123 	va_end(va);
1124 	exit(1);
1125 }
1126 
1127 #endif
1128 
1129