xref: /freebsd/sys/kern/subr_blist.c (revision 7660b554bc59a07be0431c17e0e33815818baa69)
1 /*-
2  * Copyright (c) 1998 Matthew Dillon.  All Rights Reserved.
3  * Redistribution and use in source and binary forms, with or without
4  * modification, are permitted provided that the following conditions
5  * are met:
6  * 1. Redistributions of source code must retain the above copyright
7  *    notice, this list of conditions and the following disclaimer.
8  * 2. Redistributions in binary form must reproduce the above copyright
9  *    notice, this list of conditions and the following disclaimer in the
10  *    documentation and/or other materials provided with the distribution.
11  * 4. Neither the name of the University nor the names of its contributors
12  *    may be used to endorse or promote products derived from this software
13  *    without specific prior written permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
16  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
19  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
21  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 /*
28  * BLIST.C -	Bitmap allocator/deallocator, using a radix tree with hinting
29  *
30  *	This module implements a general bitmap allocator/deallocator.  The
31  *	allocator eats around 2 bits per 'block'.  The module does not
32  *	try to interpret the meaning of a 'block' other then to return
33  *	SWAPBLK_NONE on an allocation failure.
34  *
35  *	A radix tree is used to maintain the bitmap.  Two radix constants are
36  *	involved:  One for the bitmaps contained in the leaf nodes (typically
37  *	32), and one for the meta nodes (typically 16).  Both meta and leaf
38  *	nodes have a hint field.  This field gives us a hint as to the largest
39  *	free contiguous range of blocks under the node.  It may contain a
40  *	value that is too high, but will never contain a value that is too
41  *	low.  When the radix tree is searched, allocation failures in subtrees
42  *	update the hint.
43  *
44  *	The radix tree also implements two collapsed states for meta nodes:
45  *	the ALL-ALLOCATED state and the ALL-FREE state.  If a meta node is
46  *	in either of these two states, all information contained underneath
47  *	the node is considered stale.  These states are used to optimize
48  *	allocation and freeing operations.
49  *
50  * 	The hinting greatly increases code efficiency for allocations while
51  *	the general radix structure optimizes both allocations and frees.  The
52  *	radix tree should be able to operate well no matter how much
53  *	fragmentation there is and no matter how large a bitmap is used.
54  *
55  *	Unlike the rlist code, the blist code wires all necessary memory at
56  *	creation time.  Neither allocations nor frees require interaction with
57  *	the memory subsystem.  In contrast, the rlist code may allocate memory
58  *	on an rlist_free() call.  The non-blocking features of the blist code
59  *	are used to great advantage in the swap code (vm/nswap_pager.c).  The
60  *	rlist code uses a little less overall memory then the blist code (but
61  *	due to swap interleaving not all that much less), but the blist code
62  *	scales much, much better.
63  *
64  *	LAYOUT: The radix tree is layed out recursively using a
65  *	linear array.  Each meta node is immediately followed (layed out
66  *	sequentially in memory) by BLIST_META_RADIX lower level nodes.  This
67  *	is a recursive structure but one that can be easily scanned through
68  *	a very simple 'skip' calculation.  In order to support large radixes,
69  *	portions of the tree may reside outside our memory allocation.  We
70  *	handle this with an early-termination optimization (when bighint is
71  *	set to -1) on the scan.  The memory allocation is only large enough
72  *	to cover the number of blocks requested at creation time even if it
73  *	must be encompassed in larger root-node radix.
74  *
75  *	NOTE: the allocator cannot currently allocate more then
76  *	BLIST_BMAP_RADIX blocks per call.  It will panic with 'allocation too
77  *	large' if you try.  This is an area that could use improvement.  The
78  *	radix is large enough that this restriction does not effect the swap
79  *	system, though.  Currently only the allocation code is effected by
80  *	this algorithmic unfeature.  The freeing code can handle arbitrary
81  *	ranges.
82  *
83  *	This code can be compiled stand-alone for debugging.
84  */
85 
86 #include <sys/cdefs.h>
87 __FBSDID("$FreeBSD$");
88 
89 #ifdef _KERNEL
90 
91 #include <sys/param.h>
92 #include <sys/systm.h>
93 #include <sys/lock.h>
94 #include <sys/kernel.h>
95 #include <sys/blist.h>
96 #include <sys/malloc.h>
97 #include <sys/proc.h>
98 #include <sys/mutex.h>
99 #include <vm/vm.h>
100 #include <vm/vm_object.h>
101 #include <vm/vm_kern.h>
102 #include <vm/vm_extern.h>
103 #include <vm/vm_page.h>
104 
105 #else
106 
107 #ifndef BLIST_NO_DEBUG
108 #define BLIST_DEBUG
109 #endif
110 
111 #define SWAPBLK_NONE ((daddr_t)-1)
112 
113 #include <sys/types.h>
114 #include <stdio.h>
115 #include <string.h>
116 #include <stdlib.h>
117 #include <stdarg.h>
118 
119 #define malloc(a,b,c)	calloc(a, 1)
120 #define free(a,b)	free(a)
121 
122 typedef unsigned int u_daddr_t;
123 
124 #include <sys/blist.h>
125 
126 void panic(const char *ctl, ...);
127 
128 #endif
129 
130 /*
131  * static support functions
132  */
133 
134 static daddr_t blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count);
135 static daddr_t blst_meta_alloc(blmeta_t *scan, daddr_t blk,
136 				daddr_t count, daddr_t radix, int skip);
137 static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count);
138 static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count,
139 					daddr_t radix, int skip, daddr_t blk);
140 static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix,
141 				daddr_t skip, blist_t dest, daddr_t count);
142 static int blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count);
143 static int blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count,
144 				daddr_t radix, int skip, daddr_t blk);
145 static daddr_t	blst_radix_init(blmeta_t *scan, daddr_t radix,
146 						int skip, daddr_t count);
147 #ifndef _KERNEL
148 static void	blst_radix_print(blmeta_t *scan, daddr_t blk,
149 					daddr_t radix, int skip, int tab);
150 #endif
151 
152 #ifdef _KERNEL
153 static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space");
154 #endif
155 
156 /*
157  * blist_create() - create a blist capable of handling up to the specified
158  *		    number of blocks
159  *
160  *	blocks must be greater then 0
161  *
162  *	The smallest blist consists of a single leaf node capable of
163  *	managing BLIST_BMAP_RADIX blocks.
164  */
165 
166 blist_t
167 blist_create(daddr_t blocks)
168 {
169 	blist_t bl;
170 	int radix;
171 	int skip = 0;
172 
173 	/*
174 	 * Calculate radix and skip field used for scanning.
175 	 */
176 	radix = BLIST_BMAP_RADIX;
177 
178 	while (radix < blocks) {
179 		radix *= BLIST_META_RADIX;
180 		skip = (skip + 1) * BLIST_META_RADIX;
181 	}
182 
183 	bl = malloc(sizeof(struct blist), M_SWAP, M_WAITOK | M_ZERO);
184 
185 	bl->bl_blocks = blocks;
186 	bl->bl_radix = radix;
187 	bl->bl_skip = skip;
188 	bl->bl_rootblks = 1 +
189 	    blst_radix_init(NULL, bl->bl_radix, bl->bl_skip, blocks);
190 	bl->bl_root = malloc(sizeof(blmeta_t) * bl->bl_rootblks, M_SWAP, M_WAITOK);
191 
192 #if defined(BLIST_DEBUG)
193 	printf(
194 		"BLIST representing %lld blocks (%lld MB of swap)"
195 		", requiring %lldK of ram\n",
196 		(long long)bl->bl_blocks,
197 		(long long)bl->bl_blocks * 4 / 1024,
198 		(long long)(bl->bl_rootblks * sizeof(blmeta_t) + 1023) / 1024
199 	);
200 	printf("BLIST raw radix tree contains %lld records\n",
201 	    (long long)bl->bl_rootblks);
202 #endif
203 	blst_radix_init(bl->bl_root, bl->bl_radix, bl->bl_skip, blocks);
204 
205 	return(bl);
206 }
207 
208 void
209 blist_destroy(blist_t bl)
210 {
211 	free(bl->bl_root, M_SWAP);
212 	free(bl, M_SWAP);
213 }
214 
215 /*
216  * blist_alloc() - reserve space in the block bitmap.  Return the base
217  *		     of a contiguous region or SWAPBLK_NONE if space could
218  *		     not be allocated.
219  */
220 
221 daddr_t
222 blist_alloc(blist_t bl, daddr_t count)
223 {
224 	daddr_t blk = SWAPBLK_NONE;
225 
226 	if (bl) {
227 		if (bl->bl_radix == BLIST_BMAP_RADIX)
228 			blk = blst_leaf_alloc(bl->bl_root, 0, count);
229 		else
230 			blk = blst_meta_alloc(bl->bl_root, 0, count, bl->bl_radix, bl->bl_skip);
231 		if (blk != SWAPBLK_NONE)
232 			bl->bl_free -= count;
233 	}
234 	return(blk);
235 }
236 
237 /*
238  * blist_free() -	free up space in the block bitmap.  Return the base
239  *		     	of a contiguous region.  Panic if an inconsistancy is
240  *			found.
241  */
242 
243 void
244 blist_free(blist_t bl, daddr_t blkno, daddr_t count)
245 {
246 	if (bl) {
247 		if (bl->bl_radix == BLIST_BMAP_RADIX)
248 			blst_leaf_free(bl->bl_root, blkno, count);
249 		else
250 			blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix, bl->bl_skip, 0);
251 		bl->bl_free += count;
252 	}
253 }
254 
255 /*
256  * blist_fill() -	mark a region in the block bitmap as off-limits
257  *			to the allocator (i.e. allocate it), ignoring any
258  *			existing allocations.  Return the number of blocks
259  *			actually filled that were free before the call.
260  */
261 
262 int
263 blist_fill(blist_t bl, daddr_t blkno, daddr_t count)
264 {
265 	int filled;
266 
267 	if (bl) {
268 		if (bl->bl_radix == BLIST_BMAP_RADIX)
269 			filled = blst_leaf_fill(bl->bl_root, blkno, count);
270 		else
271 			filled = blst_meta_fill(bl->bl_root, blkno, count,
272 			    bl->bl_radix, bl->bl_skip, 0);
273 		bl->bl_free -= filled;
274 		return filled;
275 	} else
276 		return 0;
277 }
278 
279 /*
280  * blist_resize() -	resize an existing radix tree to handle the
281  *			specified number of blocks.  This will reallocate
282  *			the tree and transfer the previous bitmap to the new
283  *			one.  When extending the tree you can specify whether
284  *			the new blocks are to left allocated or freed.
285  */
286 
287 void
288 blist_resize(blist_t *pbl, daddr_t count, int freenew)
289 {
290     blist_t newbl = blist_create(count);
291     blist_t save = *pbl;
292 
293     *pbl = newbl;
294     if (count > save->bl_blocks)
295 	    count = save->bl_blocks;
296     blst_copy(save->bl_root, 0, save->bl_radix, save->bl_skip, newbl, count);
297 
298     /*
299      * If resizing upwards, should we free the new space or not?
300      */
301     if (freenew && count < newbl->bl_blocks) {
302 	    blist_free(newbl, count, newbl->bl_blocks - count);
303     }
304     blist_destroy(save);
305 }
306 
307 #ifdef BLIST_DEBUG
308 
309 /*
310  * blist_print()    - dump radix tree
311  */
312 
313 void
314 blist_print(blist_t bl)
315 {
316 	printf("BLIST {\n");
317 	blst_radix_print(bl->bl_root, 0, bl->bl_radix, bl->bl_skip, 4);
318 	printf("}\n");
319 }
320 
321 #endif
322 
323 /************************************************************************
324  *			  ALLOCATION SUPPORT FUNCTIONS			*
325  ************************************************************************
326  *
327  *	These support functions do all the actual work.  They may seem
328  *	rather longish, but that's because I've commented them up.  The
329  *	actual code is straight forward.
330  *
331  */
332 
333 /*
334  * blist_leaf_alloc() -	allocate at a leaf in the radix tree (a bitmap).
335  *
336  *	This is the core of the allocator and is optimized for the 1 block
337  *	and the BLIST_BMAP_RADIX block allocation cases.  Other cases are
338  *	somewhat slower.  The 1 block allocation case is log2 and extremely
339  *	quick.
340  */
341 
342 static daddr_t
343 blst_leaf_alloc(
344 	blmeta_t *scan,
345 	daddr_t blk,
346 	int count
347 ) {
348 	u_daddr_t orig = scan->u.bmu_bitmap;
349 
350 	if (orig == 0) {
351 		/*
352 		 * Optimize bitmap all-allocated case.  Also, count = 1
353 		 * case assumes at least 1 bit is free in the bitmap, so
354 		 * we have to take care of this case here.
355 		 */
356 		scan->bm_bighint = 0;
357 		return(SWAPBLK_NONE);
358 	}
359 	if (count == 1) {
360 		/*
361 		 * Optimized code to allocate one bit out of the bitmap
362 		 */
363 		u_daddr_t mask;
364 		int j = BLIST_BMAP_RADIX/2;
365 		int r = 0;
366 
367 		mask = (u_daddr_t)-1 >> (BLIST_BMAP_RADIX/2);
368 
369 		while (j) {
370 			if ((orig & mask) == 0) {
371 			    r += j;
372 			    orig >>= j;
373 			}
374 			j >>= 1;
375 			mask >>= j;
376 		}
377 		scan->u.bmu_bitmap &= ~(1 << r);
378 		return(blk + r);
379 	}
380 	if (count <= BLIST_BMAP_RADIX) {
381 		/*
382 		 * non-optimized code to allocate N bits out of the bitmap.
383 		 * The more bits, the faster the code runs.  It will run
384 		 * the slowest allocating 2 bits, but since there aren't any
385 		 * memory ops in the core loop (or shouldn't be, anyway),
386 		 * you probably won't notice the difference.
387 		 */
388 		int j;
389 		int n = BLIST_BMAP_RADIX - count;
390 		u_daddr_t mask;
391 
392 		mask = (u_daddr_t)-1 >> n;
393 
394 		for (j = 0; j <= n; ++j) {
395 			if ((orig & mask) == mask) {
396 				scan->u.bmu_bitmap &= ~mask;
397 				return(blk + j);
398 			}
399 			mask = (mask << 1);
400 		}
401 	}
402 	/*
403 	 * We couldn't allocate count in this subtree, update bighint.
404 	 */
405 	scan->bm_bighint = count - 1;
406 	return(SWAPBLK_NONE);
407 }
408 
409 /*
410  * blist_meta_alloc() -	allocate at a meta in the radix tree.
411  *
412  *	Attempt to allocate at a meta node.  If we can't, we update
413  *	bighint and return a failure.  Updating bighint optimize future
414  *	calls that hit this node.  We have to check for our collapse cases
415  *	and we have a few optimizations strewn in as well.
416  */
417 
418 static daddr_t
419 blst_meta_alloc(
420 	blmeta_t *scan,
421 	daddr_t blk,
422 	daddr_t count,
423 	daddr_t radix,
424 	int skip
425 ) {
426 	int i;
427 	int next_skip = ((u_int)skip / BLIST_META_RADIX);
428 
429 	if (scan->u.bmu_avail == 0)  {
430 		/*
431 		 * ALL-ALLOCATED special case
432 		 */
433 		scan->bm_bighint = count;
434 		return(SWAPBLK_NONE);
435 	}
436 
437 	if (scan->u.bmu_avail == radix) {
438 		radix /= BLIST_META_RADIX;
439 
440 		/*
441 		 * ALL-FREE special case, initialize uninitialize
442 		 * sublevel.
443 		 */
444 		for (i = 1; i <= skip; i += next_skip) {
445 			if (scan[i].bm_bighint == (daddr_t)-1)
446 				break;
447 			if (next_skip == 1) {
448 				scan[i].u.bmu_bitmap = (u_daddr_t)-1;
449 				scan[i].bm_bighint = BLIST_BMAP_RADIX;
450 			} else {
451 				scan[i].bm_bighint = radix;
452 				scan[i].u.bmu_avail = radix;
453 			}
454 		}
455 	} else {
456 		radix /= BLIST_META_RADIX;
457 	}
458 
459 	for (i = 1; i <= skip; i += next_skip) {
460 		if (count <= scan[i].bm_bighint) {
461 			/*
462 			 * count fits in object
463 			 */
464 			daddr_t r;
465 			if (next_skip == 1) {
466 				r = blst_leaf_alloc(&scan[i], blk, count);
467 			} else {
468 				r = blst_meta_alloc(&scan[i], blk, count, radix, next_skip - 1);
469 			}
470 			if (r != SWAPBLK_NONE) {
471 				scan->u.bmu_avail -= count;
472 				if (scan->bm_bighint > scan->u.bmu_avail)
473 					scan->bm_bighint = scan->u.bmu_avail;
474 				return(r);
475 			}
476 		} else if (scan[i].bm_bighint == (daddr_t)-1) {
477 			/*
478 			 * Terminator
479 			 */
480 			break;
481 		} else if (count > radix) {
482 			/*
483 			 * count does not fit in object even if it were
484 			 * complete free.
485 			 */
486 			panic("blist_meta_alloc: allocation too large");
487 		}
488 		blk += radix;
489 	}
490 
491 	/*
492 	 * We couldn't allocate count in this subtree, update bighint.
493 	 */
494 	if (scan->bm_bighint >= count)
495 		scan->bm_bighint = count - 1;
496 	return(SWAPBLK_NONE);
497 }
498 
499 /*
500  * BLST_LEAF_FREE() -	free allocated block from leaf bitmap
501  *
502  */
503 
504 static void
505 blst_leaf_free(
506 	blmeta_t *scan,
507 	daddr_t blk,
508 	int count
509 ) {
510 	/*
511 	 * free some data in this bitmap
512 	 *
513 	 * e.g.
514 	 *	0000111111111110000
515 	 *          \_________/\__/
516 	 *		v        n
517 	 */
518 	int n = blk & (BLIST_BMAP_RADIX - 1);
519 	u_daddr_t mask;
520 
521 	mask = ((u_daddr_t)-1 << n) &
522 	    ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n));
523 
524 	if (scan->u.bmu_bitmap & mask)
525 		panic("blst_radix_free: freeing free block");
526 	scan->u.bmu_bitmap |= mask;
527 
528 	/*
529 	 * We could probably do a better job here.  We are required to make
530 	 * bighint at least as large as the biggest contiguous block of
531 	 * data.  If we just shoehorn it, a little extra overhead will
532 	 * be incured on the next allocation (but only that one typically).
533 	 */
534 	scan->bm_bighint = BLIST_BMAP_RADIX;
535 }
536 
537 /*
538  * BLST_META_FREE() - free allocated blocks from radix tree meta info
539  *
540  *	This support routine frees a range of blocks from the bitmap.
541  *	The range must be entirely enclosed by this radix node.  If a
542  *	meta node, we break the range down recursively to free blocks
543  *	in subnodes (which means that this code can free an arbitrary
544  *	range whereas the allocation code cannot allocate an arbitrary
545  *	range).
546  */
547 
548 static void
549 blst_meta_free(
550 	blmeta_t *scan,
551 	daddr_t freeBlk,
552 	daddr_t count,
553 	daddr_t radix,
554 	int skip,
555 	daddr_t blk
556 ) {
557 	int i;
558 	int next_skip = ((u_int)skip / BLIST_META_RADIX);
559 
560 #if 0
561 	printf("FREE (%llx,%lld) FROM (%llx,%lld)\n",
562 	    (long long)freeBlk, (long long)count,
563 	    (long long)blk, (long long)radix
564 	);
565 #endif
566 
567 	if (scan->u.bmu_avail == 0) {
568 		/*
569 		 * ALL-ALLOCATED special case, with possible
570 		 * shortcut to ALL-FREE special case.
571 		 */
572 		scan->u.bmu_avail = count;
573 		scan->bm_bighint = count;
574 
575 		if (count != radix)  {
576 			for (i = 1; i <= skip; i += next_skip) {
577 				if (scan[i].bm_bighint == (daddr_t)-1)
578 					break;
579 				scan[i].bm_bighint = 0;
580 				if (next_skip == 1) {
581 					scan[i].u.bmu_bitmap = 0;
582 				} else {
583 					scan[i].u.bmu_avail = 0;
584 				}
585 			}
586 			/* fall through */
587 		}
588 	} else {
589 		scan->u.bmu_avail += count;
590 		/* scan->bm_bighint = radix; */
591 	}
592 
593 	/*
594 	 * ALL-FREE special case.
595 	 */
596 
597 	if (scan->u.bmu_avail == radix)
598 		return;
599 	if (scan->u.bmu_avail > radix)
600 		panic("blst_meta_free: freeing already free blocks (%lld) %lld/%lld",
601 		    (long long)count, (long long)scan->u.bmu_avail,
602 		    (long long)radix);
603 
604 	/*
605 	 * Break the free down into its components
606 	 */
607 
608 	radix /= BLIST_META_RADIX;
609 
610 	i = (freeBlk - blk) / radix;
611 	blk += i * radix;
612 	i = i * next_skip + 1;
613 
614 	while (i <= skip && blk < freeBlk + count) {
615 		daddr_t v;
616 
617 		v = blk + radix - freeBlk;
618 		if (v > count)
619 			v = count;
620 
621 		if (scan->bm_bighint == (daddr_t)-1)
622 			panic("blst_meta_free: freeing unexpected range");
623 
624 		if (next_skip == 1) {
625 			blst_leaf_free(&scan[i], freeBlk, v);
626 		} else {
627 			blst_meta_free(&scan[i], freeBlk, v, radix, next_skip - 1, blk);
628 		}
629 		if (scan->bm_bighint < scan[i].bm_bighint)
630 		    scan->bm_bighint = scan[i].bm_bighint;
631 		count -= v;
632 		freeBlk += v;
633 		blk += radix;
634 		i += next_skip;
635 	}
636 }
637 
638 /*
639  * BLIST_RADIX_COPY() - copy one radix tree to another
640  *
641  *	Locates free space in the source tree and frees it in the destination
642  *	tree.  The space may not already be free in the destination.
643  */
644 
645 static void blst_copy(
646 	blmeta_t *scan,
647 	daddr_t blk,
648 	daddr_t radix,
649 	daddr_t skip,
650 	blist_t dest,
651 	daddr_t count
652 ) {
653 	int next_skip;
654 	int i;
655 
656 	/*
657 	 * Leaf node
658 	 */
659 
660 	if (radix == BLIST_BMAP_RADIX) {
661 		u_daddr_t v = scan->u.bmu_bitmap;
662 
663 		if (v == (u_daddr_t)-1) {
664 			blist_free(dest, blk, count);
665 		} else if (v != 0) {
666 			int i;
667 
668 			for (i = 0; i < BLIST_BMAP_RADIX && i < count; ++i) {
669 				if (v & (1 << i))
670 					blist_free(dest, blk + i, 1);
671 			}
672 		}
673 		return;
674 	}
675 
676 	/*
677 	 * Meta node
678 	 */
679 
680 	if (scan->u.bmu_avail == 0) {
681 		/*
682 		 * Source all allocated, leave dest allocated
683 		 */
684 		return;
685 	}
686 	if (scan->u.bmu_avail == radix) {
687 		/*
688 		 * Source all free, free entire dest
689 		 */
690 		if (count < radix)
691 			blist_free(dest, blk, count);
692 		else
693 			blist_free(dest, blk, radix);
694 		return;
695 	}
696 
697 
698 	radix /= BLIST_META_RADIX;
699 	next_skip = ((u_int)skip / BLIST_META_RADIX);
700 
701 	for (i = 1; count && i <= skip; i += next_skip) {
702 		if (scan[i].bm_bighint == (daddr_t)-1)
703 			break;
704 
705 		if (count >= radix) {
706 			blst_copy(
707 			    &scan[i],
708 			    blk,
709 			    radix,
710 			    next_skip - 1,
711 			    dest,
712 			    radix
713 			);
714 			count -= radix;
715 		} else {
716 			if (count) {
717 				blst_copy(
718 				    &scan[i],
719 				    blk,
720 				    radix,
721 				    next_skip - 1,
722 				    dest,
723 				    count
724 				);
725 			}
726 			count = 0;
727 		}
728 		blk += radix;
729 	}
730 }
731 
732 /*
733  * BLST_LEAF_FILL() -	allocate specific blocks in leaf bitmap
734  *
735  *	This routine allocates all blocks in the specified range
736  *	regardless of any existing allocations in that range.  Returns
737  *	the number of blocks allocated by the call.
738  */
739 
740 static int
741 blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count)
742 {
743 	int n = blk & (BLIST_BMAP_RADIX - 1);
744 	int nblks;
745 	u_daddr_t mask, bitmap;
746 
747 	mask = ((u_daddr_t)-1 << n) &
748 	    ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n));
749 
750 	/* Count the number of blocks we're about to allocate */
751 	bitmap = scan->u.bmu_bitmap & mask;
752 	for (nblks = 0; bitmap != 0; nblks++)
753 		bitmap &= bitmap - 1;
754 
755 	scan->u.bmu_bitmap &= ~mask;
756 	return nblks;
757 }
758 
759 /*
760  * BLIST_META_FILL() -	allocate specific blocks at a meta node
761  *
762  *	This routine allocates the specified range of blocks,
763  *	regardless of any existing allocations in the range.  The
764  *	range must be within the extent of this node.  Returns the
765  *	number of blocks allocated by the call.
766  */
767 static int
768 blst_meta_fill(
769 	blmeta_t *scan,
770 	daddr_t allocBlk,
771 	daddr_t count,
772 	daddr_t radix,
773 	int skip,
774 	daddr_t blk
775 ) {
776 	int i;
777 	int next_skip = ((u_int)skip / BLIST_META_RADIX);
778 	int nblks = 0;
779 
780 	if (count == radix || scan->u.bmu_avail == 0)  {
781 		/*
782 		 * ALL-ALLOCATED special case
783 		 */
784 		nblks = scan->u.bmu_avail;
785 		scan->u.bmu_avail = 0;
786 		scan->bm_bighint = count;
787 		return nblks;
788 	}
789 
790 	if (scan->u.bmu_avail == radix) {
791 		radix /= BLIST_META_RADIX;
792 
793 		/*
794 		 * ALL-FREE special case, initialize sublevel
795 		 */
796 		for (i = 1; i <= skip; i += next_skip) {
797 			if (scan[i].bm_bighint == (daddr_t)-1)
798 				break;
799 			if (next_skip == 1) {
800 				scan[i].u.bmu_bitmap = (u_daddr_t)-1;
801 				scan[i].bm_bighint = BLIST_BMAP_RADIX;
802 			} else {
803 				scan[i].bm_bighint = radix;
804 				scan[i].u.bmu_avail = radix;
805 			}
806 		}
807 	} else {
808 		radix /= BLIST_META_RADIX;
809 	}
810 
811 	if (count > radix)
812 		panic("blist_meta_fill: allocation too large");
813 
814 	i = (allocBlk - blk) / radix;
815 	blk += i * radix;
816 	i = i * next_skip + 1;
817 
818 	while (i <= skip && blk < allocBlk + count) {
819 		daddr_t v;
820 
821 		v = blk + radix - allocBlk;
822 		if (v > count)
823 			v = count;
824 
825 		if (scan->bm_bighint == (daddr_t)-1)
826 			panic("blst_meta_fill: filling unexpected range");
827 
828 		if (next_skip == 1) {
829 			nblks += blst_leaf_fill(&scan[i], allocBlk, v);
830 		} else {
831 			nblks += blst_meta_fill(&scan[i], allocBlk, v,
832 			    radix, next_skip - 1, blk);
833 		}
834 		count -= v;
835 		allocBlk += v;
836 		blk += radix;
837 		i += next_skip;
838 	}
839 	scan->u.bmu_avail -= nblks;
840 	return nblks;
841 }
842 
843 /*
844  * BLST_RADIX_INIT() - initialize radix tree
845  *
846  *	Initialize our meta structures and bitmaps and calculate the exact
847  *	amount of space required to manage 'count' blocks - this space may
848  *	be considerably less then the calculated radix due to the large
849  *	RADIX values we use.
850  */
851 
852 static daddr_t
853 blst_radix_init(blmeta_t *scan, daddr_t radix, int skip, daddr_t count)
854 {
855 	int i;
856 	int next_skip;
857 	daddr_t memindex = 0;
858 
859 	/*
860 	 * Leaf node
861 	 */
862 
863 	if (radix == BLIST_BMAP_RADIX) {
864 		if (scan) {
865 			scan->bm_bighint = 0;
866 			scan->u.bmu_bitmap = 0;
867 		}
868 		return(memindex);
869 	}
870 
871 	/*
872 	 * Meta node.  If allocating the entire object we can special
873 	 * case it.  However, we need to figure out how much memory
874 	 * is required to manage 'count' blocks, so we continue on anyway.
875 	 */
876 
877 	if (scan) {
878 		scan->bm_bighint = 0;
879 		scan->u.bmu_avail = 0;
880 	}
881 
882 	radix /= BLIST_META_RADIX;
883 	next_skip = ((u_int)skip / BLIST_META_RADIX);
884 
885 	for (i = 1; i <= skip; i += next_skip) {
886 		if (count >= radix) {
887 			/*
888 			 * Allocate the entire object
889 			 */
890 			memindex = i + blst_radix_init(
891 			    ((scan) ? &scan[i] : NULL),
892 			    radix,
893 			    next_skip - 1,
894 			    radix
895 			);
896 			count -= radix;
897 		} else if (count > 0) {
898 			/*
899 			 * Allocate a partial object
900 			 */
901 			memindex = i + blst_radix_init(
902 			    ((scan) ? &scan[i] : NULL),
903 			    radix,
904 			    next_skip - 1,
905 			    count
906 			);
907 			count = 0;
908 		} else {
909 			/*
910 			 * Add terminator and break out
911 			 */
912 			if (scan)
913 				scan[i].bm_bighint = (daddr_t)-1;
914 			break;
915 		}
916 	}
917 	if (memindex < i)
918 		memindex = i;
919 	return(memindex);
920 }
921 
922 #ifdef BLIST_DEBUG
923 
924 static void
925 blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int skip, int tab)
926 {
927 	int i;
928 	int next_skip;
929 	int lastState = 0;
930 
931 	if (radix == BLIST_BMAP_RADIX) {
932 		printf(
933 		    "%*.*s(%08llx,%lld): bitmap %08llx big=%lld\n",
934 		    tab, tab, "",
935 		    (long long)blk, (long long)radix,
936 		    (long long)scan->u.bmu_bitmap,
937 		    (long long)scan->bm_bighint
938 		);
939 		return;
940 	}
941 
942 	if (scan->u.bmu_avail == 0) {
943 		printf(
944 		    "%*.*s(%08llx,%lld) ALL ALLOCATED\n",
945 		    tab, tab, "",
946 		    (long long)blk,
947 		    (long long)radix
948 		);
949 		return;
950 	}
951 	if (scan->u.bmu_avail == radix) {
952 		printf(
953 		    "%*.*s(%08llx,%lld) ALL FREE\n",
954 		    tab, tab, "",
955 		    (long long)blk,
956 		    (long long)radix
957 		);
958 		return;
959 	}
960 
961 	printf(
962 	    "%*.*s(%08llx,%lld): subtree (%lld/%lld) big=%lld {\n",
963 	    tab, tab, "",
964 	    (long long)blk, (long long)radix,
965 	    (long long)scan->u.bmu_avail,
966 	    (long long)radix,
967 	    (long long)scan->bm_bighint
968 	);
969 
970 	radix /= BLIST_META_RADIX;
971 	next_skip = ((u_int)skip / BLIST_META_RADIX);
972 	tab += 4;
973 
974 	for (i = 1; i <= skip; i += next_skip) {
975 		if (scan[i].bm_bighint == (daddr_t)-1) {
976 			printf(
977 			    "%*.*s(%08llx,%lld): Terminator\n",
978 			    tab, tab, "",
979 			    (long long)blk, (long long)radix
980 			);
981 			lastState = 0;
982 			break;
983 		}
984 		blst_radix_print(
985 		    &scan[i],
986 		    blk,
987 		    radix,
988 		    next_skip - 1,
989 		    tab
990 		);
991 		blk += radix;
992 	}
993 	tab -= 4;
994 
995 	printf(
996 	    "%*.*s}\n",
997 	    tab, tab, ""
998 	);
999 }
1000 
1001 #endif
1002 
1003 #ifdef BLIST_DEBUG
1004 
1005 int
1006 main(int ac, char **av)
1007 {
1008 	int size = 1024;
1009 	int i;
1010 	blist_t bl;
1011 
1012 	for (i = 1; i < ac; ++i) {
1013 		const char *ptr = av[i];
1014 		if (*ptr != '-') {
1015 			size = strtol(ptr, NULL, 0);
1016 			continue;
1017 		}
1018 		ptr += 2;
1019 		fprintf(stderr, "Bad option: %s\n", ptr - 2);
1020 		exit(1);
1021 	}
1022 	bl = blist_create(size);
1023 	blist_free(bl, 0, size);
1024 
1025 	for (;;) {
1026 		char buf[1024];
1027 		daddr_t da = 0;
1028 		daddr_t count = 0;
1029 
1030 
1031 		printf("%lld/%lld/%lld> ", (long long)bl->bl_free,
1032 		    (long long)size, (long long)bl->bl_radix);
1033 		fflush(stdout);
1034 		if (fgets(buf, sizeof(buf), stdin) == NULL)
1035 			break;
1036 		switch(buf[0]) {
1037 		case 'r':
1038 			if (sscanf(buf + 1, "%lld", &count) == 1) {
1039 				blist_resize(&bl, count, 1);
1040 			} else {
1041 				printf("?\n");
1042 			}
1043 		case 'p':
1044 			blist_print(bl);
1045 			break;
1046 		case 'a':
1047 			if (sscanf(buf + 1, "%lld", &count) == 1) {
1048 				daddr_t blk = blist_alloc(bl, count);
1049 				printf("    R=%08llx\n", (long long)blk);
1050 			} else {
1051 				printf("?\n");
1052 			}
1053 			break;
1054 		case 'f':
1055 			if (sscanf(buf + 1, "%llx %lld",
1056 			    (long long *)&da, (long long *)&count) == 2) {
1057 				blist_free(bl, da, count);
1058 			} else {
1059 				printf("?\n");
1060 			}
1061 			break;
1062 		case 'l':
1063 			if (sscanf(buf + 1, "%llx %lld",
1064 			    (long long *)&da, (long long *)&count) == 2) {
1065 				printf("    n=%d\n",
1066 				    blist_fill(bl, da, count));
1067 			} else {
1068 				printf("?\n");
1069 			}
1070 			break;
1071 		case '?':
1072 		case 'h':
1073 			puts(
1074 			    "p          -print\n"
1075 			    "a %d       -allocate\n"
1076 			    "f %x %d    -free\n"
1077 			    "l %x %d    -fill\n"
1078 			    "r %d       -resize\n"
1079 			    "h/?        -help"
1080 			);
1081 			break;
1082 		default:
1083 			printf("?\n");
1084 			break;
1085 		}
1086 	}
1087 	return(0);
1088 }
1089 
1090 void
1091 panic(const char *ctl, ...)
1092 {
1093 	va_list va;
1094 
1095 	va_start(va, ctl);
1096 	vfprintf(stderr, ctl, va);
1097 	fprintf(stderr, "\n");
1098 	va_end(va);
1099 	exit(1);
1100 }
1101 
1102 #endif
1103 
1104