xref: /freebsd/sys/kern/subr_blist.c (revision 23833df4831a6f41aa39e952fba524edfb8cec6d)
1 /*-
2  * Copyright (c) 1998 Matthew Dillon.  All Rights Reserved.
3  * Redistribution and use in source and binary forms, with or without
4  * modification, are permitted provided that the following conditions
5  * are met:
6  * 1. Redistributions of source code must retain the above copyright
7  *    notice, this list of conditions and the following disclaimer.
8  * 2. Redistributions in binary form must reproduce the above copyright
9  *    notice, this list of conditions and the following disclaimer in the
10  *    documentation and/or other materials provided with the distribution.
11  * 3. Neither the name of the University nor the names of its contributors
12  *    may be used to endorse or promote products derived from this software
13  *    without specific prior written permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
16  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
19  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
21  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 /*
28  * BLIST.C -	Bitmap allocator/deallocator, using a radix tree with hinting
29  *
30  *	This module implements a general bitmap allocator/deallocator.  The
31  *	allocator eats around 2 bits per 'block'.  The module does not
32  *	try to interpret the meaning of a 'block' other than to return
33  *	SWAPBLK_NONE on an allocation failure.
34  *
35  *	A radix tree is used to maintain the bitmap.  Two radix constants are
36  *	involved:  One for the bitmaps contained in the leaf nodes (typically
37  *	64), and one for the meta nodes (typically 16).  Both meta and leaf
38  *	nodes have a hint field.  This field gives us a hint as to the largest
39  *	free contiguous range of blocks under the node.  It may contain a
40  *	value that is too high, but will never contain a value that is too
41  *	low.  When the radix tree is searched, allocation failures in subtrees
42  *	update the hint.
43  *
44  *	The radix tree also implements two collapsed states for meta nodes:
45  *	the ALL-ALLOCATED state and the ALL-FREE state.  If a meta node is
46  *	in either of these two states, all information contained underneath
47  *	the node is considered stale.  These states are used to optimize
48  *	allocation and freeing operations.
49  *
50  * 	The hinting greatly increases code efficiency for allocations while
51  *	the general radix structure optimizes both allocations and frees.  The
52  *	radix tree should be able to operate well no matter how much
53  *	fragmentation there is and no matter how large a bitmap is used.
54  *
55  *	The blist code wires all necessary memory at creation time.  Neither
56  *	allocations nor frees require interaction with the memory subsystem.
57  *	The non-blocking features of the blist code are used in the swap code
58  *	(vm/swap_pager.c).
59  *
60  *	LAYOUT: The radix tree is laid out recursively using a
61  *	linear array.  Each meta node is immediately followed (laid out
62  *	sequentially in memory) by BLIST_META_RADIX lower level nodes.  This
63  *	is a recursive structure but one that can be easily scanned through
64  *	a very simple 'skip' calculation.  In order to support large radixes,
65  *	portions of the tree may reside outside our memory allocation.  We
66  *	handle this with an early-termination optimization (when bighint is
67  *	set to -1) on the scan.  The memory allocation is only large enough
68  *	to cover the number of blocks requested at creation time even if it
69  *	must be encompassed in larger root-node radix.
70  *
71  *	NOTE: the allocator cannot currently allocate more than
72  *	BLIST_BMAP_RADIX blocks per call.  It will panic with 'allocation too
73  *	large' if you try.  This is an area that could use improvement.  The
74  *	radix is large enough that this restriction does not effect the swap
75  *	system, though.  Currently only the allocation code is affected by
76  *	this algorithmic unfeature.  The freeing code can handle arbitrary
77  *	ranges.
78  *
79  *	This code can be compiled stand-alone for debugging.
80  */
81 
82 #include <sys/cdefs.h>
83 __FBSDID("$FreeBSD$");
84 
85 #ifdef _KERNEL
86 
87 #include <sys/param.h>
88 #include <sys/systm.h>
89 #include <sys/lock.h>
90 #include <sys/kernel.h>
91 #include <sys/blist.h>
92 #include <sys/malloc.h>
93 #include <sys/proc.h>
94 #include <sys/mutex.h>
95 
96 #else
97 
98 #ifndef BLIST_NO_DEBUG
99 #define BLIST_DEBUG
100 #endif
101 
102 #include <sys/types.h>
103 #include <sys/malloc.h>
104 #include <stdio.h>
105 #include <string.h>
106 #include <stdlib.h>
107 #include <stdarg.h>
108 #include <stdbool.h>
109 
110 #define	bitcount64(x)	__bitcount64((uint64_t)(x))
111 #define malloc(a,b,c)	calloc(a, 1)
112 #define free(a,b)	free(a)
113 
114 #include <sys/blist.h>
115 
116 void panic(const char *ctl, ...);
117 
118 #endif
119 
120 /*
121  * static support functions
122  */
123 static daddr_t	blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count,
124 		    daddr_t cursor);
125 static daddr_t	blst_meta_alloc(blmeta_t *scan, daddr_t blk, daddr_t count,
126 		    daddr_t radix, daddr_t skip, daddr_t cursor);
127 static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count);
128 static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count,
129 		    daddr_t radix, daddr_t skip, daddr_t blk);
130 static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix,
131 		    daddr_t skip, blist_t dest, daddr_t count);
132 static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count);
133 static daddr_t blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count,
134 		    daddr_t radix, daddr_t skip, daddr_t blk);
135 static daddr_t	blst_radix_init(blmeta_t *scan, daddr_t radix, daddr_t skip,
136 		    daddr_t count);
137 #ifndef _KERNEL
138 static void	blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix,
139 		    daddr_t skip, int tab);
140 #endif
141 
142 #ifdef _KERNEL
143 static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space");
144 #endif
145 
146 /*
147  * blist_create() - create a blist capable of handling up to the specified
148  *		    number of blocks
149  *
150  *	blocks - must be greater than 0
151  * 	flags  - malloc flags
152  *
153  *	The smallest blist consists of a single leaf node capable of
154  *	managing BLIST_BMAP_RADIX blocks.
155  */
156 blist_t
157 blist_create(daddr_t blocks, int flags)
158 {
159 	blist_t bl;
160 	daddr_t nodes, radix, skip;
161 
162 	/*
163 	 * Calculate radix and skip field used for scanning.
164 	 */
165 	radix = BLIST_BMAP_RADIX;
166 	skip = 0;
167 	while (radix < blocks) {
168 		radix *= BLIST_META_RADIX;
169 		skip = (skip + 1) * BLIST_META_RADIX;
170 	}
171 	nodes = 1 + blst_radix_init(NULL, radix, skip, blocks);
172 
173 	bl = malloc(sizeof(struct blist), M_SWAP, flags);
174 	if (bl == NULL)
175 		return (NULL);
176 
177 	bl->bl_blocks = blocks;
178 	bl->bl_radix = radix;
179 	bl->bl_skip = skip;
180 	bl->bl_cursor = 0;
181 	bl->bl_root = malloc(nodes * sizeof(blmeta_t), M_SWAP, flags);
182 	if (bl->bl_root == NULL) {
183 		free(bl, M_SWAP);
184 		return (NULL);
185 	}
186 	blst_radix_init(bl->bl_root, radix, skip, blocks);
187 
188 #if defined(BLIST_DEBUG)
189 	printf(
190 		"BLIST representing %lld blocks (%lld MB of swap)"
191 		", requiring %lldK of ram\n",
192 		(long long)bl->bl_blocks,
193 		(long long)bl->bl_blocks * 4 / 1024,
194 		(long long)(nodes * sizeof(blmeta_t) + 1023) / 1024
195 	);
196 	printf("BLIST raw radix tree contains %lld records\n",
197 	    (long long)nodes);
198 #endif
199 
200 	return (bl);
201 }
202 
203 void
204 blist_destroy(blist_t bl)
205 {
206 	free(bl->bl_root, M_SWAP);
207 	free(bl, M_SWAP);
208 }
209 
210 /*
211  * blist_alloc() -   reserve space in the block bitmap.  Return the base
212  *		     of a contiguous region or SWAPBLK_NONE if space could
213  *		     not be allocated.
214  */
215 daddr_t
216 blist_alloc(blist_t bl, daddr_t count)
217 {
218 	daddr_t blk;
219 
220 	/*
221 	 * This loop iterates at most twice.  An allocation failure in the
222 	 * first iteration leads to a second iteration only if the cursor was
223 	 * non-zero.  When the cursor is zero, an allocation failure will
224 	 * reduce the hint, stopping further iterations.
225 	 */
226 	while (count <= bl->bl_root->bm_bighint) {
227 		blk = blst_meta_alloc(bl->bl_root, 0, count, bl->bl_radix,
228 		    bl->bl_skip, bl->bl_cursor);
229 		if (blk != SWAPBLK_NONE) {
230 			bl->bl_cursor = blk + count;
231 			return (blk);
232 		} else if (bl->bl_cursor != 0)
233 			bl->bl_cursor = 0;
234 	}
235 	return (SWAPBLK_NONE);
236 }
237 
238 /*
239  * blist_avail() -	return the number of free blocks.
240  */
241 daddr_t
242 blist_avail(blist_t bl)
243 {
244 
245 	if (bl->bl_radix == BLIST_BMAP_RADIX)
246 		return (bitcount64(bl->bl_root->u.bmu_bitmap));
247 	else
248 		return (bl->bl_root->u.bmu_avail);
249 }
250 
251 /*
252  * blist_free() -	free up space in the block bitmap.  Return the base
253  *		     	of a contiguous region.  Panic if an inconsistancy is
254  *			found.
255  */
256 void
257 blist_free(blist_t bl, daddr_t blkno, daddr_t count)
258 {
259 
260 	blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix, bl->bl_skip, 0);
261 }
262 
263 /*
264  * blist_fill() -	mark a region in the block bitmap as off-limits
265  *			to the allocator (i.e. allocate it), ignoring any
266  *			existing allocations.  Return the number of blocks
267  *			actually filled that were free before the call.
268  */
269 daddr_t
270 blist_fill(blist_t bl, daddr_t blkno, daddr_t count)
271 {
272 
273 	return (blst_meta_fill(bl->bl_root, blkno, count, bl->bl_radix,
274 	    bl->bl_skip, 0));
275 }
276 
277 /*
278  * blist_resize() -	resize an existing radix tree to handle the
279  *			specified number of blocks.  This will reallocate
280  *			the tree and transfer the previous bitmap to the new
281  *			one.  When extending the tree you can specify whether
282  *			the new blocks are to left allocated or freed.
283  */
284 void
285 blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags)
286 {
287     blist_t newbl = blist_create(count, flags);
288     blist_t save = *pbl;
289 
290     *pbl = newbl;
291     if (count > save->bl_blocks)
292 	    count = save->bl_blocks;
293     blst_copy(save->bl_root, 0, save->bl_radix, save->bl_skip, newbl, count);
294 
295     /*
296      * If resizing upwards, should we free the new space or not?
297      */
298     if (freenew && count < newbl->bl_blocks) {
299 	    blist_free(newbl, count, newbl->bl_blocks - count);
300     }
301     blist_destroy(save);
302 }
303 
304 #ifdef BLIST_DEBUG
305 
306 /*
307  * blist_print()    - dump radix tree
308  */
309 void
310 blist_print(blist_t bl)
311 {
312 	printf("BLIST {\n");
313 	blst_radix_print(bl->bl_root, 0, bl->bl_radix, bl->bl_skip, 4);
314 	printf("}\n");
315 }
316 
317 #endif
318 
319 /************************************************************************
320  *			  ALLOCATION SUPPORT FUNCTIONS			*
321  ************************************************************************
322  *
323  *	These support functions do all the actual work.  They may seem
324  *	rather longish, but that's because I've commented them up.  The
325  *	actual code is straight forward.
326  *
327  */
328 
329 /*
330  * blist_leaf_alloc() -	allocate at a leaf in the radix tree (a bitmap).
331  *
332  *	This is the core of the allocator and is optimized for the
333  *	BLIST_BMAP_RADIX block allocation case.  Otherwise, execution
334  *	time is proportional to log2(count) + log2(BLIST_BMAP_RADIX).
335  */
336 static daddr_t
337 blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count, daddr_t cursor)
338 {
339 	u_daddr_t mask;
340 	int count1, hi, lo, mid, num_shifts, range1, range_ext;
341 
342 	if (count == BLIST_BMAP_RADIX) {
343 		/*
344 		 * Optimize allocation of BLIST_BMAP_RADIX bits.  If this wasn't
345 		 * a special case, then forming the final value of 'mask' below
346 		 * would require special handling to avoid an invalid left shift
347 		 * when count equals the number of bits in mask.
348 		 */
349 		if (~scan->u.bmu_bitmap != 0) {
350 			scan->bm_bighint = BLIST_BMAP_RADIX - 1;
351 			return (SWAPBLK_NONE);
352 		}
353 		if (cursor != blk)
354 			return (SWAPBLK_NONE);
355 		scan->u.bmu_bitmap = 0;
356 		scan->bm_bighint = 0;
357 		return (blk);
358 	}
359 	range1 = 0;
360 	count1 = count - 1;
361 	num_shifts = fls(count1);
362 	mask = scan->u.bmu_bitmap;
363 	while (mask != 0 && num_shifts > 0) {
364 		/*
365 		 * If bit i is set in mask, then bits in [i, i+range1] are set
366 		 * in scan->u.bmu_bitmap.  The value of range1 is equal to
367 		 * count1 >> num_shifts.  Grow range and reduce num_shifts to 0,
368 		 * while preserving these invariants.  The updates to mask leave
369 		 * fewer bits set, but each bit that remains set represents a
370 		 * longer string of consecutive bits set in scan->u.bmu_bitmap.
371 		 */
372 		num_shifts--;
373 		range_ext = range1 + ((count1 >> num_shifts) & 1);
374 		mask &= mask >> range_ext;
375 		range1 += range_ext;
376 	}
377 	if (mask == 0) {
378 		/*
379 		 * Update bighint.  There is no allocation bigger than range1
380 		 * available in this leaf.
381 		 */
382 		scan->bm_bighint = range1;
383 		return (SWAPBLK_NONE);
384 	}
385 
386 	/*
387 	 * Discard any candidates that appear before the cursor.
388 	 */
389 	lo = cursor - blk;
390 	mask &= ~(u_daddr_t)0 << lo;
391 
392 	if (mask == 0)
393 		return (SWAPBLK_NONE);
394 
395 	/*
396 	 * The least significant set bit in mask marks the start of the first
397 	 * available range of sufficient size.  Clear all the bits but that one,
398 	 * and then perform a binary search to find its position.
399 	 */
400 	mask &= -mask;
401 	hi = BLIST_BMAP_RADIX - count1;
402 	while (lo + 1 < hi) {
403 		mid = (lo + hi) >> 1;
404 		if ((mask >> mid) != 0)
405 			lo = mid;
406 		else
407 			hi = mid;
408 	}
409 
410 	/*
411 	 * Set in mask exactly the bits being allocated, and clear them from
412 	 * the set of available bits.
413 	 */
414 	mask = (mask << count) - mask;
415 	scan->u.bmu_bitmap &= ~mask;
416 	return (blk + lo);
417 }
418 
419 /*
420  * blist_meta_alloc() -	allocate at a meta in the radix tree.
421  *
422  *	Attempt to allocate at a meta node.  If we can't, we update
423  *	bighint and return a failure.  Updating bighint optimize future
424  *	calls that hit this node.  We have to check for our collapse cases
425  *	and we have a few optimizations strewn in as well.
426  */
427 static daddr_t
428 blst_meta_alloc(blmeta_t *scan, daddr_t blk, daddr_t count, daddr_t radix,
429     daddr_t skip, daddr_t cursor)
430 {
431 	daddr_t i, next_skip, r;
432 	int child;
433 	bool scan_from_start;
434 
435 	if (radix == BLIST_BMAP_RADIX)
436 		return (blst_leaf_alloc(scan, blk, count, cursor));
437 	if (scan->u.bmu_avail < count) {
438 		/*
439 		 * The meta node's hint must be too large if the allocation
440 		 * exceeds the number of free blocks.  Reduce the hint, and
441 		 * return failure.
442 		 */
443 		scan->bm_bighint = scan->u.bmu_avail;
444 		return (SWAPBLK_NONE);
445 	}
446 	next_skip = skip / BLIST_META_RADIX;
447 
448 	/*
449 	 * An ALL-FREE meta node requires special handling before allocating
450 	 * any of its blocks.
451 	 */
452 	if (scan->u.bmu_avail == radix) {
453 		radix /= BLIST_META_RADIX;
454 
455 		/*
456 		 * Reinitialize each of the meta node's children.  An ALL-FREE
457 		 * meta node cannot have a terminator in any subtree.
458 		 */
459 		for (i = 1; i <= skip; i += next_skip) {
460 			if (next_skip == 1)
461 				scan[i].u.bmu_bitmap = (u_daddr_t)-1;
462 			else
463 				scan[i].u.bmu_avail = radix;
464 			scan[i].bm_bighint = radix;
465 		}
466 	} else {
467 		radix /= BLIST_META_RADIX;
468 	}
469 
470 	if (count > radix) {
471 		/*
472 		 * The allocation exceeds the number of blocks that are
473 		 * managed by a subtree of this meta node.
474 		 */
475 		panic("allocation too large");
476 	}
477 	scan_from_start = cursor == blk;
478 	child = (cursor - blk) / radix;
479 	blk += child * radix;
480 	for (i = 1 + child * next_skip; i <= skip; i += next_skip) {
481 		if (count <= scan[i].bm_bighint) {
482 			/*
483 			 * The allocation might fit in the i'th subtree.
484 			 */
485 			r = blst_meta_alloc(&scan[i], blk, count, radix,
486 			    next_skip - 1, cursor > blk ? cursor : blk);
487 			if (r != SWAPBLK_NONE) {
488 				scan->u.bmu_avail -= count;
489 				return (r);
490 			}
491 		} else if (scan[i].bm_bighint == (daddr_t)-1) {
492 			/*
493 			 * Terminator
494 			 */
495 			break;
496 		}
497 		blk += radix;
498 	}
499 
500 	/*
501 	 * We couldn't allocate count in this subtree, update bighint.
502 	 */
503 	if (scan_from_start && scan->bm_bighint >= count)
504 		scan->bm_bighint = count - 1;
505 
506 	return (SWAPBLK_NONE);
507 }
508 
509 /*
510  * BLST_LEAF_FREE() -	free allocated block from leaf bitmap
511  *
512  */
513 static void
514 blst_leaf_free(blmeta_t *scan, daddr_t blk, int count)
515 {
516 	/*
517 	 * free some data in this bitmap
518 	 *
519 	 * e.g.
520 	 *	0000111111111110000
521 	 *          \_________/\__/
522 	 *		v        n
523 	 */
524 	int n = blk & (BLIST_BMAP_RADIX - 1);
525 	u_daddr_t mask;
526 
527 	mask = ((u_daddr_t)-1 << n) &
528 	    ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n));
529 
530 	if (scan->u.bmu_bitmap & mask)
531 		panic("blst_radix_free: freeing free block");
532 	scan->u.bmu_bitmap |= mask;
533 
534 	/*
535 	 * We could probably do a better job here.  We are required to make
536 	 * bighint at least as large as the biggest contiguous block of
537 	 * data.  If we just shoehorn it, a little extra overhead will
538 	 * be incured on the next allocation (but only that one typically).
539 	 */
540 	scan->bm_bighint = BLIST_BMAP_RADIX;
541 }
542 
543 /*
544  * BLST_META_FREE() - free allocated blocks from radix tree meta info
545  *
546  *	This support routine frees a range of blocks from the bitmap.
547  *	The range must be entirely enclosed by this radix node.  If a
548  *	meta node, we break the range down recursively to free blocks
549  *	in subnodes (which means that this code can free an arbitrary
550  *	range whereas the allocation code cannot allocate an arbitrary
551  *	range).
552  */
553 static void
554 blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, daddr_t radix,
555     daddr_t skip, daddr_t blk)
556 {
557 	daddr_t i, next_skip, v;
558 	int child;
559 
560 	if (scan->bm_bighint == (daddr_t)-1)
561 		panic("freeing invalid range");
562 	if (radix == BLIST_BMAP_RADIX)
563 		return (blst_leaf_free(scan, freeBlk, count));
564 	next_skip = skip / BLIST_META_RADIX;
565 
566 	if (scan->u.bmu_avail == 0) {
567 		/*
568 		 * ALL-ALLOCATED special case, with possible
569 		 * shortcut to ALL-FREE special case.
570 		 */
571 		scan->u.bmu_avail = count;
572 		scan->bm_bighint = count;
573 
574 		if (count != radix)  {
575 			for (i = 1; i <= skip; i += next_skip) {
576 				if (scan[i].bm_bighint == (daddr_t)-1)
577 					break;
578 				scan[i].bm_bighint = 0;
579 				if (next_skip == 1) {
580 					scan[i].u.bmu_bitmap = 0;
581 				} else {
582 					scan[i].u.bmu_avail = 0;
583 				}
584 			}
585 			/* fall through */
586 		}
587 	} else {
588 		scan->u.bmu_avail += count;
589 		/* scan->bm_bighint = radix; */
590 	}
591 
592 	/*
593 	 * ALL-FREE special case.
594 	 */
595 
596 	if (scan->u.bmu_avail == radix)
597 		return;
598 	if (scan->u.bmu_avail > radix)
599 		panic("blst_meta_free: freeing already free blocks (%lld) %lld/%lld",
600 		    (long long)count, (long long)scan->u.bmu_avail,
601 		    (long long)radix);
602 
603 	/*
604 	 * Break the free down into its components
605 	 */
606 
607 	radix /= BLIST_META_RADIX;
608 
609 	child = (freeBlk - blk) / radix;
610 	blk += child * radix;
611 	i = 1 + child * next_skip;
612 	while (i <= skip && blk < freeBlk + count) {
613 		v = blk + radix - freeBlk;
614 		if (v > count)
615 			v = count;
616 		blst_meta_free(&scan[i], freeBlk, v, radix, next_skip - 1, blk);
617 		if (scan->bm_bighint < scan[i].bm_bighint)
618 			scan->bm_bighint = scan[i].bm_bighint;
619 		count -= v;
620 		freeBlk += v;
621 		blk += radix;
622 		i += next_skip;
623 	}
624 }
625 
626 /*
627  * BLIST_RADIX_COPY() - copy one radix tree to another
628  *
629  *	Locates free space in the source tree and frees it in the destination
630  *	tree.  The space may not already be free in the destination.
631  */
632 static void
633 blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, daddr_t skip,
634     blist_t dest, daddr_t count)
635 {
636 	daddr_t i, next_skip;
637 
638 	/*
639 	 * Leaf node
640 	 */
641 
642 	if (radix == BLIST_BMAP_RADIX) {
643 		u_daddr_t v = scan->u.bmu_bitmap;
644 
645 		if (v == (u_daddr_t)-1) {
646 			blist_free(dest, blk, count);
647 		} else if (v != 0) {
648 			int i;
649 
650 			for (i = 0; i < BLIST_BMAP_RADIX && i < count; ++i) {
651 				if (v & ((u_daddr_t)1 << i))
652 					blist_free(dest, blk + i, 1);
653 			}
654 		}
655 		return;
656 	}
657 
658 	/*
659 	 * Meta node
660 	 */
661 
662 	if (scan->u.bmu_avail == 0) {
663 		/*
664 		 * Source all allocated, leave dest allocated
665 		 */
666 		return;
667 	}
668 	if (scan->u.bmu_avail == radix) {
669 		/*
670 		 * Source all free, free entire dest
671 		 */
672 		if (count < radix)
673 			blist_free(dest, blk, count);
674 		else
675 			blist_free(dest, blk, radix);
676 		return;
677 	}
678 
679 
680 	radix /= BLIST_META_RADIX;
681 	next_skip = skip / BLIST_META_RADIX;
682 
683 	for (i = 1; count && i <= skip; i += next_skip) {
684 		if (scan[i].bm_bighint == (daddr_t)-1)
685 			break;
686 
687 		if (count >= radix) {
688 			blst_copy(&scan[i], blk, radix, next_skip - 1, dest,
689 			    radix);
690 			count -= radix;
691 		} else {
692 			if (count) {
693 				blst_copy(&scan[i], blk, radix, next_skip - 1,
694 				    dest, count);
695 			}
696 			count = 0;
697 		}
698 		blk += radix;
699 	}
700 }
701 
702 /*
703  * BLST_LEAF_FILL() -	allocate specific blocks in leaf bitmap
704  *
705  *	This routine allocates all blocks in the specified range
706  *	regardless of any existing allocations in that range.  Returns
707  *	the number of blocks allocated by the call.
708  */
709 static daddr_t
710 blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count)
711 {
712 	int n = blk & (BLIST_BMAP_RADIX - 1);
713 	daddr_t nblks;
714 	u_daddr_t mask;
715 
716 	mask = ((u_daddr_t)-1 << n) &
717 	    ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n));
718 
719 	/* Count the number of blocks that we are allocating. */
720 	nblks = bitcount64(scan->u.bmu_bitmap & mask);
721 
722 	scan->u.bmu_bitmap &= ~mask;
723 	return (nblks);
724 }
725 
726 /*
727  * BLIST_META_FILL() -	allocate specific blocks at a meta node
728  *
729  *	This routine allocates the specified range of blocks,
730  *	regardless of any existing allocations in the range.  The
731  *	range must be within the extent of this node.  Returns the
732  *	number of blocks allocated by the call.
733  */
734 static daddr_t
735 blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, daddr_t radix,
736     daddr_t skip, daddr_t blk)
737 {
738 	daddr_t i, nblks, next_skip, v;
739 	int child;
740 
741 	if (scan->bm_bighint == (daddr_t)-1)
742 		panic("filling invalid range");
743 	if (count > radix) {
744 		/*
745 		 * The allocation exceeds the number of blocks that are
746 		 * managed by this node.
747 		 */
748 		panic("fill too large");
749 	}
750 	if (radix == BLIST_BMAP_RADIX)
751 		return (blst_leaf_fill(scan, allocBlk, count));
752 	if (count == radix || scan->u.bmu_avail == 0)  {
753 		/*
754 		 * ALL-ALLOCATED special case
755 		 */
756 		nblks = scan->u.bmu_avail;
757 		scan->u.bmu_avail = 0;
758 		scan->bm_bighint = 0;
759 		return (nblks);
760 	}
761 	next_skip = skip / BLIST_META_RADIX;
762 
763 	/*
764 	 * An ALL-FREE meta node requires special handling before allocating
765 	 * any of its blocks.
766 	 */
767 	if (scan->u.bmu_avail == radix) {
768 		radix /= BLIST_META_RADIX;
769 
770 		/*
771 		 * Reinitialize each of the meta node's children.  An ALL-FREE
772 		 * meta node cannot have a terminator in any subtree.
773 		 */
774 		for (i = 1; i <= skip; i += next_skip) {
775 			if (next_skip == 1)
776 				scan[i].u.bmu_bitmap = (u_daddr_t)-1;
777 			else
778 				scan[i].u.bmu_avail = radix;
779 			scan[i].bm_bighint = radix;
780 		}
781 	} else {
782 		radix /= BLIST_META_RADIX;
783 	}
784 
785 	nblks = 0;
786 	child = (allocBlk - blk) / radix;
787 	blk += child * radix;
788 	i = 1 + child * next_skip;
789 	while (i <= skip && blk < allocBlk + count) {
790 		v = blk + radix - allocBlk;
791 		if (v > count)
792 			v = count;
793 		nblks += blst_meta_fill(&scan[i], allocBlk, v, radix,
794 		    next_skip - 1, blk);
795 		count -= v;
796 		allocBlk += v;
797 		blk += radix;
798 		i += next_skip;
799 	}
800 	scan->u.bmu_avail -= nblks;
801 	return (nblks);
802 }
803 
804 /*
805  * BLST_RADIX_INIT() - initialize radix tree
806  *
807  *	Initialize our meta structures and bitmaps and calculate the exact
808  *	amount of space required to manage 'count' blocks - this space may
809  *	be considerably less than the calculated radix due to the large
810  *	RADIX values we use.
811  */
812 static daddr_t
813 blst_radix_init(blmeta_t *scan, daddr_t radix, daddr_t skip, daddr_t count)
814 {
815 	daddr_t i, memindex, next_skip;
816 
817 	memindex = 0;
818 
819 	/*
820 	 * Leaf node
821 	 */
822 
823 	if (radix == BLIST_BMAP_RADIX) {
824 		if (scan) {
825 			scan->bm_bighint = 0;
826 			scan->u.bmu_bitmap = 0;
827 		}
828 		return (memindex);
829 	}
830 
831 	/*
832 	 * Meta node.  If allocating the entire object we can special
833 	 * case it.  However, we need to figure out how much memory
834 	 * is required to manage 'count' blocks, so we continue on anyway.
835 	 */
836 
837 	if (scan) {
838 		scan->bm_bighint = 0;
839 		scan->u.bmu_avail = 0;
840 	}
841 
842 	radix /= BLIST_META_RADIX;
843 	next_skip = skip / BLIST_META_RADIX;
844 
845 	for (i = 1; i <= skip; i += next_skip) {
846 		if (count >= radix) {
847 			/*
848 			 * Allocate the entire object
849 			 */
850 			memindex = i +
851 			    blst_radix_init(((scan) ? &scan[i] : NULL), radix,
852 			    next_skip - 1, radix);
853 			count -= radix;
854 		} else if (count > 0) {
855 			/*
856 			 * Allocate a partial object
857 			 */
858 			memindex = i +
859 			    blst_radix_init(((scan) ? &scan[i] : NULL), radix,
860 			    next_skip - 1, count);
861 			count = 0;
862 		} else {
863 			/*
864 			 * Add terminator and break out
865 			 */
866 			if (scan)
867 				scan[i].bm_bighint = (daddr_t)-1;
868 			break;
869 		}
870 	}
871 	if (memindex < i)
872 		memindex = i;
873 	return (memindex);
874 }
875 
876 #ifdef BLIST_DEBUG
877 
878 static void
879 blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, daddr_t skip,
880     int tab)
881 {
882 	daddr_t i, next_skip;
883 
884 	if (radix == BLIST_BMAP_RADIX) {
885 		printf(
886 		    "%*.*s(%08llx,%lld): bitmap %016llx big=%lld\n",
887 		    tab, tab, "",
888 		    (long long)blk, (long long)radix,
889 		    (long long)scan->u.bmu_bitmap,
890 		    (long long)scan->bm_bighint
891 		);
892 		return;
893 	}
894 
895 	if (scan->u.bmu_avail == 0) {
896 		printf(
897 		    "%*.*s(%08llx,%lld) ALL ALLOCATED\n",
898 		    tab, tab, "",
899 		    (long long)blk,
900 		    (long long)radix
901 		);
902 		return;
903 	}
904 	if (scan->u.bmu_avail == radix) {
905 		printf(
906 		    "%*.*s(%08llx,%lld) ALL FREE\n",
907 		    tab, tab, "",
908 		    (long long)blk,
909 		    (long long)radix
910 		);
911 		return;
912 	}
913 
914 	printf(
915 	    "%*.*s(%08llx,%lld): subtree (%lld/%lld) big=%lld {\n",
916 	    tab, tab, "",
917 	    (long long)blk, (long long)radix,
918 	    (long long)scan->u.bmu_avail,
919 	    (long long)radix,
920 	    (long long)scan->bm_bighint
921 	);
922 
923 	radix /= BLIST_META_RADIX;
924 	next_skip = skip / BLIST_META_RADIX;
925 	tab += 4;
926 
927 	for (i = 1; i <= skip; i += next_skip) {
928 		if (scan[i].bm_bighint == (daddr_t)-1) {
929 			printf(
930 			    "%*.*s(%08llx,%lld): Terminator\n",
931 			    tab, tab, "",
932 			    (long long)blk, (long long)radix
933 			);
934 			break;
935 		}
936 		blst_radix_print(&scan[i], blk, radix, next_skip - 1, tab);
937 		blk += radix;
938 	}
939 	tab -= 4;
940 
941 	printf(
942 	    "%*.*s}\n",
943 	    tab, tab, ""
944 	);
945 }
946 
947 #endif
948 
949 #ifdef BLIST_DEBUG
950 
951 int
952 main(int ac, char **av)
953 {
954 	int size = 1024;
955 	int i;
956 	blist_t bl;
957 
958 	for (i = 1; i < ac; ++i) {
959 		const char *ptr = av[i];
960 		if (*ptr != '-') {
961 			size = strtol(ptr, NULL, 0);
962 			continue;
963 		}
964 		ptr += 2;
965 		fprintf(stderr, "Bad option: %s\n", ptr - 2);
966 		exit(1);
967 	}
968 	bl = blist_create(size, M_WAITOK);
969 	blist_free(bl, 0, size);
970 
971 	for (;;) {
972 		char buf[1024];
973 		long long da = 0;
974 		long long count = 0;
975 
976 		printf("%lld/%lld/%lld> ", (long long)blist_avail(bl),
977 		    (long long)size, (long long)bl->bl_radix);
978 		fflush(stdout);
979 		if (fgets(buf, sizeof(buf), stdin) == NULL)
980 			break;
981 		switch(buf[0]) {
982 		case 'r':
983 			if (sscanf(buf + 1, "%lld", &count) == 1) {
984 				blist_resize(&bl, count, 1, M_WAITOK);
985 			} else {
986 				printf("?\n");
987 			}
988 		case 'p':
989 			blist_print(bl);
990 			break;
991 		case 'a':
992 			if (sscanf(buf + 1, "%lld", &count) == 1) {
993 				daddr_t blk = blist_alloc(bl, count);
994 				printf("    R=%08llx\n", (long long)blk);
995 			} else {
996 				printf("?\n");
997 			}
998 			break;
999 		case 'f':
1000 			if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) {
1001 				blist_free(bl, da, count);
1002 			} else {
1003 				printf("?\n");
1004 			}
1005 			break;
1006 		case 'l':
1007 			if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) {
1008 				printf("    n=%jd\n",
1009 				    (intmax_t)blist_fill(bl, da, count));
1010 			} else {
1011 				printf("?\n");
1012 			}
1013 			break;
1014 		case '?':
1015 		case 'h':
1016 			puts(
1017 			    "p          -print\n"
1018 			    "a %d       -allocate\n"
1019 			    "f %x %d    -free\n"
1020 			    "l %x %d    -fill\n"
1021 			    "r %d       -resize\n"
1022 			    "h/?        -help"
1023 			);
1024 			break;
1025 		default:
1026 			printf("?\n");
1027 			break;
1028 		}
1029 	}
1030 	return(0);
1031 }
1032 
1033 void
1034 panic(const char *ctl, ...)
1035 {
1036 	va_list va;
1037 
1038 	va_start(va, ctl);
1039 	vfprintf(stderr, ctl, va);
1040 	fprintf(stderr, "\n");
1041 	va_end(va);
1042 	exit(1);
1043 }
1044 
1045 #endif
1046