xref: /freebsd/sys/kern/subr_blist.c (revision e9b1dc32c9bd2ebae5f9e140bfa0e0321bc366b5)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1998 Matthew Dillon.  All Rights Reserved.
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. Neither the name of the University nor the names of its contributors
14  *    may be used to endorse or promote products derived from this software
15  *    without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
18  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
23  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
25  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
26  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  */
29 /*
30  * BLIST.C -	Bitmap allocator/deallocator, using a radix tree with hinting
31  *
32  *	This module implements a general bitmap allocator/deallocator.  The
33  *	allocator eats around 2 bits per 'block'.  The module does not
34  *	try to interpret the meaning of a 'block' other than to return
35  *	SWAPBLK_NONE on an allocation failure.
36  *
37  *	A radix tree controls access to pieces of the bitmap, and includes
38  *	auxiliary information at each interior node about the availabilty of
39  *	contiguous free blocks in the subtree rooted at that node.  Two radix
40  *	constants are involved: one for the size of the bitmaps contained in the
41  *	leaf nodes (BLIST_BMAP_RADIX), and one for the number of descendents of
42  *	each of the meta (interior) nodes (BLIST_META_RADIX).  Each subtree is
43  *	associated with a range of blocks.  The root of any subtree stores a
44  *	hint field that defines an upper bound on the size of the largest
45  *	allocation that can begin in the associated block range.  A hint is an
46  *	upper bound on a potential allocation, but not necessarily a tight upper
47  *	bound.
48  *
49  *	The radix tree also implements two collapsed states for meta nodes:
50  *	the ALL-ALLOCATED state and the ALL-FREE state.  If a meta node is
51  *	in either of these two states, all information contained underneath
52  *	the node is considered stale.  These states are used to optimize
53  *	allocation and freeing operations.
54  *
55  * 	The hinting greatly increases code efficiency for allocations while
56  *	the general radix structure optimizes both allocations and frees.  The
57  *	radix tree should be able to operate well no matter how much
58  *	fragmentation there is and no matter how large a bitmap is used.
59  *
60  *	The blist code wires all necessary memory at creation time.  Neither
61  *	allocations nor frees require interaction with the memory subsystem.
62  *	The non-blocking features of the blist code are used in the swap code
63  *	(vm/swap_pager.c).
64  *
65  *	LAYOUT: The radix tree is laid out recursively using a
66  *	linear array.  Each meta node is immediately followed (laid out
67  *	sequentially in memory) by BLIST_META_RADIX lower level nodes.  This
68  *	is a recursive structure but one that can be easily scanned through
69  *	a very simple 'skip' calculation.  In order to support large radixes,
70  *	portions of the tree may reside outside our memory allocation.  We
71  *	handle this with an early-termination optimization (when bighint is
72  *	set to -1) on the scan.  The memory allocation is only large enough
73  *	to cover the number of blocks requested at creation time even if it
74  *	must be encompassed in larger root-node radix.
75  *
76  *	NOTE: the allocator cannot currently allocate more than
77  *	BLIST_BMAP_RADIX blocks per call.  It will panic with 'allocation too
78  *	large' if you try.  This is an area that could use improvement.  The
79  *	radix is large enough that this restriction does not effect the swap
80  *	system, though.  Currently only the allocation code is affected by
81  *	this algorithmic unfeature.  The freeing code can handle arbitrary
82  *	ranges.
83  *
84  *	This code can be compiled stand-alone for debugging.
85  */
86 
87 #include <sys/cdefs.h>
88 __FBSDID("$FreeBSD$");
89 
90 #ifdef _KERNEL
91 
92 #include <sys/param.h>
93 #include <sys/systm.h>
94 #include <sys/lock.h>
95 #include <sys/kernel.h>
96 #include <sys/blist.h>
97 #include <sys/malloc.h>
98 #include <sys/sbuf.h>
99 #include <sys/proc.h>
100 #include <sys/mutex.h>
101 
102 #else
103 
104 #ifndef BLIST_NO_DEBUG
105 #define BLIST_DEBUG
106 #endif
107 
108 #include <sys/types.h>
109 #include <sys/malloc.h>
110 #include <sys/sbuf.h>
111 #include <stdio.h>
112 #include <string.h>
113 #include <stddef.h>
114 #include <stdlib.h>
115 #include <stdarg.h>
116 #include <stdbool.h>
117 
118 #define	bitcount64(x)	__bitcount64((uint64_t)(x))
119 #define malloc(a,b,c)	calloc(a, 1)
120 #define free(a,b)	free(a)
121 static __inline int imax(int a, int b) { return (a > b ? a : b); }
122 
123 #include <sys/blist.h>
124 
125 void panic(const char *ctl, ...);
126 
127 #endif
128 
129 /*
130  * static support functions
131  */
132 static daddr_t	blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count);
133 static daddr_t	blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count,
134 		    u_daddr_t radix);
135 static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count);
136 static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count,
137 		    u_daddr_t radix);
138 static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix,
139 		    blist_t dest, daddr_t count);
140 static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count);
141 static daddr_t blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count,
142 		    u_daddr_t radix);
143 #ifndef _KERNEL
144 static void	blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix,
145 		    int tab);
146 #endif
147 
148 #ifdef _KERNEL
149 static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space");
150 #endif
151 
152 _Static_assert(BLIST_BMAP_RADIX % BLIST_META_RADIX == 0,
153     "radix divisibility error");
154 #define	BLIST_BMAP_MASK	(BLIST_BMAP_RADIX - 1)
155 #define	BLIST_META_MASK	(BLIST_META_RADIX - 1)
156 
157 /*
158  * For a subtree that can represent the state of up to 'radix' blocks, the
159  * number of leaf nodes of the subtree is L=radix/BLIST_BMAP_RADIX.  If 'm'
160  * is short for BLIST_META_RADIX, then for a tree of height h with L=m**h
161  * leaf nodes, the total number of tree nodes is 1 + m + m**2 + ... + m**h,
162  * or, equivalently, (m**(h+1)-1)/(m-1).  This quantity is called 'skip'
163  * in the 'meta' functions that process subtrees.  Since integer division
164  * discards remainders, we can express this computation as
165  * skip = (m * m**h) / (m - 1)
166  * skip = (m * (radix / BLIST_BMAP_RADIX)) / (m - 1)
167  * and since m divides BLIST_BMAP_RADIX, we can simplify further to
168  * skip = (radix / (BLIST_BMAP_RADIX / m)) / (m - 1)
169  * skip = radix / ((BLIST_BMAP_RADIX / m) * (m - 1))
170  * so that simple integer division by a constant can safely be used for the
171  * calculation.
172  */
173 static inline daddr_t
174 radix_to_skip(daddr_t radix)
175 {
176 
177 	return (radix /
178 	    ((BLIST_BMAP_RADIX / BLIST_META_RADIX) * BLIST_META_MASK));
179 }
180 
181 /*
182  * Use binary search, or a faster method, to find the 1 bit in a u_daddr_t.
183  * Assumes that the argument has only one bit set.
184  */
185 static inline int
186 bitpos(u_daddr_t mask)
187 {
188 	int hi, lo, mid;
189 
190 	switch (sizeof(mask)) {
191 #ifdef HAVE_INLINE_FFSLL
192 	case sizeof(long long):
193 		return (ffsll(mask) - 1);
194 #endif
195 	default:
196 		lo = 0;
197 		hi = BLIST_BMAP_RADIX;
198 		while (lo + 1 < hi) {
199 			mid = (lo + hi) >> 1;
200 			if ((mask >> mid) != 0)
201 				lo = mid;
202 			else
203 				hi = mid;
204 		}
205 		return (lo);
206 	}
207 }
208 
209 /*
210  * blist_create() - create a blist capable of handling up to the specified
211  *		    number of blocks
212  *
213  *	blocks - must be greater than 0
214  * 	flags  - malloc flags
215  *
216  *	The smallest blist consists of a single leaf node capable of
217  *	managing BLIST_BMAP_RADIX blocks.
218  */
219 blist_t
220 blist_create(daddr_t blocks, int flags)
221 {
222 	blist_t bl;
223 	daddr_t i, last_block;
224 	u_daddr_t nodes, radix, skip;
225 	int digit;
226 
227 	if (blocks == 0)
228 		panic("invalid block count");
229 
230 	/*
231 	 * Calculate the radix and node count used for scanning.
232 	 */
233 	last_block = blocks - 1;
234 	radix = BLIST_BMAP_RADIX;
235 	while (radix < blocks) {
236 		if (((last_block / radix + 1) & BLIST_META_MASK) != 0)
237 			/*
238 			 * We must widen the blist to avoid partially
239 			 * filled nodes.
240 			 */
241 			last_block |= radix - 1;
242 		radix *= BLIST_META_RADIX;
243 	}
244 
245 	/*
246 	 * Count the meta-nodes in the expanded tree, including the final
247 	 * terminator, from the bottom level up to the root.
248 	 */
249 	nodes = 1;
250 	if (radix - blocks >= BLIST_BMAP_RADIX)
251 		nodes++;
252 	last_block /= BLIST_BMAP_RADIX;
253 	while (last_block > 0) {
254 		nodes += last_block + 1;
255 		last_block /= BLIST_META_RADIX;
256 	}
257 	bl = malloc(offsetof(struct blist, bl_root[nodes]), M_SWAP, flags |
258 	    M_ZERO);
259 	if (bl == NULL)
260 		return (NULL);
261 
262 	bl->bl_blocks = blocks;
263 	bl->bl_radix = radix;
264 	bl->bl_cursor = 0;
265 
266 	/*
267 	 * Initialize the empty tree by filling in root values, then initialize
268 	 * just the terminators in the rest of the tree.
269 	 */
270 	bl->bl_root[0].bm_bighint = 0;
271 	if (radix == BLIST_BMAP_RADIX)
272 		bl->bl_root[0].u.bmu_bitmap = 0;
273 	else
274 		bl->bl_root[0].u.bmu_avail = 0;
275 	last_block = blocks - 1;
276 	i = 0;
277 	while (radix > BLIST_BMAP_RADIX) {
278 		radix /= BLIST_META_RADIX;
279 		skip = radix_to_skip(radix);
280 		digit = last_block / radix;
281 		i += 1 + digit * skip;
282 		if (digit != BLIST_META_MASK) {
283 			/*
284 			 * Add a terminator.
285 			 */
286 			bl->bl_root[i + skip].bm_bighint = (daddr_t)-1;
287 			bl->bl_root[i + skip].u.bmu_bitmap = 0;
288 		}
289 		last_block %= radix;
290 	}
291 
292 #if defined(BLIST_DEBUG)
293 	printf(
294 		"BLIST representing %lld blocks (%lld MB of swap)"
295 		", requiring %lldK of ram\n",
296 		(long long)bl->bl_blocks,
297 		(long long)bl->bl_blocks * 4 / 1024,
298 		(long long)(nodes * sizeof(blmeta_t) + 1023) / 1024
299 	);
300 	printf("BLIST raw radix tree contains %lld records\n",
301 	    (long long)nodes);
302 #endif
303 
304 	return (bl);
305 }
306 
307 void
308 blist_destroy(blist_t bl)
309 {
310 
311 	free(bl, M_SWAP);
312 }
313 
314 /*
315  * blist_alloc() -   reserve space in the block bitmap.  Return the base
316  *		     of a contiguous region or SWAPBLK_NONE if space could
317  *		     not be allocated.
318  */
319 daddr_t
320 blist_alloc(blist_t bl, daddr_t count)
321 {
322 	daddr_t blk;
323 
324 	/*
325 	 * This loop iterates at most twice.  An allocation failure in the
326 	 * first iteration leads to a second iteration only if the cursor was
327 	 * non-zero.  When the cursor is zero, an allocation failure will
328 	 * reduce the hint, stopping further iterations.
329 	 */
330 	while (count <= bl->bl_root->bm_bighint) {
331 		blk = blst_meta_alloc(bl->bl_root, bl->bl_cursor, count,
332 		    bl->bl_radix);
333 		if (blk != SWAPBLK_NONE) {
334 			bl->bl_cursor = blk + count;
335 			if (bl->bl_cursor == bl->bl_blocks)
336 				bl->bl_cursor = 0;
337 			return (blk);
338 		} else if (bl->bl_cursor != 0)
339 			bl->bl_cursor = 0;
340 	}
341 	return (SWAPBLK_NONE);
342 }
343 
344 /*
345  * blist_avail() -	return the number of free blocks.
346  */
347 daddr_t
348 blist_avail(blist_t bl)
349 {
350 
351 	if (bl->bl_radix == BLIST_BMAP_RADIX)
352 		return (bitcount64(bl->bl_root->u.bmu_bitmap));
353 	else
354 		return (bl->bl_root->u.bmu_avail);
355 }
356 
357 /*
358  * blist_free() -	free up space in the block bitmap.  Return the base
359  *		     	of a contiguous region.  Panic if an inconsistancy is
360  *			found.
361  */
362 void
363 blist_free(blist_t bl, daddr_t blkno, daddr_t count)
364 {
365 
366 	blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix);
367 }
368 
369 /*
370  * blist_fill() -	mark a region in the block bitmap as off-limits
371  *			to the allocator (i.e. allocate it), ignoring any
372  *			existing allocations.  Return the number of blocks
373  *			actually filled that were free before the call.
374  */
375 daddr_t
376 blist_fill(blist_t bl, daddr_t blkno, daddr_t count)
377 {
378 
379 	return (blst_meta_fill(bl->bl_root, blkno, count, bl->bl_radix));
380 }
381 
382 /*
383  * blist_resize() -	resize an existing radix tree to handle the
384  *			specified number of blocks.  This will reallocate
385  *			the tree and transfer the previous bitmap to the new
386  *			one.  When extending the tree you can specify whether
387  *			the new blocks are to left allocated or freed.
388  */
389 void
390 blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags)
391 {
392     blist_t newbl = blist_create(count, flags);
393     blist_t save = *pbl;
394 
395     *pbl = newbl;
396     if (count > save->bl_blocks)
397 	    count = save->bl_blocks;
398     blst_copy(save->bl_root, 0, save->bl_radix, newbl, count);
399 
400     /*
401      * If resizing upwards, should we free the new space or not?
402      */
403     if (freenew && count < newbl->bl_blocks) {
404 	    blist_free(newbl, count, newbl->bl_blocks - count);
405     }
406     blist_destroy(save);
407 }
408 
409 #ifdef BLIST_DEBUG
410 
411 /*
412  * blist_print()    - dump radix tree
413  */
414 void
415 blist_print(blist_t bl)
416 {
417 	printf("BLIST cursor = %08jx {\n", (uintmax_t)bl->bl_cursor);
418 	blst_radix_print(bl->bl_root, 0, bl->bl_radix, 4);
419 	printf("}\n");
420 }
421 
422 #endif
423 
424 static const u_daddr_t fib[] = {
425 	1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584,
426 	4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418, 317811,
427 	514229, 832040, 1346269, 2178309, 3524578,
428 };
429 
430 /*
431  * Use 'gap' to describe a maximal range of unallocated blocks/bits.
432  */
433 struct gap_stats {
434 	daddr_t	start;		/* current gap start, or SWAPBLK_NONE */
435 	daddr_t	num;		/* number of gaps observed */
436 	daddr_t	max;		/* largest gap size */
437 	daddr_t	avg;		/* average gap size */
438 	daddr_t	err;		/* sum - num * avg */
439 	daddr_t	histo[nitems(fib)]; /* # gaps in each size range */
440 	int	max_bucket;	/* last histo elt with nonzero val */
441 };
442 
443 /*
444  * gap_stats_counting()    - is the state 'counting 1 bits'?
445  *                           or 'skipping 0 bits'?
446  */
447 static inline bool
448 gap_stats_counting(const struct gap_stats *stats)
449 {
450 
451 	return (stats->start != SWAPBLK_NONE);
452 }
453 
454 /*
455  * init_gap_stats()    - initialize stats on gap sizes
456  */
457 static inline void
458 init_gap_stats(struct gap_stats *stats)
459 {
460 
461 	bzero(stats, sizeof(*stats));
462 	stats->start = SWAPBLK_NONE;
463 }
464 
465 /*
466  * update_gap_stats()    - update stats on gap sizes
467  */
468 static void
469 update_gap_stats(struct gap_stats *stats, daddr_t posn)
470 {
471 	daddr_t size;
472 	int hi, lo, mid;
473 
474 	if (!gap_stats_counting(stats)) {
475 		stats->start = posn;
476 		return;
477 	}
478 	size = posn - stats->start;
479 	stats->start = SWAPBLK_NONE;
480 	if (size > stats->max)
481 		stats->max = size;
482 
483 	/*
484 	 * Find the fibonacci range that contains size,
485 	 * expecting to find it in an early range.
486 	 */
487 	lo = 0;
488 	hi = 1;
489 	while (hi < nitems(fib) && fib[hi] <= size) {
490 		lo = hi;
491 		hi *= 2;
492 	}
493 	if (hi >= nitems(fib))
494 		hi = nitems(fib);
495 	while (lo + 1 != hi) {
496 		mid = (lo + hi) >> 1;
497 		if (fib[mid] <= size)
498 			lo = mid;
499 		else
500 			hi = mid;
501 	}
502 	stats->histo[lo]++;
503 	if (lo > stats->max_bucket)
504 		stats->max_bucket = lo;
505 	stats->err += size - stats->avg;
506 	stats->num++;
507 	stats->avg += stats->err / stats->num;
508 	stats->err %= stats->num;
509 }
510 
511 /*
512  * dump_gap_stats()    - print stats on gap sizes
513  */
514 static inline void
515 dump_gap_stats(const struct gap_stats *stats, struct sbuf *s)
516 {
517 	int i;
518 
519 	sbuf_printf(s, "number of maximal free ranges: %jd\n",
520 	    (intmax_t)stats->num);
521 	sbuf_printf(s, "largest free range: %jd\n", (intmax_t)stats->max);
522 	sbuf_printf(s, "average maximal free range size: %jd\n",
523 	    (intmax_t)stats->avg);
524 	sbuf_printf(s, "number of maximal free ranges of different sizes:\n");
525 	sbuf_printf(s, "               count  |  size range\n");
526 	sbuf_printf(s, "               -----  |  ----------\n");
527 	for (i = 0; i < stats->max_bucket; i++) {
528 		if (stats->histo[i] != 0) {
529 			sbuf_printf(s, "%20jd  |  ",
530 			    (intmax_t)stats->histo[i]);
531 			if (fib[i] != fib[i + 1] - 1)
532 				sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
533 				    (intmax_t)fib[i + 1] - 1);
534 			else
535 				sbuf_printf(s, "%jd\n", (intmax_t)fib[i]);
536 		}
537 	}
538 	sbuf_printf(s, "%20jd  |  ", (intmax_t)stats->histo[i]);
539 	if (stats->histo[i] > 1)
540 		sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
541 		    (intmax_t)stats->max);
542 	else
543 		sbuf_printf(s, "%jd\n", (intmax_t)stats->max);
544 }
545 
546 /*
547  * blist_stats()    - dump radix tree stats
548  */
549 void
550 blist_stats(blist_t bl, struct sbuf *s)
551 {
552 	struct gap_stats gstats;
553 	struct gap_stats *stats = &gstats;
554 	daddr_t i, nodes, radix;
555 	u_daddr_t bit, diff, mask;
556 
557 	init_gap_stats(stats);
558 	nodes = 0;
559 	i = bl->bl_radix;
560 	while (i < bl->bl_radix + bl->bl_blocks) {
561 		/*
562 		 * Find max size subtree starting at i.
563 		 */
564 		radix = BLIST_BMAP_RADIX;
565 		while (((i / radix) & BLIST_META_MASK) == 0)
566 			radix *= BLIST_META_RADIX;
567 
568 		/*
569 		 * Check for skippable subtrees starting at i.
570 		 */
571 		while (radix > BLIST_BMAP_RADIX) {
572 			if (bl->bl_root[nodes].u.bmu_avail == 0) {
573 				if (gap_stats_counting(stats))
574 					update_gap_stats(stats, i);
575 				break;
576 			}
577 			if (bl->bl_root[nodes].u.bmu_avail == radix) {
578 				if (!gap_stats_counting(stats))
579 					update_gap_stats(stats, i);
580 				break;
581 			}
582 
583 			/*
584 			 * Skip subtree root.
585 			 */
586 			nodes++;
587 			radix /= BLIST_META_RADIX;
588 		}
589 		if (radix == BLIST_BMAP_RADIX) {
590 			/*
591 			 * Scan leaf.
592 			 */
593 			mask = bl->bl_root[nodes].u.bmu_bitmap;
594 			diff = mask ^ (mask << 1);
595 			if (gap_stats_counting(stats))
596 				diff ^= 1;
597 			while (diff != 0) {
598 				bit = diff & -diff;
599 				update_gap_stats(stats, i + bitpos(bit));
600 				diff ^= bit;
601 			}
602 		}
603 		nodes += radix_to_skip(radix);
604 		i += radix;
605 	}
606 	update_gap_stats(stats, i);
607 	dump_gap_stats(stats, s);
608 }
609 
610 /************************************************************************
611  *			  ALLOCATION SUPPORT FUNCTIONS			*
612  ************************************************************************
613  *
614  *	These support functions do all the actual work.  They may seem
615  *	rather longish, but that's because I've commented them up.  The
616  *	actual code is straight forward.
617  *
618  */
619 
620 /*
621  * blist_leaf_alloc() -	allocate at a leaf in the radix tree (a bitmap).
622  *
623  *	This is the core of the allocator and is optimized for the
624  *	BLIST_BMAP_RADIX block allocation case.  Otherwise, execution
625  *	time is proportional to log2(count) + bitpos time.
626  */
627 static daddr_t
628 blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count)
629 {
630 	u_daddr_t mask;
631 	int count1, hi, lo, num_shifts, range1, range_ext;
632 
633 	range1 = 0;
634 	count1 = count - 1;
635 	num_shifts = fls(count1);
636 	mask = scan->u.bmu_bitmap;
637 	while ((-mask & ~mask) != 0 && num_shifts > 0) {
638 		/*
639 		 * If bit i is set in mask, then bits in [i, i+range1] are set
640 		 * in scan->u.bmu_bitmap.  The value of range1 is equal to
641 		 * count1 >> num_shifts.  Grow range and reduce num_shifts to 0,
642 		 * while preserving these invariants.  The updates to mask leave
643 		 * fewer bits set, but each bit that remains set represents a
644 		 * longer string of consecutive bits set in scan->u.bmu_bitmap.
645 		 * If more updates to mask cannot clear more bits, because mask
646 		 * is partitioned with all 0 bits preceding all 1 bits, the loop
647 		 * terminates immediately.
648 		 */
649 		num_shifts--;
650 		range_ext = range1 + ((count1 >> num_shifts) & 1);
651 		/*
652 		 * mask is a signed quantity for the shift because when it is
653 		 * shifted right, the sign bit should copied; when the last
654 		 * block of the leaf is free, pretend, for a while, that all the
655 		 * blocks that follow it are also free.
656 		 */
657 		mask &= (daddr_t)mask >> range_ext;
658 		range1 += range_ext;
659 	}
660 	if (mask == 0) {
661 		/*
662 		 * Update bighint.  There is no allocation bigger than range1
663 		 * starting in this leaf.
664 		 */
665 		scan->bm_bighint = range1;
666 		return (SWAPBLK_NONE);
667 	}
668 
669 	/* Discard any candidates that appear before blk. */
670 	mask &= (u_daddr_t)-1 << (blk & BLIST_BMAP_MASK);
671 	if (mask == 0)
672 		return (SWAPBLK_NONE);
673 
674 	/*
675 	 * The least significant set bit in mask marks the start of the first
676 	 * available range of sufficient size.  Clear all the bits but that one,
677 	 * and then find its position.
678 	 */
679 	mask &= -mask;
680 	lo = bitpos(mask);
681 
682 	hi = lo + count;
683 	if (hi > BLIST_BMAP_RADIX) {
684 		/*
685 		 * An allocation within this leaf is impossible, so a successful
686 		 * allocation depends on the next leaf providing some of the blocks.
687 		 */
688 		if (((blk / BLIST_BMAP_RADIX + 1) & BLIST_META_MASK) == 0) {
689 			/*
690 			 * The next leaf has a different meta-node parent, so it
691 			 * is not necessarily initialized.  Update bighint,
692 			 * comparing the range found at the end of mask to the
693 			 * largest earlier range that could have been made to
694 			 * vanish in the initial processing of mask.
695 			 */
696 			scan->bm_bighint = imax(BLIST_BMAP_RADIX - lo, range1);
697 			return (SWAPBLK_NONE);
698 		}
699 		hi -= BLIST_BMAP_RADIX;
700 		if (((scan[1].u.bmu_bitmap + 1) & ~((u_daddr_t)-1 << hi)) != 0) {
701 			/*
702 			 * The next leaf doesn't have enough free blocks at the
703 			 * beginning to complete the spanning allocation.  The
704 			 * hint cannot be updated, because the same allocation
705 			 * request could be satisfied later, by this leaf, if
706 			 * the state of the next leaf changes, and without any
707 			 * changes to this leaf.
708 			 */
709 			return (SWAPBLK_NONE);
710 		}
711 		/* Clear the first 'hi' bits in the next leaf, allocating them. */
712 		scan[1].u.bmu_bitmap &= (u_daddr_t)-1 << hi;
713 		hi = BLIST_BMAP_RADIX;
714 	}
715 
716 	/* Set the bits of mask at position 'lo' and higher. */
717 	mask = -mask;
718 	if (hi == BLIST_BMAP_RADIX) {
719 		/*
720 		 * Update bighint.  There is no allocation bigger than range1
721 		 * available in this leaf after this allocation completes.
722 		 */
723 		scan->bm_bighint = range1;
724 	} else {
725 		/* Clear the bits of mask at position 'hi' and higher. */
726 		mask &= (u_daddr_t)-1 >> (BLIST_BMAP_RADIX - hi);
727 		/* If this allocation uses all the bits, clear the hint. */
728 		if (mask == scan->u.bmu_bitmap)
729 			scan->bm_bighint = 0;
730 	}
731 	/* Clear the allocated bits from this leaf. */
732 	scan->u.bmu_bitmap &= ~mask;
733 	return ((blk & ~BLIST_BMAP_MASK) + lo);
734 }
735 
736 /*
737  * blist_meta_alloc() -	allocate at a meta in the radix tree.
738  *
739  *	Attempt to allocate at a meta node.  If we can't, we update
740  *	bighint and return a failure.  Updating bighint optimize future
741  *	calls that hit this node.  We have to check for our collapse cases
742  *	and we have a few optimizations strewn in as well.
743  */
744 static daddr_t
745 blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count, u_daddr_t radix)
746 {
747 	daddr_t blk, i, next_skip, r, skip;
748 	int child;
749 	bool scan_from_start;
750 
751 	if (radix == BLIST_BMAP_RADIX)
752 		return (blst_leaf_alloc(scan, cursor, count));
753 	if (scan->u.bmu_avail < count) {
754 		/*
755 		 * The meta node's hint must be too large if the allocation
756 		 * exceeds the number of free blocks.  Reduce the hint, and
757 		 * return failure.
758 		 */
759 		scan->bm_bighint = scan->u.bmu_avail;
760 		return (SWAPBLK_NONE);
761 	}
762 	blk = cursor & -radix;
763 	skip = radix_to_skip(radix);
764 	next_skip = skip / BLIST_META_RADIX;
765 
766 	/*
767 	 * An ALL-FREE meta node requires special handling before allocating
768 	 * any of its blocks.
769 	 */
770 	if (scan->u.bmu_avail == radix) {
771 		radix /= BLIST_META_RADIX;
772 
773 		/*
774 		 * Reinitialize each of the meta node's children.  An ALL-FREE
775 		 * meta node cannot have a terminator in any subtree.
776 		 */
777 		for (i = 1; i < skip; i += next_skip) {
778 			if (next_skip == 1)
779 				scan[i].u.bmu_bitmap = (u_daddr_t)-1;
780 			else
781 				scan[i].u.bmu_avail = radix;
782 			scan[i].bm_bighint = radix;
783 		}
784 	} else {
785 		radix /= BLIST_META_RADIX;
786 	}
787 
788 	if (count > radix) {
789 		/*
790 		 * The allocation exceeds the number of blocks that are
791 		 * managed by a subtree of this meta node.
792 		 */
793 		panic("allocation too large");
794 	}
795 	scan_from_start = cursor == blk;
796 	child = (cursor - blk) / radix;
797 	blk += child * radix;
798 	for (i = 1 + child * next_skip; i < skip; i += next_skip) {
799 		if (count <= scan[i].bm_bighint) {
800 			/*
801 			 * The allocation might fit beginning in the i'th subtree.
802 			 */
803 			r = blst_meta_alloc(&scan[i],
804 			    cursor > blk ? cursor : blk, count, radix);
805 			if (r != SWAPBLK_NONE) {
806 				scan->u.bmu_avail -= count;
807 				return (r);
808 			}
809 		} else if (scan[i].bm_bighint == (daddr_t)-1) {
810 			/*
811 			 * Terminator
812 			 */
813 			break;
814 		}
815 		blk += radix;
816 	}
817 
818 	/*
819 	 * We couldn't allocate count in this subtree, update bighint.
820 	 */
821 	if (scan_from_start && scan->bm_bighint >= count)
822 		scan->bm_bighint = count - 1;
823 
824 	return (SWAPBLK_NONE);
825 }
826 
827 /*
828  * BLST_LEAF_FREE() -	free allocated block from leaf bitmap
829  *
830  */
831 static void
832 blst_leaf_free(blmeta_t *scan, daddr_t blk, int count)
833 {
834 	u_daddr_t mask;
835 	int n;
836 
837 	/*
838 	 * free some data in this bitmap
839 	 * mask=0000111111111110000
840 	 *          \_________/\__/
841 	 *		count   n
842 	 */
843 	n = blk & BLIST_BMAP_MASK;
844 	mask = ((u_daddr_t)-1 << n) &
845 	    ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n));
846 	if (scan->u.bmu_bitmap & mask)
847 		panic("freeing free block");
848 	scan->u.bmu_bitmap |= mask;
849 
850 	/*
851 	 * We could probably do a better job here.  We are required to make
852 	 * bighint at least as large as the biggest contiguous block of
853 	 * data.  If we just shoehorn it, a little extra overhead will
854 	 * be incured on the next allocation (but only that one typically).
855 	 */
856 	scan->bm_bighint = BLIST_BMAP_RADIX;
857 }
858 
859 /*
860  * BLST_META_FREE() - free allocated blocks from radix tree meta info
861  *
862  *	This support routine frees a range of blocks from the bitmap.
863  *	The range must be entirely enclosed by this radix node.  If a
864  *	meta node, we break the range down recursively to free blocks
865  *	in subnodes (which means that this code can free an arbitrary
866  *	range whereas the allocation code cannot allocate an arbitrary
867  *	range).
868  */
869 static void
870 blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, u_daddr_t radix)
871 {
872 	daddr_t blk, i, next_skip, skip, v;
873 	int child;
874 
875 	if (scan->bm_bighint == (daddr_t)-1)
876 		panic("freeing invalid range");
877 	if (radix == BLIST_BMAP_RADIX)
878 		return (blst_leaf_free(scan, freeBlk, count));
879 	skip = radix_to_skip(radix);
880 	next_skip = skip / BLIST_META_RADIX;
881 
882 	if (scan->u.bmu_avail == 0) {
883 		/*
884 		 * ALL-ALLOCATED special case, with possible
885 		 * shortcut to ALL-FREE special case.
886 		 */
887 		scan->u.bmu_avail = count;
888 		scan->bm_bighint = count;
889 
890 		if (count != radix)  {
891 			for (i = 1; i < skip; i += next_skip) {
892 				if (scan[i].bm_bighint == (daddr_t)-1)
893 					break;
894 				scan[i].bm_bighint = 0;
895 				if (next_skip == 1) {
896 					scan[i].u.bmu_bitmap = 0;
897 				} else {
898 					scan[i].u.bmu_avail = 0;
899 				}
900 			}
901 			/* fall through */
902 		}
903 	} else {
904 		scan->u.bmu_avail += count;
905 		/* scan->bm_bighint = radix; */
906 	}
907 
908 	/*
909 	 * ALL-FREE special case.
910 	 */
911 
912 	if (scan->u.bmu_avail == radix)
913 		return;
914 	if (scan->u.bmu_avail > radix)
915 		panic("blst_meta_free: freeing already free blocks (%lld) %lld/%lld",
916 		    (long long)count, (long long)scan->u.bmu_avail,
917 		    (long long)radix);
918 
919 	/*
920 	 * Break the free down into its components
921 	 */
922 
923 	blk = freeBlk & -radix;
924 	radix /= BLIST_META_RADIX;
925 
926 	child = (freeBlk - blk) / radix;
927 	blk += child * radix;
928 	i = 1 + child * next_skip;
929 	while (i < skip && blk < freeBlk + count) {
930 		v = blk + radix - freeBlk;
931 		if (v > count)
932 			v = count;
933 		blst_meta_free(&scan[i], freeBlk, v, radix);
934 		if (scan->bm_bighint < scan[i].bm_bighint)
935 			scan->bm_bighint = scan[i].bm_bighint;
936 		count -= v;
937 		freeBlk += v;
938 		blk += radix;
939 		i += next_skip;
940 	}
941 }
942 
943 /*
944  * BLIST_RADIX_COPY() - copy one radix tree to another
945  *
946  *	Locates free space in the source tree and frees it in the destination
947  *	tree.  The space may not already be free in the destination.
948  */
949 static void
950 blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, blist_t dest,
951     daddr_t count)
952 {
953 	daddr_t i, next_skip, skip;
954 
955 	/*
956 	 * Leaf node
957 	 */
958 
959 	if (radix == BLIST_BMAP_RADIX) {
960 		u_daddr_t v = scan->u.bmu_bitmap;
961 
962 		if (v == (u_daddr_t)-1) {
963 			blist_free(dest, blk, count);
964 		} else if (v != 0) {
965 			int i;
966 
967 			for (i = 0; i < BLIST_BMAP_RADIX && i < count; ++i) {
968 				if (v & ((u_daddr_t)1 << i))
969 					blist_free(dest, blk + i, 1);
970 			}
971 		}
972 		return;
973 	}
974 
975 	/*
976 	 * Meta node
977 	 */
978 
979 	if (scan->u.bmu_avail == 0) {
980 		/*
981 		 * Source all allocated, leave dest allocated
982 		 */
983 		return;
984 	}
985 	if (scan->u.bmu_avail == radix) {
986 		/*
987 		 * Source all free, free entire dest
988 		 */
989 		if (count < radix)
990 			blist_free(dest, blk, count);
991 		else
992 			blist_free(dest, blk, radix);
993 		return;
994 	}
995 
996 
997 	skip = radix_to_skip(radix);
998 	next_skip = skip / BLIST_META_RADIX;
999 	radix /= BLIST_META_RADIX;
1000 
1001 	for (i = 1; count && i < skip; i += next_skip) {
1002 		if (scan[i].bm_bighint == (daddr_t)-1)
1003 			break;
1004 
1005 		if (count >= radix) {
1006 			blst_copy(&scan[i], blk, radix, dest, radix);
1007 			count -= radix;
1008 		} else {
1009 			if (count) {
1010 				blst_copy(&scan[i], blk, radix, dest, count);
1011 			}
1012 			count = 0;
1013 		}
1014 		blk += radix;
1015 	}
1016 }
1017 
1018 /*
1019  * BLST_LEAF_FILL() -	allocate specific blocks in leaf bitmap
1020  *
1021  *	This routine allocates all blocks in the specified range
1022  *	regardless of any existing allocations in that range.  Returns
1023  *	the number of blocks allocated by the call.
1024  */
1025 static daddr_t
1026 blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count)
1027 {
1028 	daddr_t nblks;
1029 	u_daddr_t mask;
1030 	int n;
1031 
1032 	n = blk & BLIST_BMAP_MASK;
1033 	mask = ((u_daddr_t)-1 << n) &
1034 	    ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n));
1035 
1036 	/* Count the number of blocks that we are allocating. */
1037 	nblks = bitcount64(scan->u.bmu_bitmap & mask);
1038 
1039 	scan->u.bmu_bitmap &= ~mask;
1040 	return (nblks);
1041 }
1042 
1043 /*
1044  * BLIST_META_FILL() -	allocate specific blocks at a meta node
1045  *
1046  *	This routine allocates the specified range of blocks,
1047  *	regardless of any existing allocations in the range.  The
1048  *	range must be within the extent of this node.  Returns the
1049  *	number of blocks allocated by the call.
1050  */
1051 static daddr_t
1052 blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, u_daddr_t radix)
1053 {
1054 	daddr_t blk, i, nblks, next_skip, skip, v;
1055 	int child;
1056 
1057 	if (scan->bm_bighint == (daddr_t)-1)
1058 		panic("filling invalid range");
1059 	if (count > radix) {
1060 		/*
1061 		 * The allocation exceeds the number of blocks that are
1062 		 * managed by this node.
1063 		 */
1064 		panic("fill too large");
1065 	}
1066 	if (radix == BLIST_BMAP_RADIX)
1067 		return (blst_leaf_fill(scan, allocBlk, count));
1068 	if (count == radix || scan->u.bmu_avail == 0)  {
1069 		/*
1070 		 * ALL-ALLOCATED special case
1071 		 */
1072 		nblks = scan->u.bmu_avail;
1073 		scan->u.bmu_avail = 0;
1074 		scan->bm_bighint = 0;
1075 		return (nblks);
1076 	}
1077 	skip = radix_to_skip(radix);
1078 	next_skip = skip / BLIST_META_RADIX;
1079 	blk = allocBlk & -radix;
1080 
1081 	/*
1082 	 * An ALL-FREE meta node requires special handling before allocating
1083 	 * any of its blocks.
1084 	 */
1085 	if (scan->u.bmu_avail == radix) {
1086 		radix /= BLIST_META_RADIX;
1087 
1088 		/*
1089 		 * Reinitialize each of the meta node's children.  An ALL-FREE
1090 		 * meta node cannot have a terminator in any subtree.
1091 		 */
1092 		for (i = 1; i < skip; i += next_skip) {
1093 			if (next_skip == 1)
1094 				scan[i].u.bmu_bitmap = (u_daddr_t)-1;
1095 			else
1096 				scan[i].u.bmu_avail = radix;
1097 			scan[i].bm_bighint = radix;
1098 		}
1099 	} else {
1100 		radix /= BLIST_META_RADIX;
1101 	}
1102 
1103 	nblks = 0;
1104 	child = (allocBlk - blk) / radix;
1105 	blk += child * radix;
1106 	i = 1 + child * next_skip;
1107 	while (i < skip && blk < allocBlk + count) {
1108 		v = blk + radix - allocBlk;
1109 		if (v > count)
1110 			v = count;
1111 		nblks += blst_meta_fill(&scan[i], allocBlk, v, radix);
1112 		count -= v;
1113 		allocBlk += v;
1114 		blk += radix;
1115 		i += next_skip;
1116 	}
1117 	scan->u.bmu_avail -= nblks;
1118 	return (nblks);
1119 }
1120 
1121 #ifdef BLIST_DEBUG
1122 
1123 static void
1124 blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int tab)
1125 {
1126 	daddr_t i, next_skip, skip;
1127 
1128 	if (radix == BLIST_BMAP_RADIX) {
1129 		printf(
1130 		    "%*.*s(%08llx,%lld): bitmap %016llx big=%lld\n",
1131 		    tab, tab, "",
1132 		    (long long)blk, (long long)radix,
1133 		    (long long)scan->u.bmu_bitmap,
1134 		    (long long)scan->bm_bighint
1135 		);
1136 		return;
1137 	}
1138 
1139 	if (scan->u.bmu_avail == 0) {
1140 		printf(
1141 		    "%*.*s(%08llx,%lld) ALL ALLOCATED\n",
1142 		    tab, tab, "",
1143 		    (long long)blk,
1144 		    (long long)radix
1145 		);
1146 		return;
1147 	}
1148 	if (scan->u.bmu_avail == radix) {
1149 		printf(
1150 		    "%*.*s(%08llx,%lld) ALL FREE\n",
1151 		    tab, tab, "",
1152 		    (long long)blk,
1153 		    (long long)radix
1154 		);
1155 		return;
1156 	}
1157 
1158 	printf(
1159 	    "%*.*s(%08llx,%lld): subtree (%lld/%lld) big=%lld {\n",
1160 	    tab, tab, "",
1161 	    (long long)blk, (long long)radix,
1162 	    (long long)scan->u.bmu_avail,
1163 	    (long long)radix,
1164 	    (long long)scan->bm_bighint
1165 	);
1166 
1167 	skip = radix_to_skip(radix);
1168 	next_skip = skip / BLIST_META_RADIX;
1169 	radix /= BLIST_META_RADIX;
1170 	tab += 4;
1171 
1172 	for (i = 1; i < skip; i += next_skip) {
1173 		if (scan[i].bm_bighint == (daddr_t)-1) {
1174 			printf(
1175 			    "%*.*s(%08llx,%lld): Terminator\n",
1176 			    tab, tab, "",
1177 			    (long long)blk, (long long)radix
1178 			);
1179 			break;
1180 		}
1181 		blst_radix_print(&scan[i], blk, radix, tab);
1182 		blk += radix;
1183 	}
1184 	tab -= 4;
1185 
1186 	printf(
1187 	    "%*.*s}\n",
1188 	    tab, tab, ""
1189 	);
1190 }
1191 
1192 #endif
1193 
1194 #ifdef BLIST_DEBUG
1195 
1196 int
1197 main(int ac, char **av)
1198 {
1199 	int size = 1024;
1200 	int i;
1201 	blist_t bl;
1202 	struct sbuf *s;
1203 
1204 	for (i = 1; i < ac; ++i) {
1205 		const char *ptr = av[i];
1206 		if (*ptr != '-') {
1207 			size = strtol(ptr, NULL, 0);
1208 			continue;
1209 		}
1210 		ptr += 2;
1211 		fprintf(stderr, "Bad option: %s\n", ptr - 2);
1212 		exit(1);
1213 	}
1214 	bl = blist_create(size, M_WAITOK);
1215 	blist_free(bl, 0, size);
1216 
1217 	for (;;) {
1218 		char buf[1024];
1219 		long long da = 0;
1220 		long long count = 0;
1221 
1222 		printf("%lld/%lld/%lld> ", (long long)blist_avail(bl),
1223 		    (long long)size, (long long)bl->bl_radix);
1224 		fflush(stdout);
1225 		if (fgets(buf, sizeof(buf), stdin) == NULL)
1226 			break;
1227 		switch(buf[0]) {
1228 		case 'r':
1229 			if (sscanf(buf + 1, "%lld", &count) == 1) {
1230 				blist_resize(&bl, count, 1, M_WAITOK);
1231 			} else {
1232 				printf("?\n");
1233 			}
1234 		case 'p':
1235 			blist_print(bl);
1236 			break;
1237 		case 's':
1238 			s = sbuf_new_auto();
1239 			blist_stats(bl, s);
1240 			sbuf_finish(s);
1241 			printf("%s", sbuf_data(s));
1242 			sbuf_delete(s);
1243 			break;
1244 		case 'a':
1245 			if (sscanf(buf + 1, "%lld", &count) == 1) {
1246 				daddr_t blk = blist_alloc(bl, count);
1247 				printf("    R=%08llx\n", (long long)blk);
1248 			} else {
1249 				printf("?\n");
1250 			}
1251 			break;
1252 		case 'f':
1253 			if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) {
1254 				blist_free(bl, da, count);
1255 			} else {
1256 				printf("?\n");
1257 			}
1258 			break;
1259 		case 'l':
1260 			if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) {
1261 				printf("    n=%jd\n",
1262 				    (intmax_t)blist_fill(bl, da, count));
1263 			} else {
1264 				printf("?\n");
1265 			}
1266 			break;
1267 		case '?':
1268 		case 'h':
1269 			puts(
1270 			    "p          -print\n"
1271 			    "s          -stats\n"
1272 			    "a %d       -allocate\n"
1273 			    "f %x %d    -free\n"
1274 			    "l %x %d    -fill\n"
1275 			    "r %d       -resize\n"
1276 			    "h/?        -help"
1277 			);
1278 			break;
1279 		default:
1280 			printf("?\n");
1281 			break;
1282 		}
1283 	}
1284 	return(0);
1285 }
1286 
1287 void
1288 panic(const char *ctl, ...)
1289 {
1290 	va_list va;
1291 
1292 	va_start(va, ctl);
1293 	vfprintf(stderr, ctl, va);
1294 	fprintf(stderr, "\n");
1295 	va_end(va);
1296 	exit(1);
1297 }
1298 
1299 #endif
1300