xref: /freebsd/sys/kern/subr_blist.c (revision 4d65a7c6951cea0333f1a0c1b32c38489cdfa6c5)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1998 Matthew Dillon.  All Rights Reserved.
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. Neither the name of the University nor the names of its contributors
14  *    may be used to endorse or promote products derived from this software
15  *    without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
18  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
23  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
25  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
26  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  */
29 /*
30  * BLIST.C -	Bitmap allocator/deallocator, using a radix tree with hinting
31  *
32  *	This module implements a general bitmap allocator/deallocator.  The
33  *	allocator eats around 2 bits per 'block'.  The module does not
34  *	try to interpret the meaning of a 'block' other than to return
35  *	SWAPBLK_NONE on an allocation failure.
36  *
37  *	A radix tree controls access to pieces of the bitmap, and includes
38  *	auxiliary information at each interior node about the availabilty of
39  *	contiguous free blocks in the subtree rooted at that node.  A radix
40  *	constant defines the size of the bitmaps contained in a leaf node
41  *	and the number of descendents of each of the meta (interior) nodes.
42  *	Each subtree is associated with a range of blocks.  The root of any
43  *	subtree stores a hint field that defines an upper bound on the size
44  *	of the largest allocation that can begin in the associated block
45  *	range.  A hint is an upper bound on a potential allocation, but not
46  *	necessarily a tight upper bound.
47  *
48  *	The bitmap field in each node directs the search for available blocks.
49  *	For a leaf node, a bit is set if the corresponding block is free.  For a
50  *	meta node, a bit is set if the corresponding subtree contains a free
51  *	block somewhere within it.  The search at a meta node considers only
52  *	children of that node that represent a range that includes a free block.
53  *
54  * 	The hinting greatly increases code efficiency for allocations while
55  *	the general radix structure optimizes both allocations and frees.  The
56  *	radix tree should be able to operate well no matter how much
57  *	fragmentation there is and no matter how large a bitmap is used.
58  *
59  *	The blist code wires all necessary memory at creation time.  Neither
60  *	allocations nor frees require interaction with the memory subsystem.
61  *	The non-blocking nature of allocations and frees is required by swap
62  *	code (vm/swap_pager.c).
63  *
64  *	LAYOUT: The radix tree is laid out recursively using a linear array.
65  *	Each meta node is immediately followed (laid out sequentially in
66  *	memory) by BLIST_RADIX lower-level nodes.  This is a recursive
67  *	structure but one that can be easily scanned through a very simple
68  *	'skip' calculation.  The memory allocation is only large enough to
69  *	cover the number of blocks requested at creation time.  Nodes that
70  *	represent blocks beyond that limit, nodes that would never be read
71  *	or written, are not allocated, so that the last of the
72  *	BLIST_RADIX lower-level nodes of a some nodes may not be allocated.
73  *
74  *	NOTE: the allocator cannot currently allocate more than
75  *	BLIST_RADIX blocks per call.  It will panic with 'allocation too
76  *	large' if you try.  This is an area that could use improvement.  The
77  *	radix is large enough that this restriction does not effect the swap
78  *	system, though.  Currently only the allocation code is affected by
79  *	this algorithmic unfeature.  The freeing code can handle arbitrary
80  *	ranges.
81  *
82  *	This code can be compiled stand-alone for debugging.
83  */
84 
85 #include <sys/cdefs.h>
86 #ifdef _KERNEL
87 
88 #include <sys/param.h>
89 #include <sys/systm.h>
90 #include <sys/lock.h>
91 #include <sys/kernel.h>
92 #include <sys/blist.h>
93 #include <sys/malloc.h>
94 #include <sys/sbuf.h>
95 #include <sys/proc.h>
96 #include <sys/mutex.h>
97 
98 #else
99 
100 #ifndef BLIST_NO_DEBUG
101 #define BLIST_DEBUG
102 #endif
103 
104 #include <sys/errno.h>
105 #include <sys/types.h>
106 #include <sys/malloc.h>
107 #include <sys/sbuf.h>
108 #include <assert.h>
109 #include <stdio.h>
110 #include <string.h>
111 #include <stddef.h>
112 #include <stdlib.h>
113 #include <stdarg.h>
114 #include <stdbool.h>
115 
116 #define	bitcount64(x)	__bitcount64((uint64_t)(x))
117 #define malloc(a,b,c)	calloc(a, 1)
118 #define free(a,b)	free(a)
119 #define ummin(a,b)	((a) < (b) ? (a) : (b))
120 #define imin(a,b)	((a) < (b) ? (a) : (b))
121 #define KASSERT(a,b)	assert(a)
122 
123 #include <sys/blist.h>
124 
125 #endif
126 
127 /*
128  * static support functions
129  */
130 static daddr_t	blst_leaf_alloc(blmeta_t *scan, daddr_t blk,
131     int *count, int maxcount);
132 static daddr_t	blst_meta_alloc(blmeta_t *scan, daddr_t cursor, int *count,
133     int maxcount, u_daddr_t radix);
134 static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count);
135 static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count,
136 		    u_daddr_t radix);
137 static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix,
138 		    blist_t dest, daddr_t count);
139 static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count);
140 static daddr_t blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count,
141 		    u_daddr_t radix);
142 #ifndef _KERNEL
143 static void	blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix,
144 		    int tab);
145 #endif
146 
147 #ifdef _KERNEL
148 static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space");
149 #endif
150 
151 #define	BLIST_MASK	(BLIST_RADIX - 1)
152 
153 /*
154  * For a subtree that can represent the state of up to 'radix' blocks, the
155  * number of leaf nodes of the subtree is L=radix/BLIST_RADIX.  If 'm'
156  * is short for BLIST_RADIX, then for a tree of height h with L=m**h
157  * leaf nodes, the total number of tree nodes is 1 + m + m**2 + ... + m**h,
158  * or, equivalently, (m**(h+1)-1)/(m-1).  This quantity is called 'skip'
159  * in the 'meta' functions that process subtrees.  Since integer division
160  * discards remainders, we can express this computation as
161  * skip = (m * m**h) / (m - 1)
162  * skip = (m * (radix / m)) / (m - 1)
163  * skip = radix / (m - 1)
164  * so that simple integer division by a constant can safely be used for the
165  * calculation.
166  */
167 static inline daddr_t
168 radix_to_skip(daddr_t radix)
169 {
170 
171 	return (radix / BLIST_MASK);
172 }
173 
174 /*
175  * Provide a mask with count bits set, starting as position n.
176  */
177 static inline u_daddr_t
178 bitrange(int n, int count)
179 {
180 
181 	return (((u_daddr_t)-1 << n) &
182 	    ((u_daddr_t)-1 >> (BLIST_RADIX - (n + count))));
183 }
184 
185 static inline int
186 bitpos(u_daddr_t mask)
187 {
188 
189 	_Static_assert(sizeof(long long) >= sizeof(mask),
190 	    "mask too big for ffsll()");
191 	return (ffsll(mask) - 1);
192 }
193 
194 /*
195  * blist_create() - create a blist capable of handling up to the specified
196  *		    number of blocks
197  *
198  *	blocks - must be greater than 0
199  * 	flags  - malloc flags
200  *
201  *	The smallest blist consists of a single leaf node capable of
202  *	managing BLIST_RADIX blocks.
203  */
204 blist_t
205 blist_create(daddr_t blocks, int flags)
206 {
207 	blist_t bl;
208 	u_daddr_t nodes, radix;
209 
210 	KASSERT(blocks > 0, ("invalid block count"));
211 
212 	/*
213 	 * Calculate the radix and node count used for scanning.
214 	 */
215 	nodes = 1;
216 	for (radix = 1; (blocks - 1) / BLIST_RADIX / radix > 0;
217 	    radix *= BLIST_RADIX)
218 		nodes += 1 + (blocks - 1) / BLIST_RADIX / radix;
219 
220 	/*
221 	 * Include a sentinel node to ensure that cross-leaf scans stay within
222 	 * the bounds of the allocation.
223 	 */
224 	if (blocks % BLIST_RADIX == 0)
225 		nodes++;
226 
227 	bl = malloc(offsetof(struct blist, bl_root[nodes]), M_SWAP, flags |
228 	    M_ZERO);
229 	if (bl == NULL)
230 		return (NULL);
231 
232 	bl->bl_blocks = blocks;
233 	bl->bl_radix = radix;
234 
235 #if defined(BLIST_DEBUG)
236 	printf(
237 		"BLIST representing %lld blocks (%lld MB of swap)"
238 		", requiring %lldK of ram\n",
239 		(long long)bl->bl_blocks,
240 		(long long)bl->bl_blocks * 4 / 1024,
241 		(long long)(nodes * sizeof(blmeta_t) + 1023) / 1024
242 	);
243 	printf("BLIST raw radix tree contains %lld records\n",
244 	    (long long)nodes);
245 #endif
246 
247 	return (bl);
248 }
249 
250 void
251 blist_destroy(blist_t bl)
252 {
253 
254 	free(bl, M_SWAP);
255 }
256 
257 /*
258  * blist_alloc() -   reserve space in the block bitmap.  Return the base
259  *		     of a contiguous region or SWAPBLK_NONE if space could
260  *		     not be allocated.
261  */
262 daddr_t
263 blist_alloc(blist_t bl, int *count, int maxcount)
264 {
265 	daddr_t blk, cursor;
266 
267 	KASSERT(*count <= maxcount,
268 	    ("invalid parameters %d > %d", *count, maxcount));
269 	KASSERT(*count <= BLIST_MAX_ALLOC,
270 	    ("minimum allocation too large: %d", *count));
271 
272 	/*
273 	 * This loop iterates at most twice.  An allocation failure in the
274 	 * first iteration leads to a second iteration only if the cursor was
275 	 * non-zero.  When the cursor is zero, an allocation failure will
276 	 * stop further iterations.
277 	 */
278 	for (cursor = bl->bl_cursor;; cursor = 0) {
279 		blk = blst_meta_alloc(bl->bl_root, cursor, count, maxcount,
280 		    bl->bl_radix);
281 		if (blk != SWAPBLK_NONE) {
282 			bl->bl_avail -= *count;
283 			bl->bl_cursor = blk + *count;
284 			if (bl->bl_cursor == bl->bl_blocks)
285 				bl->bl_cursor = 0;
286 			return (blk);
287 		}
288 		if (cursor == 0)
289 			return (SWAPBLK_NONE);
290 	}
291 }
292 
293 /*
294  * blist_avail() -	return the number of free blocks.
295  */
296 daddr_t
297 blist_avail(blist_t bl)
298 {
299 
300 	return (bl->bl_avail);
301 }
302 
303 /*
304  * blist_free() -	free up space in the block bitmap.  Return the base
305  *		     	of a contiguous region.
306  */
307 void
308 blist_free(blist_t bl, daddr_t blkno, daddr_t count)
309 {
310 
311 	KASSERT(blkno >= 0 && blkno + count <= bl->bl_blocks,
312 	    ("freeing invalid range: blkno %jx, count %d, blocks %jd",
313 	    (uintmax_t)blkno, (int)count, (uintmax_t)bl->bl_blocks));
314 	blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix);
315 	bl->bl_avail += count;
316 }
317 
318 /*
319  * blist_fill() -	mark a region in the block bitmap as off-limits
320  *			to the allocator (i.e. allocate it), ignoring any
321  *			existing allocations.  Return the number of blocks
322  *			actually filled that were free before the call.
323  */
324 daddr_t
325 blist_fill(blist_t bl, daddr_t blkno, daddr_t count)
326 {
327 	daddr_t filled;
328 
329 	KASSERT(blkno >= 0 && blkno + count <= bl->bl_blocks,
330 	    ("filling invalid range: blkno %jx, count %d, blocks %jd",
331 	    (uintmax_t)blkno, (int)count, (uintmax_t)bl->bl_blocks));
332 	filled = blst_meta_fill(bl->bl_root, blkno, count, bl->bl_radix);
333 	bl->bl_avail -= filled;
334 	return (filled);
335 }
336 
337 /*
338  * blist_resize() -	resize an existing radix tree to handle the
339  *			specified number of blocks.  This will reallocate
340  *			the tree and transfer the previous bitmap to the new
341  *			one.  When extending the tree you can specify whether
342  *			the new blocks are to left allocated or freed.
343  */
344 void
345 blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags)
346 {
347     blist_t newbl = blist_create(count, flags);
348     blist_t save = *pbl;
349 
350     *pbl = newbl;
351     if (count > save->bl_blocks)
352 	    count = save->bl_blocks;
353     blst_copy(save->bl_root, 0, save->bl_radix, newbl, count);
354 
355     /*
356      * If resizing upwards, should we free the new space or not?
357      */
358     if (freenew && count < newbl->bl_blocks) {
359 	    blist_free(newbl, count, newbl->bl_blocks - count);
360     }
361     blist_destroy(save);
362 }
363 
364 #ifdef BLIST_DEBUG
365 
366 /*
367  * blist_print()    - dump radix tree
368  */
369 void
370 blist_print(blist_t bl)
371 {
372 	printf("BLIST avail = %jd, cursor = %08jx {\n",
373 	    (uintmax_t)bl->bl_avail, (uintmax_t)bl->bl_cursor);
374 
375 	if (bl->bl_root->bm_bitmap != 0)
376 		blst_radix_print(bl->bl_root, 0, bl->bl_radix, 4);
377 	printf("}\n");
378 }
379 
380 #endif
381 
382 static const u_daddr_t fib[] = {
383 	1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584,
384 	4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418, 317811,
385 	514229, 832040, 1346269, 2178309, 3524578,
386 };
387 
388 /*
389  * Use 'gap' to describe a maximal range of unallocated blocks/bits.
390  */
391 struct gap_stats {
392 	daddr_t	start;		/* current gap start, or SWAPBLK_NONE */
393 	daddr_t	num;		/* number of gaps observed */
394 	daddr_t	max;		/* largest gap size */
395 	daddr_t	avg;		/* average gap size */
396 	daddr_t	err;		/* sum - num * avg */
397 	daddr_t	histo[nitems(fib)]; /* # gaps in each size range */
398 	int	max_bucket;	/* last histo elt with nonzero val */
399 };
400 
401 /*
402  * gap_stats_counting()    - is the state 'counting 1 bits'?
403  *                           or 'skipping 0 bits'?
404  */
405 static inline bool
406 gap_stats_counting(const struct gap_stats *stats)
407 {
408 
409 	return (stats->start != SWAPBLK_NONE);
410 }
411 
412 /*
413  * init_gap_stats()    - initialize stats on gap sizes
414  */
415 static inline void
416 init_gap_stats(struct gap_stats *stats)
417 {
418 
419 	bzero(stats, sizeof(*stats));
420 	stats->start = SWAPBLK_NONE;
421 }
422 
423 /*
424  * update_gap_stats()    - update stats on gap sizes
425  */
426 static void
427 update_gap_stats(struct gap_stats *stats, daddr_t posn)
428 {
429 	daddr_t size;
430 	int hi, lo, mid;
431 
432 	if (!gap_stats_counting(stats)) {
433 		stats->start = posn;
434 		return;
435 	}
436 	size = posn - stats->start;
437 	stats->start = SWAPBLK_NONE;
438 	if (size > stats->max)
439 		stats->max = size;
440 
441 	/*
442 	 * Find the fibonacci range that contains size,
443 	 * expecting to find it in an early range.
444 	 */
445 	lo = 0;
446 	hi = 1;
447 	while (hi < nitems(fib) && fib[hi] <= size) {
448 		lo = hi;
449 		hi *= 2;
450 	}
451 	if (hi >= nitems(fib))
452 		hi = nitems(fib);
453 	while (lo + 1 != hi) {
454 		mid = (lo + hi) >> 1;
455 		if (fib[mid] <= size)
456 			lo = mid;
457 		else
458 			hi = mid;
459 	}
460 	stats->histo[lo]++;
461 	if (lo > stats->max_bucket)
462 		stats->max_bucket = lo;
463 	stats->err += size - stats->avg;
464 	stats->num++;
465 	stats->avg += stats->err / stats->num;
466 	stats->err %= stats->num;
467 }
468 
469 /*
470  * dump_gap_stats()    - print stats on gap sizes
471  */
472 static inline void
473 dump_gap_stats(const struct gap_stats *stats, struct sbuf *s)
474 {
475 	int i;
476 
477 	sbuf_printf(s, "number of maximal free ranges: %jd\n",
478 	    (intmax_t)stats->num);
479 	sbuf_printf(s, "largest free range: %jd\n", (intmax_t)stats->max);
480 	sbuf_printf(s, "average maximal free range size: %jd\n",
481 	    (intmax_t)stats->avg);
482 	sbuf_cat(s, "number of maximal free ranges of different sizes:\n");
483 	sbuf_cat(s, "               count  |  size range\n");
484 	sbuf_cat(s, "               -----  |  ----------\n");
485 	for (i = 0; i < stats->max_bucket; i++) {
486 		if (stats->histo[i] != 0) {
487 			sbuf_printf(s, "%20jd  |  ",
488 			    (intmax_t)stats->histo[i]);
489 			if (fib[i] != fib[i + 1] - 1)
490 				sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
491 				    (intmax_t)fib[i + 1] - 1);
492 			else
493 				sbuf_printf(s, "%jd\n", (intmax_t)fib[i]);
494 		}
495 	}
496 	sbuf_printf(s, "%20jd  |  ", (intmax_t)stats->histo[i]);
497 	if (stats->histo[i] > 1)
498 		sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
499 		    (intmax_t)stats->max);
500 	else
501 		sbuf_printf(s, "%jd\n", (intmax_t)stats->max);
502 }
503 
504 /*
505  * blist_stats()    - dump radix tree stats
506  */
507 void
508 blist_stats(blist_t bl, struct sbuf *s)
509 {
510 	struct gap_stats gstats;
511 	struct gap_stats *stats = &gstats;
512 	daddr_t i, nodes, radix;
513 	u_daddr_t diff, mask;
514 	int digit;
515 
516 	init_gap_stats(stats);
517 	nodes = 0;
518 	radix = bl->bl_radix;
519 	for (i = 0; i < bl->bl_blocks; ) {
520 		/*
521 		 * Check for skippable subtrees starting at i.
522 		 */
523 		while (radix != 1) {
524 			if (bl->bl_root[nodes].bm_bitmap == 0) {
525 				if (gap_stats_counting(stats))
526 					update_gap_stats(stats, i);
527 				break;
528 			}
529 
530 			/*
531 			 * Skip subtree root.
532 			 */
533 			nodes++;
534 			radix /= BLIST_RADIX;
535 		}
536 		if (radix == 1) {
537 			/*
538 			 * Scan leaf.
539 			 */
540 			mask = bl->bl_root[nodes].bm_bitmap;
541 			diff = mask ^ (mask << 1);
542 			if (gap_stats_counting(stats))
543 				diff ^= 1;
544 			while (diff != 0) {
545 				digit = bitpos(diff);
546 				update_gap_stats(stats, i + digit);
547 				diff ^= bitrange(digit, 1);
548 			}
549 		}
550 		nodes += radix_to_skip(radix * BLIST_RADIX);
551 		i += radix * BLIST_RADIX;
552 
553 		/*
554 		 * Find max size subtree starting at i.
555 		 */
556 		for (radix = 1;
557 		    ((i / BLIST_RADIX / radix) & BLIST_MASK) == 0;
558 		    radix *= BLIST_RADIX)
559 			;
560 	}
561 	update_gap_stats(stats, i);
562 	dump_gap_stats(stats, s);
563 }
564 
565 /************************************************************************
566  *			  ALLOCATION SUPPORT FUNCTIONS			*
567  ************************************************************************
568  *
569  *	These support functions do all the actual work.  They may seem
570  *	rather longish, but that's because I've commented them up.  The
571  *	actual code is straight forward.
572  *
573  */
574 
575 /*
576  * BLST_NEXT_LEAF_ALLOC() - allocate the blocks starting with the next leaf.
577  *
578  *	'scan' is a leaf node, and its first block is at address 'start'.  The
579  *	next leaf node could be adjacent, or several nodes away if the least
580  *	common ancestor of 'scan' and its neighbor is several levels up.  Use
581  *	addresses to determine how many meta-nodes lie between the leaves.  If
582  *	sequence of leaves starting with the next one has enough initial bits
583  *	set, clear them and clear the bits in the meta nodes on the path up to
584  *	the least common ancestor to mark any subtrees made completely empty.
585  */
586 static int
587 blst_next_leaf_alloc(blmeta_t *scan, daddr_t start, int count, int maxcount)
588 {
589 	u_daddr_t radix;
590 	daddr_t blk;
591 	int avail, digit;
592 
593 	start += BLIST_RADIX;
594 	for (blk = start; blk - start < maxcount; blk += BLIST_RADIX) {
595 		/* Skip meta-nodes, as long as they promise more free blocks. */
596 		radix = BLIST_RADIX;
597 		while (((++scan)->bm_bitmap & 1) == 1 &&
598 		    ((blk / radix) & BLIST_MASK) == 0)
599 			radix *= BLIST_RADIX;
600 		if (~scan->bm_bitmap != 0) {
601 			/*
602 			 * Either there is no next leaf with any free blocks,
603 			 * or we've reached the next leaf and found that some
604 			 * of its blocks are not free.  In the first case,
605 			 * bitpos() returns zero here.
606 			 */
607 			avail = blk - start + bitpos(~scan->bm_bitmap);
608 			if (avail < count || avail == 0) {
609 				/*
610 				 * There isn't a next leaf with enough free
611 				 * blocks at its beginning to bother
612 				 * allocating.
613 				 */
614 				return (avail);
615 			}
616 			maxcount = imin(avail, maxcount);
617 			if (maxcount % BLIST_RADIX == 0) {
618 				/*
619 				 * There was no next leaf.  Back scan up to
620 				 * last leaf.
621 				 */
622 				do {
623 					radix /= BLIST_RADIX;
624 					--scan;
625 				} while (radix != 1);
626 				blk -= BLIST_RADIX;
627 			}
628 		}
629 	}
630 
631 	/*
632 	 * 'scan' is the last leaf that provides blocks.  Clear from 1 to
633 	 * BLIST_RADIX bits to represent the allocation of those last blocks.
634 	 */
635 	if (maxcount % BLIST_RADIX != 0)
636 		scan->bm_bitmap &= ~bitrange(0, maxcount % BLIST_RADIX);
637 	else
638 		scan->bm_bitmap = 0;
639 
640 	for (;;) {
641 		/* Back up over meta-nodes, clearing bits if necessary. */
642 		blk -= BLIST_RADIX;
643 		for (radix = BLIST_RADIX;
644 		    (digit = ((blk / radix) & BLIST_MASK)) == 0;
645 		    radix *= BLIST_RADIX) {
646 			if ((scan--)->bm_bitmap == 0)
647 				scan->bm_bitmap ^= 1;
648 		}
649 		if ((scan--)->bm_bitmap == 0)
650 			scan[-digit * radix_to_skip(radix)].bm_bitmap ^=
651 			    (u_daddr_t)1 << digit;
652 
653 		if (blk == start)
654 			break;
655 		/* Clear all the bits of this leaf. */
656 		scan->bm_bitmap = 0;
657 	}
658 	return (maxcount);
659 }
660 
661 /*
662  * BLST_LEAF_ALLOC() -	allocate at a leaf in the radix tree (a bitmap).
663  *
664  *	This function is the core of the allocator.  Its execution time is
665  *	proportional to log(count), plus height of the tree if the allocation
666  *	crosses a leaf boundary.
667  */
668 static daddr_t
669 blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int *count, int maxcount)
670 {
671 	u_daddr_t mask;
672 	int bighint, count1, hi, lo, num_shifts;
673 
674 	count1 = *count - 1;
675 	num_shifts = fls(count1);
676 	mask = ~scan->bm_bitmap;
677 	while ((mask & (mask + 1)) != 0 && num_shifts > 0) {
678 		/*
679 		 * If bit i is 0 in mask, then bits in [i, i + (count1 >>
680 		 * num_shifts)] are 1 in scan->bm_bitmap.  Reduce num_shifts to
681 		 * 0, while preserving this invariant.  The updates to mask
682 		 * leave fewer bits 0, but each bit that remains 0 represents a
683 		 * longer string of consecutive 1-bits in scan->bm_bitmap.  If
684 		 * more updates to mask cannot set more bits, because mask is
685 		 * partitioned with all 1 bits following all 0 bits, the loop
686 		 * terminates immediately.
687 		 */
688 		num_shifts--;
689 		mask |= mask >> ((count1 >> num_shifts) + 1) / 2;
690 	}
691 	bighint = count1 >> num_shifts;
692 	if (~mask == 0) {
693 		/*
694 		 * Update bighint.  There is no allocation bigger than
695 		 * count1 >> num_shifts starting in this leaf.
696 		 */
697 		scan->bm_bighint = bighint;
698 		return (SWAPBLK_NONE);
699 	}
700 
701 	/* Discard any candidates that appear before blk. */
702 	if ((blk & BLIST_MASK) != 0) {
703 		if ((~mask & bitrange(0, blk & BLIST_MASK)) != 0) {
704 			/* Grow bighint in case all discarded bits are set. */
705 			bighint += blk & BLIST_MASK;
706 			mask |= bitrange(0, blk & BLIST_MASK);
707 			if (~mask == 0) {
708 				scan->bm_bighint = bighint;
709 				return (SWAPBLK_NONE);
710 			}
711 		}
712 		blk -= blk & BLIST_MASK;
713 	}
714 
715 	/*
716 	 * The least significant set bit in mask marks the start of the first
717 	 * available range of sufficient size.  Find its position.
718 	 */
719 	lo = bitpos(~mask);
720 
721 	/*
722 	 * Find how much space is available starting at that position.
723 	 */
724 	if ((mask & (mask + 1)) != 0) {
725 		/* Count the 1 bits starting at position lo. */
726 		hi = bitpos(mask & (mask + 1)) + count1;
727 		if (maxcount < hi - lo)
728 			hi = lo + maxcount;
729 		*count = hi - lo;
730 		mask = ~bitrange(lo, *count);
731 	} else if (maxcount <= BLIST_RADIX - lo) {
732 		/* All the blocks we can use are available here. */
733 		hi = lo + maxcount;
734 		*count = maxcount;
735 		mask = ~bitrange(lo, *count);
736 		if (hi == BLIST_RADIX)
737 			scan->bm_bighint = bighint;
738 	} else {
739 		/* Check next leaf for some of the blocks we want or need. */
740 		count1 = *count - (BLIST_RADIX - lo);
741 		maxcount -= BLIST_RADIX - lo;
742 		hi = blst_next_leaf_alloc(scan, blk, count1, maxcount);
743 		if (hi < count1)
744 			/*
745 			 * The next leaf cannot supply enough blocks to reach
746 			 * the minimum required allocation.  The hint cannot be
747 			 * updated, because the same allocation request could
748 			 * be satisfied later, by this leaf, if the state of
749 			 * the next leaf changes, and without any changes to
750 			 * this leaf.
751 			 */
752 			return (SWAPBLK_NONE);
753 		*count = BLIST_RADIX - lo + hi;
754 		scan->bm_bighint = bighint;
755 	}
756 
757 	/* Clear the allocated bits from this leaf. */
758 	scan->bm_bitmap &= mask;
759 	return (blk + lo);
760 }
761 
762 /*
763  * blist_meta_alloc() -	allocate at a meta in the radix tree.
764  *
765  *	Attempt to allocate at a meta node.  If we can't, we update
766  *	bighint and return a failure.  Updating bighint optimize future
767  *	calls that hit this node.  We have to check for our collapse cases
768  *	and we have a few optimizations strewn in as well.
769  */
770 static daddr_t
771 blst_meta_alloc(blmeta_t *scan, daddr_t cursor, int *count,
772     int maxcount, u_daddr_t radix)
773 {
774 	daddr_t blk, i, r, skip;
775 	u_daddr_t mask;
776 	bool scan_from_start;
777 	int digit;
778 
779 	if (radix == 1)
780 		return (blst_leaf_alloc(scan, cursor, count, maxcount));
781 	blk = cursor & -(radix * BLIST_RADIX);
782 	scan_from_start = (cursor == blk);
783 	skip = radix_to_skip(radix);
784 	mask = scan->bm_bitmap;
785 
786 	/* Discard any candidates that appear before cursor. */
787 	digit = (cursor / radix) & BLIST_MASK;
788 	mask &= (u_daddr_t)-1 << digit;
789 	if (mask == 0)
790 		return (SWAPBLK_NONE);
791 
792 	/*
793 	 * If the first try is for a block that includes the cursor, pre-undo
794 	 * the digit * radix offset in the first call; otherwise, ignore the
795 	 * cursor entirely.
796 	 */
797 	if (((mask >> digit) & 1) == 1)
798 		cursor -= digit * radix;
799 	else
800 		cursor = blk;
801 
802 	/*
803 	 * Examine the nonempty subtree associated with each bit set in mask.
804 	 */
805 	do {
806 		digit = bitpos(mask);
807 		i = 1 + digit * skip;
808 		if (*count <= scan[i].bm_bighint) {
809 			/*
810 			 * The allocation might fit beginning in the i'th subtree.
811 			 */
812 			r = blst_meta_alloc(&scan[i], cursor + digit * radix,
813 			    count, maxcount, radix / BLIST_RADIX);
814 			if (r != SWAPBLK_NONE) {
815 				if (scan[i].bm_bitmap == 0)
816 					scan->bm_bitmap ^= bitrange(digit, 1);
817 				return (r);
818 			}
819 		}
820 		cursor = blk;
821 	} while ((mask ^= bitrange(digit, 1)) != 0);
822 
823 	/*
824 	 * We couldn't allocate count in this subtree.  If the whole tree was
825 	 * scanned, and the last tree node is allocated, update bighint.
826 	 */
827 	if (scan_from_start && !(digit == BLIST_RADIX - 1 &&
828 	    scan[i].bm_bighint == BLIST_MAX_ALLOC))
829 		scan->bm_bighint = *count - 1;
830 
831 	return (SWAPBLK_NONE);
832 }
833 
834 /*
835  * BLST_LEAF_FREE() -	free allocated block from leaf bitmap
836  *
837  */
838 static void
839 blst_leaf_free(blmeta_t *scan, daddr_t blk, int count)
840 {
841 	u_daddr_t mask;
842 
843 	/*
844 	 * free some data in this bitmap
845 	 * mask=0000111111111110000
846 	 *          \_________/\__/
847 	 *		count   n
848 	 */
849 	mask = bitrange(blk & BLIST_MASK, count);
850 	KASSERT((scan->bm_bitmap & mask) == 0,
851 	    ("freeing free block: %jx, size %d, mask %jx",
852 	    (uintmax_t)blk, count, (uintmax_t)scan->bm_bitmap & mask));
853 	scan->bm_bitmap |= mask;
854 }
855 
856 /*
857  * BLST_META_FREE() - free allocated blocks from radix tree meta info
858  *
859  *	This support routine frees a range of blocks from the bitmap.
860  *	The range must be entirely enclosed by this radix node.  If a
861  *	meta node, we break the range down recursively to free blocks
862  *	in subnodes (which means that this code can free an arbitrary
863  *	range whereas the allocation code cannot allocate an arbitrary
864  *	range).
865  */
866 static void
867 blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, u_daddr_t radix)
868 {
869 	daddr_t blk, endBlk, i, skip;
870 	int digit, endDigit;
871 
872 	/*
873 	 * We could probably do a better job here.  We are required to make
874 	 * bighint at least as large as the biggest allocable block of data.
875 	 * If we just shoehorn it, a little extra overhead will be incurred
876 	 * on the next allocation (but only that one typically).
877 	 */
878 	scan->bm_bighint = BLIST_MAX_ALLOC;
879 
880 	if (radix == 1)
881 		return (blst_leaf_free(scan, freeBlk, count));
882 
883 	endBlk = freeBlk + count;
884 	blk = (freeBlk + radix * BLIST_RADIX) & -(radix * BLIST_RADIX);
885 	/*
886 	 * blk is first block past the end of the range of this meta node,
887 	 * or 0 in case of overflow.
888 	 */
889 	if (blk != 0)
890 		endBlk = ummin(endBlk, blk);
891 	skip = radix_to_skip(radix);
892 	blk = freeBlk & -radix;
893 	digit = (blk / radix) & BLIST_MASK;
894 	endDigit = 1 + (((endBlk - 1) / radix) & BLIST_MASK);
895 	scan->bm_bitmap |= bitrange(digit, endDigit - digit);
896 	for (i = 1 + digit * skip; blk < endBlk; i += skip) {
897 		blk += radix;
898 		count = ummin(blk, endBlk) - freeBlk;
899 		blst_meta_free(&scan[i], freeBlk, count, radix / BLIST_RADIX);
900 		freeBlk = blk;
901 	}
902 }
903 
904 /*
905  * BLST_COPY() - copy one radix tree to another
906  *
907  *	Locates free space in the source tree and frees it in the destination
908  *	tree.  The space may not already be free in the destination.
909  */
910 static void
911 blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, blist_t dest,
912     daddr_t count)
913 {
914 	daddr_t endBlk, i, skip;
915 
916 	/*
917 	 * Leaf node
918 	 */
919 
920 	if (radix == 1) {
921 		u_daddr_t v = scan->bm_bitmap;
922 
923 		if (v == (u_daddr_t)-1) {
924 			blist_free(dest, blk, count);
925 		} else if (v != 0) {
926 			int i;
927 
928 			for (i = 0; i < count; ++i) {
929 				if (v & ((u_daddr_t)1 << i))
930 					blist_free(dest, blk + i, 1);
931 			}
932 		}
933 		return;
934 	}
935 
936 	/*
937 	 * Meta node
938 	 */
939 
940 	if (scan->bm_bitmap == 0) {
941 		/*
942 		 * Source all allocated, leave dest allocated
943 		 */
944 		return;
945 	}
946 
947 	endBlk = blk + count;
948 	skip = radix_to_skip(radix);
949 	for (i = 1; blk < endBlk; i += skip) {
950 		blk += radix;
951 		count = radix;
952 		if (blk >= endBlk)
953 			count -= blk - endBlk;
954 		blst_copy(&scan[i], blk - radix,
955 		    radix / BLIST_RADIX, dest, count);
956 	}
957 }
958 
959 /*
960  * BLST_LEAF_FILL() -	allocate specific blocks in leaf bitmap
961  *
962  *	This routine allocates all blocks in the specified range
963  *	regardless of any existing allocations in that range.  Returns
964  *	the number of blocks allocated by the call.
965  */
966 static daddr_t
967 blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count)
968 {
969 	daddr_t nblks;
970 	u_daddr_t mask;
971 
972 	mask = bitrange(blk & BLIST_MASK, count);
973 
974 	/* Count the number of blocks that we are allocating. */
975 	nblks = bitcount64(scan->bm_bitmap & mask);
976 
977 	scan->bm_bitmap &= ~mask;
978 	return (nblks);
979 }
980 
981 /*
982  * BLIST_META_FILL() -	allocate specific blocks at a meta node
983  *
984  *	This routine allocates the specified range of blocks,
985  *	regardless of any existing allocations in the range.  The
986  *	range must be within the extent of this node.  Returns the
987  *	number of blocks allocated by the call.
988  */
989 static daddr_t
990 blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, u_daddr_t radix)
991 {
992 	daddr_t blk, endBlk, i, nblks, skip;
993 	int digit;
994 
995 	if (radix == 1)
996 		return (blst_leaf_fill(scan, allocBlk, count));
997 
998 	endBlk = allocBlk + count;
999 	blk = (allocBlk + radix * BLIST_RADIX) & -(radix * BLIST_RADIX);
1000 	/*
1001 	 * blk is first block past the end of the range of this meta node,
1002 	 * or 0 in case of overflow.
1003 	 */
1004 	if (blk != 0)
1005 		endBlk = ummin(endBlk, blk);
1006 	skip = radix_to_skip(radix);
1007 	blk = allocBlk & -radix;
1008 	nblks = 0;
1009 	while (blk < endBlk) {
1010 		digit = (blk / radix) & BLIST_MASK;
1011 		i = 1 + digit * skip;
1012 		blk += radix;
1013 		count = ummin(blk, endBlk) - allocBlk;
1014 		nblks += blst_meta_fill(&scan[i], allocBlk, count,
1015 		    radix / BLIST_RADIX);
1016 		if (scan[i].bm_bitmap == 0)
1017 			scan->bm_bitmap &= ~((u_daddr_t)1 << digit);
1018 		allocBlk = blk;
1019 	}
1020 	return (nblks);
1021 }
1022 
1023 #ifdef BLIST_DEBUG
1024 
1025 static void
1026 blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int tab)
1027 {
1028 	daddr_t skip;
1029 	u_daddr_t mask;
1030 	int digit;
1031 
1032 	if (radix == 1) {
1033 		printf(
1034 		    "%*.*s(%08llx,%lld): bitmap %0*llx big=%lld\n",
1035 		    tab, tab, "",
1036 		    (long long)blk, (long long)BLIST_RADIX,
1037 		    (int)(1 + (BLIST_RADIX - 1) / 4),
1038 		    (long long)scan->bm_bitmap,
1039 		    (long long)scan->bm_bighint
1040 		);
1041 		return;
1042 	}
1043 
1044 	printf(
1045 	    "%*.*s(%08llx): subtree (%lld/%lld) bitmap %0*llx big=%lld {\n",
1046 	    tab, tab, "",
1047 	    (long long)blk, (long long)radix * BLIST_RADIX,
1048 	    (long long)radix * BLIST_RADIX,
1049 	    (int)(1 + (BLIST_RADIX - 1) / 4),
1050 	    (long long)scan->bm_bitmap,
1051 	    (long long)scan->bm_bighint
1052 	);
1053 
1054 	skip = radix_to_skip(radix);
1055 	tab += 4;
1056 
1057 	mask = scan->bm_bitmap;
1058 	/* Examine the nonempty subtree associated with each bit set in mask */
1059 	do {
1060 		digit = bitpos(mask);
1061 		blst_radix_print(&scan[1 + digit * skip], blk + digit * radix,
1062 		    radix / BLIST_RADIX, tab);
1063 	} while ((mask ^= bitrange(digit, 1)) != 0);
1064 	tab -= 4;
1065 
1066 	printf(
1067 	    "%*.*s}\n",
1068 	    tab, tab, ""
1069 	);
1070 }
1071 
1072 #endif
1073 
1074 #ifdef BLIST_DEBUG
1075 
1076 int
1077 main(int ac, char **av)
1078 {
1079 	daddr_t size = BLIST_RADIX * BLIST_RADIX;
1080 	int i;
1081 	blist_t bl;
1082 	struct sbuf *s;
1083 
1084 	for (i = 1; i < ac; ++i) {
1085 		const char *ptr = av[i];
1086 		if (*ptr != '-') {
1087 			size = strtoll(ptr, NULL, 0);
1088 			continue;
1089 		}
1090 		ptr += 2;
1091 		fprintf(stderr, "Bad option: %s\n", ptr - 2);
1092 		exit(1);
1093 	}
1094 	bl = blist_create(size, M_WAITOK);
1095 	if (bl == NULL) {
1096 		fprintf(stderr, "blist_create failed\n");
1097 		exit(1);
1098 	}
1099 	blist_free(bl, 0, size);
1100 
1101 	for (;;) {
1102 		char buf[1024];
1103 		long long da = 0;
1104 		int count = 0, maxcount = 0;
1105 
1106 		printf("%lld/%lld/%lld> ", (long long)blist_avail(bl),
1107 		    (long long)size, (long long)bl->bl_radix * BLIST_RADIX);
1108 		fflush(stdout);
1109 		if (fgets(buf, sizeof(buf), stdin) == NULL)
1110 			break;
1111 		switch(buf[0]) {
1112 		case 'r':
1113 			if (sscanf(buf + 1, "%d", &count) == 1) {
1114 				blist_resize(&bl, count, 1, M_WAITOK);
1115 			} else {
1116 				printf("?\n");
1117 			}
1118 		case 'p':
1119 			blist_print(bl);
1120 			break;
1121 		case 's':
1122 			s = sbuf_new_auto();
1123 			blist_stats(bl, s);
1124 			sbuf_finish(s);
1125 			printf("%s", sbuf_data(s));
1126 			sbuf_delete(s);
1127 			break;
1128 		case 'a':
1129 			if (sscanf(buf + 1, "%d%d", &count, &maxcount) == 2) {
1130 				daddr_t blk = blist_alloc(bl, &count, maxcount);
1131 				printf("    R=%08llx, c=%08d\n",
1132 				    (long long)blk, count);
1133 			} else {
1134 				printf("?\n");
1135 			}
1136 			break;
1137 		case 'f':
1138 			if (sscanf(buf + 1, "%llx %d", &da, &count) == 2) {
1139 				blist_free(bl, da, count);
1140 			} else {
1141 				printf("?\n");
1142 			}
1143 			break;
1144 		case 'l':
1145 			if (sscanf(buf + 1, "%llx %d", &da, &count) == 2) {
1146 				printf("    n=%jd\n",
1147 				    (intmax_t)blist_fill(bl, da, count));
1148 			} else {
1149 				printf("?\n");
1150 			}
1151 			break;
1152 		case '?':
1153 		case 'h':
1154 			puts(
1155 			    "p          -print\n"
1156 			    "s          -stats\n"
1157 			    "a %d %d    -allocate\n"
1158 			    "f %x %d    -free\n"
1159 			    "l %x %d    -fill\n"
1160 			    "r %d       -resize\n"
1161 			    "h/?        -help\n"
1162 			    "q          -quit"
1163 			);
1164 			break;
1165 		case 'q':
1166 			break;
1167 		default:
1168 			printf("?\n");
1169 			break;
1170 		}
1171 		if (buf[0] == 'q')
1172 			break;
1173 	}
1174 	return (0);
1175 }
1176 
1177 #endif
1178