xref: /freebsd/sys/kern/subr_blist.c (revision 6132212808e8dccedc9e5d85fea4390c2f38059a)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1998 Matthew Dillon.  All Rights Reserved.
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. Neither the name of the University nor the names of its contributors
14  *    may be used to endorse or promote products derived from this software
15  *    without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
18  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
23  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
25  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
26  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  */
29 /*
30  * BLIST.C -	Bitmap allocator/deallocator, using a radix tree with hinting
31  *
32  *	This module implements a general bitmap allocator/deallocator.  The
33  *	allocator eats around 2 bits per 'block'.  The module does not
34  *	try to interpret the meaning of a 'block' other than to return
35  *	SWAPBLK_NONE on an allocation failure.
36  *
37  *	A radix tree controls access to pieces of the bitmap, and includes
38  *	auxiliary information at each interior node about the availabilty of
39  *	contiguous free blocks in the subtree rooted at that node.  A radix
40  *	constant defines the size of the bitmaps contained in a leaf node
41  *	and the number of descendents of each of the meta (interior) nodes.
42  *	Each subtree is associated with a range of blocks.  The root of any
43  *	subtree stores a hint field that defines an upper bound on the size
44  *	of the largest allocation that can begin in the associated block
45  *	range.  A hint is an upper bound on a potential allocation, but not
46  *	necessarily a tight upper bound.
47  *
48  *	The bitmap field in each node directs the search for available blocks.
49  *	For a leaf node, a bit is set if the corresponding block is free.  For a
50  *	meta node, a bit is set if the corresponding subtree contains a free
51  *	block somewhere within it.  The search at a meta node considers only
52  *	children of that node that represent a range that includes a free block.
53  *
54  * 	The hinting greatly increases code efficiency for allocations while
55  *	the general radix structure optimizes both allocations and frees.  The
56  *	radix tree should be able to operate well no matter how much
57  *	fragmentation there is and no matter how large a bitmap is used.
58  *
59  *	The blist code wires all necessary memory at creation time.  Neither
60  *	allocations nor frees require interaction with the memory subsystem.
61  *	The non-blocking nature of allocations and frees is required by swap
62  *	code (vm/swap_pager.c).
63  *
64  *	LAYOUT: The radix tree is laid out recursively using a linear array.
65  *	Each meta node is immediately followed (laid out sequentially in
66  *	memory) by BLIST_RADIX lower-level nodes.  This is a recursive
67  *	structure but one that can be easily scanned through a very simple
68  *	'skip' calculation.  The memory allocation is only large enough to
69  *	cover the number of blocks requested at creation time.  Nodes that
70  *	represent blocks beyond that limit, nodes that would never be read
71  *	or written, are not allocated, so that the last of the
72  *	BLIST_RADIX lower-level nodes of a some nodes may not be allocated.
73  *
74  *	NOTE: the allocator cannot currently allocate more than
75  *	BLIST_RADIX blocks per call.  It will panic with 'allocation too
76  *	large' if you try.  This is an area that could use improvement.  The
77  *	radix is large enough that this restriction does not effect the swap
78  *	system, though.  Currently only the allocation code is affected by
79  *	this algorithmic unfeature.  The freeing code can handle arbitrary
80  *	ranges.
81  *
82  *	This code can be compiled stand-alone for debugging.
83  */
84 
85 #include <sys/cdefs.h>
86 __FBSDID("$FreeBSD$");
87 
88 #ifdef _KERNEL
89 
90 #include <sys/param.h>
91 #include <sys/systm.h>
92 #include <sys/lock.h>
93 #include <sys/kernel.h>
94 #include <sys/blist.h>
95 #include <sys/malloc.h>
96 #include <sys/sbuf.h>
97 #include <sys/proc.h>
98 #include <sys/mutex.h>
99 
100 #else
101 
102 #ifndef BLIST_NO_DEBUG
103 #define BLIST_DEBUG
104 #endif
105 
106 #include <sys/errno.h>
107 #include <sys/types.h>
108 #include <sys/malloc.h>
109 #include <sys/sbuf.h>
110 #include <assert.h>
111 #include <stdio.h>
112 #include <string.h>
113 #include <stddef.h>
114 #include <stdlib.h>
115 #include <stdarg.h>
116 #include <stdbool.h>
117 
118 #define	bitcount64(x)	__bitcount64((uint64_t)(x))
119 #define malloc(a,b,c)	calloc(a, 1)
120 #define free(a,b)	free(a)
121 #define ummin(a,b)	((a) < (b) ? (a) : (b))
122 #define imin(a,b)	((a) < (b) ? (a) : (b))
123 #define KASSERT(a,b)	assert(a)
124 
125 #include <sys/blist.h>
126 
127 #endif
128 
129 /*
130  * static support functions
131  */
132 static daddr_t	blst_leaf_alloc(blmeta_t *scan, daddr_t blk,
133     int *count, int maxcount);
134 static daddr_t	blst_meta_alloc(blmeta_t *scan, daddr_t cursor, int *count,
135     int maxcount, u_daddr_t radix);
136 static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count);
137 static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count,
138 		    u_daddr_t radix);
139 static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix,
140 		    blist_t dest, daddr_t count);
141 static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count);
142 static daddr_t blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count,
143 		    u_daddr_t radix);
144 #ifndef _KERNEL
145 static void	blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix,
146 		    int tab);
147 #endif
148 
149 #ifdef _KERNEL
150 static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space");
151 #endif
152 
153 #define	BLIST_MASK	(BLIST_RADIX - 1)
154 
155 /*
156  * For a subtree that can represent the state of up to 'radix' blocks, the
157  * number of leaf nodes of the subtree is L=radix/BLIST_RADIX.  If 'm'
158  * is short for BLIST_RADIX, then for a tree of height h with L=m**h
159  * leaf nodes, the total number of tree nodes is 1 + m + m**2 + ... + m**h,
160  * or, equivalently, (m**(h+1)-1)/(m-1).  This quantity is called 'skip'
161  * in the 'meta' functions that process subtrees.  Since integer division
162  * discards remainders, we can express this computation as
163  * skip = (m * m**h) / (m - 1)
164  * skip = (m * (radix / m)) / (m - 1)
165  * skip = radix / (m - 1)
166  * so that simple integer division by a constant can safely be used for the
167  * calculation.
168  */
169 static inline daddr_t
170 radix_to_skip(daddr_t radix)
171 {
172 
173 	return (radix / BLIST_MASK);
174 }
175 
176 /*
177  * Provide a mask with count bits set, starting as position n.
178  */
179 static inline u_daddr_t
180 bitrange(int n, int count)
181 {
182 
183 	return (((u_daddr_t)-1 << n) &
184 	    ((u_daddr_t)-1 >> (BLIST_RADIX - (n + count))));
185 }
186 
187 /*
188  * Find the first bit set in a u_daddr_t.
189  */
190 static inline int
191 generic_bitpos(u_daddr_t mask)
192 {
193 	int hi, lo, mid;
194 
195 	lo = 0;
196 	hi = BLIST_RADIX;
197 	while (lo + 1 < hi) {
198 		mid = (lo + hi) >> 1;
199 		if (mask & bitrange(0, mid))
200 			hi = mid;
201 		else
202 			lo = mid;
203 	}
204 	return (lo);
205 }
206 
207 static inline int
208 bitpos(u_daddr_t mask)
209 {
210 
211 	switch (sizeof(mask)) {
212 #ifdef HAVE_INLINE_FFSLL
213 	case sizeof(long long):
214 		return (ffsll(mask) - 1);
215 #endif
216 #ifdef HAVE_INLINE_FFS
217 	case sizeof(int):
218 		return (ffs(mask) - 1);
219 #endif
220 	default:
221 		return (generic_bitpos(mask));
222 	}
223 }
224 
225 /*
226  * blist_create() - create a blist capable of handling up to the specified
227  *		    number of blocks
228  *
229  *	blocks - must be greater than 0
230  * 	flags  - malloc flags
231  *
232  *	The smallest blist consists of a single leaf node capable of
233  *	managing BLIST_RADIX blocks.
234  */
235 blist_t
236 blist_create(daddr_t blocks, int flags)
237 {
238 	blist_t bl;
239 	u_daddr_t nodes, radix;
240 
241 	KASSERT(blocks > 0, ("invalid block count"));
242 
243 	/*
244 	 * Calculate the radix and node count used for scanning.
245 	 */
246 	nodes = 1;
247 	for (radix = 1; radix <= blocks / BLIST_RADIX; radix *= BLIST_RADIX)
248 		nodes += 1 + (blocks - 1) / radix / BLIST_RADIX;
249 
250 	bl = malloc(offsetof(struct blist, bl_root[nodes]), M_SWAP, flags |
251 	    M_ZERO);
252 	if (bl == NULL)
253 		return (NULL);
254 
255 	bl->bl_blocks = blocks;
256 	bl->bl_radix = radix;
257 
258 #if defined(BLIST_DEBUG)
259 	printf(
260 		"BLIST representing %lld blocks (%lld MB of swap)"
261 		", requiring %lldK of ram\n",
262 		(long long)bl->bl_blocks,
263 		(long long)bl->bl_blocks * 4 / 1024,
264 		(long long)(nodes * sizeof(blmeta_t) + 1023) / 1024
265 	);
266 	printf("BLIST raw radix tree contains %lld records\n",
267 	    (long long)nodes);
268 #endif
269 
270 	return (bl);
271 }
272 
273 void
274 blist_destroy(blist_t bl)
275 {
276 
277 	free(bl, M_SWAP);
278 }
279 
280 /*
281  * blist_alloc() -   reserve space in the block bitmap.  Return the base
282  *		     of a contiguous region or SWAPBLK_NONE if space could
283  *		     not be allocated.
284  */
285 daddr_t
286 blist_alloc(blist_t bl, int *count, int maxcount)
287 {
288 	daddr_t blk, cursor;
289 
290 	KASSERT(*count <= maxcount,
291 	    ("invalid parameters %d > %d", *count, maxcount));
292 	KASSERT(*count <= BLIST_MAX_ALLOC,
293 	    ("minimum allocation too large: %d", *count));
294 
295 	/*
296 	 * This loop iterates at most twice.  An allocation failure in the
297 	 * first iteration leads to a second iteration only if the cursor was
298 	 * non-zero.  When the cursor is zero, an allocation failure will
299 	 * stop further iterations.
300 	 */
301 	for (cursor = bl->bl_cursor;; cursor = 0) {
302 		blk = blst_meta_alloc(bl->bl_root, cursor, count, maxcount,
303 		    bl->bl_radix);
304 		if (blk != SWAPBLK_NONE) {
305 			bl->bl_avail -= *count;
306 			bl->bl_cursor = blk + *count;
307 			if (bl->bl_cursor == bl->bl_blocks)
308 				bl->bl_cursor = 0;
309 			return (blk);
310 		}
311 		if (cursor == 0)
312 			return (SWAPBLK_NONE);
313 	}
314 }
315 
316 /*
317  * blist_avail() -	return the number of free blocks.
318  */
319 daddr_t
320 blist_avail(blist_t bl)
321 {
322 
323 	return (bl->bl_avail);
324 }
325 
326 /*
327  * blist_free() -	free up space in the block bitmap.  Return the base
328  *		     	of a contiguous region.
329  */
330 void
331 blist_free(blist_t bl, daddr_t blkno, daddr_t count)
332 {
333 
334 	KASSERT(blkno >= 0 && blkno + count <= bl->bl_blocks,
335 	    ("freeing invalid range: blkno %jx, count %d, blocks %jd",
336 	    (uintmax_t)blkno, (int)count, (uintmax_t)bl->bl_blocks));
337 	blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix);
338 	bl->bl_avail += count;
339 }
340 
341 /*
342  * blist_fill() -	mark a region in the block bitmap as off-limits
343  *			to the allocator (i.e. allocate it), ignoring any
344  *			existing allocations.  Return the number of blocks
345  *			actually filled that were free before the call.
346  */
347 daddr_t
348 blist_fill(blist_t bl, daddr_t blkno, daddr_t count)
349 {
350 	daddr_t filled;
351 
352 	KASSERT(blkno >= 0 && blkno + count <= bl->bl_blocks,
353 	    ("filling invalid range: blkno %jx, count %d, blocks %jd",
354 	    (uintmax_t)blkno, (int)count, (uintmax_t)bl->bl_blocks));
355 	filled = blst_meta_fill(bl->bl_root, blkno, count, bl->bl_radix);
356 	bl->bl_avail -= filled;
357 	return (filled);
358 }
359 
360 /*
361  * blist_resize() -	resize an existing radix tree to handle the
362  *			specified number of blocks.  This will reallocate
363  *			the tree and transfer the previous bitmap to the new
364  *			one.  When extending the tree you can specify whether
365  *			the new blocks are to left allocated or freed.
366  */
367 void
368 blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags)
369 {
370     blist_t newbl = blist_create(count, flags);
371     blist_t save = *pbl;
372 
373     *pbl = newbl;
374     if (count > save->bl_blocks)
375 	    count = save->bl_blocks;
376     blst_copy(save->bl_root, 0, save->bl_radix, newbl, count);
377 
378     /*
379      * If resizing upwards, should we free the new space or not?
380      */
381     if (freenew && count < newbl->bl_blocks) {
382 	    blist_free(newbl, count, newbl->bl_blocks - count);
383     }
384     blist_destroy(save);
385 }
386 
387 #ifdef BLIST_DEBUG
388 
389 /*
390  * blist_print()    - dump radix tree
391  */
392 void
393 blist_print(blist_t bl)
394 {
395 	printf("BLIST avail = %jd, cursor = %08jx {\n",
396 	    (uintmax_t)bl->bl_avail, (uintmax_t)bl->bl_cursor);
397 
398 	if (bl->bl_root->bm_bitmap != 0)
399 		blst_radix_print(bl->bl_root, 0, bl->bl_radix, 4);
400 	printf("}\n");
401 }
402 
403 #endif
404 
405 static const u_daddr_t fib[] = {
406 	1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584,
407 	4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418, 317811,
408 	514229, 832040, 1346269, 2178309, 3524578,
409 };
410 
411 /*
412  * Use 'gap' to describe a maximal range of unallocated blocks/bits.
413  */
414 struct gap_stats {
415 	daddr_t	start;		/* current gap start, or SWAPBLK_NONE */
416 	daddr_t	num;		/* number of gaps observed */
417 	daddr_t	max;		/* largest gap size */
418 	daddr_t	avg;		/* average gap size */
419 	daddr_t	err;		/* sum - num * avg */
420 	daddr_t	histo[nitems(fib)]; /* # gaps in each size range */
421 	int	max_bucket;	/* last histo elt with nonzero val */
422 };
423 
424 /*
425  * gap_stats_counting()    - is the state 'counting 1 bits'?
426  *                           or 'skipping 0 bits'?
427  */
428 static inline bool
429 gap_stats_counting(const struct gap_stats *stats)
430 {
431 
432 	return (stats->start != SWAPBLK_NONE);
433 }
434 
435 /*
436  * init_gap_stats()    - initialize stats on gap sizes
437  */
438 static inline void
439 init_gap_stats(struct gap_stats *stats)
440 {
441 
442 	bzero(stats, sizeof(*stats));
443 	stats->start = SWAPBLK_NONE;
444 }
445 
446 /*
447  * update_gap_stats()    - update stats on gap sizes
448  */
449 static void
450 update_gap_stats(struct gap_stats *stats, daddr_t posn)
451 {
452 	daddr_t size;
453 	int hi, lo, mid;
454 
455 	if (!gap_stats_counting(stats)) {
456 		stats->start = posn;
457 		return;
458 	}
459 	size = posn - stats->start;
460 	stats->start = SWAPBLK_NONE;
461 	if (size > stats->max)
462 		stats->max = size;
463 
464 	/*
465 	 * Find the fibonacci range that contains size,
466 	 * expecting to find it in an early range.
467 	 */
468 	lo = 0;
469 	hi = 1;
470 	while (hi < nitems(fib) && fib[hi] <= size) {
471 		lo = hi;
472 		hi *= 2;
473 	}
474 	if (hi >= nitems(fib))
475 		hi = nitems(fib);
476 	while (lo + 1 != hi) {
477 		mid = (lo + hi) >> 1;
478 		if (fib[mid] <= size)
479 			lo = mid;
480 		else
481 			hi = mid;
482 	}
483 	stats->histo[lo]++;
484 	if (lo > stats->max_bucket)
485 		stats->max_bucket = lo;
486 	stats->err += size - stats->avg;
487 	stats->num++;
488 	stats->avg += stats->err / stats->num;
489 	stats->err %= stats->num;
490 }
491 
492 /*
493  * dump_gap_stats()    - print stats on gap sizes
494  */
495 static inline void
496 dump_gap_stats(const struct gap_stats *stats, struct sbuf *s)
497 {
498 	int i;
499 
500 	sbuf_printf(s, "number of maximal free ranges: %jd\n",
501 	    (intmax_t)stats->num);
502 	sbuf_printf(s, "largest free range: %jd\n", (intmax_t)stats->max);
503 	sbuf_printf(s, "average maximal free range size: %jd\n",
504 	    (intmax_t)stats->avg);
505 	sbuf_printf(s, "number of maximal free ranges of different sizes:\n");
506 	sbuf_printf(s, "               count  |  size range\n");
507 	sbuf_printf(s, "               -----  |  ----------\n");
508 	for (i = 0; i < stats->max_bucket; i++) {
509 		if (stats->histo[i] != 0) {
510 			sbuf_printf(s, "%20jd  |  ",
511 			    (intmax_t)stats->histo[i]);
512 			if (fib[i] != fib[i + 1] - 1)
513 				sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
514 				    (intmax_t)fib[i + 1] - 1);
515 			else
516 				sbuf_printf(s, "%jd\n", (intmax_t)fib[i]);
517 		}
518 	}
519 	sbuf_printf(s, "%20jd  |  ", (intmax_t)stats->histo[i]);
520 	if (stats->histo[i] > 1)
521 		sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
522 		    (intmax_t)stats->max);
523 	else
524 		sbuf_printf(s, "%jd\n", (intmax_t)stats->max);
525 }
526 
527 /*
528  * blist_stats()    - dump radix tree stats
529  */
530 void
531 blist_stats(blist_t bl, struct sbuf *s)
532 {
533 	struct gap_stats gstats;
534 	struct gap_stats *stats = &gstats;
535 	daddr_t i, nodes, radix;
536 	u_daddr_t diff, mask;
537 	int digit;
538 
539 	init_gap_stats(stats);
540 	nodes = 0;
541 	radix = bl->bl_radix;
542 	for (i = 0; i < bl->bl_blocks; ) {
543 		/*
544 		 * Check for skippable subtrees starting at i.
545 		 */
546 		while (radix != 1) {
547 			if (bl->bl_root[nodes].bm_bitmap == 0) {
548 				if (gap_stats_counting(stats))
549 					update_gap_stats(stats, i);
550 				break;
551 			}
552 
553 			/*
554 			 * Skip subtree root.
555 			 */
556 			nodes++;
557 			radix /= BLIST_RADIX;
558 		}
559 		if (radix == 1) {
560 			/*
561 			 * Scan leaf.
562 			 */
563 			mask = bl->bl_root[nodes].bm_bitmap;
564 			diff = mask ^ (mask << 1);
565 			if (gap_stats_counting(stats))
566 				diff ^= 1;
567 			while (diff != 0) {
568 				digit = bitpos(diff);
569 				update_gap_stats(stats, i + digit);
570 				diff ^= bitrange(digit, 1);
571 			}
572 		}
573 		nodes += radix_to_skip(radix * BLIST_RADIX);
574 		i += radix * BLIST_RADIX;
575 
576 		/*
577 		 * Find max size subtree starting at i.
578 		 */
579 		for (radix = 1;
580 		    ((i / BLIST_RADIX / radix) & BLIST_MASK) == 0;
581 		    radix *= BLIST_RADIX)
582 			;
583 	}
584 	update_gap_stats(stats, i);
585 	dump_gap_stats(stats, s);
586 }
587 
588 /************************************************************************
589  *			  ALLOCATION SUPPORT FUNCTIONS			*
590  ************************************************************************
591  *
592  *	These support functions do all the actual work.  They may seem
593  *	rather longish, but that's because I've commented them up.  The
594  *	actual code is straight forward.
595  *
596  */
597 
598 /*
599  * BLST_NEXT_LEAF_ALLOC() - allocate the blocks starting with the next leaf.
600  *
601  *	'scan' is a leaf node, and its first block is at address 'start'.  The
602  *	next leaf node could be adjacent, or several nodes away if the least
603  *	common ancestor of 'scan' and its neighbor is several levels up.  Use
604  *	addresses to determine how many meta-nodes lie between the leaves.  If
605  *	sequence of leaves starting with the next one has enough initial bits
606  *	set, clear them and clear the bits in the meta nodes on the path up to
607  *	the least common ancestor to mark any subtrees made completely empty.
608  */
609 static int
610 blst_next_leaf_alloc(blmeta_t *scan, daddr_t start, int count, int maxcount)
611 {
612 	u_daddr_t radix;
613 	daddr_t blk;
614 	int avail, digit;
615 
616 	start += BLIST_RADIX;
617 	for (blk = start; blk - start < maxcount; blk += BLIST_RADIX) {
618 		/* Skip meta-nodes, as long as they promise more free blocks. */
619 		radix = BLIST_RADIX;
620 		while (((++scan)->bm_bitmap & 1) == 1 &&
621 		    ((blk / radix) & BLIST_MASK) == 0)
622 			radix *= BLIST_RADIX;
623 		if (~scan->bm_bitmap != 0) {
624 			/*
625 			 * Either there is no next leaf with any free blocks,
626 			 * or we've reached the next leaf and found that some
627 			 * of its blocks are not free.  In the first case,
628 			 * bitpos() returns zero here.
629 			 */
630 			avail = blk - start + bitpos(~scan->bm_bitmap);
631 			if (avail < count || avail == 0) {
632 				/*
633 				 * There isn't a next leaf with enough free
634 				 * blocks at its beginning to bother
635 				 * allocating.
636 				 */
637 				return (avail);
638 			}
639 			maxcount = imin(avail, maxcount);
640 			if (maxcount % BLIST_RADIX == 0) {
641 				/*
642 				 * There was no next leaf.  Back scan up to
643 				 * last leaf.
644 				 */
645 				do {
646 					radix /= BLIST_RADIX;
647 					--scan;
648 				} while (radix != 1);
649 				blk -= BLIST_RADIX;
650 			}
651 		}
652 	}
653 
654 	/*
655 	 * 'scan' is the last leaf that provides blocks.  Clear from 1 to
656 	 * BLIST_RADIX bits to represent the allocation of those last blocks.
657 	 */
658 	if (maxcount % BLIST_RADIX != 0)
659 		scan->bm_bitmap &= ~bitrange(0, maxcount % BLIST_RADIX);
660 	else
661 		scan->bm_bitmap = 0;
662 
663 	for (;;) {
664 		/* Back up over meta-nodes, clearing bits if necessary. */
665 		blk -= BLIST_RADIX;
666 		for (radix = BLIST_RADIX;
667 		    (digit = ((blk / radix) & BLIST_MASK)) == 0;
668 		    radix *= BLIST_RADIX) {
669 			if ((scan--)->bm_bitmap == 0)
670 				scan->bm_bitmap ^= 1;
671 		}
672 		if ((scan--)->bm_bitmap == 0)
673 			scan[-digit * radix_to_skip(radix)].bm_bitmap ^=
674 			    (u_daddr_t)1 << digit;
675 
676 		if (blk == start)
677 			break;
678 		/* Clear all the bits of this leaf. */
679 		scan->bm_bitmap = 0;
680 	}
681 	return (maxcount);
682 }
683 
684 /*
685  * BLST_LEAF_ALLOC() -	allocate at a leaf in the radix tree (a bitmap).
686  *
687  *	This function is the core of the allocator.  Its execution time is
688  *	proportional to log(count), plus height of the tree if the allocation
689  *	crosses a leaf boundary.
690  */
691 static daddr_t
692 blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int *count, int maxcount)
693 {
694 	u_daddr_t mask;
695 	int bighint, count1, hi, lo, num_shifts;
696 
697 	count1 = *count - 1;
698 	num_shifts = fls(count1);
699 	mask = ~scan->bm_bitmap;
700 	while ((mask & (mask + 1)) != 0 && num_shifts > 0) {
701 		/*
702 		 * If bit i is 0 in mask, then bits in [i, i + (count1 >>
703 		 * num_shifts)] are 1 in scan->bm_bitmap.  Reduce num_shifts to
704 		 * 0, while preserving this invariant.  The updates to mask
705 		 * leave fewer bits 0, but each bit that remains 0 represents a
706 		 * longer string of consecutive 1-bits in scan->bm_bitmap.  If
707 		 * more updates to mask cannot set more bits, because mask is
708 		 * partitioned with all 1 bits following all 0 bits, the loop
709 		 * terminates immediately.
710 		 */
711 		num_shifts--;
712 		mask |= mask >> ((count1 >> num_shifts) + 1) / 2;
713 	}
714 	bighint = count1 >> num_shifts;
715 	if (~mask == 0) {
716 		/*
717 		 * Update bighint.  There is no allocation bigger than
718 		 * count1 >> num_shifts starting in this leaf.
719 		 */
720 		scan->bm_bighint = bighint;
721 		return (SWAPBLK_NONE);
722 	}
723 
724 	/* Discard any candidates that appear before blk. */
725 	if ((blk & BLIST_MASK) != 0) {
726 		if ((~mask & bitrange(0, blk & BLIST_MASK)) != 0) {
727 			/* Grow bighint in case all discarded bits are set. */
728 			bighint += blk & BLIST_MASK;
729 			mask |= bitrange(0, blk & BLIST_MASK);
730 			if (~mask == 0) {
731 				scan->bm_bighint = bighint;
732 				return (SWAPBLK_NONE);
733 			}
734 		}
735 		blk -= blk & BLIST_MASK;
736 	}
737 
738 	/*
739 	 * The least significant set bit in mask marks the start of the first
740 	 * available range of sufficient size.  Find its position.
741 	 */
742 	lo = bitpos(~mask);
743 
744 	/*
745 	 * Find how much space is available starting at that position.
746 	 */
747 	if ((mask & (mask + 1)) != 0) {
748 		/* Count the 1 bits starting at position lo. */
749 		hi = bitpos(mask & (mask + 1)) + count1;
750 		if (maxcount < hi - lo)
751 			hi = lo + maxcount;
752 		*count = hi - lo;
753 		mask = ~bitrange(lo, *count);
754 	} else if (maxcount <= BLIST_RADIX - lo) {
755 		/* All the blocks we can use are available here. */
756 		hi = lo + maxcount;
757 		*count = maxcount;
758 		mask = ~bitrange(lo, *count);
759 		if (hi == BLIST_RADIX)
760 			scan->bm_bighint = bighint;
761 	} else {
762 		/* Check next leaf for some of the blocks we want or need. */
763 		count1 = *count - (BLIST_RADIX - lo);
764 		maxcount -= BLIST_RADIX - lo;
765 		hi = blst_next_leaf_alloc(scan, blk, count1, maxcount);
766 		if (hi < count1)
767 			/*
768 			 * The next leaf cannot supply enough blocks to reach
769 			 * the minimum required allocation.  The hint cannot be
770 			 * updated, because the same allocation request could
771 			 * be satisfied later, by this leaf, if the state of
772 			 * the next leaf changes, and without any changes to
773 			 * this leaf.
774 			 */
775 			return (SWAPBLK_NONE);
776 		*count = BLIST_RADIX - lo + hi;
777 		scan->bm_bighint = bighint;
778 	}
779 
780 	/* Clear the allocated bits from this leaf. */
781 	scan->bm_bitmap &= mask;
782 	return (blk + lo);
783 }
784 
785 /*
786  * blist_meta_alloc() -	allocate at a meta in the radix tree.
787  *
788  *	Attempt to allocate at a meta node.  If we can't, we update
789  *	bighint and return a failure.  Updating bighint optimize future
790  *	calls that hit this node.  We have to check for our collapse cases
791  *	and we have a few optimizations strewn in as well.
792  */
793 static daddr_t
794 blst_meta_alloc(blmeta_t *scan, daddr_t cursor, int *count,
795     int maxcount, u_daddr_t radix)
796 {
797 	daddr_t blk, i, r, skip;
798 	u_daddr_t mask;
799 	bool scan_from_start;
800 	int digit;
801 
802 	if (radix == 1)
803 		return (blst_leaf_alloc(scan, cursor, count, maxcount));
804 	blk = cursor & -(radix * BLIST_RADIX);
805 	scan_from_start = (cursor == blk);
806 	skip = radix_to_skip(radix);
807 	mask = scan->bm_bitmap;
808 
809 	/* Discard any candidates that appear before cursor. */
810 	digit = (cursor / radix) & BLIST_MASK;
811 	mask &= (u_daddr_t)-1 << digit;
812 	if (mask == 0)
813 		return (SWAPBLK_NONE);
814 
815 	/*
816 	 * If the first try is for a block that includes the cursor, pre-undo
817 	 * the digit * radix offset in the first call; otherwise, ignore the
818 	 * cursor entirely.
819 	 */
820 	if (((mask >> digit) & 1) == 1)
821 		cursor -= digit * radix;
822 	else
823 		cursor = blk;
824 
825 	/*
826 	 * Examine the nonempty subtree associated with each bit set in mask.
827 	 */
828 	do {
829 		digit = bitpos(mask);
830 		i = 1 + digit * skip;
831 		if (*count <= scan[i].bm_bighint) {
832 			/*
833 			 * The allocation might fit beginning in the i'th subtree.
834 			 */
835 			r = blst_meta_alloc(&scan[i], cursor + digit * radix,
836 			    count, maxcount, radix / BLIST_RADIX);
837 			if (r != SWAPBLK_NONE) {
838 				if (scan[i].bm_bitmap == 0)
839 					scan->bm_bitmap ^= bitrange(digit, 1);
840 				return (r);
841 			}
842 		}
843 		cursor = blk;
844 	} while ((mask ^= bitrange(digit, 1)) != 0);
845 
846 	/*
847 	 * We couldn't allocate count in this subtree.  If the whole tree was
848 	 * scanned, and the last tree node is allocated, update bighint.
849 	 */
850 	if (scan_from_start && !(digit == BLIST_RADIX - 1 &&
851 	    scan[i].bm_bighint == BLIST_MAX_ALLOC))
852 		scan->bm_bighint = *count - 1;
853 
854 	return (SWAPBLK_NONE);
855 }
856 
857 /*
858  * BLST_LEAF_FREE() -	free allocated block from leaf bitmap
859  *
860  */
861 static void
862 blst_leaf_free(blmeta_t *scan, daddr_t blk, int count)
863 {
864 	u_daddr_t mask;
865 
866 	/*
867 	 * free some data in this bitmap
868 	 * mask=0000111111111110000
869 	 *          \_________/\__/
870 	 *		count   n
871 	 */
872 	mask = bitrange(blk & BLIST_MASK, count);
873 	KASSERT((scan->bm_bitmap & mask) == 0,
874 	    ("freeing free block: %jx, size %d, mask %jx",
875 	    (uintmax_t)blk, count, (uintmax_t)scan->bm_bitmap & mask));
876 	scan->bm_bitmap |= mask;
877 }
878 
879 /*
880  * BLST_META_FREE() - free allocated blocks from radix tree meta info
881  *
882  *	This support routine frees a range of blocks from the bitmap.
883  *	The range must be entirely enclosed by this radix node.  If a
884  *	meta node, we break the range down recursively to free blocks
885  *	in subnodes (which means that this code can free an arbitrary
886  *	range whereas the allocation code cannot allocate an arbitrary
887  *	range).
888  */
889 static void
890 blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, u_daddr_t radix)
891 {
892 	daddr_t blk, endBlk, i, skip;
893 	int digit, endDigit;
894 
895 	/*
896 	 * We could probably do a better job here.  We are required to make
897 	 * bighint at least as large as the biggest allocable block of data.
898 	 * If we just shoehorn it, a little extra overhead will be incurred
899 	 * on the next allocation (but only that one typically).
900 	 */
901 	scan->bm_bighint = BLIST_MAX_ALLOC;
902 
903 	if (radix == 1)
904 		return (blst_leaf_free(scan, freeBlk, count));
905 
906 	endBlk = freeBlk + count;
907 	blk = (freeBlk + radix * BLIST_RADIX) & -(radix * BLIST_RADIX);
908 	/*
909 	 * blk is first block past the end of the range of this meta node,
910 	 * or 0 in case of overflow.
911 	 */
912 	if (blk != 0)
913 		endBlk = ummin(endBlk, blk);
914 	skip = radix_to_skip(radix);
915 	blk = freeBlk & -radix;
916 	digit = (blk / radix) & BLIST_MASK;
917 	endDigit = 1 + (((endBlk - 1) / radix) & BLIST_MASK);
918 	scan->bm_bitmap |= bitrange(digit, endDigit - digit);
919 	for (i = 1 + digit * skip; blk < endBlk; i += skip) {
920 		blk += radix;
921 		count = ummin(blk, endBlk) - freeBlk;
922 		blst_meta_free(&scan[i], freeBlk, count, radix / BLIST_RADIX);
923 		freeBlk = blk;
924 	}
925 }
926 
927 /*
928  * BLST_COPY() - copy one radix tree to another
929  *
930  *	Locates free space in the source tree and frees it in the destination
931  *	tree.  The space may not already be free in the destination.
932  */
933 static void
934 blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, blist_t dest,
935     daddr_t count)
936 {
937 	daddr_t endBlk, i, skip;
938 
939 	/*
940 	 * Leaf node
941 	 */
942 
943 	if (radix == 1) {
944 		u_daddr_t v = scan->bm_bitmap;
945 
946 		if (v == (u_daddr_t)-1) {
947 			blist_free(dest, blk, count);
948 		} else if (v != 0) {
949 			int i;
950 
951 			for (i = 0; i < count; ++i) {
952 				if (v & ((u_daddr_t)1 << i))
953 					blist_free(dest, blk + i, 1);
954 			}
955 		}
956 		return;
957 	}
958 
959 	/*
960 	 * Meta node
961 	 */
962 
963 	if (scan->bm_bitmap == 0) {
964 		/*
965 		 * Source all allocated, leave dest allocated
966 		 */
967 		return;
968 	}
969 
970 	endBlk = blk + count;
971 	skip = radix_to_skip(radix);
972 	for (i = 1; blk < endBlk; i += skip) {
973 		blk += radix;
974 		count = radix;
975 		if (blk >= endBlk)
976 			count -= blk - endBlk;
977 		blst_copy(&scan[i], blk - radix,
978 		    radix / BLIST_RADIX, dest, count);
979 	}
980 }
981 
982 /*
983  * BLST_LEAF_FILL() -	allocate specific blocks in leaf bitmap
984  *
985  *	This routine allocates all blocks in the specified range
986  *	regardless of any existing allocations in that range.  Returns
987  *	the number of blocks allocated by the call.
988  */
989 static daddr_t
990 blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count)
991 {
992 	daddr_t nblks;
993 	u_daddr_t mask;
994 
995 	mask = bitrange(blk & BLIST_MASK, count);
996 
997 	/* Count the number of blocks that we are allocating. */
998 	nblks = bitcount64(scan->bm_bitmap & mask);
999 
1000 	scan->bm_bitmap &= ~mask;
1001 	return (nblks);
1002 }
1003 
1004 /*
1005  * BLIST_META_FILL() -	allocate specific blocks at a meta node
1006  *
1007  *	This routine allocates the specified range of blocks,
1008  *	regardless of any existing allocations in the range.  The
1009  *	range must be within the extent of this node.  Returns the
1010  *	number of blocks allocated by the call.
1011  */
1012 static daddr_t
1013 blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, u_daddr_t radix)
1014 {
1015 	daddr_t blk, endBlk, i, nblks, skip;
1016 	int digit;
1017 
1018 	if (radix == 1)
1019 		return (blst_leaf_fill(scan, allocBlk, count));
1020 
1021 	endBlk = allocBlk + count;
1022 	blk = (allocBlk + radix * BLIST_RADIX) & -(radix * BLIST_RADIX);
1023 	/*
1024 	 * blk is first block past the end of the range of this meta node,
1025 	 * or 0 in case of overflow.
1026 	 */
1027 	if (blk != 0)
1028 		endBlk = ummin(endBlk, blk);
1029 	skip = radix_to_skip(radix);
1030 	blk = allocBlk & -radix;
1031 	nblks = 0;
1032 	while (blk < endBlk) {
1033 		digit = (blk / radix) & BLIST_MASK;
1034 		i = 1 + digit * skip;
1035 		blk += radix;
1036 		count = ummin(blk, endBlk) - allocBlk;
1037 		nblks += blst_meta_fill(&scan[i], allocBlk, count,
1038 		    radix / BLIST_RADIX);
1039 		if (scan[i].bm_bitmap == 0)
1040 			scan->bm_bitmap &= ~((u_daddr_t)1 << digit);
1041 		allocBlk = blk;
1042 	}
1043 	return (nblks);
1044 }
1045 
1046 #ifdef BLIST_DEBUG
1047 
1048 static void
1049 blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int tab)
1050 {
1051 	daddr_t skip;
1052 	u_daddr_t mask;
1053 	int digit;
1054 
1055 	if (radix == 1) {
1056 		printf(
1057 		    "%*.*s(%08llx,%lld): bitmap %0*llx big=%lld\n",
1058 		    tab, tab, "",
1059 		    (long long)blk, (long long)BLIST_RADIX,
1060 		    (int)(1 + (BLIST_RADIX - 1) / 4),
1061 		    (long long)scan->bm_bitmap,
1062 		    (long long)scan->bm_bighint
1063 		);
1064 		return;
1065 	}
1066 
1067 	printf(
1068 	    "%*.*s(%08llx): subtree (%lld/%lld) bitmap %0*llx big=%lld {\n",
1069 	    tab, tab, "",
1070 	    (long long)blk, (long long)radix * BLIST_RADIX,
1071 	    (long long)radix * BLIST_RADIX,
1072 	    (int)(1 + (BLIST_RADIX - 1) / 4),
1073 	    (long long)scan->bm_bitmap,
1074 	    (long long)scan->bm_bighint
1075 	);
1076 
1077 	skip = radix_to_skip(radix);
1078 	tab += 4;
1079 
1080 	mask = scan->bm_bitmap;
1081 	/* Examine the nonempty subtree associated with each bit set in mask */
1082 	do {
1083 		digit = bitpos(mask);
1084 		blst_radix_print(&scan[1 + digit * skip], blk + digit * radix,
1085 		    radix / BLIST_RADIX, tab);
1086 	} while ((mask ^= bitrange(digit, 1)) != 0);
1087 	tab -= 4;
1088 
1089 	printf(
1090 	    "%*.*s}\n",
1091 	    tab, tab, ""
1092 	);
1093 }
1094 
1095 #endif
1096 
1097 #ifdef BLIST_DEBUG
1098 
1099 int
1100 main(int ac, char **av)
1101 {
1102 	daddr_t size = BLIST_RADIX * BLIST_RADIX;
1103 	int i;
1104 	blist_t bl;
1105 	struct sbuf *s;
1106 
1107 	for (i = 1; i < ac; ++i) {
1108 		const char *ptr = av[i];
1109 		if (*ptr != '-') {
1110 			size = strtoll(ptr, NULL, 0);
1111 			continue;
1112 		}
1113 		ptr += 2;
1114 		fprintf(stderr, "Bad option: %s\n", ptr - 2);
1115 		exit(1);
1116 	}
1117 	bl = blist_create(size, M_WAITOK);
1118 	if (bl == NULL) {
1119 		fprintf(stderr, "blist_create failed\n");
1120 		exit(1);
1121 	}
1122 	blist_free(bl, 0, size);
1123 
1124 	for (;;) {
1125 		char buf[1024];
1126 		long long da = 0;
1127 		int count = 0, maxcount = 0;
1128 
1129 		printf("%lld/%lld/%lld> ", (long long)blist_avail(bl),
1130 		    (long long)size, (long long)bl->bl_radix * BLIST_RADIX);
1131 		fflush(stdout);
1132 		if (fgets(buf, sizeof(buf), stdin) == NULL)
1133 			break;
1134 		switch(buf[0]) {
1135 		case 'r':
1136 			if (sscanf(buf + 1, "%d", &count) == 1) {
1137 				blist_resize(&bl, count, 1, M_WAITOK);
1138 			} else {
1139 				printf("?\n");
1140 			}
1141 		case 'p':
1142 			blist_print(bl);
1143 			break;
1144 		case 's':
1145 			s = sbuf_new_auto();
1146 			blist_stats(bl, s);
1147 			sbuf_finish(s);
1148 			printf("%s", sbuf_data(s));
1149 			sbuf_delete(s);
1150 			break;
1151 		case 'a':
1152 			if (sscanf(buf + 1, "%d%d", &count, &maxcount) == 2) {
1153 				daddr_t blk = blist_alloc(bl, &count, maxcount);
1154 				printf("    R=%08llx, c=%08d\n",
1155 				    (long long)blk, count);
1156 			} else {
1157 				printf("?\n");
1158 			}
1159 			break;
1160 		case 'f':
1161 			if (sscanf(buf + 1, "%llx %d", &da, &count) == 2) {
1162 				blist_free(bl, da, count);
1163 			} else {
1164 				printf("?\n");
1165 			}
1166 			break;
1167 		case 'l':
1168 			if (sscanf(buf + 1, "%llx %d", &da, &count) == 2) {
1169 				printf("    n=%jd\n",
1170 				    (intmax_t)blist_fill(bl, da, count));
1171 			} else {
1172 				printf("?\n");
1173 			}
1174 			break;
1175 		case '?':
1176 		case 'h':
1177 			puts(
1178 			    "p          -print\n"
1179 			    "s          -stats\n"
1180 			    "a %d %d    -allocate\n"
1181 			    "f %x %d    -free\n"
1182 			    "l %x %d    -fill\n"
1183 			    "r %d       -resize\n"
1184 			    "h/?        -help\n"
1185 			    "q          -quit"
1186 			);
1187 			break;
1188 		case 'q':
1189 			break;
1190 		default:
1191 			printf("?\n");
1192 			break;
1193 		}
1194 		if (buf[0] == 'q')
1195 			break;
1196 	}
1197 	return (0);
1198 }
1199 
1200 #endif
1201