xref: /freebsd/sys/kern/subr_blist.c (revision 28f4385e45a2681c14bd04b83fe1796eaefe8265)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1998 Matthew Dillon.  All Rights Reserved.
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. Neither the name of the University nor the names of its contributors
14  *    may be used to endorse or promote products derived from this software
15  *    without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
18  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
23  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
25  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
26  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  */
29 /*
30  * BLIST.C -	Bitmap allocator/deallocator, using a radix tree with hinting
31  *
32  *	This module implements a general bitmap allocator/deallocator.  The
33  *	allocator eats around 2 bits per 'block'.  The module does not
34  *	try to interpret the meaning of a 'block' other than to return
35  *	SWAPBLK_NONE on an allocation failure.
36  *
37  *	A radix tree controls access to pieces of the bitmap, and includes
38  *	auxiliary information at each interior node about the availabilty of
39  *	contiguous free blocks in the subtree rooted at that node.  Two radix
40  *	constants are involved: one for the size of the bitmaps contained in the
41  *	leaf nodes (BLIST_BMAP_RADIX), and one for the number of descendents of
42  *	each of the meta (interior) nodes (BLIST_META_RADIX).  Each subtree is
43  *	associated with a range of blocks.  The root of any subtree stores a
44  *	hint field that defines an upper bound on the size of the largest
45  *	allocation that can begin in the associated block range.  A hint is an
46  *	upper bound on a potential allocation, but not necessarily a tight upper
47  *	bound.
48  *
49  *	The bitmap field in each node directs the search for available blocks.
50  *	For a leaf node, a bit is set if the corresponding block is free.  For a
51  *	meta node, a bit is set if the corresponding subtree contains a free
52  *	block somewhere within it.  The search at a meta node considers only
53  *	children of that node that represent a range that includes a free block.
54  *
55  * 	The hinting greatly increases code efficiency for allocations while
56  *	the general radix structure optimizes both allocations and frees.  The
57  *	radix tree should be able to operate well no matter how much
58  *	fragmentation there is and no matter how large a bitmap is used.
59  *
60  *	The blist code wires all necessary memory at creation time.  Neither
61  *	allocations nor frees require interaction with the memory subsystem.
62  *	The non-blocking nature of allocations and frees is required by swap
63  *	code (vm/swap_pager.c).
64  *
65  *	LAYOUT: The radix tree is laid out recursively using a linear array.
66  *	Each meta node is immediately followed (laid out sequentially in
67  *	memory) by BLIST_META_RADIX lower level nodes.  This is a recursive
68  *	structure but one that can be easily scanned through a very simple
69  *	'skip' calculation.  The memory allocation is only large enough to
70  *	cover the number of blocks requested at creation time.  Nodes that
71  *	represent blocks beyond that limit, nodes that would never be read
72  *	or written, are not allocated, so that the last of the
73  *	BLIST_META_RADIX lower level nodes of a some nodes may not be
74  *	allocated.
75  *
76  *	NOTE: the allocator cannot currently allocate more than
77  *	BLIST_BMAP_RADIX blocks per call.  It will panic with 'allocation too
78  *	large' if you try.  This is an area that could use improvement.  The
79  *	radix is large enough that this restriction does not effect the swap
80  *	system, though.  Currently only the allocation code is affected by
81  *	this algorithmic unfeature.  The freeing code can handle arbitrary
82  *	ranges.
83  *
84  *	This code can be compiled stand-alone for debugging.
85  */
86 
87 #include <sys/cdefs.h>
88 __FBSDID("$FreeBSD$");
89 
90 #ifdef _KERNEL
91 
92 #include <sys/param.h>
93 #include <sys/systm.h>
94 #include <sys/lock.h>
95 #include <sys/kernel.h>
96 #include <sys/blist.h>
97 #include <sys/malloc.h>
98 #include <sys/sbuf.h>
99 #include <sys/proc.h>
100 #include <sys/mutex.h>
101 
102 #else
103 
104 #ifndef BLIST_NO_DEBUG
105 #define BLIST_DEBUG
106 #endif
107 
108 #include <sys/errno.h>
109 #include <sys/types.h>
110 #include <sys/malloc.h>
111 #include <sys/sbuf.h>
112 #include <stdio.h>
113 #include <string.h>
114 #include <stddef.h>
115 #include <stdlib.h>
116 #include <stdarg.h>
117 #include <stdbool.h>
118 
119 #define	bitcount64(x)	__bitcount64((uint64_t)(x))
120 #define malloc(a,b,c)	calloc(a, 1)
121 #define free(a,b)	free(a)
122 #define ummin(a,b)	((a) < (b) ? (a) : (b))
123 
124 #include <sys/blist.h>
125 
126 void panic(const char *ctl, ...);
127 
128 #endif
129 
130 /*
131  * static support functions
132  */
133 static daddr_t	blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count);
134 static daddr_t	blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count,
135 		    u_daddr_t radix);
136 static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count);
137 static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count,
138 		    u_daddr_t radix);
139 static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix,
140 		    blist_t dest, daddr_t count);
141 static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count);
142 static daddr_t blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count,
143 		    u_daddr_t radix);
144 #ifndef _KERNEL
145 static void	blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix,
146 		    int tab);
147 #endif
148 
149 #ifdef _KERNEL
150 static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space");
151 #endif
152 
153 _Static_assert(BLIST_BMAP_RADIX % BLIST_META_RADIX == 0,
154     "radix divisibility error");
155 #define	BLIST_BMAP_MASK	(BLIST_BMAP_RADIX - 1)
156 #define	BLIST_META_MASK	(BLIST_META_RADIX - 1)
157 
158 /*
159  * For a subtree that can represent the state of up to 'radix' blocks, the
160  * number of leaf nodes of the subtree is L=radix/BLIST_BMAP_RADIX.  If 'm'
161  * is short for BLIST_META_RADIX, then for a tree of height h with L=m**h
162  * leaf nodes, the total number of tree nodes is 1 + m + m**2 + ... + m**h,
163  * or, equivalently, (m**(h+1)-1)/(m-1).  This quantity is called 'skip'
164  * in the 'meta' functions that process subtrees.  Since integer division
165  * discards remainders, we can express this computation as
166  * skip = (m * m**h) / (m - 1)
167  * skip = (m * (radix / BLIST_BMAP_RADIX)) / (m - 1)
168  * and since m divides BLIST_BMAP_RADIX, we can simplify further to
169  * skip = (radix / (BLIST_BMAP_RADIX / m)) / (m - 1)
170  * skip = radix / ((BLIST_BMAP_RADIX / m) * (m - 1))
171  * so that simple integer division by a constant can safely be used for the
172  * calculation.
173  */
174 static inline daddr_t
175 radix_to_skip(daddr_t radix)
176 {
177 
178 	return (radix /
179 	    ((BLIST_BMAP_RADIX / BLIST_META_RADIX) * BLIST_META_MASK));
180 }
181 
182 /*
183  * Provide a mask with count bits set, starting as position n.
184  */
185 static inline u_daddr_t
186 bitrange(int n, int count)
187 {
188 
189 	return (((u_daddr_t)-1 << n) &
190 	    ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - (n + count))));
191 }
192 
193 
194 /*
195  * Use binary search, or a faster method, to find the 1 bit in a u_daddr_t.
196  * Assumes that the argument has only one bit set.
197  */
198 static inline int
199 bitpos(u_daddr_t mask)
200 {
201 	int hi, lo, mid;
202 
203 	switch (sizeof(mask)) {
204 #ifdef HAVE_INLINE_FFSLL
205 	case sizeof(long long):
206 		return (ffsll(mask) - 1);
207 #endif
208 	default:
209 		lo = 0;
210 		hi = BLIST_BMAP_RADIX;
211 		while (lo + 1 < hi) {
212 			mid = (lo + hi) >> 1;
213 			if ((mask >> mid) != 0)
214 				lo = mid;
215 			else
216 				hi = mid;
217 		}
218 		return (lo);
219 	}
220 }
221 
222 /*
223  * blist_create() - create a blist capable of handling up to the specified
224  *		    number of blocks
225  *
226  *	blocks - must be greater than 0
227  * 	flags  - malloc flags
228  *
229  *	The smallest blist consists of a single leaf node capable of
230  *	managing BLIST_BMAP_RADIX blocks.
231  */
232 blist_t
233 blist_create(daddr_t blocks, int flags)
234 {
235 	blist_t bl;
236 	u_daddr_t nodes, radix;
237 
238 	if (blocks == 0)
239 		panic("invalid block count");
240 
241 	/*
242 	 * Calculate the radix and node count used for scanning.
243 	 */
244 	nodes = 1;
245 	radix = BLIST_BMAP_RADIX;
246 	while (radix <= blocks) {
247 		nodes += 1 + (blocks - 1) / radix;
248 		radix *= BLIST_META_RADIX;
249 	}
250 
251 	bl = malloc(offsetof(struct blist, bl_root[nodes]), M_SWAP, flags |
252 	    M_ZERO);
253 	if (bl == NULL)
254 		return (NULL);
255 
256 	bl->bl_blocks = blocks;
257 	bl->bl_radix = radix;
258 
259 #if defined(BLIST_DEBUG)
260 	printf(
261 		"BLIST representing %lld blocks (%lld MB of swap)"
262 		", requiring %lldK of ram\n",
263 		(long long)bl->bl_blocks,
264 		(long long)bl->bl_blocks * 4 / 1024,
265 		(long long)(nodes * sizeof(blmeta_t) + 1023) / 1024
266 	);
267 	printf("BLIST raw radix tree contains %lld records\n",
268 	    (long long)nodes);
269 #endif
270 
271 	return (bl);
272 }
273 
274 void
275 blist_destroy(blist_t bl)
276 {
277 
278 	free(bl, M_SWAP);
279 }
280 
281 /*
282  * blist_alloc() -   reserve space in the block bitmap.  Return the base
283  *		     of a contiguous region or SWAPBLK_NONE if space could
284  *		     not be allocated.
285  */
286 daddr_t
287 blist_alloc(blist_t bl, daddr_t count)
288 {
289 	daddr_t blk;
290 
291 	if (count > BLIST_MAX_ALLOC)
292 		panic("allocation too large");
293 
294 	/*
295 	 * This loop iterates at most twice.  An allocation failure in the
296 	 * first iteration leads to a second iteration only if the cursor was
297 	 * non-zero.  When the cursor is zero, an allocation failure will
298 	 * stop further iterations.
299 	 */
300 	for (;;) {
301 		blk = blst_meta_alloc(bl->bl_root, bl->bl_cursor, count,
302 		    bl->bl_radix);
303 		if (blk != SWAPBLK_NONE) {
304 			bl->bl_avail -= count;
305 			bl->bl_cursor = blk + count;
306 			if (bl->bl_cursor == bl->bl_blocks)
307 				bl->bl_cursor = 0;
308 			return (blk);
309 		} else if (bl->bl_cursor == 0)
310 			return (SWAPBLK_NONE);
311 		bl->bl_cursor = 0;
312 	}
313 }
314 
315 /*
316  * blist_avail() -	return the number of free blocks.
317  */
318 daddr_t
319 blist_avail(blist_t bl)
320 {
321 
322 	return (bl->bl_avail);
323 }
324 
325 /*
326  * blist_free() -	free up space in the block bitmap.  Return the base
327  *		     	of a contiguous region.  Panic if an inconsistancy is
328  *			found.
329  */
330 void
331 blist_free(blist_t bl, daddr_t blkno, daddr_t count)
332 {
333 
334 	if (blkno < 0 || blkno + count > bl->bl_blocks)
335 		panic("freeing invalid range");
336 	blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix);
337 	bl->bl_avail += count;
338 }
339 
340 /*
341  * blist_fill() -	mark a region in the block bitmap as off-limits
342  *			to the allocator (i.e. allocate it), ignoring any
343  *			existing allocations.  Return the number of blocks
344  *			actually filled that were free before the call.
345  */
346 daddr_t
347 blist_fill(blist_t bl, daddr_t blkno, daddr_t count)
348 {
349 	daddr_t filled;
350 
351 	if (blkno < 0 || blkno + count > bl->bl_blocks)
352 		panic("filling invalid range");
353 	filled = blst_meta_fill(bl->bl_root, blkno, count, bl->bl_radix);
354 	bl->bl_avail -= filled;
355 	return (filled);
356 }
357 
358 /*
359  * blist_resize() -	resize an existing radix tree to handle the
360  *			specified number of blocks.  This will reallocate
361  *			the tree and transfer the previous bitmap to the new
362  *			one.  When extending the tree you can specify whether
363  *			the new blocks are to left allocated or freed.
364  */
365 void
366 blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags)
367 {
368     blist_t newbl = blist_create(count, flags);
369     blist_t save = *pbl;
370 
371     *pbl = newbl;
372     if (count > save->bl_blocks)
373 	    count = save->bl_blocks;
374     blst_copy(save->bl_root, 0, save->bl_radix, newbl, count);
375 
376     /*
377      * If resizing upwards, should we free the new space or not?
378      */
379     if (freenew && count < newbl->bl_blocks) {
380 	    blist_free(newbl, count, newbl->bl_blocks - count);
381     }
382     blist_destroy(save);
383 }
384 
385 #ifdef BLIST_DEBUG
386 
387 /*
388  * blist_print()    - dump radix tree
389  */
390 void
391 blist_print(blist_t bl)
392 {
393 	printf("BLIST avail = %jd, cursor = %08jx {\n",
394 	    (uintmax_t)bl->bl_avail, (uintmax_t)bl->bl_cursor);
395 
396 	if (bl->bl_root->bm_bitmap != 0)
397 		blst_radix_print(bl->bl_root, 0, bl->bl_radix, 4);
398 	printf("}\n");
399 }
400 
401 #endif
402 
403 static const u_daddr_t fib[] = {
404 	1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584,
405 	4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418, 317811,
406 	514229, 832040, 1346269, 2178309, 3524578,
407 };
408 
409 /*
410  * Use 'gap' to describe a maximal range of unallocated blocks/bits.
411  */
412 struct gap_stats {
413 	daddr_t	start;		/* current gap start, or SWAPBLK_NONE */
414 	daddr_t	num;		/* number of gaps observed */
415 	daddr_t	max;		/* largest gap size */
416 	daddr_t	avg;		/* average gap size */
417 	daddr_t	err;		/* sum - num * avg */
418 	daddr_t	histo[nitems(fib)]; /* # gaps in each size range */
419 	int	max_bucket;	/* last histo elt with nonzero val */
420 };
421 
422 /*
423  * gap_stats_counting()    - is the state 'counting 1 bits'?
424  *                           or 'skipping 0 bits'?
425  */
426 static inline bool
427 gap_stats_counting(const struct gap_stats *stats)
428 {
429 
430 	return (stats->start != SWAPBLK_NONE);
431 }
432 
433 /*
434  * init_gap_stats()    - initialize stats on gap sizes
435  */
436 static inline void
437 init_gap_stats(struct gap_stats *stats)
438 {
439 
440 	bzero(stats, sizeof(*stats));
441 	stats->start = SWAPBLK_NONE;
442 }
443 
444 /*
445  * update_gap_stats()    - update stats on gap sizes
446  */
447 static void
448 update_gap_stats(struct gap_stats *stats, daddr_t posn)
449 {
450 	daddr_t size;
451 	int hi, lo, mid;
452 
453 	if (!gap_stats_counting(stats)) {
454 		stats->start = posn;
455 		return;
456 	}
457 	size = posn - stats->start;
458 	stats->start = SWAPBLK_NONE;
459 	if (size > stats->max)
460 		stats->max = size;
461 
462 	/*
463 	 * Find the fibonacci range that contains size,
464 	 * expecting to find it in an early range.
465 	 */
466 	lo = 0;
467 	hi = 1;
468 	while (hi < nitems(fib) && fib[hi] <= size) {
469 		lo = hi;
470 		hi *= 2;
471 	}
472 	if (hi >= nitems(fib))
473 		hi = nitems(fib);
474 	while (lo + 1 != hi) {
475 		mid = (lo + hi) >> 1;
476 		if (fib[mid] <= size)
477 			lo = mid;
478 		else
479 			hi = mid;
480 	}
481 	stats->histo[lo]++;
482 	if (lo > stats->max_bucket)
483 		stats->max_bucket = lo;
484 	stats->err += size - stats->avg;
485 	stats->num++;
486 	stats->avg += stats->err / stats->num;
487 	stats->err %= stats->num;
488 }
489 
490 /*
491  * dump_gap_stats()    - print stats on gap sizes
492  */
493 static inline void
494 dump_gap_stats(const struct gap_stats *stats, struct sbuf *s)
495 {
496 	int i;
497 
498 	sbuf_printf(s, "number of maximal free ranges: %jd\n",
499 	    (intmax_t)stats->num);
500 	sbuf_printf(s, "largest free range: %jd\n", (intmax_t)stats->max);
501 	sbuf_printf(s, "average maximal free range size: %jd\n",
502 	    (intmax_t)stats->avg);
503 	sbuf_printf(s, "number of maximal free ranges of different sizes:\n");
504 	sbuf_printf(s, "               count  |  size range\n");
505 	sbuf_printf(s, "               -----  |  ----------\n");
506 	for (i = 0; i < stats->max_bucket; i++) {
507 		if (stats->histo[i] != 0) {
508 			sbuf_printf(s, "%20jd  |  ",
509 			    (intmax_t)stats->histo[i]);
510 			if (fib[i] != fib[i + 1] - 1)
511 				sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
512 				    (intmax_t)fib[i + 1] - 1);
513 			else
514 				sbuf_printf(s, "%jd\n", (intmax_t)fib[i]);
515 		}
516 	}
517 	sbuf_printf(s, "%20jd  |  ", (intmax_t)stats->histo[i]);
518 	if (stats->histo[i] > 1)
519 		sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
520 		    (intmax_t)stats->max);
521 	else
522 		sbuf_printf(s, "%jd\n", (intmax_t)stats->max);
523 }
524 
525 /*
526  * blist_stats()    - dump radix tree stats
527  */
528 void
529 blist_stats(blist_t bl, struct sbuf *s)
530 {
531 	struct gap_stats gstats;
532 	struct gap_stats *stats = &gstats;
533 	daddr_t i, nodes, radix;
534 	u_daddr_t bit, diff, mask;
535 
536 	init_gap_stats(stats);
537 	nodes = 0;
538 	i = bl->bl_radix;
539 	while (i < bl->bl_radix + bl->bl_blocks) {
540 		/*
541 		 * Find max size subtree starting at i.
542 		 */
543 		radix = BLIST_BMAP_RADIX;
544 		while (((i / radix) & BLIST_META_MASK) == 0)
545 			radix *= BLIST_META_RADIX;
546 
547 		/*
548 		 * Check for skippable subtrees starting at i.
549 		 */
550 		while (radix > BLIST_BMAP_RADIX) {
551 			if (bl->bl_root[nodes].bm_bitmap == 0) {
552 				if (gap_stats_counting(stats))
553 					update_gap_stats(stats, i);
554 				break;
555 			}
556 
557 			/*
558 			 * Skip subtree root.
559 			 */
560 			nodes++;
561 			radix /= BLIST_META_RADIX;
562 		}
563 		if (radix == BLIST_BMAP_RADIX) {
564 			/*
565 			 * Scan leaf.
566 			 */
567 			mask = bl->bl_root[nodes].bm_bitmap;
568 			diff = mask ^ (mask << 1);
569 			if (gap_stats_counting(stats))
570 				diff ^= 1;
571 			while (diff != 0) {
572 				bit = diff & -diff;
573 				update_gap_stats(stats, i + bitpos(bit));
574 				diff ^= bit;
575 			}
576 		}
577 		nodes += radix_to_skip(radix);
578 		i += radix;
579 	}
580 	update_gap_stats(stats, i);
581 	dump_gap_stats(stats, s);
582 }
583 
584 /************************************************************************
585  *			  ALLOCATION SUPPORT FUNCTIONS			*
586  ************************************************************************
587  *
588  *	These support functions do all the actual work.  They may seem
589  *	rather longish, but that's because I've commented them up.  The
590  *	actual code is straight forward.
591  *
592  */
593 
594 /*
595  * BLST_NEXT_LEAF_ALLOC() - allocate the first few blocks in the next leaf.
596  *
597  *	'scan' is a leaf node, associated with a block containing 'blk'.
598  *	The next leaf node could be adjacent, or several nodes away if the
599  *	least common ancestor of 'scan' and its neighbor is several levels
600  *	up.  Use 'blk' to determine how many meta-nodes lie between the
601  *	leaves.  If the next leaf has enough initial bits set, clear them
602  *	and clear the bits in the meta nodes on the path up to the least
603  *	common ancestor to mark any subtrees made completely empty.
604  */
605 static int
606 blst_next_leaf_alloc(blmeta_t *scan, daddr_t blk, int count)
607 {
608 	blmeta_t *next;
609 	daddr_t skip;
610 	u_daddr_t radix;
611 	int digit;
612 
613 	next = scan + 1;
614 	blk += BLIST_BMAP_RADIX;
615 	radix = BLIST_BMAP_RADIX;
616 	while ((digit = ((blk / radix) & BLIST_META_MASK)) == 0 &&
617 	    (next->bm_bitmap & 1) == 1) {
618 		next++;
619 		radix *= BLIST_META_RADIX;
620 	}
621 	if (((next->bm_bitmap + 1) & ~((u_daddr_t)-1 << count)) != 0) {
622 		/*
623 		 * The next leaf doesn't have enough free blocks at the
624 		 * beginning to complete the spanning allocation.
625 		 */
626 		return (ENOMEM);
627 	}
628 	/* Clear the first 'count' bits in the next leaf to allocate. */
629 	next->bm_bitmap &= (u_daddr_t)-1 << count;
630 
631 	/*
632 	 * Update bitmaps of next-ancestors, up to least common ancestor.
633 	 */
634 	skip = radix_to_skip(radix);
635 	while (radix != BLIST_BMAP_RADIX && next->bm_bitmap == 0) {
636 		(--next)->bm_bitmap ^= 1;
637 		radix /= BLIST_META_RADIX;
638 	}
639 	if (next->bm_bitmap == 0)
640 		scan[-digit * skip].bm_bitmap ^= (u_daddr_t)1 << digit;
641 	return (0);
642 }
643 
644 /*
645  * BLST_LEAF_ALLOC() -	allocate at a leaf in the radix tree (a bitmap).
646  *
647  *	This function is the core of the allocator.  Its execution time is
648  *	proportional to log(count), plus height of the tree if the allocation
649  *	crosses a leaf boundary.
650  */
651 static daddr_t
652 blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count)
653 {
654 	u_daddr_t cursor_mask, mask;
655 	int count1, hi, lo, num_shifts, range1, range_ext;
656 
657 	range1 = 0;
658 	count1 = count - 1;
659 	num_shifts = fls(count1);
660 	mask = scan->bm_bitmap;
661 	while ((-mask & ~mask) != 0 && num_shifts > 0) {
662 		/*
663 		 * If bit i is set in mask, then bits in [i, i+range1] are set
664 		 * in scan->bm_bitmap.  The value of range1 is equal to count1
665 		 * >> num_shifts.  Grow range1 and reduce num_shifts to 0,
666 		 * while preserving these invariants.  The updates to mask
667 		 * leave fewer bits set, but each bit that remains set
668 		 * represents a longer string of consecutive bits set in
669 		 * scan->bm_bitmap.  If more updates to mask cannot clear more
670 		 * bits, because mask is partitioned with all 0 bits preceding
671 		 * all 1 bits, the loop terminates immediately.
672 		 */
673 		num_shifts--;
674 		range_ext = range1 + ((count1 >> num_shifts) & 1);
675 		/*
676 		 * mask is a signed quantity for the shift because when it is
677 		 * shifted right, the sign bit should copied; when the last
678 		 * block of the leaf is free, pretend, for a while, that all the
679 		 * blocks that follow it are also free.
680 		 */
681 		mask &= (daddr_t)mask >> range_ext;
682 		range1 += range_ext;
683 	}
684 	if (mask == 0) {
685 		/*
686 		 * Update bighint.  There is no allocation bigger than range1
687 		 * starting in this leaf.
688 		 */
689 		scan->bm_bighint = range1;
690 		return (SWAPBLK_NONE);
691 	}
692 
693 	/* Discard any candidates that appear before blk. */
694 	if ((blk & BLIST_BMAP_MASK) != 0) {
695 		cursor_mask = mask & bitrange(0, blk & BLIST_BMAP_MASK);
696 		if (cursor_mask != 0) {
697 			mask ^= cursor_mask;
698 			if (mask == 0)
699 				return (SWAPBLK_NONE);
700 
701 			/*
702 			 * Bighint change for last block allocation cannot
703 			 * assume that any other blocks are allocated, so the
704 			 * bighint cannot be reduced much.
705 			 */
706 			range1 = BLIST_MAX_ALLOC - 1;
707 		}
708 		blk &= ~BLIST_BMAP_MASK;
709 	}
710 
711 	/*
712 	 * The least significant set bit in mask marks the start of the first
713 	 * available range of sufficient size.  Clear all the bits but that one,
714 	 * and then find its position.
715 	 */
716 	mask &= -mask;
717 	lo = bitpos(mask);
718 
719 	hi = lo + count;
720 	if (hi > BLIST_BMAP_RADIX) {
721 		/*
722 		 * An allocation within this leaf is impossible, so a successful
723 		 * allocation depends on the next leaf providing some of the blocks.
724 		 */
725 		if (blst_next_leaf_alloc(scan, blk, hi - BLIST_BMAP_RADIX) != 0)
726 			/*
727 			 * The hint cannot be updated, because the same
728 			 * allocation request could be satisfied later, by this
729 			 * leaf, if the state of the next leaf changes, and
730 			 * without any changes to this leaf.
731 			 */
732 			return (SWAPBLK_NONE);
733 		hi = BLIST_BMAP_RADIX;
734 	}
735 
736 	/* Set the bits of mask at position 'lo' and higher. */
737 	mask = -mask;
738 	if (hi == BLIST_BMAP_RADIX) {
739 		/*
740 		 * Update bighint.  There is no allocation bigger than range1
741 		 * available in this leaf after this allocation completes.
742 		 */
743 		scan->bm_bighint = range1;
744 	} else {
745 		/* Clear the bits of mask at position 'hi' and higher. */
746 		mask &= (u_daddr_t)-1 >> (BLIST_BMAP_RADIX - hi);
747 	}
748 	/* Clear the allocated bits from this leaf. */
749 	scan->bm_bitmap &= ~mask;
750 	return (blk + lo);
751 }
752 
753 /*
754  * blist_meta_alloc() -	allocate at a meta in the radix tree.
755  *
756  *	Attempt to allocate at a meta node.  If we can't, we update
757  *	bighint and return a failure.  Updating bighint optimize future
758  *	calls that hit this node.  We have to check for our collapse cases
759  *	and we have a few optimizations strewn in as well.
760  */
761 static daddr_t
762 blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count, u_daddr_t radix)
763 {
764 	daddr_t blk, i, r, skip;
765 	u_daddr_t bit, mask;
766 	bool scan_from_start;
767 	int digit;
768 
769 	if (radix == BLIST_BMAP_RADIX)
770 		return (blst_leaf_alloc(scan, cursor, count));
771 	blk = cursor & -radix;
772 	scan_from_start = (cursor == blk);
773 	radix /= BLIST_META_RADIX;
774 	skip = radix_to_skip(radix);
775 	mask = scan->bm_bitmap;
776 
777 	/* Discard any candidates that appear before cursor. */
778 	digit = (cursor / radix) & BLIST_META_MASK;
779 	mask &= (u_daddr_t)-1 << digit;
780 	if (mask == 0)
781 		return (SWAPBLK_NONE);
782 
783 	/*
784 	 * If the first try is for a block that includes the cursor, pre-undo
785 	 * the digit * radix offset in the first call; otherwise, ignore the
786 	 * cursor entirely.
787 	 */
788 	if (((mask >> digit) & 1) == 1)
789 		cursor -= digit * radix;
790 	else
791 		cursor = blk;
792 
793 	/*
794 	 * Examine the nonempty subtree associated with each bit set in mask.
795 	 */
796 	do {
797 		bit = mask & -mask;
798 		digit = bitpos(bit);
799 		i = 1 + digit * skip;
800 		if (count <= scan[i].bm_bighint) {
801 			/*
802 			 * The allocation might fit beginning in the i'th subtree.
803 			 */
804 			r = blst_meta_alloc(&scan[i], cursor + digit * radix,
805 			    count, radix);
806 			if (r != SWAPBLK_NONE) {
807 				if (scan[i].bm_bitmap == 0)
808 					scan->bm_bitmap ^= bit;
809 				return (r);
810 			}
811 		}
812 		cursor = blk;
813 	} while ((mask ^= bit) != 0);
814 
815 	/*
816 	 * We couldn't allocate count in this subtree.  If the whole tree was
817 	 * scanned, and the last tree node is allocated, update bighint.
818 	 */
819 	if (scan_from_start && !(digit == BLIST_META_RADIX - 1 &&
820 	    scan[i].bm_bighint == BLIST_MAX_ALLOC))
821 		scan->bm_bighint = count - 1;
822 
823 	return (SWAPBLK_NONE);
824 }
825 
826 /*
827  * BLST_LEAF_FREE() -	free allocated block from leaf bitmap
828  *
829  */
830 static void
831 blst_leaf_free(blmeta_t *scan, daddr_t blk, int count)
832 {
833 	u_daddr_t mask;
834 
835 	/*
836 	 * free some data in this bitmap
837 	 * mask=0000111111111110000
838 	 *          \_________/\__/
839 	 *		count   n
840 	 */
841 	mask = bitrange(blk & BLIST_BMAP_MASK, count);
842 	if (scan->bm_bitmap & mask)
843 		panic("freeing free block");
844 	scan->bm_bitmap |= mask;
845 }
846 
847 /*
848  * BLST_META_FREE() - free allocated blocks from radix tree meta info
849  *
850  *	This support routine frees a range of blocks from the bitmap.
851  *	The range must be entirely enclosed by this radix node.  If a
852  *	meta node, we break the range down recursively to free blocks
853  *	in subnodes (which means that this code can free an arbitrary
854  *	range whereas the allocation code cannot allocate an arbitrary
855  *	range).
856  */
857 static void
858 blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, u_daddr_t radix)
859 {
860 	daddr_t blk, endBlk, i, skip;
861 	int digit, endDigit;
862 
863 	/*
864 	 * We could probably do a better job here.  We are required to make
865 	 * bighint at least as large as the biggest allocable block of data.
866 	 * If we just shoehorn it, a little extra overhead will be incurred
867 	 * on the next allocation (but only that one typically).
868 	 */
869 	scan->bm_bighint = BLIST_MAX_ALLOC;
870 
871 	if (radix == BLIST_BMAP_RADIX)
872 		return (blst_leaf_free(scan, freeBlk, count));
873 
874 	endBlk = ummin(freeBlk + count, (freeBlk + radix) & -radix);
875 	radix /= BLIST_META_RADIX;
876 	skip = radix_to_skip(radix);
877 	blk = freeBlk & -radix;
878 	digit = (blk / radix) & BLIST_META_MASK;
879 	endDigit = 1 + (((endBlk - 1) / radix) & BLIST_META_MASK);
880 	scan->bm_bitmap |= bitrange(digit, endDigit - digit);
881 	for (i = 1 + digit * skip; blk < endBlk; i += skip) {
882 		blk += radix;
883 		count = ummin(blk, endBlk) - freeBlk;
884 		blst_meta_free(&scan[i], freeBlk, count, radix);
885 		freeBlk = blk;
886 	}
887 }
888 
889 /*
890  * BLST_COPY() - copy one radix tree to another
891  *
892  *	Locates free space in the source tree and frees it in the destination
893  *	tree.  The space may not already be free in the destination.
894  */
895 static void
896 blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, blist_t dest,
897     daddr_t count)
898 {
899 	daddr_t endBlk, i, skip;
900 
901 	/*
902 	 * Leaf node
903 	 */
904 
905 	if (radix == BLIST_BMAP_RADIX) {
906 		u_daddr_t v = scan->bm_bitmap;
907 
908 		if (v == (u_daddr_t)-1) {
909 			blist_free(dest, blk, count);
910 		} else if (v != 0) {
911 			int i;
912 
913 			for (i = 0; i < count; ++i) {
914 				if (v & ((u_daddr_t)1 << i))
915 					blist_free(dest, blk + i, 1);
916 			}
917 		}
918 		return;
919 	}
920 
921 	/*
922 	 * Meta node
923 	 */
924 
925 	if (scan->bm_bitmap == 0) {
926 		/*
927 		 * Source all allocated, leave dest allocated
928 		 */
929 		return;
930 	}
931 
932 	endBlk = blk + count;
933 	radix /= BLIST_META_RADIX;
934 	skip = radix_to_skip(radix);
935 	for (i = 1; blk < endBlk; i += skip) {
936 		blk += radix;
937 		count = radix;
938 		if (blk >= endBlk)
939 			count -= blk - endBlk;
940 		blst_copy(&scan[i], blk - radix, radix, dest, count);
941 	}
942 }
943 
944 /*
945  * BLST_LEAF_FILL() -	allocate specific blocks in leaf bitmap
946  *
947  *	This routine allocates all blocks in the specified range
948  *	regardless of any existing allocations in that range.  Returns
949  *	the number of blocks allocated by the call.
950  */
951 static daddr_t
952 blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count)
953 {
954 	daddr_t nblks;
955 	u_daddr_t mask;
956 
957 	mask = bitrange(blk & BLIST_BMAP_MASK, count);
958 
959 	/* Count the number of blocks that we are allocating. */
960 	nblks = bitcount64(scan->bm_bitmap & mask);
961 
962 	scan->bm_bitmap &= ~mask;
963 	return (nblks);
964 }
965 
966 /*
967  * BLIST_META_FILL() -	allocate specific blocks at a meta node
968  *
969  *	This routine allocates the specified range of blocks,
970  *	regardless of any existing allocations in the range.  The
971  *	range must be within the extent of this node.  Returns the
972  *	number of blocks allocated by the call.
973  */
974 static daddr_t
975 blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, u_daddr_t radix)
976 {
977 	daddr_t blk, endBlk, i, nblks, skip;
978 	int digit;
979 
980 	if (radix == BLIST_BMAP_RADIX)
981 		return (blst_leaf_fill(scan, allocBlk, count));
982 
983 	endBlk = ummin(allocBlk + count, (allocBlk + radix) & -radix);
984 	radix /= BLIST_META_RADIX;
985 	skip = radix_to_skip(radix);
986 	blk = allocBlk & -radix;
987 	nblks = 0;
988 	while (blk < endBlk) {
989 		digit = (blk / radix) & BLIST_META_MASK;
990 		i = 1 + digit * skip;
991 		blk += radix;
992 		count = ummin(blk, endBlk) - allocBlk;
993 		nblks += blst_meta_fill(&scan[i], allocBlk, count, radix);
994 		if (scan[i].bm_bitmap == 0)
995 			scan->bm_bitmap &= ~((u_daddr_t)1 << digit);
996 		allocBlk = blk;
997 	}
998 	return (nblks);
999 }
1000 
1001 #ifdef BLIST_DEBUG
1002 
1003 static void
1004 blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int tab)
1005 {
1006 	daddr_t skip;
1007 	u_daddr_t bit, mask;
1008 	int digit;
1009 
1010 	if (radix == BLIST_BMAP_RADIX) {
1011 		printf(
1012 		    "%*.*s(%08llx,%lld): bitmap %0*llx big=%lld\n",
1013 		    tab, tab, "",
1014 		    (long long)blk, (long long)radix,
1015 		    1 + (BLIST_BMAP_RADIX - 1) / 4,
1016 		    (long long)scan->bm_bitmap,
1017 		    (long long)scan->bm_bighint
1018 		);
1019 		return;
1020 	}
1021 
1022 	printf(
1023 	    "%*.*s(%08llx): subtree (%lld/%lld) bitmap %0*llx big=%lld {\n",
1024 	    tab, tab, "",
1025 	    (long long)blk, (long long)radix,
1026 	    (long long)radix,
1027 	    1 + (BLIST_META_RADIX - 1) / 4,
1028 	    (long long)scan->bm_bitmap,
1029 	    (long long)scan->bm_bighint
1030 	);
1031 
1032 	radix /= BLIST_META_RADIX;
1033 	skip = radix_to_skip(radix);
1034 	tab += 4;
1035 
1036 	mask = scan->bm_bitmap;
1037 	/* Examine the nonempty subtree associated with each bit set in mask */
1038 	do {
1039 		bit = mask & -mask;
1040 		digit = bitpos(bit);
1041 		blst_radix_print(&scan[1 + digit * skip], blk + digit * radix,
1042 		    radix, tab);
1043 	} while ((mask ^= bit) != 0);
1044 	tab -= 4;
1045 
1046 	printf(
1047 	    "%*.*s}\n",
1048 	    tab, tab, ""
1049 	);
1050 }
1051 
1052 #endif
1053 
1054 #ifdef BLIST_DEBUG
1055 
1056 int
1057 main(int ac, char **av)
1058 {
1059 	int size = BLIST_META_RADIX * BLIST_BMAP_RADIX;
1060 	int i;
1061 	blist_t bl;
1062 	struct sbuf *s;
1063 
1064 	for (i = 1; i < ac; ++i) {
1065 		const char *ptr = av[i];
1066 		if (*ptr != '-') {
1067 			size = strtol(ptr, NULL, 0);
1068 			continue;
1069 		}
1070 		ptr += 2;
1071 		fprintf(stderr, "Bad option: %s\n", ptr - 2);
1072 		exit(1);
1073 	}
1074 	bl = blist_create(size, M_WAITOK);
1075 	blist_free(bl, 0, size);
1076 
1077 	for (;;) {
1078 		char buf[1024];
1079 		long long da = 0;
1080 		long long count = 0;
1081 
1082 		printf("%lld/%lld/%lld> ", (long long)blist_avail(bl),
1083 		    (long long)size, (long long)bl->bl_radix);
1084 		fflush(stdout);
1085 		if (fgets(buf, sizeof(buf), stdin) == NULL)
1086 			break;
1087 		switch(buf[0]) {
1088 		case 'r':
1089 			if (sscanf(buf + 1, "%lld", &count) == 1) {
1090 				blist_resize(&bl, count, 1, M_WAITOK);
1091 			} else {
1092 				printf("?\n");
1093 			}
1094 		case 'p':
1095 			blist_print(bl);
1096 			break;
1097 		case 's':
1098 			s = sbuf_new_auto();
1099 			blist_stats(bl, s);
1100 			sbuf_finish(s);
1101 			printf("%s", sbuf_data(s));
1102 			sbuf_delete(s);
1103 			break;
1104 		case 'a':
1105 			if (sscanf(buf + 1, "%lld", &count) == 1) {
1106 				daddr_t blk = blist_alloc(bl, count);
1107 				printf("    R=%08llx\n", (long long)blk);
1108 			} else {
1109 				printf("?\n");
1110 			}
1111 			break;
1112 		case 'f':
1113 			if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) {
1114 				blist_free(bl, da, count);
1115 			} else {
1116 				printf("?\n");
1117 			}
1118 			break;
1119 		case 'l':
1120 			if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) {
1121 				printf("    n=%jd\n",
1122 				    (intmax_t)blist_fill(bl, da, count));
1123 			} else {
1124 				printf("?\n");
1125 			}
1126 			break;
1127 		case '?':
1128 		case 'h':
1129 			puts(
1130 			    "p          -print\n"
1131 			    "s          -stats\n"
1132 			    "a %d       -allocate\n"
1133 			    "f %x %d    -free\n"
1134 			    "l %x %d    -fill\n"
1135 			    "r %d       -resize\n"
1136 			    "h/?        -help"
1137 			);
1138 			break;
1139 		default:
1140 			printf("?\n");
1141 			break;
1142 		}
1143 	}
1144 	return(0);
1145 }
1146 
1147 void
1148 panic(const char *ctl, ...)
1149 {
1150 	va_list va;
1151 
1152 	va_start(va, ctl);
1153 	vfprintf(stderr, ctl, va);
1154 	fprintf(stderr, "\n");
1155 	va_end(va);
1156 	exit(1);
1157 }
1158 
1159 #endif
1160