xref: /freebsd/sys/kern/subr_blist.c (revision 2ef9ff7dd34a78a7890ba4d6de64da34d9c10942)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1998 Matthew Dillon.  All Rights Reserved.
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. Neither the name of the University nor the names of its contributors
14  *    may be used to endorse or promote products derived from this software
15  *    without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
18  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
23  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
25  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
26  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  */
29 /*
30  * BLIST.C -	Bitmap allocator/deallocator, using a radix tree with hinting
31  *
32  *	This module implements a general bitmap allocator/deallocator.  The
33  *	allocator eats around 2 bits per 'block'.  The module does not
34  *	try to interpret the meaning of a 'block' other than to return
35  *	SWAPBLK_NONE on an allocation failure.
36  *
37  *	A radix tree controls access to pieces of the bitmap, and includes
38  *	auxiliary information at each interior node about the availabilty of
39  *	contiguous free blocks in the subtree rooted at that node.  Two radix
40  *	constants are involved: one for the size of the bitmaps contained in the
41  *	leaf nodes (BLIST_BMAP_RADIX), and one for the number of descendents of
42  *	each of the meta (interior) nodes (BLIST_META_RADIX).  Each subtree is
43  *	associated with a range of blocks.  The root of any subtree stores a
44  *	hint field that defines an upper bound on the size of the largest
45  *	allocation that can begin in the associated block range.  A hint is an
46  *	upper bound on a potential allocation, but not necessarily a tight upper
47  *	bound.
48  *
49  *	The bitmap field in each node directs the search for available blocks.
50  *	For a leaf node, a bit is set if the corresponding block is free.  For a
51  *	meta node, a bit is set if the corresponding subtree contains a free
52  *	block somewhere within it.  The search at a meta node considers only
53  *	children of that node that represent a range that includes a free block.
54  *
55  * 	The hinting greatly increases code efficiency for allocations while
56  *	the general radix structure optimizes both allocations and frees.  The
57  *	radix tree should be able to operate well no matter how much
58  *	fragmentation there is and no matter how large a bitmap is used.
59  *
60  *	The blist code wires all necessary memory at creation time.  Neither
61  *	allocations nor frees require interaction with the memory subsystem.
62  *	The non-blocking nature of allocations and frees is required by swap
63  *	code (vm/swap_pager.c).
64  *
65  *	LAYOUT: The radix tree is laid out recursively using a linear array.
66  *	Each meta node is immediately followed (laid out sequentially in
67  *	memory) by BLIST_META_RADIX lower level nodes.  This is a recursive
68  *	structure but one that can be easily scanned through a very simple
69  *	'skip' calculation.  The memory allocation is only large enough to
70  *	cover the number of blocks requested at creation time.  Nodes that
71  *	represent blocks beyond that limit, nodes that would never be read
72  *	or written, are not allocated, so that the last of the
73  *	BLIST_META_RADIX lower level nodes of a some nodes may not be
74  *	allocated.
75  *
76  *	NOTE: the allocator cannot currently allocate more than
77  *	BLIST_BMAP_RADIX blocks per call.  It will panic with 'allocation too
78  *	large' if you try.  This is an area that could use improvement.  The
79  *	radix is large enough that this restriction does not effect the swap
80  *	system, though.  Currently only the allocation code is affected by
81  *	this algorithmic unfeature.  The freeing code can handle arbitrary
82  *	ranges.
83  *
84  *	This code can be compiled stand-alone for debugging.
85  */
86 
87 #include <sys/cdefs.h>
88 __FBSDID("$FreeBSD$");
89 
90 #ifdef _KERNEL
91 
92 #include <sys/param.h>
93 #include <sys/systm.h>
94 #include <sys/lock.h>
95 #include <sys/kernel.h>
96 #include <sys/blist.h>
97 #include <sys/malloc.h>
98 #include <sys/sbuf.h>
99 #include <sys/proc.h>
100 #include <sys/mutex.h>
101 
102 #else
103 
104 #ifndef BLIST_NO_DEBUG
105 #define BLIST_DEBUG
106 #endif
107 
108 #include <sys/errno.h>
109 #include <sys/types.h>
110 #include <sys/malloc.h>
111 #include <sys/sbuf.h>
112 #include <assert.h>
113 #include <stdio.h>
114 #include <string.h>
115 #include <stddef.h>
116 #include <stdlib.h>
117 #include <stdarg.h>
118 #include <stdbool.h>
119 
120 #define	bitcount64(x)	__bitcount64((uint64_t)(x))
121 #define malloc(a,b,c)	calloc(a, 1)
122 #define free(a,b)	free(a)
123 #define ummin(a,b)	((a) < (b) ? (a) : (b))
124 #define KASSERT(a,b)	assert(a)
125 
126 #include <sys/blist.h>
127 
128 #endif
129 
130 /*
131  * static support functions
132  */
133 static daddr_t	blst_leaf_alloc(blmeta_t *scan, daddr_t blk,
134     int *count, int maxcount);
135 static daddr_t	blst_meta_alloc(blmeta_t *scan, daddr_t cursor, int *count,
136     int maxcount, u_daddr_t radix);
137 static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count);
138 static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count,
139 		    u_daddr_t radix);
140 static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix,
141 		    blist_t dest, daddr_t count);
142 static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count);
143 static daddr_t blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count,
144 		    u_daddr_t radix);
145 #ifndef _KERNEL
146 static void	blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix,
147 		    int tab);
148 #endif
149 
150 #ifdef _KERNEL
151 static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space");
152 #endif
153 
154 _Static_assert(BLIST_BMAP_RADIX % BLIST_META_RADIX == 0,
155     "radix divisibility error");
156 #define	BLIST_BMAP_MASK	(BLIST_BMAP_RADIX - 1)
157 #define	BLIST_META_MASK	(BLIST_META_RADIX - 1)
158 
159 /*
160  * For a subtree that can represent the state of up to 'radix' blocks, the
161  * number of leaf nodes of the subtree is L=radix/BLIST_BMAP_RADIX.  If 'm'
162  * is short for BLIST_META_RADIX, then for a tree of height h with L=m**h
163  * leaf nodes, the total number of tree nodes is 1 + m + m**2 + ... + m**h,
164  * or, equivalently, (m**(h+1)-1)/(m-1).  This quantity is called 'skip'
165  * in the 'meta' functions that process subtrees.  Since integer division
166  * discards remainders, we can express this computation as
167  * skip = (m * m**h) / (m - 1)
168  * skip = (m * (radix / BLIST_BMAP_RADIX)) / (m - 1)
169  * and since m divides BLIST_BMAP_RADIX, we can simplify further to
170  * skip = (radix / (BLIST_BMAP_RADIX / m)) / (m - 1)
171  * skip = radix / ((BLIST_BMAP_RADIX / m) * (m - 1))
172  * so that simple integer division by a constant can safely be used for the
173  * calculation.
174  */
175 static inline daddr_t
176 radix_to_skip(daddr_t radix)
177 {
178 
179 	return (radix /
180 	    ((BLIST_BMAP_RADIX / BLIST_META_RADIX) * BLIST_META_MASK));
181 }
182 
183 /*
184  * Provide a mask with count bits set, starting as position n.
185  */
186 static inline u_daddr_t
187 bitrange(int n, int count)
188 {
189 
190 	return (((u_daddr_t)-1 << n) &
191 	    ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - (n + count))));
192 }
193 
194 
195 /*
196  * Find the first bit set in a u_daddr_t.
197  */
198 static inline int
199 generic_bitpos(u_daddr_t mask)
200 {
201 	int hi, lo, mid;
202 
203 	lo = 0;
204 	hi = BLIST_BMAP_RADIX;
205 	while (lo + 1 < hi) {
206 		mid = (lo + hi) >> 1;
207 		if (mask & bitrange(0, mid))
208 			hi = mid;
209 		else
210 			lo = mid;
211 	}
212 	return (lo);
213 }
214 
215 static inline int
216 bitpos(u_daddr_t mask)
217 {
218 
219 	switch (sizeof(mask)) {
220 #ifdef HAVE_INLINE_FFSLL
221 	case sizeof(long long):
222 		return (ffsll(mask) - 1);
223 #endif
224 #ifdef HAVE_INLINE_FFS
225 	case sizeof(int):
226 		return (ffs(mask) - 1);
227 #endif
228 	default:
229 		return (generic_bitpos(mask));
230 	}
231 }
232 
233 /*
234  * blist_create() - create a blist capable of handling up to the specified
235  *		    number of blocks
236  *
237  *	blocks - must be greater than 0
238  * 	flags  - malloc flags
239  *
240  *	The smallest blist consists of a single leaf node capable of
241  *	managing BLIST_BMAP_RADIX blocks.
242  */
243 blist_t
244 blist_create(daddr_t blocks, int flags)
245 {
246 	blist_t bl;
247 	u_daddr_t nodes, radix;
248 
249 	KASSERT(blocks > 0, ("invalid block count"));
250 
251 	/*
252 	 * Calculate the radix and node count used for scanning.
253 	 */
254 	nodes = 1;
255 	radix = BLIST_BMAP_RADIX;
256 	while (radix <= blocks) {
257 		nodes += 1 + (blocks - 1) / radix;
258 		radix *= BLIST_META_RADIX;
259 	}
260 
261 	bl = malloc(offsetof(struct blist, bl_root[nodes]), M_SWAP, flags |
262 	    M_ZERO);
263 	if (bl == NULL)
264 		return (NULL);
265 
266 	bl->bl_blocks = blocks;
267 	bl->bl_radix = radix;
268 
269 #if defined(BLIST_DEBUG)
270 	printf(
271 		"BLIST representing %lld blocks (%lld MB of swap)"
272 		", requiring %lldK of ram\n",
273 		(long long)bl->bl_blocks,
274 		(long long)bl->bl_blocks * 4 / 1024,
275 		(long long)(nodes * sizeof(blmeta_t) + 1023) / 1024
276 	);
277 	printf("BLIST raw radix tree contains %lld records\n",
278 	    (long long)nodes);
279 #endif
280 
281 	return (bl);
282 }
283 
284 void
285 blist_destroy(blist_t bl)
286 {
287 
288 	free(bl, M_SWAP);
289 }
290 
291 /*
292  * blist_alloc() -   reserve space in the block bitmap.  Return the base
293  *		     of a contiguous region or SWAPBLK_NONE if space could
294  *		     not be allocated.
295  */
296 daddr_t
297 blist_alloc(blist_t bl, int *count, int maxcount)
298 {
299 	daddr_t blk, cursor;
300 
301 	KASSERT(*count <= maxcount,
302 	    ("invalid parameters %d > %d", *count, maxcount));
303 	KASSERT(maxcount <= BLIST_MAX_ALLOC,
304 	    ("allocation too large: %d", maxcount));
305 
306 	/*
307 	 * This loop iterates at most twice.  An allocation failure in the
308 	 * first iteration leads to a second iteration only if the cursor was
309 	 * non-zero.  When the cursor is zero, an allocation failure will
310 	 * stop further iterations.
311 	 */
312 	for (cursor = bl->bl_cursor;; cursor = 0) {
313 		blk = blst_meta_alloc(bl->bl_root, cursor, count, maxcount,
314 		    bl->bl_radix);
315 		if (blk != SWAPBLK_NONE) {
316 			bl->bl_avail -= *count;
317 			bl->bl_cursor = blk + *count;
318 			if (bl->bl_cursor == bl->bl_blocks)
319 				bl->bl_cursor = 0;
320 			return (blk);
321 		}
322 		if (cursor == 0)
323 			return (SWAPBLK_NONE);
324 	}
325 }
326 
327 /*
328  * blist_avail() -	return the number of free blocks.
329  */
330 daddr_t
331 blist_avail(blist_t bl)
332 {
333 
334 	return (bl->bl_avail);
335 }
336 
337 /*
338  * blist_free() -	free up space in the block bitmap.  Return the base
339  *		     	of a contiguous region.
340  */
341 void
342 blist_free(blist_t bl, daddr_t blkno, daddr_t count)
343 {
344 
345 	KASSERT(blkno >= 0 && blkno + count <= bl->bl_blocks,
346 	    ("freeing invalid range: blkno %jx, count %d, blocks %jd",
347 	    (uintmax_t)blkno, (int)count, (uintmax_t)bl->bl_blocks));
348 	blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix);
349 	bl->bl_avail += count;
350 }
351 
352 /*
353  * blist_fill() -	mark a region in the block bitmap as off-limits
354  *			to the allocator (i.e. allocate it), ignoring any
355  *			existing allocations.  Return the number of blocks
356  *			actually filled that were free before the call.
357  */
358 daddr_t
359 blist_fill(blist_t bl, daddr_t blkno, daddr_t count)
360 {
361 	daddr_t filled;
362 
363 	KASSERT(blkno >= 0 && blkno + count <= bl->bl_blocks,
364 	    ("filling invalid range: blkno %jx, count %d, blocks %jd",
365 	    (uintmax_t)blkno, (int)count, (uintmax_t)bl->bl_blocks));
366 	filled = blst_meta_fill(bl->bl_root, blkno, count, bl->bl_radix);
367 	bl->bl_avail -= filled;
368 	return (filled);
369 }
370 
371 /*
372  * blist_resize() -	resize an existing radix tree to handle the
373  *			specified number of blocks.  This will reallocate
374  *			the tree and transfer the previous bitmap to the new
375  *			one.  When extending the tree you can specify whether
376  *			the new blocks are to left allocated or freed.
377  */
378 void
379 blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags)
380 {
381     blist_t newbl = blist_create(count, flags);
382     blist_t save = *pbl;
383 
384     *pbl = newbl;
385     if (count > save->bl_blocks)
386 	    count = save->bl_blocks;
387     blst_copy(save->bl_root, 0, save->bl_radix, newbl, count);
388 
389     /*
390      * If resizing upwards, should we free the new space or not?
391      */
392     if (freenew && count < newbl->bl_blocks) {
393 	    blist_free(newbl, count, newbl->bl_blocks - count);
394     }
395     blist_destroy(save);
396 }
397 
398 #ifdef BLIST_DEBUG
399 
400 /*
401  * blist_print()    - dump radix tree
402  */
403 void
404 blist_print(blist_t bl)
405 {
406 	printf("BLIST avail = %jd, cursor = %08jx {\n",
407 	    (uintmax_t)bl->bl_avail, (uintmax_t)bl->bl_cursor);
408 
409 	if (bl->bl_root->bm_bitmap != 0)
410 		blst_radix_print(bl->bl_root, 0, bl->bl_radix, 4);
411 	printf("}\n");
412 }
413 
414 #endif
415 
416 static const u_daddr_t fib[] = {
417 	1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584,
418 	4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418, 317811,
419 	514229, 832040, 1346269, 2178309, 3524578,
420 };
421 
422 /*
423  * Use 'gap' to describe a maximal range of unallocated blocks/bits.
424  */
425 struct gap_stats {
426 	daddr_t	start;		/* current gap start, or SWAPBLK_NONE */
427 	daddr_t	num;		/* number of gaps observed */
428 	daddr_t	max;		/* largest gap size */
429 	daddr_t	avg;		/* average gap size */
430 	daddr_t	err;		/* sum - num * avg */
431 	daddr_t	histo[nitems(fib)]; /* # gaps in each size range */
432 	int	max_bucket;	/* last histo elt with nonzero val */
433 };
434 
435 /*
436  * gap_stats_counting()    - is the state 'counting 1 bits'?
437  *                           or 'skipping 0 bits'?
438  */
439 static inline bool
440 gap_stats_counting(const struct gap_stats *stats)
441 {
442 
443 	return (stats->start != SWAPBLK_NONE);
444 }
445 
446 /*
447  * init_gap_stats()    - initialize stats on gap sizes
448  */
449 static inline void
450 init_gap_stats(struct gap_stats *stats)
451 {
452 
453 	bzero(stats, sizeof(*stats));
454 	stats->start = SWAPBLK_NONE;
455 }
456 
457 /*
458  * update_gap_stats()    - update stats on gap sizes
459  */
460 static void
461 update_gap_stats(struct gap_stats *stats, daddr_t posn)
462 {
463 	daddr_t size;
464 	int hi, lo, mid;
465 
466 	if (!gap_stats_counting(stats)) {
467 		stats->start = posn;
468 		return;
469 	}
470 	size = posn - stats->start;
471 	stats->start = SWAPBLK_NONE;
472 	if (size > stats->max)
473 		stats->max = size;
474 
475 	/*
476 	 * Find the fibonacci range that contains size,
477 	 * expecting to find it in an early range.
478 	 */
479 	lo = 0;
480 	hi = 1;
481 	while (hi < nitems(fib) && fib[hi] <= size) {
482 		lo = hi;
483 		hi *= 2;
484 	}
485 	if (hi >= nitems(fib))
486 		hi = nitems(fib);
487 	while (lo + 1 != hi) {
488 		mid = (lo + hi) >> 1;
489 		if (fib[mid] <= size)
490 			lo = mid;
491 		else
492 			hi = mid;
493 	}
494 	stats->histo[lo]++;
495 	if (lo > stats->max_bucket)
496 		stats->max_bucket = lo;
497 	stats->err += size - stats->avg;
498 	stats->num++;
499 	stats->avg += stats->err / stats->num;
500 	stats->err %= stats->num;
501 }
502 
503 /*
504  * dump_gap_stats()    - print stats on gap sizes
505  */
506 static inline void
507 dump_gap_stats(const struct gap_stats *stats, struct sbuf *s)
508 {
509 	int i;
510 
511 	sbuf_printf(s, "number of maximal free ranges: %jd\n",
512 	    (intmax_t)stats->num);
513 	sbuf_printf(s, "largest free range: %jd\n", (intmax_t)stats->max);
514 	sbuf_printf(s, "average maximal free range size: %jd\n",
515 	    (intmax_t)stats->avg);
516 	sbuf_printf(s, "number of maximal free ranges of different sizes:\n");
517 	sbuf_printf(s, "               count  |  size range\n");
518 	sbuf_printf(s, "               -----  |  ----------\n");
519 	for (i = 0; i < stats->max_bucket; i++) {
520 		if (stats->histo[i] != 0) {
521 			sbuf_printf(s, "%20jd  |  ",
522 			    (intmax_t)stats->histo[i]);
523 			if (fib[i] != fib[i + 1] - 1)
524 				sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
525 				    (intmax_t)fib[i + 1] - 1);
526 			else
527 				sbuf_printf(s, "%jd\n", (intmax_t)fib[i]);
528 		}
529 	}
530 	sbuf_printf(s, "%20jd  |  ", (intmax_t)stats->histo[i]);
531 	if (stats->histo[i] > 1)
532 		sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
533 		    (intmax_t)stats->max);
534 	else
535 		sbuf_printf(s, "%jd\n", (intmax_t)stats->max);
536 }
537 
538 /*
539  * blist_stats()    - dump radix tree stats
540  */
541 void
542 blist_stats(blist_t bl, struct sbuf *s)
543 {
544 	struct gap_stats gstats;
545 	struct gap_stats *stats = &gstats;
546 	daddr_t i, nodes, radix;
547 	u_daddr_t diff, mask;
548 	int digit;
549 
550 	init_gap_stats(stats);
551 	nodes = 0;
552 	i = bl->bl_radix;
553 	while (i < bl->bl_radix + bl->bl_blocks) {
554 		/*
555 		 * Find max size subtree starting at i.
556 		 */
557 		radix = BLIST_BMAP_RADIX;
558 		while (((i / radix) & BLIST_META_MASK) == 0)
559 			radix *= BLIST_META_RADIX;
560 
561 		/*
562 		 * Check for skippable subtrees starting at i.
563 		 */
564 		while (radix > BLIST_BMAP_RADIX) {
565 			if (bl->bl_root[nodes].bm_bitmap == 0) {
566 				if (gap_stats_counting(stats))
567 					update_gap_stats(stats, i);
568 				break;
569 			}
570 
571 			/*
572 			 * Skip subtree root.
573 			 */
574 			nodes++;
575 			radix /= BLIST_META_RADIX;
576 		}
577 		if (radix == BLIST_BMAP_RADIX) {
578 			/*
579 			 * Scan leaf.
580 			 */
581 			mask = bl->bl_root[nodes].bm_bitmap;
582 			diff = mask ^ (mask << 1);
583 			if (gap_stats_counting(stats))
584 				diff ^= 1;
585 			while (diff != 0) {
586 				digit = bitpos(diff);
587 				update_gap_stats(stats, i + digit);
588 				diff ^= bitrange(digit, 1);
589 			}
590 		}
591 		nodes += radix_to_skip(radix);
592 		i += radix;
593 	}
594 	update_gap_stats(stats, i);
595 	dump_gap_stats(stats, s);
596 }
597 
598 /************************************************************************
599  *			  ALLOCATION SUPPORT FUNCTIONS			*
600  ************************************************************************
601  *
602  *	These support functions do all the actual work.  They may seem
603  *	rather longish, but that's because I've commented them up.  The
604  *	actual code is straight forward.
605  *
606  */
607 
608 /*
609  * BLST_NEXT_LEAF_ALLOC() - allocate the first few blocks in the next leaf.
610  *
611  *	'scan' is a leaf node, associated with a block containing 'blk'.
612  *	The next leaf node could be adjacent, or several nodes away if the
613  *	least common ancestor of 'scan' and its neighbor is several levels
614  *	up.  Use 'blk' to determine how many meta-nodes lie between the
615  *	leaves.  If the next leaf has enough initial bits set, clear them
616  *	and clear the bits in the meta nodes on the path up to the least
617  *	common ancestor to mark any subtrees made completely empty.
618  */
619 static int
620 blst_next_leaf_alloc(blmeta_t *scan, daddr_t blk, int count, int maxcount)
621 {
622 	blmeta_t *next;
623 	u_daddr_t radix;
624 	int avail, digit;
625 
626 	next = scan + 1;
627 	blk += BLIST_BMAP_RADIX;
628 	radix = BLIST_BMAP_RADIX;
629 	while ((next->bm_bitmap & 1) == 1 &&
630 	    (digit = ((blk / radix) & BLIST_META_MASK)) == 0) {
631 		next++;
632 		radix *= BLIST_META_RADIX;
633 	}
634 	if ((next->bm_bitmap & 1) != 1)
635 		return (0);
636 	avail = (~next->bm_bitmap != 0) ?
637 	    bitpos(~next->bm_bitmap) : BLIST_BMAP_RADIX;
638 	if (avail < count) {
639  		/*
640  		 * The next leaf doesn't have enough free blocks at the
641  		 * beginning to complete the spanning allocation.
642  		 */
643 		return (0);
644 	}
645 	count = imin(avail, maxcount);
646 	/* Clear the first 'count' bits in the next leaf to allocate. */
647 	next->bm_bitmap &= ~bitrange(0, count);
648 
649 	/*
650 	 * Update bitmaps of next-ancestors, up to least common ancestor.
651 	 */
652 	while (next->bm_bitmap == 0) {
653 		if (--next == scan) {
654 			scan[-digit * radix_to_skip(radix)].bm_bitmap ^=
655 			    (u_daddr_t)1 << digit;
656 			break;
657 		}
658 		next->bm_bitmap ^= 1;
659  	}
660 	return (count);
661 }
662 
663 /*
664  * Given a bitmask, flip all the bits from the least-significant 1-bit to the
665  * most significant bit.  If the result is non-zero, then the least-significant
666  * 1-bit of the result is in the same position as the least-signification 0-bit
667  * in mask that is followed by a 1-bit.
668  */
669 static inline u_daddr_t
670 flip_hibits(u_daddr_t mask)
671 {
672 
673 	return (-mask & ~mask);
674 }
675 
676 /*
677  * BLST_LEAF_ALLOC() -	allocate at a leaf in the radix tree (a bitmap).
678  *
679  *	This function is the core of the allocator.  Its execution time is
680  *	proportional to log(count), plus height of the tree if the allocation
681  *	crosses a leaf boundary.
682  */
683 static daddr_t
684 blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int *count, int maxcount)
685 {
686 	u_daddr_t cursor_mask, mask;
687 	int count1, hi, lo, num_shifts, range1, range_ext;
688 
689 	range1 = 0;
690 	count1 = *count - 1;
691 	num_shifts = fls(count1);
692 	mask = scan->bm_bitmap;
693 	while (flip_hibits(mask) != 0 && num_shifts > 0) {
694 		/*
695 		 * If bit i is set in mask, then bits in [i, i+range1] are set
696 		 * in scan->bm_bitmap.  The value of range1 is equal to count1
697 		 * >> num_shifts.  Grow range1 and reduce num_shifts to 0,
698 		 * while preserving these invariants.  The updates to mask
699 		 * leave fewer bits set, but each bit that remains set
700 		 * represents a longer string of consecutive bits set in
701 		 * scan->bm_bitmap.  If more updates to mask cannot clear more
702 		 * bits, because mask is partitioned with all 0 bits preceding
703 		 * all 1 bits, the loop terminates immediately.
704 		 */
705 		num_shifts--;
706 		range_ext = range1 + ((count1 >> num_shifts) & 1);
707 		/*
708 		 * mask is a signed quantity for the shift because when it is
709 		 * shifted right, the sign bit should copied; when the last
710 		 * block of the leaf is free, pretend, for a while, that all the
711 		 * blocks that follow it are also free.
712 		 */
713 		mask &= (daddr_t)mask >> range_ext;
714 		range1 += range_ext;
715 	}
716 	if (mask == 0) {
717 		/*
718 		 * Update bighint.  There is no allocation bigger than range1
719 		 * starting in this leaf.
720 		 */
721 		scan->bm_bighint = range1;
722 		return (SWAPBLK_NONE);
723 	}
724 
725 	/* Discard any candidates that appear before blk. */
726 	if ((blk & BLIST_BMAP_MASK) != 0) {
727 		cursor_mask = mask & bitrange(0, blk & BLIST_BMAP_MASK);
728 		if (cursor_mask != 0) {
729 			mask ^= cursor_mask;
730 			if (mask == 0)
731 				return (SWAPBLK_NONE);
732 
733 			/*
734 			 * Bighint change for last block allocation cannot
735 			 * assume that any other blocks are allocated, so the
736 			 * bighint cannot be reduced much.
737 			 */
738 			range1 = BLIST_MAX_ALLOC - 1;
739 		}
740 		blk &= ~BLIST_BMAP_MASK;
741 	}
742 
743 	/*
744 	 * The least significant set bit in mask marks the start of the first
745 	 * available range of sufficient size.  Find its position.
746 	 */
747 	lo = bitpos(mask);
748 
749 	/*
750 	 * Find how much space is available starting at that position.
751 	 */
752 	if (flip_hibits(mask) != 0) {
753 		/* Count the 1 bits starting at position lo. */
754 		hi = bitpos(flip_hibits(mask)) + count1;
755 		if (maxcount < hi - lo)
756 			hi = lo + maxcount;
757 		*count = hi - lo;
758 		mask = bitrange(lo, *count);
759 	} else if (maxcount <= BLIST_BMAP_RADIX - lo) {
760 		/* All the blocks we can use are available here. */
761 		hi = lo + maxcount;
762 		*count = maxcount;
763 		mask = bitrange(lo, *count);
764 	} else {
765 		/* Check next leaf for some of the blocks we want or need. */
766 		count1 = *count - (BLIST_BMAP_RADIX - lo);
767 		maxcount -= BLIST_BMAP_RADIX - lo;
768 		hi = blst_next_leaf_alloc(scan, blk, count1, maxcount);
769 		if (hi < count1)
770 			/*
771 			 * The next leaf cannot supply enough blocks to reach
772 			 * the minimum required allocation.  The hint cannot be
773 			 * updated, because the same allocation request could
774 			 * be satisfied later, by this leaf, if the state of
775 			 * the next leaf changes, and without any changes to
776 			 * this leaf.
777 			 */
778 			return (SWAPBLK_NONE);
779 		*count = BLIST_BMAP_RADIX - lo + hi;
780 		hi = BLIST_BMAP_RADIX;
781 	}
782 
783 	if (hi == BLIST_BMAP_RADIX) {
784 		/*
785 		 * Update bighint.  There is no allocation bigger than range1
786 		 * available in this leaf after this allocation completes.
787 		 */
788 		scan->bm_bighint = range1;
789 	}
790 	/* Clear the allocated bits from this leaf. */
791 	scan->bm_bitmap &= ~mask;
792 	return (blk + lo);
793 }
794 
795 /*
796  * blist_meta_alloc() -	allocate at a meta in the radix tree.
797  *
798  *	Attempt to allocate at a meta node.  If we can't, we update
799  *	bighint and return a failure.  Updating bighint optimize future
800  *	calls that hit this node.  We have to check for our collapse cases
801  *	and we have a few optimizations strewn in as well.
802  */
803 static daddr_t
804 blst_meta_alloc(blmeta_t *scan, daddr_t cursor, int *count,
805     int maxcount, u_daddr_t radix)
806 {
807 	daddr_t blk, i, r, skip;
808 	u_daddr_t mask;
809 	bool scan_from_start;
810 	int digit;
811 
812 	if (radix == BLIST_BMAP_RADIX)
813 		return (blst_leaf_alloc(scan, cursor, count, maxcount));
814 	blk = cursor & -radix;
815 	scan_from_start = (cursor == blk);
816 	radix /= BLIST_META_RADIX;
817 	skip = radix_to_skip(radix);
818 	mask = scan->bm_bitmap;
819 
820 	/* Discard any candidates that appear before cursor. */
821 	digit = (cursor / radix) & BLIST_META_MASK;
822 	mask &= (u_daddr_t)-1 << digit;
823 	if (mask == 0)
824 		return (SWAPBLK_NONE);
825 
826 	/*
827 	 * If the first try is for a block that includes the cursor, pre-undo
828 	 * the digit * radix offset in the first call; otherwise, ignore the
829 	 * cursor entirely.
830 	 */
831 	if (((mask >> digit) & 1) == 1)
832 		cursor -= digit * radix;
833 	else
834 		cursor = blk;
835 
836 	/*
837 	 * Examine the nonempty subtree associated with each bit set in mask.
838 	 */
839 	do {
840 		digit = bitpos(mask);
841 		i = 1 + digit * skip;
842 		if (*count <= scan[i].bm_bighint) {
843 			/*
844 			 * The allocation might fit beginning in the i'th subtree.
845 			 */
846 			r = blst_meta_alloc(&scan[i], cursor + digit * radix,
847 			    count, maxcount, radix);
848 			if (r != SWAPBLK_NONE) {
849 				if (scan[i].bm_bitmap == 0)
850 					scan->bm_bitmap ^= bitrange(digit, 1);
851 				return (r);
852 			}
853 		}
854 		cursor = blk;
855 	} while ((mask ^= bitrange(digit, 1)) != 0);
856 
857 	/*
858 	 * We couldn't allocate count in this subtree.  If the whole tree was
859 	 * scanned, and the last tree node is allocated, update bighint.
860 	 */
861 	if (scan_from_start && !(digit == BLIST_META_RADIX - 1 &&
862 	    scan[i].bm_bighint == BLIST_MAX_ALLOC))
863 		scan->bm_bighint = *count - 1;
864 
865 	return (SWAPBLK_NONE);
866 }
867 
868 /*
869  * BLST_LEAF_FREE() -	free allocated block from leaf bitmap
870  *
871  */
872 static void
873 blst_leaf_free(blmeta_t *scan, daddr_t blk, int count)
874 {
875 	u_daddr_t mask;
876 
877 	/*
878 	 * free some data in this bitmap
879 	 * mask=0000111111111110000
880 	 *          \_________/\__/
881 	 *		count   n
882 	 */
883 	mask = bitrange(blk & BLIST_BMAP_MASK, count);
884 	KASSERT((scan->bm_bitmap & mask) == 0,
885 	    ("freeing free block: %jx, size %d, mask %jx",
886 	    (uintmax_t)blk, count, (uintmax_t)scan->bm_bitmap & mask));
887 	scan->bm_bitmap |= mask;
888 }
889 
890 /*
891  * BLST_META_FREE() - free allocated blocks from radix tree meta info
892  *
893  *	This support routine frees a range of blocks from the bitmap.
894  *	The range must be entirely enclosed by this radix node.  If a
895  *	meta node, we break the range down recursively to free blocks
896  *	in subnodes (which means that this code can free an arbitrary
897  *	range whereas the allocation code cannot allocate an arbitrary
898  *	range).
899  */
900 static void
901 blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, u_daddr_t radix)
902 {
903 	daddr_t blk, endBlk, i, skip;
904 	int digit, endDigit;
905 
906 	/*
907 	 * We could probably do a better job here.  We are required to make
908 	 * bighint at least as large as the biggest allocable block of data.
909 	 * If we just shoehorn it, a little extra overhead will be incurred
910 	 * on the next allocation (but only that one typically).
911 	 */
912 	scan->bm_bighint = BLIST_MAX_ALLOC;
913 
914 	if (radix == BLIST_BMAP_RADIX)
915 		return (blst_leaf_free(scan, freeBlk, count));
916 
917 	endBlk = ummin(freeBlk + count, (freeBlk + radix) & -radix);
918 	radix /= BLIST_META_RADIX;
919 	skip = radix_to_skip(radix);
920 	blk = freeBlk & -radix;
921 	digit = (blk / radix) & BLIST_META_MASK;
922 	endDigit = 1 + (((endBlk - 1) / radix) & BLIST_META_MASK);
923 	scan->bm_bitmap |= bitrange(digit, endDigit - digit);
924 	for (i = 1 + digit * skip; blk < endBlk; i += skip) {
925 		blk += radix;
926 		count = ummin(blk, endBlk) - freeBlk;
927 		blst_meta_free(&scan[i], freeBlk, count, radix);
928 		freeBlk = blk;
929 	}
930 }
931 
932 /*
933  * BLST_COPY() - copy one radix tree to another
934  *
935  *	Locates free space in the source tree and frees it in the destination
936  *	tree.  The space may not already be free in the destination.
937  */
938 static void
939 blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, blist_t dest,
940     daddr_t count)
941 {
942 	daddr_t endBlk, i, skip;
943 
944 	/*
945 	 * Leaf node
946 	 */
947 
948 	if (radix == BLIST_BMAP_RADIX) {
949 		u_daddr_t v = scan->bm_bitmap;
950 
951 		if (v == (u_daddr_t)-1) {
952 			blist_free(dest, blk, count);
953 		} else if (v != 0) {
954 			int i;
955 
956 			for (i = 0; i < count; ++i) {
957 				if (v & ((u_daddr_t)1 << i))
958 					blist_free(dest, blk + i, 1);
959 			}
960 		}
961 		return;
962 	}
963 
964 	/*
965 	 * Meta node
966 	 */
967 
968 	if (scan->bm_bitmap == 0) {
969 		/*
970 		 * Source all allocated, leave dest allocated
971 		 */
972 		return;
973 	}
974 
975 	endBlk = blk + count;
976 	radix /= BLIST_META_RADIX;
977 	skip = radix_to_skip(radix);
978 	for (i = 1; blk < endBlk; i += skip) {
979 		blk += radix;
980 		count = radix;
981 		if (blk >= endBlk)
982 			count -= blk - endBlk;
983 		blst_copy(&scan[i], blk - radix, radix, dest, count);
984 	}
985 }
986 
987 /*
988  * BLST_LEAF_FILL() -	allocate specific blocks in leaf bitmap
989  *
990  *	This routine allocates all blocks in the specified range
991  *	regardless of any existing allocations in that range.  Returns
992  *	the number of blocks allocated by the call.
993  */
994 static daddr_t
995 blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count)
996 {
997 	daddr_t nblks;
998 	u_daddr_t mask;
999 
1000 	mask = bitrange(blk & BLIST_BMAP_MASK, count);
1001 
1002 	/* Count the number of blocks that we are allocating. */
1003 	nblks = bitcount64(scan->bm_bitmap & mask);
1004 
1005 	scan->bm_bitmap &= ~mask;
1006 	return (nblks);
1007 }
1008 
1009 /*
1010  * BLIST_META_FILL() -	allocate specific blocks at a meta node
1011  *
1012  *	This routine allocates the specified range of blocks,
1013  *	regardless of any existing allocations in the range.  The
1014  *	range must be within the extent of this node.  Returns the
1015  *	number of blocks allocated by the call.
1016  */
1017 static daddr_t
1018 blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, u_daddr_t radix)
1019 {
1020 	daddr_t blk, endBlk, i, nblks, skip;
1021 	int digit;
1022 
1023 	if (radix == BLIST_BMAP_RADIX)
1024 		return (blst_leaf_fill(scan, allocBlk, count));
1025 
1026 	endBlk = ummin(allocBlk + count, (allocBlk + radix) & -radix);
1027 	radix /= BLIST_META_RADIX;
1028 	skip = radix_to_skip(radix);
1029 	blk = allocBlk & -radix;
1030 	nblks = 0;
1031 	while (blk < endBlk) {
1032 		digit = (blk / radix) & BLIST_META_MASK;
1033 		i = 1 + digit * skip;
1034 		blk += radix;
1035 		count = ummin(blk, endBlk) - allocBlk;
1036 		nblks += blst_meta_fill(&scan[i], allocBlk, count, radix);
1037 		if (scan[i].bm_bitmap == 0)
1038 			scan->bm_bitmap &= ~((u_daddr_t)1 << digit);
1039 		allocBlk = blk;
1040 	}
1041 	return (nblks);
1042 }
1043 
1044 #ifdef BLIST_DEBUG
1045 
1046 static void
1047 blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int tab)
1048 {
1049 	daddr_t skip;
1050 	u_daddr_t mask;
1051 	int digit;
1052 
1053 	if (radix == BLIST_BMAP_RADIX) {
1054 		printf(
1055 		    "%*.*s(%08llx,%lld): bitmap %0*llx big=%lld\n",
1056 		    tab, tab, "",
1057 		    (long long)blk, (long long)radix,
1058 		    1 + (BLIST_BMAP_RADIX - 1) / 4,
1059 		    (long long)scan->bm_bitmap,
1060 		    (long long)scan->bm_bighint
1061 		);
1062 		return;
1063 	}
1064 
1065 	printf(
1066 	    "%*.*s(%08llx): subtree (%lld/%lld) bitmap %0*llx big=%lld {\n",
1067 	    tab, tab, "",
1068 	    (long long)blk, (long long)radix,
1069 	    (long long)radix,
1070 	    1 + (BLIST_META_RADIX - 1) / 4,
1071 	    (long long)scan->bm_bitmap,
1072 	    (long long)scan->bm_bighint
1073 	);
1074 
1075 	radix /= BLIST_META_RADIX;
1076 	skip = radix_to_skip(radix);
1077 	tab += 4;
1078 
1079 	mask = scan->bm_bitmap;
1080 	/* Examine the nonempty subtree associated with each bit set in mask */
1081 	do {
1082 		digit = bitpos(mask);
1083 		blst_radix_print(&scan[1 + digit * skip], blk + digit * radix,
1084 		    radix, tab);
1085 	} while ((mask ^= bitrange(digit, 1)) != 0);
1086 	tab -= 4;
1087 
1088 	printf(
1089 	    "%*.*s}\n",
1090 	    tab, tab, ""
1091 	);
1092 }
1093 
1094 #endif
1095 
1096 #ifdef BLIST_DEBUG
1097 
1098 int
1099 main(int ac, char **av)
1100 {
1101 	int size = BLIST_META_RADIX * BLIST_BMAP_RADIX;
1102 	int i;
1103 	blist_t bl;
1104 	struct sbuf *s;
1105 
1106 	for (i = 1; i < ac; ++i) {
1107 		const char *ptr = av[i];
1108 		if (*ptr != '-') {
1109 			size = strtol(ptr, NULL, 0);
1110 			continue;
1111 		}
1112 		ptr += 2;
1113 		fprintf(stderr, "Bad option: %s\n", ptr - 2);
1114 		exit(1);
1115 	}
1116 	bl = blist_create(size, M_WAITOK);
1117 	blist_free(bl, 0, size);
1118 
1119 	for (;;) {
1120 		char buf[1024];
1121 		long long da = 0;
1122 		int count = 0, maxcount = 0;
1123 
1124 		printf("%lld/%lld/%lld> ", (long long)blist_avail(bl),
1125 		    (long long)size, (long long)bl->bl_radix);
1126 		fflush(stdout);
1127 		if (fgets(buf, sizeof(buf), stdin) == NULL)
1128 			break;
1129 		switch(buf[0]) {
1130 		case 'r':
1131 			if (sscanf(buf + 1, "%d", &count) == 1) {
1132 				blist_resize(&bl, count, 1, M_WAITOK);
1133 			} else {
1134 				printf("?\n");
1135 			}
1136 		case 'p':
1137 			blist_print(bl);
1138 			break;
1139 		case 's':
1140 			s = sbuf_new_auto();
1141 			blist_stats(bl, s);
1142 			sbuf_finish(s);
1143 			printf("%s", sbuf_data(s));
1144 			sbuf_delete(s);
1145 			break;
1146 		case 'a':
1147 			if (sscanf(buf + 1, "%d%d", &count, &maxcount) == 2) {
1148 				daddr_t blk = blist_alloc(bl, &count, maxcount);
1149 				printf("    R=%08llx, c=%08d\n",
1150 				    (long long)blk, count);
1151 			} else {
1152 				printf("?\n");
1153 			}
1154 			break;
1155 		case 'f':
1156 			if (sscanf(buf + 1, "%llx %d", &da, &count) == 2) {
1157 				blist_free(bl, da, count);
1158 			} else {
1159 				printf("?\n");
1160 			}
1161 			break;
1162 		case 'l':
1163 			if (sscanf(buf + 1, "%llx %d", &da, &count) == 2) {
1164 				printf("    n=%jd\n",
1165 				    (intmax_t)blist_fill(bl, da, count));
1166 			} else {
1167 				printf("?\n");
1168 			}
1169 			break;
1170 		case '?':
1171 		case 'h':
1172 			puts(
1173 			    "p          -print\n"
1174 			    "s          -stats\n"
1175 			    "a %d %d    -allocate\n"
1176 			    "f %x %d    -free\n"
1177 			    "l %x %d    -fill\n"
1178 			    "r %d       -resize\n"
1179 			    "h/?        -help\n"
1180 			    "q          -quit"
1181 			);
1182 			break;
1183 		case 'q':
1184 			break;
1185 		default:
1186 			printf("?\n");
1187 			break;
1188 		}
1189 		if (buf[0] == 'q')
1190 			break;
1191 	}
1192 	return (0);
1193 }
1194 
1195 #endif
1196