xref: /freebsd/sys/kern/subr_blist.c (revision 2c9a9dfc187d171de6b92654d71b977f067ed641)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1998 Matthew Dillon.  All Rights Reserved.
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. Neither the name of the University nor the names of its contributors
14  *    may be used to endorse or promote products derived from this software
15  *    without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
18  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
23  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
25  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
26  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  */
29 /*
30  * BLIST.C -	Bitmap allocator/deallocator, using a radix tree with hinting
31  *
32  *	This module implements a general bitmap allocator/deallocator.  The
33  *	allocator eats around 2 bits per 'block'.  The module does not
34  *	try to interpret the meaning of a 'block' other than to return
35  *	SWAPBLK_NONE on an allocation failure.
36  *
37  *	A radix tree controls access to pieces of the bitmap, and includes
38  *	auxiliary information at each interior node about the availabilty of
39  *	contiguous free blocks in the subtree rooted at that node.  Two radix
40  *	constants are involved: one for the size of the bitmaps contained in the
41  *	leaf nodes (BLIST_BMAP_RADIX), and one for the number of descendents of
42  *	each of the meta (interior) nodes (BLIST_META_RADIX).  Each subtree is
43  *	associated with a range of blocks.  The root of any subtree stores a
44  *	hint field that defines an upper bound on the size of the largest
45  *	allocation that can begin in the associated block range.  A hint is an
46  *	upper bound on a potential allocation, but not necessarily a tight upper
47  *	bound.
48  *
49  *	The bitmap field in each node directs the search for available blocks.
50  *	For a leaf node, a bit is set if the corresponding block is free.  For a
51  *	meta node, a bit is set if the corresponding subtree contains a free
52  *	block somewhere within it.  The search at a meta node considers only
53  *	children of that node that represent a range that includes a free block.
54  *
55  * 	The hinting greatly increases code efficiency for allocations while
56  *	the general radix structure optimizes both allocations and frees.  The
57  *	radix tree should be able to operate well no matter how much
58  *	fragmentation there is and no matter how large a bitmap is used.
59  *
60  *	The blist code wires all necessary memory at creation time.  Neither
61  *	allocations nor frees require interaction with the memory subsystem.
62  *	The non-blocking nature of allocations and frees is required by swap
63  *	code (vm/swap_pager.c).
64  *
65  *	LAYOUT: The radix tree is laid out recursively using a linear array.
66  *	Each meta node is immediately followed (laid out sequentially in
67  *	memory) by BLIST_META_RADIX lower level nodes.  This is a recursive
68  *	structure but one that can be easily scanned through a very simple
69  *	'skip' calculation.  The memory allocation is only large enough to
70  *	cover the number of blocks requested at creation time.  Nodes that
71  *	represent blocks beyond that limit, nodes that would never be read
72  *	or written, are not allocated, so that the last of the
73  *	BLIST_META_RADIX lower level nodes of a some nodes may not be
74  *	allocated.
75  *
76  *	NOTE: the allocator cannot currently allocate more than
77  *	BLIST_BMAP_RADIX blocks per call.  It will panic with 'allocation too
78  *	large' if you try.  This is an area that could use improvement.  The
79  *	radix is large enough that this restriction does not effect the swap
80  *	system, though.  Currently only the allocation code is affected by
81  *	this algorithmic unfeature.  The freeing code can handle arbitrary
82  *	ranges.
83  *
84  *	This code can be compiled stand-alone for debugging.
85  */
86 
87 #include <sys/cdefs.h>
88 __FBSDID("$FreeBSD$");
89 
90 #ifdef _KERNEL
91 
92 #include <sys/param.h>
93 #include <sys/systm.h>
94 #include <sys/lock.h>
95 #include <sys/kernel.h>
96 #include <sys/blist.h>
97 #include <sys/malloc.h>
98 #include <sys/sbuf.h>
99 #include <sys/proc.h>
100 #include <sys/mutex.h>
101 
102 #else
103 
104 #ifndef BLIST_NO_DEBUG
105 #define BLIST_DEBUG
106 #endif
107 
108 #include <sys/errno.h>
109 #include <sys/types.h>
110 #include <sys/malloc.h>
111 #include <sys/sbuf.h>
112 #include <assert.h>
113 #include <stdio.h>
114 #include <string.h>
115 #include <stddef.h>
116 #include <stdlib.h>
117 #include <stdarg.h>
118 #include <stdbool.h>
119 
120 #define	bitcount64(x)	__bitcount64((uint64_t)(x))
121 #define malloc(a,b,c)	calloc(a, 1)
122 #define free(a,b)	free(a)
123 #define ummin(a,b)	((a) < (b) ? (a) : (b))
124 #define imin(a,b)	((a) < (b) ? (a) : (b))
125 #define KASSERT(a,b)	assert(a)
126 
127 #include <sys/blist.h>
128 
129 #endif
130 
131 /*
132  * static support functions
133  */
134 static daddr_t	blst_leaf_alloc(blmeta_t *scan, daddr_t blk,
135     int *count, int maxcount);
136 static daddr_t	blst_meta_alloc(blmeta_t *scan, daddr_t cursor, int *count,
137     int maxcount, u_daddr_t radix);
138 static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count);
139 static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count,
140 		    u_daddr_t radix);
141 static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix,
142 		    blist_t dest, daddr_t count);
143 static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count);
144 static daddr_t blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count,
145 		    u_daddr_t radix);
146 #ifndef _KERNEL
147 static void	blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix,
148 		    int tab);
149 #endif
150 
151 #ifdef _KERNEL
152 static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space");
153 #endif
154 
155 _Static_assert(BLIST_BMAP_RADIX % BLIST_META_RADIX == 0,
156     "radix divisibility error");
157 #define	BLIST_BMAP_MASK	(BLIST_BMAP_RADIX - 1)
158 #define	BLIST_META_MASK	(BLIST_META_RADIX - 1)
159 
160 /*
161  * For a subtree that can represent the state of up to 'radix' blocks, the
162  * number of leaf nodes of the subtree is L=radix/BLIST_BMAP_RADIX.  If 'm'
163  * is short for BLIST_META_RADIX, then for a tree of height h with L=m**h
164  * leaf nodes, the total number of tree nodes is 1 + m + m**2 + ... + m**h,
165  * or, equivalently, (m**(h+1)-1)/(m-1).  This quantity is called 'skip'
166  * in the 'meta' functions that process subtrees.  Since integer division
167  * discards remainders, we can express this computation as
168  * skip = (m * m**h) / (m - 1)
169  * skip = (m * (radix / BLIST_BMAP_RADIX)) / (m - 1)
170  * and since m divides BLIST_BMAP_RADIX, we can simplify further to
171  * skip = (radix / (BLIST_BMAP_RADIX / m)) / (m - 1)
172  * skip = radix / ((BLIST_BMAP_RADIX / m) * (m - 1))
173  * so that simple integer division by a constant can safely be used for the
174  * calculation.
175  */
176 static inline daddr_t
177 radix_to_skip(daddr_t radix)
178 {
179 
180 	return (radix /
181 	    ((BLIST_BMAP_RADIX / BLIST_META_RADIX) * BLIST_META_MASK));
182 }
183 
184 /*
185  * Provide a mask with count bits set, starting as position n.
186  */
187 static inline u_daddr_t
188 bitrange(int n, int count)
189 {
190 
191 	return (((u_daddr_t)-1 << n) &
192 	    ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - (n + count))));
193 }
194 
195 
196 /*
197  * Find the first bit set in a u_daddr_t.
198  */
199 static inline int
200 generic_bitpos(u_daddr_t mask)
201 {
202 	int hi, lo, mid;
203 
204 	lo = 0;
205 	hi = BLIST_BMAP_RADIX;
206 	while (lo + 1 < hi) {
207 		mid = (lo + hi) >> 1;
208 		if (mask & bitrange(0, mid))
209 			hi = mid;
210 		else
211 			lo = mid;
212 	}
213 	return (lo);
214 }
215 
216 static inline int
217 bitpos(u_daddr_t mask)
218 {
219 
220 	switch (sizeof(mask)) {
221 #ifdef HAVE_INLINE_FFSLL
222 	case sizeof(long long):
223 		return (ffsll(mask) - 1);
224 #endif
225 #ifdef HAVE_INLINE_FFS
226 	case sizeof(int):
227 		return (ffs(mask) - 1);
228 #endif
229 	default:
230 		return (generic_bitpos(mask));
231 	}
232 }
233 
234 /*
235  * blist_create() - create a blist capable of handling up to the specified
236  *		    number of blocks
237  *
238  *	blocks - must be greater than 0
239  * 	flags  - malloc flags
240  *
241  *	The smallest blist consists of a single leaf node capable of
242  *	managing BLIST_BMAP_RADIX blocks.
243  */
244 blist_t
245 blist_create(daddr_t blocks, int flags)
246 {
247 	blist_t bl;
248 	u_daddr_t nodes, radix;
249 
250 	KASSERT(blocks > 0, ("invalid block count"));
251 
252 	/*
253 	 * Calculate the radix and node count used for scanning.
254 	 */
255 	nodes = 1;
256 	radix = BLIST_BMAP_RADIX;
257 	while (radix <= blocks) {
258 		nodes += 1 + (blocks - 1) / radix;
259 		radix *= BLIST_META_RADIX;
260 	}
261 
262 	bl = malloc(offsetof(struct blist, bl_root[nodes]), M_SWAP, flags |
263 	    M_ZERO);
264 	if (bl == NULL)
265 		return (NULL);
266 
267 	bl->bl_blocks = blocks;
268 	bl->bl_radix = radix;
269 
270 #if defined(BLIST_DEBUG)
271 	printf(
272 		"BLIST representing %lld blocks (%lld MB of swap)"
273 		", requiring %lldK of ram\n",
274 		(long long)bl->bl_blocks,
275 		(long long)bl->bl_blocks * 4 / 1024,
276 		(long long)(nodes * sizeof(blmeta_t) + 1023) / 1024
277 	);
278 	printf("BLIST raw radix tree contains %lld records\n",
279 	    (long long)nodes);
280 #endif
281 
282 	return (bl);
283 }
284 
285 void
286 blist_destroy(blist_t bl)
287 {
288 
289 	free(bl, M_SWAP);
290 }
291 
292 /*
293  * blist_alloc() -   reserve space in the block bitmap.  Return the base
294  *		     of a contiguous region or SWAPBLK_NONE if space could
295  *		     not be allocated.
296  */
297 daddr_t
298 blist_alloc(blist_t bl, int *count, int maxcount)
299 {
300 	daddr_t blk, cursor;
301 
302 	KASSERT(*count <= maxcount,
303 	    ("invalid parameters %d > %d", *count, maxcount));
304 	KASSERT(*count <= BLIST_MAX_ALLOC,
305 	    ("minimum allocation too large: %d", *count));
306 
307 	/*
308 	 * This loop iterates at most twice.  An allocation failure in the
309 	 * first iteration leads to a second iteration only if the cursor was
310 	 * non-zero.  When the cursor is zero, an allocation failure will
311 	 * stop further iterations.
312 	 */
313 	for (cursor = bl->bl_cursor;; cursor = 0) {
314 		blk = blst_meta_alloc(bl->bl_root, cursor, count, maxcount,
315 		    bl->bl_radix);
316 		if (blk != SWAPBLK_NONE) {
317 			bl->bl_avail -= *count;
318 			bl->bl_cursor = blk + *count;
319 			if (bl->bl_cursor == bl->bl_blocks)
320 				bl->bl_cursor = 0;
321 			return (blk);
322 		}
323 		if (cursor == 0)
324 			return (SWAPBLK_NONE);
325 	}
326 }
327 
328 /*
329  * blist_avail() -	return the number of free blocks.
330  */
331 daddr_t
332 blist_avail(blist_t bl)
333 {
334 
335 	return (bl->bl_avail);
336 }
337 
338 /*
339  * blist_free() -	free up space in the block bitmap.  Return the base
340  *		     	of a contiguous region.
341  */
342 void
343 blist_free(blist_t bl, daddr_t blkno, daddr_t count)
344 {
345 
346 	KASSERT(blkno >= 0 && blkno + count <= bl->bl_blocks,
347 	    ("freeing invalid range: blkno %jx, count %d, blocks %jd",
348 	    (uintmax_t)blkno, (int)count, (uintmax_t)bl->bl_blocks));
349 	blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix);
350 	bl->bl_avail += count;
351 }
352 
353 /*
354  * blist_fill() -	mark a region in the block bitmap as off-limits
355  *			to the allocator (i.e. allocate it), ignoring any
356  *			existing allocations.  Return the number of blocks
357  *			actually filled that were free before the call.
358  */
359 daddr_t
360 blist_fill(blist_t bl, daddr_t blkno, daddr_t count)
361 {
362 	daddr_t filled;
363 
364 	KASSERT(blkno >= 0 && blkno + count <= bl->bl_blocks,
365 	    ("filling invalid range: blkno %jx, count %d, blocks %jd",
366 	    (uintmax_t)blkno, (int)count, (uintmax_t)bl->bl_blocks));
367 	filled = blst_meta_fill(bl->bl_root, blkno, count, bl->bl_radix);
368 	bl->bl_avail -= filled;
369 	return (filled);
370 }
371 
372 /*
373  * blist_resize() -	resize an existing radix tree to handle the
374  *			specified number of blocks.  This will reallocate
375  *			the tree and transfer the previous bitmap to the new
376  *			one.  When extending the tree you can specify whether
377  *			the new blocks are to left allocated or freed.
378  */
379 void
380 blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags)
381 {
382     blist_t newbl = blist_create(count, flags);
383     blist_t save = *pbl;
384 
385     *pbl = newbl;
386     if (count > save->bl_blocks)
387 	    count = save->bl_blocks;
388     blst_copy(save->bl_root, 0, save->bl_radix, newbl, count);
389 
390     /*
391      * If resizing upwards, should we free the new space or not?
392      */
393     if (freenew && count < newbl->bl_blocks) {
394 	    blist_free(newbl, count, newbl->bl_blocks - count);
395     }
396     blist_destroy(save);
397 }
398 
399 #ifdef BLIST_DEBUG
400 
401 /*
402  * blist_print()    - dump radix tree
403  */
404 void
405 blist_print(blist_t bl)
406 {
407 	printf("BLIST avail = %jd, cursor = %08jx {\n",
408 	    (uintmax_t)bl->bl_avail, (uintmax_t)bl->bl_cursor);
409 
410 	if (bl->bl_root->bm_bitmap != 0)
411 		blst_radix_print(bl->bl_root, 0, bl->bl_radix, 4);
412 	printf("}\n");
413 }
414 
415 #endif
416 
417 static const u_daddr_t fib[] = {
418 	1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584,
419 	4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418, 317811,
420 	514229, 832040, 1346269, 2178309, 3524578,
421 };
422 
423 /*
424  * Use 'gap' to describe a maximal range of unallocated blocks/bits.
425  */
426 struct gap_stats {
427 	daddr_t	start;		/* current gap start, or SWAPBLK_NONE */
428 	daddr_t	num;		/* number of gaps observed */
429 	daddr_t	max;		/* largest gap size */
430 	daddr_t	avg;		/* average gap size */
431 	daddr_t	err;		/* sum - num * avg */
432 	daddr_t	histo[nitems(fib)]; /* # gaps in each size range */
433 	int	max_bucket;	/* last histo elt with nonzero val */
434 };
435 
436 /*
437  * gap_stats_counting()    - is the state 'counting 1 bits'?
438  *                           or 'skipping 0 bits'?
439  */
440 static inline bool
441 gap_stats_counting(const struct gap_stats *stats)
442 {
443 
444 	return (stats->start != SWAPBLK_NONE);
445 }
446 
447 /*
448  * init_gap_stats()    - initialize stats on gap sizes
449  */
450 static inline void
451 init_gap_stats(struct gap_stats *stats)
452 {
453 
454 	bzero(stats, sizeof(*stats));
455 	stats->start = SWAPBLK_NONE;
456 }
457 
458 /*
459  * update_gap_stats()    - update stats on gap sizes
460  */
461 static void
462 update_gap_stats(struct gap_stats *stats, daddr_t posn)
463 {
464 	daddr_t size;
465 	int hi, lo, mid;
466 
467 	if (!gap_stats_counting(stats)) {
468 		stats->start = posn;
469 		return;
470 	}
471 	size = posn - stats->start;
472 	stats->start = SWAPBLK_NONE;
473 	if (size > stats->max)
474 		stats->max = size;
475 
476 	/*
477 	 * Find the fibonacci range that contains size,
478 	 * expecting to find it in an early range.
479 	 */
480 	lo = 0;
481 	hi = 1;
482 	while (hi < nitems(fib) && fib[hi] <= size) {
483 		lo = hi;
484 		hi *= 2;
485 	}
486 	if (hi >= nitems(fib))
487 		hi = nitems(fib);
488 	while (lo + 1 != hi) {
489 		mid = (lo + hi) >> 1;
490 		if (fib[mid] <= size)
491 			lo = mid;
492 		else
493 			hi = mid;
494 	}
495 	stats->histo[lo]++;
496 	if (lo > stats->max_bucket)
497 		stats->max_bucket = lo;
498 	stats->err += size - stats->avg;
499 	stats->num++;
500 	stats->avg += stats->err / stats->num;
501 	stats->err %= stats->num;
502 }
503 
504 /*
505  * dump_gap_stats()    - print stats on gap sizes
506  */
507 static inline void
508 dump_gap_stats(const struct gap_stats *stats, struct sbuf *s)
509 {
510 	int i;
511 
512 	sbuf_printf(s, "number of maximal free ranges: %jd\n",
513 	    (intmax_t)stats->num);
514 	sbuf_printf(s, "largest free range: %jd\n", (intmax_t)stats->max);
515 	sbuf_printf(s, "average maximal free range size: %jd\n",
516 	    (intmax_t)stats->avg);
517 	sbuf_printf(s, "number of maximal free ranges of different sizes:\n");
518 	sbuf_printf(s, "               count  |  size range\n");
519 	sbuf_printf(s, "               -----  |  ----------\n");
520 	for (i = 0; i < stats->max_bucket; i++) {
521 		if (stats->histo[i] != 0) {
522 			sbuf_printf(s, "%20jd  |  ",
523 			    (intmax_t)stats->histo[i]);
524 			if (fib[i] != fib[i + 1] - 1)
525 				sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
526 				    (intmax_t)fib[i + 1] - 1);
527 			else
528 				sbuf_printf(s, "%jd\n", (intmax_t)fib[i]);
529 		}
530 	}
531 	sbuf_printf(s, "%20jd  |  ", (intmax_t)stats->histo[i]);
532 	if (stats->histo[i] > 1)
533 		sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
534 		    (intmax_t)stats->max);
535 	else
536 		sbuf_printf(s, "%jd\n", (intmax_t)stats->max);
537 }
538 
539 /*
540  * blist_stats()    - dump radix tree stats
541  */
542 void
543 blist_stats(blist_t bl, struct sbuf *s)
544 {
545 	struct gap_stats gstats;
546 	struct gap_stats *stats = &gstats;
547 	daddr_t i, nodes, radix;
548 	u_daddr_t diff, mask;
549 	int digit;
550 
551 	init_gap_stats(stats);
552 	nodes = 0;
553 	i = bl->bl_radix;
554 	while (i < bl->bl_radix + bl->bl_blocks) {
555 		/*
556 		 * Find max size subtree starting at i.
557 		 */
558 		radix = BLIST_BMAP_RADIX;
559 		while (((i / radix) & BLIST_META_MASK) == 0)
560 			radix *= BLIST_META_RADIX;
561 
562 		/*
563 		 * Check for skippable subtrees starting at i.
564 		 */
565 		while (radix > BLIST_BMAP_RADIX) {
566 			if (bl->bl_root[nodes].bm_bitmap == 0) {
567 				if (gap_stats_counting(stats))
568 					update_gap_stats(stats, i);
569 				break;
570 			}
571 
572 			/*
573 			 * Skip subtree root.
574 			 */
575 			nodes++;
576 			radix /= BLIST_META_RADIX;
577 		}
578 		if (radix == BLIST_BMAP_RADIX) {
579 			/*
580 			 * Scan leaf.
581 			 */
582 			mask = bl->bl_root[nodes].bm_bitmap;
583 			diff = mask ^ (mask << 1);
584 			if (gap_stats_counting(stats))
585 				diff ^= 1;
586 			while (diff != 0) {
587 				digit = bitpos(diff);
588 				update_gap_stats(stats, i + digit);
589 				diff ^= bitrange(digit, 1);
590 			}
591 		}
592 		nodes += radix_to_skip(radix);
593 		i += radix;
594 	}
595 	update_gap_stats(stats, i);
596 	dump_gap_stats(stats, s);
597 }
598 
599 /************************************************************************
600  *			  ALLOCATION SUPPORT FUNCTIONS			*
601  ************************************************************************
602  *
603  *	These support functions do all the actual work.  They may seem
604  *	rather longish, but that's because I've commented them up.  The
605  *	actual code is straight forward.
606  *
607  */
608 
609 /*
610  * BLST_NEXT_LEAF_ALLOC() - allocate the blocks starting with the next leaf.
611  *
612  *	'scan' is a leaf node, and its first block is at address 'start'.  The
613  *	next leaf node could be adjacent, or several nodes away if the least
614  *	common ancestor of 'scan' and its neighbor is several levels up.  Use
615  *	addresses to determine how many meta-nodes lie between the leaves.  If
616  *	sequence of leaves starting with the next one has enough initial bits
617  *	set, clear them and clear the bits in the meta nodes on the path up to
618  *	the least common ancestor to mark any subtrees made completely empty.
619  */
620 static int
621 blst_next_leaf_alloc(blmeta_t *scan, daddr_t start, int count, int maxcount)
622 {
623 	u_daddr_t radix;
624 	daddr_t blk;
625 	int avail, digit;
626 
627 	start += BLIST_BMAP_RADIX;
628 	for (blk = start; blk - start < maxcount; blk += BLIST_BMAP_RADIX) {
629 		/* Skip meta-nodes, as long as they promise more free blocks. */
630 		radix = BLIST_BMAP_RADIX;
631 		while (((++scan)->bm_bitmap & 1) == 1 &&
632 		    ((blk / radix) & BLIST_META_MASK) == 0)
633 			radix *= BLIST_META_RADIX;
634 		if (~scan->bm_bitmap != 0) {
635 			/*
636 			 * Either there is no next leaf with any free blocks,
637 			 * or we've reached the next leaf and found that some
638 			 * of its blocks are not free.  In the first case,
639 			 * bitpos() returns zero here.
640 			 */
641 			avail = blk - start + bitpos(~scan->bm_bitmap);
642 			if (avail < count || avail == 0) {
643 				/*
644 				 * There isn't a next leaf with enough free
645 				 * blocks at its beginning to bother
646 				 * allocating.
647 				 */
648 				return (avail);
649 			}
650 			maxcount = imin(avail, maxcount);
651 			if (maxcount % BLIST_BMAP_RADIX == 0) {
652 				/*
653 				 * There was no next leaf.  Back scan up to
654 				 * last leaf.
655 				 */
656 				--scan;
657 				while (radix != BLIST_BMAP_RADIX) {
658 					radix /= BLIST_META_RADIX;
659 					--scan;
660 				}
661 				blk -= BLIST_BMAP_RADIX;
662 			}
663 		}
664 	}
665 
666 	/*
667 	 * 'scan' is the last leaf that provides blocks.  Clear from 1 to
668 	 * BLIST_BMAP_RADIX bits to represent the allocation of those last
669 	 * blocks.
670 	 */
671 	if (maxcount % BLIST_BMAP_RADIX != 0)
672 		scan->bm_bitmap &= ~bitrange(0, maxcount % BLIST_BMAP_RADIX);
673 	else
674 		scan->bm_bitmap = 0;
675 
676 	for (;;) {
677 		/* Back up over meta-nodes, clearing bits if necessary. */
678 		blk -= BLIST_BMAP_RADIX;
679 		radix = BLIST_BMAP_RADIX;
680 		while ((digit = ((blk / radix) & BLIST_META_MASK)) == 0) {
681 			if ((scan--)->bm_bitmap == 0)
682 				scan->bm_bitmap ^= 1;
683 			radix *= BLIST_META_RADIX;
684 		}
685 		if ((scan--)->bm_bitmap == 0)
686 			scan[-digit * radix_to_skip(radix)].bm_bitmap ^=
687 			    (u_daddr_t)1 << digit;
688 
689 		if (blk == start)
690 			break;
691 		/* Clear all the bits of this leaf. */
692 		scan->bm_bitmap = 0;
693 	}
694 	return (maxcount);
695 }
696 
697 /*
698  * Given a bitmask, flip all the bits from the least-significant 1-bit to the
699  * most significant bit.  If the result is non-zero, then the least-significant
700  * 1-bit of the result is in the same position as the least-signification 0-bit
701  * in mask that is followed by a 1-bit.
702  */
703 static inline u_daddr_t
704 flip_hibits(u_daddr_t mask)
705 {
706 
707 	return (-mask & ~mask);
708 }
709 
710 /*
711  * BLST_LEAF_ALLOC() -	allocate at a leaf in the radix tree (a bitmap).
712  *
713  *	This function is the core of the allocator.  Its execution time is
714  *	proportional to log(count), plus height of the tree if the allocation
715  *	crosses a leaf boundary.
716  */
717 static daddr_t
718 blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int *count, int maxcount)
719 {
720 	u_daddr_t cursor_mask, mask;
721 	int count1, hi, lo, num_shifts, range1, range_ext;
722 
723 	range1 = 0;
724 	count1 = *count - 1;
725 	num_shifts = fls(count1);
726 	mask = scan->bm_bitmap;
727 	while (flip_hibits(mask) != 0 && num_shifts > 0) {
728 		/*
729 		 * If bit i is set in mask, then bits in [i, i+range1] are set
730 		 * in scan->bm_bitmap.  The value of range1 is equal to count1
731 		 * >> num_shifts.  Grow range1 and reduce num_shifts to 0,
732 		 * while preserving these invariants.  The updates to mask
733 		 * leave fewer bits set, but each bit that remains set
734 		 * represents a longer string of consecutive bits set in
735 		 * scan->bm_bitmap.  If more updates to mask cannot clear more
736 		 * bits, because mask is partitioned with all 0 bits preceding
737 		 * all 1 bits, the loop terminates immediately.
738 		 */
739 		num_shifts--;
740 		range_ext = range1 + ((count1 >> num_shifts) & 1);
741 		/*
742 		 * mask is a signed quantity for the shift because when it is
743 		 * shifted right, the sign bit should copied; when the last
744 		 * block of the leaf is free, pretend, for a while, that all the
745 		 * blocks that follow it are also free.
746 		 */
747 		mask &= (daddr_t)mask >> range_ext;
748 		range1 += range_ext;
749 	}
750 	if (mask == 0) {
751 		/*
752 		 * Update bighint.  There is no allocation bigger than range1
753 		 * starting in this leaf.
754 		 */
755 		scan->bm_bighint = range1;
756 		return (SWAPBLK_NONE);
757 	}
758 
759 	/* Discard any candidates that appear before blk. */
760 	if ((blk & BLIST_BMAP_MASK) != 0) {
761 		cursor_mask = mask & bitrange(0, blk & BLIST_BMAP_MASK);
762 		if (cursor_mask != 0) {
763 			mask ^= cursor_mask;
764 			if (mask == 0)
765 				return (SWAPBLK_NONE);
766 
767 			/*
768 			 * Bighint change for last block allocation cannot
769 			 * assume that any other blocks are allocated, so the
770 			 * bighint cannot be reduced much.
771 			 */
772 			range1 = BLIST_MAX_ALLOC - 1;
773 		}
774 		blk &= ~BLIST_BMAP_MASK;
775 	}
776 
777 	/*
778 	 * The least significant set bit in mask marks the start of the first
779 	 * available range of sufficient size.  Find its position.
780 	 */
781 	lo = bitpos(mask);
782 
783 	/*
784 	 * Find how much space is available starting at that position.
785 	 */
786 	if (flip_hibits(mask) != 0) {
787 		/* Count the 1 bits starting at position lo. */
788 		hi = bitpos(flip_hibits(mask)) + count1;
789 		if (maxcount < hi - lo)
790 			hi = lo + maxcount;
791 		*count = hi - lo;
792 		mask = bitrange(lo, *count);
793 	} else if (maxcount <= BLIST_BMAP_RADIX - lo) {
794 		/* All the blocks we can use are available here. */
795 		hi = lo + maxcount;
796 		*count = maxcount;
797 		mask = bitrange(lo, *count);
798 	} else {
799 		/* Check next leaf for some of the blocks we want or need. */
800 		count1 = *count - (BLIST_BMAP_RADIX - lo);
801 		maxcount -= BLIST_BMAP_RADIX - lo;
802 		hi = blst_next_leaf_alloc(scan, blk, count1, maxcount);
803 		if (hi < count1)
804 			/*
805 			 * The next leaf cannot supply enough blocks to reach
806 			 * the minimum required allocation.  The hint cannot be
807 			 * updated, because the same allocation request could
808 			 * be satisfied later, by this leaf, if the state of
809 			 * the next leaf changes, and without any changes to
810 			 * this leaf.
811 			 */
812 			return (SWAPBLK_NONE);
813 		*count = BLIST_BMAP_RADIX - lo + hi;
814 		hi = BLIST_BMAP_RADIX;
815 	}
816 
817 	if (hi == BLIST_BMAP_RADIX) {
818 		/*
819 		 * Update bighint.  There is no allocation bigger than range1
820 		 * available in this leaf after this allocation completes.
821 		 */
822 		scan->bm_bighint = range1;
823 	}
824 	/* Clear the allocated bits from this leaf. */
825 	scan->bm_bitmap &= ~mask;
826 	return (blk + lo);
827 }
828 
829 /*
830  * blist_meta_alloc() -	allocate at a meta in the radix tree.
831  *
832  *	Attempt to allocate at a meta node.  If we can't, we update
833  *	bighint and return a failure.  Updating bighint optimize future
834  *	calls that hit this node.  We have to check for our collapse cases
835  *	and we have a few optimizations strewn in as well.
836  */
837 static daddr_t
838 blst_meta_alloc(blmeta_t *scan, daddr_t cursor, int *count,
839     int maxcount, u_daddr_t radix)
840 {
841 	daddr_t blk, i, r, skip;
842 	u_daddr_t mask;
843 	bool scan_from_start;
844 	int digit;
845 
846 	if (radix == BLIST_BMAP_RADIX)
847 		return (blst_leaf_alloc(scan, cursor, count, maxcount));
848 	blk = cursor & -radix;
849 	scan_from_start = (cursor == blk);
850 	radix /= BLIST_META_RADIX;
851 	skip = radix_to_skip(radix);
852 	mask = scan->bm_bitmap;
853 
854 	/* Discard any candidates that appear before cursor. */
855 	digit = (cursor / radix) & BLIST_META_MASK;
856 	mask &= (u_daddr_t)-1 << digit;
857 	if (mask == 0)
858 		return (SWAPBLK_NONE);
859 
860 	/*
861 	 * If the first try is for a block that includes the cursor, pre-undo
862 	 * the digit * radix offset in the first call; otherwise, ignore the
863 	 * cursor entirely.
864 	 */
865 	if (((mask >> digit) & 1) == 1)
866 		cursor -= digit * radix;
867 	else
868 		cursor = blk;
869 
870 	/*
871 	 * Examine the nonempty subtree associated with each bit set in mask.
872 	 */
873 	do {
874 		digit = bitpos(mask);
875 		i = 1 + digit * skip;
876 		if (*count <= scan[i].bm_bighint) {
877 			/*
878 			 * The allocation might fit beginning in the i'th subtree.
879 			 */
880 			r = blst_meta_alloc(&scan[i], cursor + digit * radix,
881 			    count, maxcount, radix);
882 			if (r != SWAPBLK_NONE) {
883 				if (scan[i].bm_bitmap == 0)
884 					scan->bm_bitmap ^= bitrange(digit, 1);
885 				return (r);
886 			}
887 		}
888 		cursor = blk;
889 	} while ((mask ^= bitrange(digit, 1)) != 0);
890 
891 	/*
892 	 * We couldn't allocate count in this subtree.  If the whole tree was
893 	 * scanned, and the last tree node is allocated, update bighint.
894 	 */
895 	if (scan_from_start && !(digit == BLIST_META_RADIX - 1 &&
896 	    scan[i].bm_bighint == BLIST_MAX_ALLOC))
897 		scan->bm_bighint = *count - 1;
898 
899 	return (SWAPBLK_NONE);
900 }
901 
902 /*
903  * BLST_LEAF_FREE() -	free allocated block from leaf bitmap
904  *
905  */
906 static void
907 blst_leaf_free(blmeta_t *scan, daddr_t blk, int count)
908 {
909 	u_daddr_t mask;
910 
911 	/*
912 	 * free some data in this bitmap
913 	 * mask=0000111111111110000
914 	 *          \_________/\__/
915 	 *		count   n
916 	 */
917 	mask = bitrange(blk & BLIST_BMAP_MASK, count);
918 	KASSERT((scan->bm_bitmap & mask) == 0,
919 	    ("freeing free block: %jx, size %d, mask %jx",
920 	    (uintmax_t)blk, count, (uintmax_t)scan->bm_bitmap & mask));
921 	scan->bm_bitmap |= mask;
922 }
923 
924 /*
925  * BLST_META_FREE() - free allocated blocks from radix tree meta info
926  *
927  *	This support routine frees a range of blocks from the bitmap.
928  *	The range must be entirely enclosed by this radix node.  If a
929  *	meta node, we break the range down recursively to free blocks
930  *	in subnodes (which means that this code can free an arbitrary
931  *	range whereas the allocation code cannot allocate an arbitrary
932  *	range).
933  */
934 static void
935 blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, u_daddr_t radix)
936 {
937 	daddr_t blk, endBlk, i, skip;
938 	int digit, endDigit;
939 
940 	/*
941 	 * We could probably do a better job here.  We are required to make
942 	 * bighint at least as large as the biggest allocable block of data.
943 	 * If we just shoehorn it, a little extra overhead will be incurred
944 	 * on the next allocation (but only that one typically).
945 	 */
946 	scan->bm_bighint = BLIST_MAX_ALLOC;
947 
948 	if (radix == BLIST_BMAP_RADIX)
949 		return (blst_leaf_free(scan, freeBlk, count));
950 
951 	endBlk = ummin(freeBlk + count, (freeBlk + radix) & -radix);
952 	radix /= BLIST_META_RADIX;
953 	skip = radix_to_skip(radix);
954 	blk = freeBlk & -radix;
955 	digit = (blk / radix) & BLIST_META_MASK;
956 	endDigit = 1 + (((endBlk - 1) / radix) & BLIST_META_MASK);
957 	scan->bm_bitmap |= bitrange(digit, endDigit - digit);
958 	for (i = 1 + digit * skip; blk < endBlk; i += skip) {
959 		blk += radix;
960 		count = ummin(blk, endBlk) - freeBlk;
961 		blst_meta_free(&scan[i], freeBlk, count, radix);
962 		freeBlk = blk;
963 	}
964 }
965 
966 /*
967  * BLST_COPY() - copy one radix tree to another
968  *
969  *	Locates free space in the source tree and frees it in the destination
970  *	tree.  The space may not already be free in the destination.
971  */
972 static void
973 blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, blist_t dest,
974     daddr_t count)
975 {
976 	daddr_t endBlk, i, skip;
977 
978 	/*
979 	 * Leaf node
980 	 */
981 
982 	if (radix == BLIST_BMAP_RADIX) {
983 		u_daddr_t v = scan->bm_bitmap;
984 
985 		if (v == (u_daddr_t)-1) {
986 			blist_free(dest, blk, count);
987 		} else if (v != 0) {
988 			int i;
989 
990 			for (i = 0; i < count; ++i) {
991 				if (v & ((u_daddr_t)1 << i))
992 					blist_free(dest, blk + i, 1);
993 			}
994 		}
995 		return;
996 	}
997 
998 	/*
999 	 * Meta node
1000 	 */
1001 
1002 	if (scan->bm_bitmap == 0) {
1003 		/*
1004 		 * Source all allocated, leave dest allocated
1005 		 */
1006 		return;
1007 	}
1008 
1009 	endBlk = blk + count;
1010 	radix /= BLIST_META_RADIX;
1011 	skip = radix_to_skip(radix);
1012 	for (i = 1; blk < endBlk; i += skip) {
1013 		blk += radix;
1014 		count = radix;
1015 		if (blk >= endBlk)
1016 			count -= blk - endBlk;
1017 		blst_copy(&scan[i], blk - radix, radix, dest, count);
1018 	}
1019 }
1020 
1021 /*
1022  * BLST_LEAF_FILL() -	allocate specific blocks in leaf bitmap
1023  *
1024  *	This routine allocates all blocks in the specified range
1025  *	regardless of any existing allocations in that range.  Returns
1026  *	the number of blocks allocated by the call.
1027  */
1028 static daddr_t
1029 blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count)
1030 {
1031 	daddr_t nblks;
1032 	u_daddr_t mask;
1033 
1034 	mask = bitrange(blk & BLIST_BMAP_MASK, count);
1035 
1036 	/* Count the number of blocks that we are allocating. */
1037 	nblks = bitcount64(scan->bm_bitmap & mask);
1038 
1039 	scan->bm_bitmap &= ~mask;
1040 	return (nblks);
1041 }
1042 
1043 /*
1044  * BLIST_META_FILL() -	allocate specific blocks at a meta node
1045  *
1046  *	This routine allocates the specified range of blocks,
1047  *	regardless of any existing allocations in the range.  The
1048  *	range must be within the extent of this node.  Returns the
1049  *	number of blocks allocated by the call.
1050  */
1051 static daddr_t
1052 blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, u_daddr_t radix)
1053 {
1054 	daddr_t blk, endBlk, i, nblks, skip;
1055 	int digit;
1056 
1057 	if (radix == BLIST_BMAP_RADIX)
1058 		return (blst_leaf_fill(scan, allocBlk, count));
1059 
1060 	endBlk = ummin(allocBlk + count, (allocBlk + radix) & -radix);
1061 	radix /= BLIST_META_RADIX;
1062 	skip = radix_to_skip(radix);
1063 	blk = allocBlk & -radix;
1064 	nblks = 0;
1065 	while (blk < endBlk) {
1066 		digit = (blk / radix) & BLIST_META_MASK;
1067 		i = 1 + digit * skip;
1068 		blk += radix;
1069 		count = ummin(blk, endBlk) - allocBlk;
1070 		nblks += blst_meta_fill(&scan[i], allocBlk, count, radix);
1071 		if (scan[i].bm_bitmap == 0)
1072 			scan->bm_bitmap &= ~((u_daddr_t)1 << digit);
1073 		allocBlk = blk;
1074 	}
1075 	return (nblks);
1076 }
1077 
1078 #ifdef BLIST_DEBUG
1079 
1080 static void
1081 blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int tab)
1082 {
1083 	daddr_t skip;
1084 	u_daddr_t mask;
1085 	int digit;
1086 
1087 	if (radix == BLIST_BMAP_RADIX) {
1088 		printf(
1089 		    "%*.*s(%08llx,%lld): bitmap %0*llx big=%lld\n",
1090 		    tab, tab, "",
1091 		    (long long)blk, (long long)radix,
1092 		    1 + (BLIST_BMAP_RADIX - 1) / 4,
1093 		    (long long)scan->bm_bitmap,
1094 		    (long long)scan->bm_bighint
1095 		);
1096 		return;
1097 	}
1098 
1099 	printf(
1100 	    "%*.*s(%08llx): subtree (%lld/%lld) bitmap %0*llx big=%lld {\n",
1101 	    tab, tab, "",
1102 	    (long long)blk, (long long)radix,
1103 	    (long long)radix,
1104 	    1 + (BLIST_META_RADIX - 1) / 4,
1105 	    (long long)scan->bm_bitmap,
1106 	    (long long)scan->bm_bighint
1107 	);
1108 
1109 	radix /= BLIST_META_RADIX;
1110 	skip = radix_to_skip(radix);
1111 	tab += 4;
1112 
1113 	mask = scan->bm_bitmap;
1114 	/* Examine the nonempty subtree associated with each bit set in mask */
1115 	do {
1116 		digit = bitpos(mask);
1117 		blst_radix_print(&scan[1 + digit * skip], blk + digit * radix,
1118 		    radix, tab);
1119 	} while ((mask ^= bitrange(digit, 1)) != 0);
1120 	tab -= 4;
1121 
1122 	printf(
1123 	    "%*.*s}\n",
1124 	    tab, tab, ""
1125 	);
1126 }
1127 
1128 #endif
1129 
1130 #ifdef BLIST_DEBUG
1131 
1132 int
1133 main(int ac, char **av)
1134 {
1135 	int size = BLIST_META_RADIX * BLIST_BMAP_RADIX;
1136 	int i;
1137 	blist_t bl;
1138 	struct sbuf *s;
1139 
1140 	for (i = 1; i < ac; ++i) {
1141 		const char *ptr = av[i];
1142 		if (*ptr != '-') {
1143 			size = strtol(ptr, NULL, 0);
1144 			continue;
1145 		}
1146 		ptr += 2;
1147 		fprintf(stderr, "Bad option: %s\n", ptr - 2);
1148 		exit(1);
1149 	}
1150 	bl = blist_create(size, M_WAITOK);
1151 	blist_free(bl, 0, size);
1152 
1153 	for (;;) {
1154 		char buf[1024];
1155 		long long da = 0;
1156 		int count = 0, maxcount = 0;
1157 
1158 		printf("%lld/%lld/%lld> ", (long long)blist_avail(bl),
1159 		    (long long)size, (long long)bl->bl_radix);
1160 		fflush(stdout);
1161 		if (fgets(buf, sizeof(buf), stdin) == NULL)
1162 			break;
1163 		switch(buf[0]) {
1164 		case 'r':
1165 			if (sscanf(buf + 1, "%d", &count) == 1) {
1166 				blist_resize(&bl, count, 1, M_WAITOK);
1167 			} else {
1168 				printf("?\n");
1169 			}
1170 		case 'p':
1171 			blist_print(bl);
1172 			break;
1173 		case 's':
1174 			s = sbuf_new_auto();
1175 			blist_stats(bl, s);
1176 			sbuf_finish(s);
1177 			printf("%s", sbuf_data(s));
1178 			sbuf_delete(s);
1179 			break;
1180 		case 'a':
1181 			if (sscanf(buf + 1, "%d%d", &count, &maxcount) == 2) {
1182 				daddr_t blk = blist_alloc(bl, &count, maxcount);
1183 				printf("    R=%08llx, c=%08d\n",
1184 				    (long long)blk, count);
1185 			} else {
1186 				printf("?\n");
1187 			}
1188 			break;
1189 		case 'f':
1190 			if (sscanf(buf + 1, "%llx %d", &da, &count) == 2) {
1191 				blist_free(bl, da, count);
1192 			} else {
1193 				printf("?\n");
1194 			}
1195 			break;
1196 		case 'l':
1197 			if (sscanf(buf + 1, "%llx %d", &da, &count) == 2) {
1198 				printf("    n=%jd\n",
1199 				    (intmax_t)blist_fill(bl, da, count));
1200 			} else {
1201 				printf("?\n");
1202 			}
1203 			break;
1204 		case '?':
1205 		case 'h':
1206 			puts(
1207 			    "p          -print\n"
1208 			    "s          -stats\n"
1209 			    "a %d %d    -allocate\n"
1210 			    "f %x %d    -free\n"
1211 			    "l %x %d    -fill\n"
1212 			    "r %d       -resize\n"
1213 			    "h/?        -help\n"
1214 			    "q          -quit"
1215 			);
1216 			break;
1217 		case 'q':
1218 			break;
1219 		default:
1220 			printf("?\n");
1221 			break;
1222 		}
1223 		if (buf[0] == 'q')
1224 			break;
1225 	}
1226 	return (0);
1227 }
1228 
1229 #endif
1230