xref: /freebsd/sys/kern/subr_blist.c (revision 64e7d18f3438c74e228ba11b6d629696866a0091)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1998 Matthew Dillon.  All Rights Reserved.
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. Neither the name of the University nor the names of its contributors
14  *    may be used to endorse or promote products derived from this software
15  *    without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
18  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
23  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
25  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
26  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  */
29 /*
30  * BLIST.C -	Bitmap allocator/deallocator, using a radix tree with hinting
31  *
32  *	This module implements a general bitmap allocator/deallocator.  The
33  *	allocator eats around 2 bits per 'block'.  The module does not
34  *	try to interpret the meaning of a 'block' other than to return
35  *	SWAPBLK_NONE on an allocation failure.
36  *
37  *	A radix tree controls access to pieces of the bitmap, and includes
38  *	auxiliary information at each interior node about the availabilty of
39  *	contiguous free blocks in the subtree rooted at that node.  Two radix
40  *	constants are involved: one for the size of the bitmaps contained in the
41  *	leaf nodes (BLIST_BMAP_RADIX), and one for the number of descendents of
42  *	each of the meta (interior) nodes (BLIST_META_RADIX).  Each subtree is
43  *	associated with a range of blocks.  The root of any subtree stores a
44  *	hint field that defines an upper bound on the size of the largest
45  *	allocation that can begin in the associated block range.  A hint is an
46  *	upper bound on a potential allocation, but not necessarily a tight upper
47  *	bound.
48  *
49  *	The bitmap field in each node directs the search for available blocks.
50  *	For a leaf node, a bit is set if the corresponding block is free.  For a
51  *	meta node, a bit is set if the corresponding subtree contains a free
52  *	block somewhere within it.  The search at a meta node considers only
53  *	children of that node that represent a range that includes a free block.
54  *
55  * 	The hinting greatly increases code efficiency for allocations while
56  *	the general radix structure optimizes both allocations and frees.  The
57  *	radix tree should be able to operate well no matter how much
58  *	fragmentation there is and no matter how large a bitmap is used.
59  *
60  *	The blist code wires all necessary memory at creation time.  Neither
61  *	allocations nor frees require interaction with the memory subsystem.
62  *	The non-blocking nature of allocations and frees is required by swap
63  *	code (vm/swap_pager.c).
64  *
65  *	LAYOUT: The radix tree is laid out recursively using a linear array.
66  *	Each meta node is immediately followed (laid out sequentially in
67  *	memory) by BLIST_META_RADIX lower level nodes.  This is a recursive
68  *	structure but one that can be easily scanned through a very simple
69  *	'skip' calculation.  The memory allocation is only large enough to
70  *	cover the number of blocks requested at creation time.  Nodes that
71  *	represent blocks beyond that limit, nodes that would never be read
72  *	or written, are not allocated, so that the last of the
73  *	BLIST_META_RADIX lower level nodes of a some nodes may not be
74  *	allocated.
75  *
76  *	NOTE: the allocator cannot currently allocate more than
77  *	BLIST_BMAP_RADIX blocks per call.  It will panic with 'allocation too
78  *	large' if you try.  This is an area that could use improvement.  The
79  *	radix is large enough that this restriction does not effect the swap
80  *	system, though.  Currently only the allocation code is affected by
81  *	this algorithmic unfeature.  The freeing code can handle arbitrary
82  *	ranges.
83  *
84  *	This code can be compiled stand-alone for debugging.
85  */
86 
87 #include <sys/cdefs.h>
88 __FBSDID("$FreeBSD$");
89 
90 #ifdef _KERNEL
91 
92 #include <sys/param.h>
93 #include <sys/systm.h>
94 #include <sys/lock.h>
95 #include <sys/kernel.h>
96 #include <sys/blist.h>
97 #include <sys/malloc.h>
98 #include <sys/sbuf.h>
99 #include <sys/proc.h>
100 #include <sys/mutex.h>
101 
102 #else
103 
104 #ifndef BLIST_NO_DEBUG
105 #define BLIST_DEBUG
106 #endif
107 
108 #include <sys/errno.h>
109 #include <sys/types.h>
110 #include <sys/malloc.h>
111 #include <sys/sbuf.h>
112 #include <assert.h>
113 #include <stdio.h>
114 #include <string.h>
115 #include <stddef.h>
116 #include <stdlib.h>
117 #include <stdarg.h>
118 #include <stdbool.h>
119 
120 #define	bitcount64(x)	__bitcount64((uint64_t)(x))
121 #define malloc(a,b,c)	calloc(a, 1)
122 #define free(a,b)	free(a)
123 #define ummin(a,b)	((a) < (b) ? (a) : (b))
124 #define KASSERT(a,b)	assert(a)
125 
126 #include <sys/blist.h>
127 
128 #endif
129 
130 /*
131  * static support functions
132  */
133 static daddr_t	blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count);
134 static daddr_t	blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count,
135 		    u_daddr_t radix);
136 static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count);
137 static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count,
138 		    u_daddr_t radix);
139 static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix,
140 		    blist_t dest, daddr_t count);
141 static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count);
142 static daddr_t blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count,
143 		    u_daddr_t radix);
144 #ifndef _KERNEL
145 static void	blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix,
146 		    int tab);
147 #endif
148 
149 #ifdef _KERNEL
150 static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space");
151 #endif
152 
153 _Static_assert(BLIST_BMAP_RADIX % BLIST_META_RADIX == 0,
154     "radix divisibility error");
155 #define	BLIST_BMAP_MASK	(BLIST_BMAP_RADIX - 1)
156 #define	BLIST_META_MASK	(BLIST_META_RADIX - 1)
157 
158 /*
159  * For a subtree that can represent the state of up to 'radix' blocks, the
160  * number of leaf nodes of the subtree is L=radix/BLIST_BMAP_RADIX.  If 'm'
161  * is short for BLIST_META_RADIX, then for a tree of height h with L=m**h
162  * leaf nodes, the total number of tree nodes is 1 + m + m**2 + ... + m**h,
163  * or, equivalently, (m**(h+1)-1)/(m-1).  This quantity is called 'skip'
164  * in the 'meta' functions that process subtrees.  Since integer division
165  * discards remainders, we can express this computation as
166  * skip = (m * m**h) / (m - 1)
167  * skip = (m * (radix / BLIST_BMAP_RADIX)) / (m - 1)
168  * and since m divides BLIST_BMAP_RADIX, we can simplify further to
169  * skip = (radix / (BLIST_BMAP_RADIX / m)) / (m - 1)
170  * skip = radix / ((BLIST_BMAP_RADIX / m) * (m - 1))
171  * so that simple integer division by a constant can safely be used for the
172  * calculation.
173  */
174 static inline daddr_t
175 radix_to_skip(daddr_t radix)
176 {
177 
178 	return (radix /
179 	    ((BLIST_BMAP_RADIX / BLIST_META_RADIX) * BLIST_META_MASK));
180 }
181 
182 /*
183  * Provide a mask with count bits set, starting as position n.
184  */
185 static inline u_daddr_t
186 bitrange(int n, int count)
187 {
188 
189 	return (((u_daddr_t)-1 << n) &
190 	    ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - (n + count))));
191 }
192 
193 
194 /*
195  * Find the first bit set in a u_daddr_t.
196  */
197 static inline int
198 generic_bitpos(u_daddr_t mask)
199 {
200 	int hi, lo, mid;
201 
202 	lo = 0;
203 	hi = BLIST_BMAP_RADIX;
204 	while (lo + 1 < hi) {
205 		mid = (lo + hi) >> 1;
206 		if (mask & bitrange(0, mid))
207 			hi = mid;
208 		else
209 			lo = mid;
210 	}
211 	return (lo);
212 }
213 
214 static inline int
215 bitpos(u_daddr_t mask)
216 {
217 
218   switch (sizeof(mask)) {
219 #ifdef HAVE_INLINE_FFSLL
220   case sizeof(long long):
221 	  return (ffsll(mask) - 1);
222 #endif
223 #ifdef HAVE_INLINE_FFS
224   case sizeof(int):
225 	  return (ffs(mask) - 1);
226 #endif
227   default:
228 	  return (generic_bitpos(mask));
229   }
230 }
231 
232 /*
233  * blist_create() - create a blist capable of handling up to the specified
234  *		    number of blocks
235  *
236  *	blocks - must be greater than 0
237  * 	flags  - malloc flags
238  *
239  *	The smallest blist consists of a single leaf node capable of
240  *	managing BLIST_BMAP_RADIX blocks.
241  */
242 blist_t
243 blist_create(daddr_t blocks, int flags)
244 {
245 	blist_t bl;
246 	u_daddr_t nodes, radix;
247 
248 	KASSERT(blocks > 0, ("invalid block count"));
249 
250 	/*
251 	 * Calculate the radix and node count used for scanning.
252 	 */
253 	nodes = 1;
254 	radix = BLIST_BMAP_RADIX;
255 	while (radix <= blocks) {
256 		nodes += 1 + (blocks - 1) / radix;
257 		radix *= BLIST_META_RADIX;
258 	}
259 
260 	bl = malloc(offsetof(struct blist, bl_root[nodes]), M_SWAP, flags |
261 	    M_ZERO);
262 	if (bl == NULL)
263 		return (NULL);
264 
265 	bl->bl_blocks = blocks;
266 	bl->bl_radix = radix;
267 
268 #if defined(BLIST_DEBUG)
269 	printf(
270 		"BLIST representing %lld blocks (%lld MB of swap)"
271 		", requiring %lldK of ram\n",
272 		(long long)bl->bl_blocks,
273 		(long long)bl->bl_blocks * 4 / 1024,
274 		(long long)(nodes * sizeof(blmeta_t) + 1023) / 1024
275 	);
276 	printf("BLIST raw radix tree contains %lld records\n",
277 	    (long long)nodes);
278 #endif
279 
280 	return (bl);
281 }
282 
283 void
284 blist_destroy(blist_t bl)
285 {
286 
287 	free(bl, M_SWAP);
288 }
289 
290 /*
291  * blist_alloc() -   reserve space in the block bitmap.  Return the base
292  *		     of a contiguous region or SWAPBLK_NONE if space could
293  *		     not be allocated.
294  */
295 daddr_t
296 blist_alloc(blist_t bl, daddr_t count)
297 {
298 	daddr_t blk, cursor;
299 
300 	KASSERT(count <= BLIST_MAX_ALLOC,
301 	    ("allocation too large: %d", (int)count));
302 
303 	/*
304 	 * This loop iterates at most twice.  An allocation failure in the
305 	 * first iteration leads to a second iteration only if the cursor was
306 	 * non-zero.  When the cursor is zero, an allocation failure will
307 	 * stop further iterations.
308 	 */
309 	cursor = bl->bl_cursor;
310 	for (;;) {
311 		blk = blst_meta_alloc(bl->bl_root, cursor, count,
312 		    bl->bl_radix);
313 		if (blk != SWAPBLK_NONE) {
314 			bl->bl_avail -= count;
315 			bl->bl_cursor = blk + count;
316 			if (bl->bl_cursor == bl->bl_blocks)
317 				bl->bl_cursor = 0;
318 			return (blk);
319 		} else if (cursor == 0)
320 			return (SWAPBLK_NONE);
321 		cursor = 0;
322 	}
323 }
324 
325 /*
326  * blist_avail() -	return the number of free blocks.
327  */
328 daddr_t
329 blist_avail(blist_t bl)
330 {
331 
332 	return (bl->bl_avail);
333 }
334 
335 /*
336  * blist_free() -	free up space in the block bitmap.  Return the base
337  *		     	of a contiguous region.
338  */
339 void
340 blist_free(blist_t bl, daddr_t blkno, daddr_t count)
341 {
342 
343 	KASSERT(blkno >= 0 && blkno + count <= bl->bl_blocks,
344 	    ("freeing invalid range: blkno %jx, count %d, blocks %jd",
345 	    (uintmax_t)blkno, (int)count, (uintmax_t)bl->bl_blocks));
346 	blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix);
347 	bl->bl_avail += count;
348 }
349 
350 /*
351  * blist_fill() -	mark a region in the block bitmap as off-limits
352  *			to the allocator (i.e. allocate it), ignoring any
353  *			existing allocations.  Return the number of blocks
354  *			actually filled that were free before the call.
355  */
356 daddr_t
357 blist_fill(blist_t bl, daddr_t blkno, daddr_t count)
358 {
359 	daddr_t filled;
360 
361 	KASSERT(blkno >= 0 && blkno + count <= bl->bl_blocks,
362 	    ("filling invalid range: blkno %jx, count %d, blocks %jd",
363 	    (uintmax_t)blkno, (int)count, (uintmax_t)bl->bl_blocks));
364 	filled = blst_meta_fill(bl->bl_root, blkno, count, bl->bl_radix);
365 	bl->bl_avail -= filled;
366 	return (filled);
367 }
368 
369 /*
370  * blist_resize() -	resize an existing radix tree to handle the
371  *			specified number of blocks.  This will reallocate
372  *			the tree and transfer the previous bitmap to the new
373  *			one.  When extending the tree you can specify whether
374  *			the new blocks are to left allocated or freed.
375  */
376 void
377 blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags)
378 {
379     blist_t newbl = blist_create(count, flags);
380     blist_t save = *pbl;
381 
382     *pbl = newbl;
383     if (count > save->bl_blocks)
384 	    count = save->bl_blocks;
385     blst_copy(save->bl_root, 0, save->bl_radix, newbl, count);
386 
387     /*
388      * If resizing upwards, should we free the new space or not?
389      */
390     if (freenew && count < newbl->bl_blocks) {
391 	    blist_free(newbl, count, newbl->bl_blocks - count);
392     }
393     blist_destroy(save);
394 }
395 
396 #ifdef BLIST_DEBUG
397 
398 /*
399  * blist_print()    - dump radix tree
400  */
401 void
402 blist_print(blist_t bl)
403 {
404 	printf("BLIST avail = %jd, cursor = %08jx {\n",
405 	    (uintmax_t)bl->bl_avail, (uintmax_t)bl->bl_cursor);
406 
407 	if (bl->bl_root->bm_bitmap != 0)
408 		blst_radix_print(bl->bl_root, 0, bl->bl_radix, 4);
409 	printf("}\n");
410 }
411 
412 #endif
413 
414 static const u_daddr_t fib[] = {
415 	1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584,
416 	4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418, 317811,
417 	514229, 832040, 1346269, 2178309, 3524578,
418 };
419 
420 /*
421  * Use 'gap' to describe a maximal range of unallocated blocks/bits.
422  */
423 struct gap_stats {
424 	daddr_t	start;		/* current gap start, or SWAPBLK_NONE */
425 	daddr_t	num;		/* number of gaps observed */
426 	daddr_t	max;		/* largest gap size */
427 	daddr_t	avg;		/* average gap size */
428 	daddr_t	err;		/* sum - num * avg */
429 	daddr_t	histo[nitems(fib)]; /* # gaps in each size range */
430 	int	max_bucket;	/* last histo elt with nonzero val */
431 };
432 
433 /*
434  * gap_stats_counting()    - is the state 'counting 1 bits'?
435  *                           or 'skipping 0 bits'?
436  */
437 static inline bool
438 gap_stats_counting(const struct gap_stats *stats)
439 {
440 
441 	return (stats->start != SWAPBLK_NONE);
442 }
443 
444 /*
445  * init_gap_stats()    - initialize stats on gap sizes
446  */
447 static inline void
448 init_gap_stats(struct gap_stats *stats)
449 {
450 
451 	bzero(stats, sizeof(*stats));
452 	stats->start = SWAPBLK_NONE;
453 }
454 
455 /*
456  * update_gap_stats()    - update stats on gap sizes
457  */
458 static void
459 update_gap_stats(struct gap_stats *stats, daddr_t posn)
460 {
461 	daddr_t size;
462 	int hi, lo, mid;
463 
464 	if (!gap_stats_counting(stats)) {
465 		stats->start = posn;
466 		return;
467 	}
468 	size = posn - stats->start;
469 	stats->start = SWAPBLK_NONE;
470 	if (size > stats->max)
471 		stats->max = size;
472 
473 	/*
474 	 * Find the fibonacci range that contains size,
475 	 * expecting to find it in an early range.
476 	 */
477 	lo = 0;
478 	hi = 1;
479 	while (hi < nitems(fib) && fib[hi] <= size) {
480 		lo = hi;
481 		hi *= 2;
482 	}
483 	if (hi >= nitems(fib))
484 		hi = nitems(fib);
485 	while (lo + 1 != hi) {
486 		mid = (lo + hi) >> 1;
487 		if (fib[mid] <= size)
488 			lo = mid;
489 		else
490 			hi = mid;
491 	}
492 	stats->histo[lo]++;
493 	if (lo > stats->max_bucket)
494 		stats->max_bucket = lo;
495 	stats->err += size - stats->avg;
496 	stats->num++;
497 	stats->avg += stats->err / stats->num;
498 	stats->err %= stats->num;
499 }
500 
501 /*
502  * dump_gap_stats()    - print stats on gap sizes
503  */
504 static inline void
505 dump_gap_stats(const struct gap_stats *stats, struct sbuf *s)
506 {
507 	int i;
508 
509 	sbuf_printf(s, "number of maximal free ranges: %jd\n",
510 	    (intmax_t)stats->num);
511 	sbuf_printf(s, "largest free range: %jd\n", (intmax_t)stats->max);
512 	sbuf_printf(s, "average maximal free range size: %jd\n",
513 	    (intmax_t)stats->avg);
514 	sbuf_printf(s, "number of maximal free ranges of different sizes:\n");
515 	sbuf_printf(s, "               count  |  size range\n");
516 	sbuf_printf(s, "               -----  |  ----------\n");
517 	for (i = 0; i < stats->max_bucket; i++) {
518 		if (stats->histo[i] != 0) {
519 			sbuf_printf(s, "%20jd  |  ",
520 			    (intmax_t)stats->histo[i]);
521 			if (fib[i] != fib[i + 1] - 1)
522 				sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
523 				    (intmax_t)fib[i + 1] - 1);
524 			else
525 				sbuf_printf(s, "%jd\n", (intmax_t)fib[i]);
526 		}
527 	}
528 	sbuf_printf(s, "%20jd  |  ", (intmax_t)stats->histo[i]);
529 	if (stats->histo[i] > 1)
530 		sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
531 		    (intmax_t)stats->max);
532 	else
533 		sbuf_printf(s, "%jd\n", (intmax_t)stats->max);
534 }
535 
536 /*
537  * blist_stats()    - dump radix tree stats
538  */
539 void
540 blist_stats(blist_t bl, struct sbuf *s)
541 {
542 	struct gap_stats gstats;
543 	struct gap_stats *stats = &gstats;
544 	daddr_t i, nodes, radix;
545 	u_daddr_t diff, mask;
546 	int digit;
547 
548 	init_gap_stats(stats);
549 	nodes = 0;
550 	i = bl->bl_radix;
551 	while (i < bl->bl_radix + bl->bl_blocks) {
552 		/*
553 		 * Find max size subtree starting at i.
554 		 */
555 		radix = BLIST_BMAP_RADIX;
556 		while (((i / radix) & BLIST_META_MASK) == 0)
557 			radix *= BLIST_META_RADIX;
558 
559 		/*
560 		 * Check for skippable subtrees starting at i.
561 		 */
562 		while (radix > BLIST_BMAP_RADIX) {
563 			if (bl->bl_root[nodes].bm_bitmap == 0) {
564 				if (gap_stats_counting(stats))
565 					update_gap_stats(stats, i);
566 				break;
567 			}
568 
569 			/*
570 			 * Skip subtree root.
571 			 */
572 			nodes++;
573 			radix /= BLIST_META_RADIX;
574 		}
575 		if (radix == BLIST_BMAP_RADIX) {
576 			/*
577 			 * Scan leaf.
578 			 */
579 			mask = bl->bl_root[nodes].bm_bitmap;
580 			diff = mask ^ (mask << 1);
581 			if (gap_stats_counting(stats))
582 				diff ^= 1;
583 			while (diff != 0) {
584 				digit = bitpos(diff);
585 				update_gap_stats(stats, i + digit);
586 				diff ^= bitrange(digit, 1);
587 			}
588 		}
589 		nodes += radix_to_skip(radix);
590 		i += radix;
591 	}
592 	update_gap_stats(stats, i);
593 	dump_gap_stats(stats, s);
594 }
595 
596 /************************************************************************
597  *			  ALLOCATION SUPPORT FUNCTIONS			*
598  ************************************************************************
599  *
600  *	These support functions do all the actual work.  They may seem
601  *	rather longish, but that's because I've commented them up.  The
602  *	actual code is straight forward.
603  *
604  */
605 
606 /*
607  * BLST_NEXT_LEAF_ALLOC() - allocate the first few blocks in the next leaf.
608  *
609  *	'scan' is a leaf node, associated with a block containing 'blk'.
610  *	The next leaf node could be adjacent, or several nodes away if the
611  *	least common ancestor of 'scan' and its neighbor is several levels
612  *	up.  Use 'blk' to determine how many meta-nodes lie between the
613  *	leaves.  If the next leaf has enough initial bits set, clear them
614  *	and clear the bits in the meta nodes on the path up to the least
615  *	common ancestor to mark any subtrees made completely empty.
616  */
617 static int
618 blst_next_leaf_alloc(blmeta_t *scan, daddr_t blk, int count)
619 {
620 	blmeta_t *next;
621 	u_daddr_t radix;
622 	int digit;
623 
624 	next = scan + 1;
625 	blk += BLIST_BMAP_RADIX;
626 	radix = BLIST_BMAP_RADIX;
627 	while ((digit = ((blk / radix) & BLIST_META_MASK)) == 0 &&
628 	    (next->bm_bitmap & 1) == 1) {
629 		next++;
630 		radix *= BLIST_META_RADIX;
631 	}
632 	if (((next->bm_bitmap + 1) & ~((u_daddr_t)-1 << count)) != 0) {
633 		/*
634 		 * The next leaf doesn't have enough free blocks at the
635 		 * beginning to complete the spanning allocation.
636 		 */
637 		return (ENOMEM);
638 	}
639 	/* Clear the first 'count' bits in the next leaf to allocate. */
640 	next->bm_bitmap &= (u_daddr_t)-1 << count;
641 
642 	/*
643 	 * Update bitmaps of next-ancestors, up to least common ancestor.
644 	 */
645 	while (next->bm_bitmap == 0) {
646 		if (--next == scan) {
647 			scan[-digit * radix_to_skip(radix)].bm_bitmap ^=
648 			    (u_daddr_t)1 << digit;
649 			break;
650 		}
651 		next->bm_bitmap ^= 1;
652  	}
653 	return (0);
654 }
655 
656 /*
657  * Given a bitmask, flip all the bits from the least-significant 1-bit to the
658  * most significant bit.  If the result is non-zero, then the least-significant
659  * 1-bit of the result is in the same position as the least-signification 0-bit
660  * in mask that is followed by a 1-bit.
661  */
662 static inline u_daddr_t
663 flip_hibits(u_daddr_t mask)
664 {
665 
666 	return (-mask & ~mask);
667 }
668 
669 /*
670  * BLST_LEAF_ALLOC() -	allocate at a leaf in the radix tree (a bitmap).
671  *
672  *	This function is the core of the allocator.  Its execution time is
673  *	proportional to log(count), plus height of the tree if the allocation
674  *	crosses a leaf boundary.
675  */
676 static daddr_t
677 blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count)
678 {
679 	u_daddr_t cursor_mask, mask;
680 	int count1, hi, lo, num_shifts, range1, range_ext;
681 
682 	range1 = 0;
683 	count1 = count - 1;
684 	num_shifts = fls(count1);
685 	mask = scan->bm_bitmap;
686 	while (flip_hibits(mask) != 0 && num_shifts > 0) {
687 		/*
688 		 * If bit i is set in mask, then bits in [i, i+range1] are set
689 		 * in scan->bm_bitmap.  The value of range1 is equal to count1
690 		 * >> num_shifts.  Grow range1 and reduce num_shifts to 0,
691 		 * while preserving these invariants.  The updates to mask
692 		 * leave fewer bits set, but each bit that remains set
693 		 * represents a longer string of consecutive bits set in
694 		 * scan->bm_bitmap.  If more updates to mask cannot clear more
695 		 * bits, because mask is partitioned with all 0 bits preceding
696 		 * all 1 bits, the loop terminates immediately.
697 		 */
698 		num_shifts--;
699 		range_ext = range1 + ((count1 >> num_shifts) & 1);
700 		/*
701 		 * mask is a signed quantity for the shift because when it is
702 		 * shifted right, the sign bit should copied; when the last
703 		 * block of the leaf is free, pretend, for a while, that all the
704 		 * blocks that follow it are also free.
705 		 */
706 		mask &= (daddr_t)mask >> range_ext;
707 		range1 += range_ext;
708 	}
709 	if (mask == 0) {
710 		/*
711 		 * Update bighint.  There is no allocation bigger than range1
712 		 * starting in this leaf.
713 		 */
714 		scan->bm_bighint = range1;
715 		return (SWAPBLK_NONE);
716 	}
717 
718 	/* Discard any candidates that appear before blk. */
719 	if ((blk & BLIST_BMAP_MASK) != 0) {
720 		cursor_mask = mask & bitrange(0, blk & BLIST_BMAP_MASK);
721 		if (cursor_mask != 0) {
722 			mask ^= cursor_mask;
723 			if (mask == 0)
724 				return (SWAPBLK_NONE);
725 
726 			/*
727 			 * Bighint change for last block allocation cannot
728 			 * assume that any other blocks are allocated, so the
729 			 * bighint cannot be reduced much.
730 			 */
731 			range1 = BLIST_MAX_ALLOC - 1;
732 		}
733 		blk &= ~BLIST_BMAP_MASK;
734 	}
735 
736 	/*
737 	 * The least significant set bit in mask marks the start of the first
738 	 * available range of sufficient size.  Clear all the bits but that one,
739 	 * and then find its position.
740 	 */
741 	mask &= -mask;
742 	lo = bitpos(mask);
743 
744 	hi = lo + count;
745 	if (hi > BLIST_BMAP_RADIX) {
746 		/*
747 		 * An allocation within this leaf is impossible, so a successful
748 		 * allocation depends on the next leaf providing some of the blocks.
749 		 */
750 		if (blst_next_leaf_alloc(scan, blk, hi - BLIST_BMAP_RADIX) != 0)
751 			/*
752 			 * The hint cannot be updated, because the same
753 			 * allocation request could be satisfied later, by this
754 			 * leaf, if the state of the next leaf changes, and
755 			 * without any changes to this leaf.
756 			 */
757 			return (SWAPBLK_NONE);
758 		hi = BLIST_BMAP_RADIX;
759 	}
760 
761 	/* Set the bits of mask at position 'lo' and higher. */
762 	mask = -mask;
763 	if (hi == BLIST_BMAP_RADIX) {
764 		/*
765 		 * Update bighint.  There is no allocation bigger than range1
766 		 * available in this leaf after this allocation completes.
767 		 */
768 		scan->bm_bighint = range1;
769 	} else {
770 		/* Clear the bits of mask at position 'hi' and higher. */
771 		mask &= (u_daddr_t)-1 >> (BLIST_BMAP_RADIX - hi);
772 	}
773 	/* Clear the allocated bits from this leaf. */
774 	scan->bm_bitmap &= ~mask;
775 	return (blk + lo);
776 }
777 
778 /*
779  * blist_meta_alloc() -	allocate at a meta in the radix tree.
780  *
781  *	Attempt to allocate at a meta node.  If we can't, we update
782  *	bighint and return a failure.  Updating bighint optimize future
783  *	calls that hit this node.  We have to check for our collapse cases
784  *	and we have a few optimizations strewn in as well.
785  */
786 static daddr_t
787 blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count, u_daddr_t radix)
788 {
789 	daddr_t blk, i, r, skip;
790 	u_daddr_t mask;
791 	bool scan_from_start;
792 	int digit;
793 
794 	if (radix == BLIST_BMAP_RADIX)
795 		return (blst_leaf_alloc(scan, cursor, count));
796 	blk = cursor & -radix;
797 	scan_from_start = (cursor == blk);
798 	radix /= BLIST_META_RADIX;
799 	skip = radix_to_skip(radix);
800 	mask = scan->bm_bitmap;
801 
802 	/* Discard any candidates that appear before cursor. */
803 	digit = (cursor / radix) & BLIST_META_MASK;
804 	mask &= (u_daddr_t)-1 << digit;
805 	if (mask == 0)
806 		return (SWAPBLK_NONE);
807 
808 	/*
809 	 * If the first try is for a block that includes the cursor, pre-undo
810 	 * the digit * radix offset in the first call; otherwise, ignore the
811 	 * cursor entirely.
812 	 */
813 	if (((mask >> digit) & 1) == 1)
814 		cursor -= digit * radix;
815 	else
816 		cursor = blk;
817 
818 	/*
819 	 * Examine the nonempty subtree associated with each bit set in mask.
820 	 */
821 	do {
822 		digit = bitpos(mask);
823 		i = 1 + digit * skip;
824 		if (count <= scan[i].bm_bighint) {
825 			/*
826 			 * The allocation might fit beginning in the i'th subtree.
827 			 */
828 			r = blst_meta_alloc(&scan[i], cursor + digit * radix,
829 			    count, radix);
830 			if (r != SWAPBLK_NONE) {
831 				if (scan[i].bm_bitmap == 0)
832 					scan->bm_bitmap ^= bitrange(digit, 1);
833 				return (r);
834 			}
835 		}
836 		cursor = blk;
837 	} while ((mask ^= bitrange(digit, 1)) != 0);
838 
839 	/*
840 	 * We couldn't allocate count in this subtree.  If the whole tree was
841 	 * scanned, and the last tree node is allocated, update bighint.
842 	 */
843 	if (scan_from_start && !(digit == BLIST_META_RADIX - 1 &&
844 	    scan[i].bm_bighint == BLIST_MAX_ALLOC))
845 		scan->bm_bighint = count - 1;
846 
847 	return (SWAPBLK_NONE);
848 }
849 
850 /*
851  * BLST_LEAF_FREE() -	free allocated block from leaf bitmap
852  *
853  */
854 static void
855 blst_leaf_free(blmeta_t *scan, daddr_t blk, int count)
856 {
857 	u_daddr_t mask;
858 
859 	/*
860 	 * free some data in this bitmap
861 	 * mask=0000111111111110000
862 	 *          \_________/\__/
863 	 *		count   n
864 	 */
865 	mask = bitrange(blk & BLIST_BMAP_MASK, count);
866 	KASSERT((scan->bm_bitmap & mask) == 0,
867 	    ("freeing free block: %jx, size %d, mask %jx",
868 	    (uintmax_t)blk, count, (uintmax_t)scan->bm_bitmap & mask));
869 	scan->bm_bitmap |= mask;
870 }
871 
872 /*
873  * BLST_META_FREE() - free allocated blocks from radix tree meta info
874  *
875  *	This support routine frees a range of blocks from the bitmap.
876  *	The range must be entirely enclosed by this radix node.  If a
877  *	meta node, we break the range down recursively to free blocks
878  *	in subnodes (which means that this code can free an arbitrary
879  *	range whereas the allocation code cannot allocate an arbitrary
880  *	range).
881  */
882 static void
883 blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, u_daddr_t radix)
884 {
885 	daddr_t blk, endBlk, i, skip;
886 	int digit, endDigit;
887 
888 	/*
889 	 * We could probably do a better job here.  We are required to make
890 	 * bighint at least as large as the biggest allocable block of data.
891 	 * If we just shoehorn it, a little extra overhead will be incurred
892 	 * on the next allocation (but only that one typically).
893 	 */
894 	scan->bm_bighint = BLIST_MAX_ALLOC;
895 
896 	if (radix == BLIST_BMAP_RADIX)
897 		return (blst_leaf_free(scan, freeBlk, count));
898 
899 	endBlk = ummin(freeBlk + count, (freeBlk + radix) & -radix);
900 	radix /= BLIST_META_RADIX;
901 	skip = radix_to_skip(radix);
902 	blk = freeBlk & -radix;
903 	digit = (blk / radix) & BLIST_META_MASK;
904 	endDigit = 1 + (((endBlk - 1) / radix) & BLIST_META_MASK);
905 	scan->bm_bitmap |= bitrange(digit, endDigit - digit);
906 	for (i = 1 + digit * skip; blk < endBlk; i += skip) {
907 		blk += radix;
908 		count = ummin(blk, endBlk) - freeBlk;
909 		blst_meta_free(&scan[i], freeBlk, count, radix);
910 		freeBlk = blk;
911 	}
912 }
913 
914 /*
915  * BLST_COPY() - copy one radix tree to another
916  *
917  *	Locates free space in the source tree and frees it in the destination
918  *	tree.  The space may not already be free in the destination.
919  */
920 static void
921 blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, blist_t dest,
922     daddr_t count)
923 {
924 	daddr_t endBlk, i, skip;
925 
926 	/*
927 	 * Leaf node
928 	 */
929 
930 	if (radix == BLIST_BMAP_RADIX) {
931 		u_daddr_t v = scan->bm_bitmap;
932 
933 		if (v == (u_daddr_t)-1) {
934 			blist_free(dest, blk, count);
935 		} else if (v != 0) {
936 			int i;
937 
938 			for (i = 0; i < count; ++i) {
939 				if (v & ((u_daddr_t)1 << i))
940 					blist_free(dest, blk + i, 1);
941 			}
942 		}
943 		return;
944 	}
945 
946 	/*
947 	 * Meta node
948 	 */
949 
950 	if (scan->bm_bitmap == 0) {
951 		/*
952 		 * Source all allocated, leave dest allocated
953 		 */
954 		return;
955 	}
956 
957 	endBlk = blk + count;
958 	radix /= BLIST_META_RADIX;
959 	skip = radix_to_skip(radix);
960 	for (i = 1; blk < endBlk; i += skip) {
961 		blk += radix;
962 		count = radix;
963 		if (blk >= endBlk)
964 			count -= blk - endBlk;
965 		blst_copy(&scan[i], blk - radix, radix, dest, count);
966 	}
967 }
968 
969 /*
970  * BLST_LEAF_FILL() -	allocate specific blocks in leaf bitmap
971  *
972  *	This routine allocates all blocks in the specified range
973  *	regardless of any existing allocations in that range.  Returns
974  *	the number of blocks allocated by the call.
975  */
976 static daddr_t
977 blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count)
978 {
979 	daddr_t nblks;
980 	u_daddr_t mask;
981 
982 	mask = bitrange(blk & BLIST_BMAP_MASK, count);
983 
984 	/* Count the number of blocks that we are allocating. */
985 	nblks = bitcount64(scan->bm_bitmap & mask);
986 
987 	scan->bm_bitmap &= ~mask;
988 	return (nblks);
989 }
990 
991 /*
992  * BLIST_META_FILL() -	allocate specific blocks at a meta node
993  *
994  *	This routine allocates the specified range of blocks,
995  *	regardless of any existing allocations in the range.  The
996  *	range must be within the extent of this node.  Returns the
997  *	number of blocks allocated by the call.
998  */
999 static daddr_t
1000 blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, u_daddr_t radix)
1001 {
1002 	daddr_t blk, endBlk, i, nblks, skip;
1003 	int digit;
1004 
1005 	if (radix == BLIST_BMAP_RADIX)
1006 		return (blst_leaf_fill(scan, allocBlk, count));
1007 
1008 	endBlk = ummin(allocBlk + count, (allocBlk + radix) & -radix);
1009 	radix /= BLIST_META_RADIX;
1010 	skip = radix_to_skip(radix);
1011 	blk = allocBlk & -radix;
1012 	nblks = 0;
1013 	while (blk < endBlk) {
1014 		digit = (blk / radix) & BLIST_META_MASK;
1015 		i = 1 + digit * skip;
1016 		blk += radix;
1017 		count = ummin(blk, endBlk) - allocBlk;
1018 		nblks += blst_meta_fill(&scan[i], allocBlk, count, radix);
1019 		if (scan[i].bm_bitmap == 0)
1020 			scan->bm_bitmap &= ~((u_daddr_t)1 << digit);
1021 		allocBlk = blk;
1022 	}
1023 	return (nblks);
1024 }
1025 
1026 #ifdef BLIST_DEBUG
1027 
1028 static void
1029 blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int tab)
1030 {
1031 	daddr_t skip;
1032 	u_daddr_t mask;
1033 	int digit;
1034 
1035 	if (radix == BLIST_BMAP_RADIX) {
1036 		printf(
1037 		    "%*.*s(%08llx,%lld): bitmap %0*llx big=%lld\n",
1038 		    tab, tab, "",
1039 		    (long long)blk, (long long)radix,
1040 		    1 + (BLIST_BMAP_RADIX - 1) / 4,
1041 		    (long long)scan->bm_bitmap,
1042 		    (long long)scan->bm_bighint
1043 		);
1044 		return;
1045 	}
1046 
1047 	printf(
1048 	    "%*.*s(%08llx): subtree (%lld/%lld) bitmap %0*llx big=%lld {\n",
1049 	    tab, tab, "",
1050 	    (long long)blk, (long long)radix,
1051 	    (long long)radix,
1052 	    1 + (BLIST_META_RADIX - 1) / 4,
1053 	    (long long)scan->bm_bitmap,
1054 	    (long long)scan->bm_bighint
1055 	);
1056 
1057 	radix /= BLIST_META_RADIX;
1058 	skip = radix_to_skip(radix);
1059 	tab += 4;
1060 
1061 	mask = scan->bm_bitmap;
1062 	/* Examine the nonempty subtree associated with each bit set in mask */
1063 	do {
1064 		digit = bitpos(mask);
1065 		blst_radix_print(&scan[1 + digit * skip], blk + digit * radix,
1066 		    radix, tab);
1067 	} while ((mask ^= bitrange(digit, 1)) != 0);
1068 	tab -= 4;
1069 
1070 	printf(
1071 	    "%*.*s}\n",
1072 	    tab, tab, ""
1073 	);
1074 }
1075 
1076 #endif
1077 
1078 #ifdef BLIST_DEBUG
1079 
1080 int
1081 main(int ac, char **av)
1082 {
1083 	int size = BLIST_META_RADIX * BLIST_BMAP_RADIX;
1084 	int i;
1085 	blist_t bl;
1086 	struct sbuf *s;
1087 
1088 	for (i = 1; i < ac; ++i) {
1089 		const char *ptr = av[i];
1090 		if (*ptr != '-') {
1091 			size = strtol(ptr, NULL, 0);
1092 			continue;
1093 		}
1094 		ptr += 2;
1095 		fprintf(stderr, "Bad option: %s\n", ptr - 2);
1096 		exit(1);
1097 	}
1098 	bl = blist_create(size, M_WAITOK);
1099 	blist_free(bl, 0, size);
1100 
1101 	for (;;) {
1102 		char buf[1024];
1103 		long long da = 0;
1104 		long long count = 0;
1105 
1106 		printf("%lld/%lld/%lld> ", (long long)blist_avail(bl),
1107 		    (long long)size, (long long)bl->bl_radix);
1108 		fflush(stdout);
1109 		if (fgets(buf, sizeof(buf), stdin) == NULL)
1110 			break;
1111 		switch(buf[0]) {
1112 		case 'r':
1113 			if (sscanf(buf + 1, "%lld", &count) == 1) {
1114 				blist_resize(&bl, count, 1, M_WAITOK);
1115 			} else {
1116 				printf("?\n");
1117 			}
1118 		case 'p':
1119 			blist_print(bl);
1120 			break;
1121 		case 's':
1122 			s = sbuf_new_auto();
1123 			blist_stats(bl, s);
1124 			sbuf_finish(s);
1125 			printf("%s", sbuf_data(s));
1126 			sbuf_delete(s);
1127 			break;
1128 		case 'a':
1129 			if (sscanf(buf + 1, "%lld", &count) == 1) {
1130 				daddr_t blk = blist_alloc(bl, count);
1131 				printf("    R=%08llx\n", (long long)blk);
1132 			} else {
1133 				printf("?\n");
1134 			}
1135 			break;
1136 		case 'f':
1137 			if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) {
1138 				blist_free(bl, da, count);
1139 			} else {
1140 				printf("?\n");
1141 			}
1142 			break;
1143 		case 'l':
1144 			if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) {
1145 				printf("    n=%jd\n",
1146 				    (intmax_t)blist_fill(bl, da, count));
1147 			} else {
1148 				printf("?\n");
1149 			}
1150 			break;
1151 		case '?':
1152 		case 'h':
1153 			puts(
1154 			    "p          -print\n"
1155 			    "s          -stats\n"
1156 			    "a %d       -allocate\n"
1157 			    "f %x %d    -free\n"
1158 			    "l %x %d    -fill\n"
1159 			    "r %d       -resize\n"
1160 			    "h/?        -help\n"
1161 			    "q          -quit"
1162 			);
1163 			break;
1164 		case 'q':
1165 			break;
1166 		default:
1167 			printf("?\n");
1168 			break;
1169 		}
1170 		if (buf[0] == 'q')
1171 			break;
1172 	}
1173 	return (0);
1174 }
1175 
1176 #endif
1177