xref: /freebsd/sys/kern/subr_blist.c (revision 0b37c1590418417c894529d371800dfac71ef887)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1998 Matthew Dillon.  All Rights Reserved.
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. Neither the name of the University nor the names of its contributors
14  *    may be used to endorse or promote products derived from this software
15  *    without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
18  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
23  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
25  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
26  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  */
29 /*
30  * BLIST.C -	Bitmap allocator/deallocator, using a radix tree with hinting
31  *
32  *	This module implements a general bitmap allocator/deallocator.  The
33  *	allocator eats around 2 bits per 'block'.  The module does not
34  *	try to interpret the meaning of a 'block' other than to return
35  *	SWAPBLK_NONE on an allocation failure.
36  *
37  *	A radix tree controls access to pieces of the bitmap, and includes
38  *	auxiliary information at each interior node about the availabilty of
39  *	contiguous free blocks in the subtree rooted at that node.  Two radix
40  *	constants are involved: one for the size of the bitmaps contained in the
41  *	leaf nodes (BLIST_BMAP_RADIX), and one for the number of descendents of
42  *	each of the meta (interior) nodes (BLIST_META_RADIX).  Each subtree is
43  *	associated with a range of blocks.  The root of any subtree stores a
44  *	hint field that defines an upper bound on the size of the largest
45  *	allocation that can begin in the associated block range.  A hint is an
46  *	upper bound on a potential allocation, but not necessarily a tight upper
47  *	bound.
48  *
49  *	The bitmap field in each node directs the search for available blocks.
50  *	For a leaf node, a bit is set if the corresponding block is free.  For a
51  *	meta node, a bit is set if the corresponding subtree contains a free
52  *	block somewhere within it.  The search at a meta node considers only
53  *	children of that node that represent a range that includes a free block.
54  *
55  * 	The hinting greatly increases code efficiency for allocations while
56  *	the general radix structure optimizes both allocations and frees.  The
57  *	radix tree should be able to operate well no matter how much
58  *	fragmentation there is and no matter how large a bitmap is used.
59  *
60  *	The blist code wires all necessary memory at creation time.  Neither
61  *	allocations nor frees require interaction with the memory subsystem.
62  *	The non-blocking nature of allocations and frees is required by swap
63  *	code (vm/swap_pager.c).
64  *
65  *	LAYOUT: The radix tree is laid out recursively using a linear array.
66  *	Each meta node is immediately followed (laid out sequentially in
67  *	memory) by BLIST_META_RADIX lower level nodes.  This is a recursive
68  *	structure but one that can be easily scanned through a very simple
69  *	'skip' calculation.  The memory allocation is only large enough to
70  *	cover the number of blocks requested at creation time.  Nodes that
71  *	represent blocks beyond that limit, nodes that would never be read
72  *	or written, are not allocated, so that the last of the
73  *	BLIST_META_RADIX lower level nodes of a some nodes may not be
74  *	allocated.
75  *
76  *	NOTE: the allocator cannot currently allocate more than
77  *	BLIST_BMAP_RADIX blocks per call.  It will panic with 'allocation too
78  *	large' if you try.  This is an area that could use improvement.  The
79  *	radix is large enough that this restriction does not effect the swap
80  *	system, though.  Currently only the allocation code is affected by
81  *	this algorithmic unfeature.  The freeing code can handle arbitrary
82  *	ranges.
83  *
84  *	This code can be compiled stand-alone for debugging.
85  */
86 
87 #include <sys/cdefs.h>
88 __FBSDID("$FreeBSD$");
89 
90 #ifdef _KERNEL
91 
92 #include <sys/param.h>
93 #include <sys/systm.h>
94 #include <sys/lock.h>
95 #include <sys/kernel.h>
96 #include <sys/blist.h>
97 #include <sys/malloc.h>
98 #include <sys/sbuf.h>
99 #include <sys/proc.h>
100 #include <sys/mutex.h>
101 
102 #else
103 
104 #ifndef BLIST_NO_DEBUG
105 #define BLIST_DEBUG
106 #endif
107 
108 #include <sys/errno.h>
109 #include <sys/types.h>
110 #include <sys/malloc.h>
111 #include <sys/sbuf.h>
112 #include <assert.h>
113 #include <stdio.h>
114 #include <string.h>
115 #include <stddef.h>
116 #include <stdlib.h>
117 #include <stdarg.h>
118 #include <stdbool.h>
119 
120 #define	bitcount64(x)	__bitcount64((uint64_t)(x))
121 #define malloc(a,b,c)	calloc(a, 1)
122 #define free(a,b)	free(a)
123 #define ummin(a,b)	((a) < (b) ? (a) : (b))
124 #define imin(a,b)	((a) < (b) ? (a) : (b))
125 #define KASSERT(a,b)	assert(a)
126 
127 #include <sys/blist.h>
128 
129 #endif
130 
131 /*
132  * static support functions
133  */
134 static daddr_t	blst_leaf_alloc(blmeta_t *scan, daddr_t blk,
135     int *count, int maxcount);
136 static daddr_t	blst_meta_alloc(blmeta_t *scan, daddr_t cursor, int *count,
137     int maxcount, u_daddr_t radix);
138 static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count);
139 static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count,
140 		    u_daddr_t radix);
141 static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix,
142 		    blist_t dest, daddr_t count);
143 static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count);
144 static daddr_t blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count,
145 		    u_daddr_t radix);
146 #ifndef _KERNEL
147 static void	blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix,
148 		    int tab);
149 #endif
150 
151 #ifdef _KERNEL
152 static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space");
153 #endif
154 
155 _Static_assert(BLIST_BMAP_RADIX % BLIST_META_RADIX == 0,
156     "radix divisibility error");
157 #define	BLIST_BMAP_MASK	(BLIST_BMAP_RADIX - 1)
158 #define	BLIST_META_MASK	(BLIST_META_RADIX - 1)
159 
160 /*
161  * For a subtree that can represent the state of up to 'radix' blocks, the
162  * number of leaf nodes of the subtree is L=radix/BLIST_BMAP_RADIX.  If 'm'
163  * is short for BLIST_META_RADIX, then for a tree of height h with L=m**h
164  * leaf nodes, the total number of tree nodes is 1 + m + m**2 + ... + m**h,
165  * or, equivalently, (m**(h+1)-1)/(m-1).  This quantity is called 'skip'
166  * in the 'meta' functions that process subtrees.  Since integer division
167  * discards remainders, we can express this computation as
168  * skip = (m * m**h) / (m - 1)
169  * skip = (m * (radix / BLIST_BMAP_RADIX)) / (m - 1)
170  * and since m divides BLIST_BMAP_RADIX, we can simplify further to
171  * skip = (radix / (BLIST_BMAP_RADIX / m)) / (m - 1)
172  * skip = radix / ((BLIST_BMAP_RADIX / m) * (m - 1))
173  * so that simple integer division by a constant can safely be used for the
174  * calculation.
175  */
176 static inline daddr_t
177 radix_to_skip(daddr_t radix)
178 {
179 
180 	return (radix /
181 	    ((BLIST_BMAP_RADIX / BLIST_META_RADIX) * BLIST_META_MASK));
182 }
183 
184 /*
185  * Provide a mask with count bits set, starting as position n.
186  */
187 static inline u_daddr_t
188 bitrange(int n, int count)
189 {
190 
191 	return (((u_daddr_t)-1 << n) &
192 	    ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - (n + count))));
193 }
194 
195 
196 /*
197  * Find the first bit set in a u_daddr_t.
198  */
199 static inline int
200 generic_bitpos(u_daddr_t mask)
201 {
202 	int hi, lo, mid;
203 
204 	lo = 0;
205 	hi = BLIST_BMAP_RADIX;
206 	while (lo + 1 < hi) {
207 		mid = (lo + hi) >> 1;
208 		if (mask & bitrange(0, mid))
209 			hi = mid;
210 		else
211 			lo = mid;
212 	}
213 	return (lo);
214 }
215 
216 static inline int
217 bitpos(u_daddr_t mask)
218 {
219 
220 	switch (sizeof(mask)) {
221 #ifdef HAVE_INLINE_FFSLL
222 	case sizeof(long long):
223 		return (ffsll(mask) - 1);
224 #endif
225 #ifdef HAVE_INLINE_FFS
226 	case sizeof(int):
227 		return (ffs(mask) - 1);
228 #endif
229 	default:
230 		return (generic_bitpos(mask));
231 	}
232 }
233 
234 /*
235  * blist_create() - create a blist capable of handling up to the specified
236  *		    number of blocks
237  *
238  *	blocks - must be greater than 0
239  * 	flags  - malloc flags
240  *
241  *	The smallest blist consists of a single leaf node capable of
242  *	managing BLIST_BMAP_RADIX blocks.
243  */
244 blist_t
245 blist_create(daddr_t blocks, int flags)
246 {
247 	blist_t bl;
248 	u_daddr_t nodes, radix;
249 
250 	KASSERT(blocks > 0, ("invalid block count"));
251 
252 	/*
253 	 * Calculate the radix and node count used for scanning.
254 	 */
255 	nodes = 1;
256 	radix = BLIST_BMAP_RADIX;
257 	while (radix <= blocks) {
258 		nodes += 1 + (blocks - 1) / radix;
259 		radix *= BLIST_META_RADIX;
260 	}
261 
262 	bl = malloc(offsetof(struct blist, bl_root[nodes]), M_SWAP, flags |
263 	    M_ZERO);
264 	if (bl == NULL)
265 		return (NULL);
266 
267 	bl->bl_blocks = blocks;
268 	bl->bl_radix = radix;
269 
270 #if defined(BLIST_DEBUG)
271 	printf(
272 		"BLIST representing %lld blocks (%lld MB of swap)"
273 		", requiring %lldK of ram\n",
274 		(long long)bl->bl_blocks,
275 		(long long)bl->bl_blocks * 4 / 1024,
276 		(long long)(nodes * sizeof(blmeta_t) + 1023) / 1024
277 	);
278 	printf("BLIST raw radix tree contains %lld records\n",
279 	    (long long)nodes);
280 #endif
281 
282 	return (bl);
283 }
284 
285 void
286 blist_destroy(blist_t bl)
287 {
288 
289 	free(bl, M_SWAP);
290 }
291 
292 /*
293  * blist_alloc() -   reserve space in the block bitmap.  Return the base
294  *		     of a contiguous region or SWAPBLK_NONE if space could
295  *		     not be allocated.
296  */
297 daddr_t
298 blist_alloc(blist_t bl, int *count, int maxcount)
299 {
300 	daddr_t blk, cursor;
301 
302 	KASSERT(*count <= maxcount,
303 	    ("invalid parameters %d > %d", *count, maxcount));
304 	KASSERT(*count <= BLIST_MAX_ALLOC,
305 	    ("minimum allocation too large: %d", *count));
306 
307 	/*
308 	 * This loop iterates at most twice.  An allocation failure in the
309 	 * first iteration leads to a second iteration only if the cursor was
310 	 * non-zero.  When the cursor is zero, an allocation failure will
311 	 * stop further iterations.
312 	 */
313 	for (cursor = bl->bl_cursor;; cursor = 0) {
314 		blk = blst_meta_alloc(bl->bl_root, cursor, count, maxcount,
315 		    bl->bl_radix);
316 		if (blk != SWAPBLK_NONE) {
317 			bl->bl_avail -= *count;
318 			bl->bl_cursor = blk + *count;
319 			if (bl->bl_cursor == bl->bl_blocks)
320 				bl->bl_cursor = 0;
321 			return (blk);
322 		}
323 		if (cursor == 0)
324 			return (SWAPBLK_NONE);
325 	}
326 }
327 
328 /*
329  * blist_avail() -	return the number of free blocks.
330  */
331 daddr_t
332 blist_avail(blist_t bl)
333 {
334 
335 	return (bl->bl_avail);
336 }
337 
338 /*
339  * blist_free() -	free up space in the block bitmap.  Return the base
340  *		     	of a contiguous region.
341  */
342 void
343 blist_free(blist_t bl, daddr_t blkno, daddr_t count)
344 {
345 
346 	KASSERT(blkno >= 0 && blkno + count <= bl->bl_blocks,
347 	    ("freeing invalid range: blkno %jx, count %d, blocks %jd",
348 	    (uintmax_t)blkno, (int)count, (uintmax_t)bl->bl_blocks));
349 	blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix);
350 	bl->bl_avail += count;
351 }
352 
353 /*
354  * blist_fill() -	mark a region in the block bitmap as off-limits
355  *			to the allocator (i.e. allocate it), ignoring any
356  *			existing allocations.  Return the number of blocks
357  *			actually filled that were free before the call.
358  */
359 daddr_t
360 blist_fill(blist_t bl, daddr_t blkno, daddr_t count)
361 {
362 	daddr_t filled;
363 
364 	KASSERT(blkno >= 0 && blkno + count <= bl->bl_blocks,
365 	    ("filling invalid range: blkno %jx, count %d, blocks %jd",
366 	    (uintmax_t)blkno, (int)count, (uintmax_t)bl->bl_blocks));
367 	filled = blst_meta_fill(bl->bl_root, blkno, count, bl->bl_radix);
368 	bl->bl_avail -= filled;
369 	return (filled);
370 }
371 
372 /*
373  * blist_resize() -	resize an existing radix tree to handle the
374  *			specified number of blocks.  This will reallocate
375  *			the tree and transfer the previous bitmap to the new
376  *			one.  When extending the tree you can specify whether
377  *			the new blocks are to left allocated or freed.
378  */
379 void
380 blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags)
381 {
382     blist_t newbl = blist_create(count, flags);
383     blist_t save = *pbl;
384 
385     *pbl = newbl;
386     if (count > save->bl_blocks)
387 	    count = save->bl_blocks;
388     blst_copy(save->bl_root, 0, save->bl_radix, newbl, count);
389 
390     /*
391      * If resizing upwards, should we free the new space or not?
392      */
393     if (freenew && count < newbl->bl_blocks) {
394 	    blist_free(newbl, count, newbl->bl_blocks - count);
395     }
396     blist_destroy(save);
397 }
398 
399 #ifdef BLIST_DEBUG
400 
401 /*
402  * blist_print()    - dump radix tree
403  */
404 void
405 blist_print(blist_t bl)
406 {
407 	printf("BLIST avail = %jd, cursor = %08jx {\n",
408 	    (uintmax_t)bl->bl_avail, (uintmax_t)bl->bl_cursor);
409 
410 	if (bl->bl_root->bm_bitmap != 0)
411 		blst_radix_print(bl->bl_root, 0, bl->bl_radix, 4);
412 	printf("}\n");
413 }
414 
415 #endif
416 
417 static const u_daddr_t fib[] = {
418 	1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584,
419 	4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418, 317811,
420 	514229, 832040, 1346269, 2178309, 3524578,
421 };
422 
423 /*
424  * Use 'gap' to describe a maximal range of unallocated blocks/bits.
425  */
426 struct gap_stats {
427 	daddr_t	start;		/* current gap start, or SWAPBLK_NONE */
428 	daddr_t	num;		/* number of gaps observed */
429 	daddr_t	max;		/* largest gap size */
430 	daddr_t	avg;		/* average gap size */
431 	daddr_t	err;		/* sum - num * avg */
432 	daddr_t	histo[nitems(fib)]; /* # gaps in each size range */
433 	int	max_bucket;	/* last histo elt with nonzero val */
434 };
435 
436 /*
437  * gap_stats_counting()    - is the state 'counting 1 bits'?
438  *                           or 'skipping 0 bits'?
439  */
440 static inline bool
441 gap_stats_counting(const struct gap_stats *stats)
442 {
443 
444 	return (stats->start != SWAPBLK_NONE);
445 }
446 
447 /*
448  * init_gap_stats()    - initialize stats on gap sizes
449  */
450 static inline void
451 init_gap_stats(struct gap_stats *stats)
452 {
453 
454 	bzero(stats, sizeof(*stats));
455 	stats->start = SWAPBLK_NONE;
456 }
457 
458 /*
459  * update_gap_stats()    - update stats on gap sizes
460  */
461 static void
462 update_gap_stats(struct gap_stats *stats, daddr_t posn)
463 {
464 	daddr_t size;
465 	int hi, lo, mid;
466 
467 	if (!gap_stats_counting(stats)) {
468 		stats->start = posn;
469 		return;
470 	}
471 	size = posn - stats->start;
472 	stats->start = SWAPBLK_NONE;
473 	if (size > stats->max)
474 		stats->max = size;
475 
476 	/*
477 	 * Find the fibonacci range that contains size,
478 	 * expecting to find it in an early range.
479 	 */
480 	lo = 0;
481 	hi = 1;
482 	while (hi < nitems(fib) && fib[hi] <= size) {
483 		lo = hi;
484 		hi *= 2;
485 	}
486 	if (hi >= nitems(fib))
487 		hi = nitems(fib);
488 	while (lo + 1 != hi) {
489 		mid = (lo + hi) >> 1;
490 		if (fib[mid] <= size)
491 			lo = mid;
492 		else
493 			hi = mid;
494 	}
495 	stats->histo[lo]++;
496 	if (lo > stats->max_bucket)
497 		stats->max_bucket = lo;
498 	stats->err += size - stats->avg;
499 	stats->num++;
500 	stats->avg += stats->err / stats->num;
501 	stats->err %= stats->num;
502 }
503 
504 /*
505  * dump_gap_stats()    - print stats on gap sizes
506  */
507 static inline void
508 dump_gap_stats(const struct gap_stats *stats, struct sbuf *s)
509 {
510 	int i;
511 
512 	sbuf_printf(s, "number of maximal free ranges: %jd\n",
513 	    (intmax_t)stats->num);
514 	sbuf_printf(s, "largest free range: %jd\n", (intmax_t)stats->max);
515 	sbuf_printf(s, "average maximal free range size: %jd\n",
516 	    (intmax_t)stats->avg);
517 	sbuf_printf(s, "number of maximal free ranges of different sizes:\n");
518 	sbuf_printf(s, "               count  |  size range\n");
519 	sbuf_printf(s, "               -----  |  ----------\n");
520 	for (i = 0; i < stats->max_bucket; i++) {
521 		if (stats->histo[i] != 0) {
522 			sbuf_printf(s, "%20jd  |  ",
523 			    (intmax_t)stats->histo[i]);
524 			if (fib[i] != fib[i + 1] - 1)
525 				sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
526 				    (intmax_t)fib[i + 1] - 1);
527 			else
528 				sbuf_printf(s, "%jd\n", (intmax_t)fib[i]);
529 		}
530 	}
531 	sbuf_printf(s, "%20jd  |  ", (intmax_t)stats->histo[i]);
532 	if (stats->histo[i] > 1)
533 		sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
534 		    (intmax_t)stats->max);
535 	else
536 		sbuf_printf(s, "%jd\n", (intmax_t)stats->max);
537 }
538 
539 /*
540  * blist_stats()    - dump radix tree stats
541  */
542 void
543 blist_stats(blist_t bl, struct sbuf *s)
544 {
545 	struct gap_stats gstats;
546 	struct gap_stats *stats = &gstats;
547 	daddr_t i, nodes, radix;
548 	u_daddr_t diff, mask;
549 	int digit;
550 
551 	init_gap_stats(stats);
552 	nodes = 0;
553 	i = bl->bl_radix;
554 	while (i < bl->bl_radix + bl->bl_blocks) {
555 		/*
556 		 * Find max size subtree starting at i.
557 		 */
558 		radix = BLIST_BMAP_RADIX;
559 		while (((i / radix) & BLIST_META_MASK) == 0)
560 			radix *= BLIST_META_RADIX;
561 
562 		/*
563 		 * Check for skippable subtrees starting at i.
564 		 */
565 		while (radix > BLIST_BMAP_RADIX) {
566 			if (bl->bl_root[nodes].bm_bitmap == 0) {
567 				if (gap_stats_counting(stats))
568 					update_gap_stats(stats, i);
569 				break;
570 			}
571 
572 			/*
573 			 * Skip subtree root.
574 			 */
575 			nodes++;
576 			radix /= BLIST_META_RADIX;
577 		}
578 		if (radix == BLIST_BMAP_RADIX) {
579 			/*
580 			 * Scan leaf.
581 			 */
582 			mask = bl->bl_root[nodes].bm_bitmap;
583 			diff = mask ^ (mask << 1);
584 			if (gap_stats_counting(stats))
585 				diff ^= 1;
586 			while (diff != 0) {
587 				digit = bitpos(diff);
588 				update_gap_stats(stats, i + digit);
589 				diff ^= bitrange(digit, 1);
590 			}
591 		}
592 		nodes += radix_to_skip(radix);
593 		i += radix;
594 	}
595 	update_gap_stats(stats, i);
596 	dump_gap_stats(stats, s);
597 }
598 
599 /************************************************************************
600  *			  ALLOCATION SUPPORT FUNCTIONS			*
601  ************************************************************************
602  *
603  *	These support functions do all the actual work.  They may seem
604  *	rather longish, but that's because I've commented them up.  The
605  *	actual code is straight forward.
606  *
607  */
608 
609 /*
610  * BLST_NEXT_LEAF_ALLOC() - allocate the blocks starting with the next leaf.
611  *
612  *	'scan' is a leaf node, and its first block is at address 'start'.  The
613  *	next leaf node could be adjacent, or several nodes away if the least
614  *	common ancestor of 'scan' and its neighbor is several levels up.  Use
615  *	addresses to determine how many meta-nodes lie between the leaves.  If
616  *	sequence of leaves starting with the next one has enough initial bits
617  *	set, clear them and clear the bits in the meta nodes on the path up to
618  *	the least common ancestor to mark any subtrees made completely empty.
619  */
620 static int
621 blst_next_leaf_alloc(blmeta_t *scan, daddr_t start, int count, int maxcount)
622 {
623 	u_daddr_t radix;
624 	daddr_t blk;
625 	int avail, digit;
626 
627 	start += BLIST_BMAP_RADIX;
628 	for (blk = start; blk - start < maxcount; blk += BLIST_BMAP_RADIX) {
629 		/* Skip meta-nodes, as long as they promise more free blocks. */
630 		radix = BLIST_BMAP_RADIX;
631 		while (((++scan)->bm_bitmap & 1) == 1 &&
632 		    ((blk / radix) & BLIST_META_MASK) == 0)
633 			radix *= BLIST_META_RADIX;
634 		if (~scan->bm_bitmap != 0) {
635 			/*
636 			 * Either there is no next leaf with any free blocks,
637 			 * or we've reached the next leaf and found that some
638 			 * of its blocks are not free.  In the first case,
639 			 * bitpos() returns zero here.
640 			 */
641 			avail = blk - start + bitpos(~scan->bm_bitmap);
642 			if (avail < count || avail == 0) {
643 				/*
644 				 * There isn't a next leaf with enough free
645 				 * blocks at its beginning to bother
646 				 * allocating.
647 				 */
648 				return (avail);
649 			}
650 			maxcount = imin(avail, maxcount);
651 			if (maxcount % BLIST_BMAP_RADIX == 0) {
652 				/*
653 				 * There was no next leaf.  Back scan up to
654 				 * last leaf.
655 				 */
656 				--scan;
657 				while (radix != BLIST_BMAP_RADIX) {
658 					radix /= BLIST_META_RADIX;
659 					--scan;
660 				}
661 				blk -= BLIST_BMAP_RADIX;
662 			}
663 		}
664 	}
665 
666 	/*
667 	 * 'scan' is the last leaf that provides blocks.  Clear from 1 to
668 	 * BLIST_BMAP_RADIX bits to represent the allocation of those last
669 	 * blocks.
670 	 */
671 	if (maxcount % BLIST_BMAP_RADIX != 0)
672 		scan->bm_bitmap &= ~bitrange(0, maxcount % BLIST_BMAP_RADIX);
673 	else
674 		scan->bm_bitmap = 0;
675 
676 	for (;;) {
677 		/* Back up over meta-nodes, clearing bits if necessary. */
678 		blk -= BLIST_BMAP_RADIX;
679 		radix = BLIST_BMAP_RADIX;
680 		while ((digit = ((blk / radix) & BLIST_META_MASK)) == 0) {
681 			if ((scan--)->bm_bitmap == 0)
682 				scan->bm_bitmap ^= 1;
683 			radix *= BLIST_META_RADIX;
684 		}
685 		if ((scan--)->bm_bitmap == 0)
686 			scan[-digit * radix_to_skip(radix)].bm_bitmap ^=
687 			    (u_daddr_t)1 << digit;
688 
689 		if (blk == start)
690 			break;
691 		/* Clear all the bits of this leaf. */
692 		scan->bm_bitmap = 0;
693 	}
694 	return (maxcount);
695 }
696 
697 /*
698  * BLST_LEAF_ALLOC() -	allocate at a leaf in the radix tree (a bitmap).
699  *
700  *	This function is the core of the allocator.  Its execution time is
701  *	proportional to log(count), plus height of the tree if the allocation
702  *	crosses a leaf boundary.
703  */
704 static daddr_t
705 blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int *count, int maxcount)
706 {
707 	u_daddr_t mask;
708 	int bighint, count1, hi, lo, num_shifts;
709 
710 	count1 = *count - 1;
711 	num_shifts = fls(count1);
712 	mask = ~scan->bm_bitmap;
713 	while ((mask & (mask + 1)) != 0 && num_shifts > 0) {
714 		/*
715 		 * If bit i is 0 in mask, then bits in [i, i + (count1 >>
716 		 * num_shifts)] are 1 in scan->bm_bitmap.  Reduce num_shifts to
717 		 * 0, while preserving this invariant.  The updates to mask
718 		 * leave fewer bits 0, but each bit that remains 0 represents a
719 		 * longer string of consecutive 1-bits in scan->bm_bitmap.  If
720 		 * more updates to mask cannot set more bits, because mask is
721 		 * partitioned with all 1 bits following all 0 bits, the loop
722 		 * terminates immediately.
723 		 */
724 		num_shifts--;
725 		mask |= mask >> ((count1 >> num_shifts) + 1) / 2;
726 	}
727 	bighint = count1 >> num_shifts;
728 	if (~mask == 0) {
729 		/*
730 		 * Update bighint.  There is no allocation bigger than
731 		 * count1 >> num_shifts starting in this leaf.
732 		 */
733 		scan->bm_bighint = bighint;
734 		return (SWAPBLK_NONE);
735 	}
736 
737 	/* Discard any candidates that appear before blk. */
738 	if ((blk & BLIST_BMAP_MASK) != 0) {
739 		if ((~mask & bitrange(0, blk & BLIST_BMAP_MASK)) != 0) {
740 			/* Grow bighint in case all discarded bits are set. */
741 			bighint += blk & BLIST_BMAP_MASK;
742 			mask |= bitrange(0, blk & BLIST_BMAP_MASK);
743 			if (~mask == 0) {
744 				scan->bm_bighint = bighint;
745 				return (SWAPBLK_NONE);
746 			}
747 		}
748 		blk -= blk & BLIST_BMAP_MASK;
749 	}
750 
751 	/*
752 	 * The least significant set bit in mask marks the start of the first
753 	 * available range of sufficient size.  Find its position.
754 	 */
755 	lo = bitpos(~mask);
756 
757 	/*
758 	 * Find how much space is available starting at that position.
759 	 */
760 	if ((mask & (mask + 1)) != 0) {
761 		/* Count the 1 bits starting at position lo. */
762 		hi = bitpos(mask & (mask + 1)) + count1;
763 		if (maxcount < hi - lo)
764 			hi = lo + maxcount;
765 		*count = hi - lo;
766 		mask = ~bitrange(lo, *count);
767 	} else if (maxcount <= BLIST_BMAP_RADIX - lo) {
768 		/* All the blocks we can use are available here. */
769 		hi = lo + maxcount;
770 		*count = maxcount;
771 		mask = ~bitrange(lo, *count);
772 		if (hi == BLIST_BMAP_RADIX)
773 			scan->bm_bighint = bighint;
774 	} else {
775 		/* Check next leaf for some of the blocks we want or need. */
776 		count1 = *count - (BLIST_BMAP_RADIX - lo);
777 		maxcount -= BLIST_BMAP_RADIX - lo;
778 		hi = blst_next_leaf_alloc(scan, blk, count1, maxcount);
779 		if (hi < count1)
780 			/*
781 			 * The next leaf cannot supply enough blocks to reach
782 			 * the minimum required allocation.  The hint cannot be
783 			 * updated, because the same allocation request could
784 			 * be satisfied later, by this leaf, if the state of
785 			 * the next leaf changes, and without any changes to
786 			 * this leaf.
787 			 */
788 			return (SWAPBLK_NONE);
789 		*count = BLIST_BMAP_RADIX - lo + hi;
790 		scan->bm_bighint = bighint;
791 	}
792 
793 	/* Clear the allocated bits from this leaf. */
794 	scan->bm_bitmap &= mask;
795 	return (blk + lo);
796 }
797 
798 /*
799  * blist_meta_alloc() -	allocate at a meta in the radix tree.
800  *
801  *	Attempt to allocate at a meta node.  If we can't, we update
802  *	bighint and return a failure.  Updating bighint optimize future
803  *	calls that hit this node.  We have to check for our collapse cases
804  *	and we have a few optimizations strewn in as well.
805  */
806 static daddr_t
807 blst_meta_alloc(blmeta_t *scan, daddr_t cursor, int *count,
808     int maxcount, u_daddr_t radix)
809 {
810 	daddr_t blk, i, r, skip;
811 	u_daddr_t mask;
812 	bool scan_from_start;
813 	int digit;
814 
815 	if (radix == BLIST_BMAP_RADIX)
816 		return (blst_leaf_alloc(scan, cursor, count, maxcount));
817 	blk = cursor & -radix;
818 	scan_from_start = (cursor == blk);
819 	radix /= BLIST_META_RADIX;
820 	skip = radix_to_skip(radix);
821 	mask = scan->bm_bitmap;
822 
823 	/* Discard any candidates that appear before cursor. */
824 	digit = (cursor / radix) & BLIST_META_MASK;
825 	mask &= (u_daddr_t)-1 << digit;
826 	if (mask == 0)
827 		return (SWAPBLK_NONE);
828 
829 	/*
830 	 * If the first try is for a block that includes the cursor, pre-undo
831 	 * the digit * radix offset in the first call; otherwise, ignore the
832 	 * cursor entirely.
833 	 */
834 	if (((mask >> digit) & 1) == 1)
835 		cursor -= digit * radix;
836 	else
837 		cursor = blk;
838 
839 	/*
840 	 * Examine the nonempty subtree associated with each bit set in mask.
841 	 */
842 	do {
843 		digit = bitpos(mask);
844 		i = 1 + digit * skip;
845 		if (*count <= scan[i].bm_bighint) {
846 			/*
847 			 * The allocation might fit beginning in the i'th subtree.
848 			 */
849 			r = blst_meta_alloc(&scan[i], cursor + digit * radix,
850 			    count, maxcount, radix);
851 			if (r != SWAPBLK_NONE) {
852 				if (scan[i].bm_bitmap == 0)
853 					scan->bm_bitmap ^= bitrange(digit, 1);
854 				return (r);
855 			}
856 		}
857 		cursor = blk;
858 	} while ((mask ^= bitrange(digit, 1)) != 0);
859 
860 	/*
861 	 * We couldn't allocate count in this subtree.  If the whole tree was
862 	 * scanned, and the last tree node is allocated, update bighint.
863 	 */
864 	if (scan_from_start && !(digit == BLIST_META_RADIX - 1 &&
865 	    scan[i].bm_bighint == BLIST_MAX_ALLOC))
866 		scan->bm_bighint = *count - 1;
867 
868 	return (SWAPBLK_NONE);
869 }
870 
871 /*
872  * BLST_LEAF_FREE() -	free allocated block from leaf bitmap
873  *
874  */
875 static void
876 blst_leaf_free(blmeta_t *scan, daddr_t blk, int count)
877 {
878 	u_daddr_t mask;
879 
880 	/*
881 	 * free some data in this bitmap
882 	 * mask=0000111111111110000
883 	 *          \_________/\__/
884 	 *		count   n
885 	 */
886 	mask = bitrange(blk & BLIST_BMAP_MASK, count);
887 	KASSERT((scan->bm_bitmap & mask) == 0,
888 	    ("freeing free block: %jx, size %d, mask %jx",
889 	    (uintmax_t)blk, count, (uintmax_t)scan->bm_bitmap & mask));
890 	scan->bm_bitmap |= mask;
891 }
892 
893 /*
894  * BLST_META_FREE() - free allocated blocks from radix tree meta info
895  *
896  *	This support routine frees a range of blocks from the bitmap.
897  *	The range must be entirely enclosed by this radix node.  If a
898  *	meta node, we break the range down recursively to free blocks
899  *	in subnodes (which means that this code can free an arbitrary
900  *	range whereas the allocation code cannot allocate an arbitrary
901  *	range).
902  */
903 static void
904 blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, u_daddr_t radix)
905 {
906 	daddr_t blk, endBlk, i, skip;
907 	int digit, endDigit;
908 
909 	/*
910 	 * We could probably do a better job here.  We are required to make
911 	 * bighint at least as large as the biggest allocable block of data.
912 	 * If we just shoehorn it, a little extra overhead will be incurred
913 	 * on the next allocation (but only that one typically).
914 	 */
915 	scan->bm_bighint = BLIST_MAX_ALLOC;
916 
917 	if (radix == BLIST_BMAP_RADIX)
918 		return (blst_leaf_free(scan, freeBlk, count));
919 
920 	endBlk = ummin(freeBlk + count, (freeBlk + radix) & -radix);
921 	radix /= BLIST_META_RADIX;
922 	skip = radix_to_skip(radix);
923 	blk = freeBlk & -radix;
924 	digit = (blk / radix) & BLIST_META_MASK;
925 	endDigit = 1 + (((endBlk - 1) / radix) & BLIST_META_MASK);
926 	scan->bm_bitmap |= bitrange(digit, endDigit - digit);
927 	for (i = 1 + digit * skip; blk < endBlk; i += skip) {
928 		blk += radix;
929 		count = ummin(blk, endBlk) - freeBlk;
930 		blst_meta_free(&scan[i], freeBlk, count, radix);
931 		freeBlk = blk;
932 	}
933 }
934 
935 /*
936  * BLST_COPY() - copy one radix tree to another
937  *
938  *	Locates free space in the source tree and frees it in the destination
939  *	tree.  The space may not already be free in the destination.
940  */
941 static void
942 blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, blist_t dest,
943     daddr_t count)
944 {
945 	daddr_t endBlk, i, skip;
946 
947 	/*
948 	 * Leaf node
949 	 */
950 
951 	if (radix == BLIST_BMAP_RADIX) {
952 		u_daddr_t v = scan->bm_bitmap;
953 
954 		if (v == (u_daddr_t)-1) {
955 			blist_free(dest, blk, count);
956 		} else if (v != 0) {
957 			int i;
958 
959 			for (i = 0; i < count; ++i) {
960 				if (v & ((u_daddr_t)1 << i))
961 					blist_free(dest, blk + i, 1);
962 			}
963 		}
964 		return;
965 	}
966 
967 	/*
968 	 * Meta node
969 	 */
970 
971 	if (scan->bm_bitmap == 0) {
972 		/*
973 		 * Source all allocated, leave dest allocated
974 		 */
975 		return;
976 	}
977 
978 	endBlk = blk + count;
979 	radix /= BLIST_META_RADIX;
980 	skip = radix_to_skip(radix);
981 	for (i = 1; blk < endBlk; i += skip) {
982 		blk += radix;
983 		count = radix;
984 		if (blk >= endBlk)
985 			count -= blk - endBlk;
986 		blst_copy(&scan[i], blk - radix, radix, dest, count);
987 	}
988 }
989 
990 /*
991  * BLST_LEAF_FILL() -	allocate specific blocks in leaf bitmap
992  *
993  *	This routine allocates all blocks in the specified range
994  *	regardless of any existing allocations in that range.  Returns
995  *	the number of blocks allocated by the call.
996  */
997 static daddr_t
998 blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count)
999 {
1000 	daddr_t nblks;
1001 	u_daddr_t mask;
1002 
1003 	mask = bitrange(blk & BLIST_BMAP_MASK, count);
1004 
1005 	/* Count the number of blocks that we are allocating. */
1006 	nblks = bitcount64(scan->bm_bitmap & mask);
1007 
1008 	scan->bm_bitmap &= ~mask;
1009 	return (nblks);
1010 }
1011 
1012 /*
1013  * BLIST_META_FILL() -	allocate specific blocks at a meta node
1014  *
1015  *	This routine allocates the specified range of blocks,
1016  *	regardless of any existing allocations in the range.  The
1017  *	range must be within the extent of this node.  Returns the
1018  *	number of blocks allocated by the call.
1019  */
1020 static daddr_t
1021 blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, u_daddr_t radix)
1022 {
1023 	daddr_t blk, endBlk, i, nblks, skip;
1024 	int digit;
1025 
1026 	if (radix == BLIST_BMAP_RADIX)
1027 		return (blst_leaf_fill(scan, allocBlk, count));
1028 
1029 	endBlk = ummin(allocBlk + count, (allocBlk + radix) & -radix);
1030 	radix /= BLIST_META_RADIX;
1031 	skip = radix_to_skip(radix);
1032 	blk = allocBlk & -radix;
1033 	nblks = 0;
1034 	while (blk < endBlk) {
1035 		digit = (blk / radix) & BLIST_META_MASK;
1036 		i = 1 + digit * skip;
1037 		blk += radix;
1038 		count = ummin(blk, endBlk) - allocBlk;
1039 		nblks += blst_meta_fill(&scan[i], allocBlk, count, radix);
1040 		if (scan[i].bm_bitmap == 0)
1041 			scan->bm_bitmap &= ~((u_daddr_t)1 << digit);
1042 		allocBlk = blk;
1043 	}
1044 	return (nblks);
1045 }
1046 
1047 #ifdef BLIST_DEBUG
1048 
1049 static void
1050 blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int tab)
1051 {
1052 	daddr_t skip;
1053 	u_daddr_t mask;
1054 	int digit;
1055 
1056 	if (radix == BLIST_BMAP_RADIX) {
1057 		printf(
1058 		    "%*.*s(%08llx,%lld): bitmap %0*llx big=%lld\n",
1059 		    tab, tab, "",
1060 		    (long long)blk, (long long)radix,
1061 		    1 + (BLIST_BMAP_RADIX - 1) / 4,
1062 		    (long long)scan->bm_bitmap,
1063 		    (long long)scan->bm_bighint
1064 		);
1065 		return;
1066 	}
1067 
1068 	printf(
1069 	    "%*.*s(%08llx): subtree (%lld/%lld) bitmap %0*llx big=%lld {\n",
1070 	    tab, tab, "",
1071 	    (long long)blk, (long long)radix,
1072 	    (long long)radix,
1073 	    1 + (BLIST_META_RADIX - 1) / 4,
1074 	    (long long)scan->bm_bitmap,
1075 	    (long long)scan->bm_bighint
1076 	);
1077 
1078 	radix /= BLIST_META_RADIX;
1079 	skip = radix_to_skip(radix);
1080 	tab += 4;
1081 
1082 	mask = scan->bm_bitmap;
1083 	/* Examine the nonempty subtree associated with each bit set in mask */
1084 	do {
1085 		digit = bitpos(mask);
1086 		blst_radix_print(&scan[1 + digit * skip], blk + digit * radix,
1087 		    radix, tab);
1088 	} while ((mask ^= bitrange(digit, 1)) != 0);
1089 	tab -= 4;
1090 
1091 	printf(
1092 	    "%*.*s}\n",
1093 	    tab, tab, ""
1094 	);
1095 }
1096 
1097 #endif
1098 
1099 #ifdef BLIST_DEBUG
1100 
1101 int
1102 main(int ac, char **av)
1103 {
1104 	int size = BLIST_META_RADIX * BLIST_BMAP_RADIX;
1105 	int i;
1106 	blist_t bl;
1107 	struct sbuf *s;
1108 
1109 	for (i = 1; i < ac; ++i) {
1110 		const char *ptr = av[i];
1111 		if (*ptr != '-') {
1112 			size = strtol(ptr, NULL, 0);
1113 			continue;
1114 		}
1115 		ptr += 2;
1116 		fprintf(stderr, "Bad option: %s\n", ptr - 2);
1117 		exit(1);
1118 	}
1119 	bl = blist_create(size, M_WAITOK);
1120 	blist_free(bl, 0, size);
1121 
1122 	for (;;) {
1123 		char buf[1024];
1124 		long long da = 0;
1125 		int count = 0, maxcount = 0;
1126 
1127 		printf("%lld/%lld/%lld> ", (long long)blist_avail(bl),
1128 		    (long long)size, (long long)bl->bl_radix);
1129 		fflush(stdout);
1130 		if (fgets(buf, sizeof(buf), stdin) == NULL)
1131 			break;
1132 		switch(buf[0]) {
1133 		case 'r':
1134 			if (sscanf(buf + 1, "%d", &count) == 1) {
1135 				blist_resize(&bl, count, 1, M_WAITOK);
1136 			} else {
1137 				printf("?\n");
1138 			}
1139 		case 'p':
1140 			blist_print(bl);
1141 			break;
1142 		case 's':
1143 			s = sbuf_new_auto();
1144 			blist_stats(bl, s);
1145 			sbuf_finish(s);
1146 			printf("%s", sbuf_data(s));
1147 			sbuf_delete(s);
1148 			break;
1149 		case 'a':
1150 			if (sscanf(buf + 1, "%d%d", &count, &maxcount) == 2) {
1151 				daddr_t blk = blist_alloc(bl, &count, maxcount);
1152 				printf("    R=%08llx, c=%08d\n",
1153 				    (long long)blk, count);
1154 			} else {
1155 				printf("?\n");
1156 			}
1157 			break;
1158 		case 'f':
1159 			if (sscanf(buf + 1, "%llx %d", &da, &count) == 2) {
1160 				blist_free(bl, da, count);
1161 			} else {
1162 				printf("?\n");
1163 			}
1164 			break;
1165 		case 'l':
1166 			if (sscanf(buf + 1, "%llx %d", &da, &count) == 2) {
1167 				printf("    n=%jd\n",
1168 				    (intmax_t)blist_fill(bl, da, count));
1169 			} else {
1170 				printf("?\n");
1171 			}
1172 			break;
1173 		case '?':
1174 		case 'h':
1175 			puts(
1176 			    "p          -print\n"
1177 			    "s          -stats\n"
1178 			    "a %d %d    -allocate\n"
1179 			    "f %x %d    -free\n"
1180 			    "l %x %d    -fill\n"
1181 			    "r %d       -resize\n"
1182 			    "h/?        -help\n"
1183 			    "q          -quit"
1184 			);
1185 			break;
1186 		case 'q':
1187 			break;
1188 		default:
1189 			printf("?\n");
1190 			break;
1191 		}
1192 		if (buf[0] == 'q')
1193 			break;
1194 	}
1195 	return (0);
1196 }
1197 
1198 #endif
1199