1 /*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1998 Matthew Dillon. All Rights Reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. Neither the name of the University nor the names of its contributors
14 * may be used to endorse or promote products derived from this software
15 * without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
18 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
21 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
23 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
25 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
26 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29 /*
30 * BLIST.C - Bitmap allocator/deallocator, using a radix tree with hinting
31 *
32 * This module implements a general bitmap allocator/deallocator. The
33 * allocator eats around 2 bits per 'block'. The module does not
34 * try to interpret the meaning of a 'block' other than to return
35 * SWAPBLK_NONE on an allocation failure.
36 *
37 * A radix tree controls access to pieces of the bitmap, and includes
38 * auxiliary information at each interior node about the availabilty of
39 * contiguous free blocks in the subtree rooted at that node. A radix
40 * constant defines the size of the bitmaps contained in a leaf node
41 * and the number of descendents of each of the meta (interior) nodes.
42 * Each subtree is associated with a range of blocks. The root of any
43 * subtree stores a hint field that defines an upper bound on the size
44 * of the largest allocation that can begin in the associated block
45 * range. A hint is an upper bound on a potential allocation, but not
46 * necessarily a tight upper bound.
47 *
48 * The bitmap field in each node directs the search for available blocks.
49 * For a leaf node, a bit is set if the corresponding block is free. For a
50 * meta node, a bit is set if the corresponding subtree contains a free
51 * block somewhere within it. The search at a meta node considers only
52 * children of that node that represent a range that includes a free block.
53 *
54 * The hinting greatly increases code efficiency for allocations while
55 * the general radix structure optimizes both allocations and frees. The
56 * radix tree should be able to operate well no matter how much
57 * fragmentation there is and no matter how large a bitmap is used.
58 *
59 * The blist code wires all necessary memory at creation time. Neither
60 * allocations nor frees require interaction with the memory subsystem.
61 * The non-blocking nature of allocations and frees is required by swap
62 * code (vm/swap_pager.c).
63 *
64 * LAYOUT: The radix tree is laid out recursively using a linear array.
65 * Each meta node is immediately followed (laid out sequentially in
66 * memory) by BLIST_RADIX lower-level nodes. This is a recursive
67 * structure but one that can be easily scanned through a very simple
68 * 'skip' calculation. The memory allocation is only large enough to
69 * cover the number of blocks requested at creation time. Nodes that
70 * represent blocks beyond that limit, nodes that would never be read
71 * or written, are not allocated, so that the last of the
72 * BLIST_RADIX lower-level nodes of a some nodes may not be allocated.
73 *
74 * NOTE: the allocator cannot currently allocate more than
75 * BLIST_RADIX blocks per call. It will panic with 'allocation too
76 * large' if you try. This is an area that could use improvement. The
77 * radix is large enough that this restriction does not effect the swap
78 * system, though. Currently only the allocation code is affected by
79 * this algorithmic unfeature. The freeing code can handle arbitrary
80 * ranges.
81 *
82 * This code can be compiled stand-alone for debugging.
83 */
84
85 #include <sys/cdefs.h>
86 #ifdef _KERNEL
87
88 #include <sys/param.h>
89 #include <sys/systm.h>
90 #include <sys/lock.h>
91 #include <sys/kernel.h>
92 #include <sys/blist.h>
93 #include <sys/malloc.h>
94 #include <sys/sbuf.h>
95 #include <sys/proc.h>
96 #include <sys/mutex.h>
97
98 #else
99
100 #ifndef BLIST_NO_DEBUG
101 #define BLIST_DEBUG
102 #endif
103
104 #include <sys/errno.h>
105 #include <sys/types.h>
106 #include <sys/malloc.h>
107 #include <sys/sbuf.h>
108 #include <assert.h>
109 #include <stdio.h>
110 #include <string.h>
111 #include <stddef.h>
112 #include <stdlib.h>
113 #include <stdarg.h>
114 #include <stdbool.h>
115
116 #define bitcount64(x) __bitcount64((uint64_t)(x))
117 #define malloc(a,b,c) calloc(a, 1)
118 #define free(a,b) free(a)
119 #define ummin(a,b) ((a) < (b) ? (a) : (b))
120 #define imin(a,b) ((a) < (b) ? (a) : (b))
121 #define KASSERT(a,b) assert(a)
122
123 #include <sys/blist.h>
124
125 #endif
126
127 /*
128 * static support functions
129 */
130 static daddr_t blst_leaf_alloc(blmeta_t *scan, daddr_t blk,
131 int *count, int maxcount);
132 static daddr_t blst_meta_alloc(blmeta_t *scan, daddr_t cursor, int *count,
133 int maxcount, u_daddr_t radix);
134 static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count);
135 static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count,
136 u_daddr_t radix);
137 static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix,
138 blist_t dest, daddr_t count);
139 static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count);
140 static daddr_t blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count,
141 u_daddr_t radix);
142 #ifndef _KERNEL
143 static void blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix,
144 int tab);
145 #endif
146
147 #ifdef _KERNEL
148 static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space");
149 #endif
150
151 #define BLIST_MASK (BLIST_RADIX - 1)
152
153 /*
154 * For a subtree that can represent the state of up to 'radix' blocks, the
155 * number of leaf nodes of the subtree is L=radix/BLIST_RADIX. If 'm'
156 * is short for BLIST_RADIX, then for a tree of height h with L=m**h
157 * leaf nodes, the total number of tree nodes is 1 + m + m**2 + ... + m**h,
158 * or, equivalently, (m**(h+1)-1)/(m-1). This quantity is called 'skip'
159 * in the 'meta' functions that process subtrees. Since integer division
160 * discards remainders, we can express this computation as
161 * skip = (m * m**h) / (m - 1)
162 * skip = (m * (radix / m)) / (m - 1)
163 * skip = radix / (m - 1)
164 * so that simple integer division by a constant can safely be used for the
165 * calculation.
166 */
167 static inline daddr_t
radix_to_skip(daddr_t radix)168 radix_to_skip(daddr_t radix)
169 {
170
171 return (radix / BLIST_MASK);
172 }
173
174 /*
175 * Provide a mask with count bits set, starting as position n.
176 */
177 static inline u_daddr_t
bitrange(int n,int count)178 bitrange(int n, int count)
179 {
180
181 return (((u_daddr_t)-1 << n) &
182 ((u_daddr_t)-1 >> (BLIST_RADIX - (n + count))));
183 }
184
185 static inline int
bitpos(u_daddr_t mask)186 bitpos(u_daddr_t mask)
187 {
188
189 _Static_assert(sizeof(long long) >= sizeof(mask),
190 "mask too big for ffsll()");
191 return (ffsll(mask) - 1);
192 }
193
194 /*
195 * blist_create() - create a blist capable of handling up to the specified
196 * number of blocks
197 *
198 * blocks - must be greater than 0
199 * flags - malloc flags
200 *
201 * The smallest blist consists of a single leaf node capable of
202 * managing BLIST_RADIX blocks.
203 */
204 blist_t
blist_create(daddr_t blocks,int flags)205 blist_create(daddr_t blocks, int flags)
206 {
207 blist_t bl;
208 u_daddr_t nodes, radix;
209
210 KASSERT(blocks > 0, ("invalid block count"));
211
212 /*
213 * Calculate the radix and node count used for scanning.
214 */
215 nodes = 1;
216 for (radix = 1; (blocks - 1) / BLIST_RADIX / radix > 0;
217 radix *= BLIST_RADIX)
218 nodes += 1 + (blocks - 1) / BLIST_RADIX / radix;
219
220 /*
221 * Include a sentinel node to ensure that cross-leaf scans stay within
222 * the bounds of the allocation.
223 */
224 if (blocks % BLIST_RADIX == 0)
225 nodes++;
226
227 bl = malloc(offsetof(struct blist, bl_root[nodes]), M_SWAP, flags |
228 M_ZERO);
229 if (bl == NULL)
230 return (NULL);
231
232 bl->bl_blocks = blocks;
233 bl->bl_radix = radix;
234
235 #if defined(BLIST_DEBUG)
236 printf(
237 "BLIST representing %lld blocks (%lld MB of swap)"
238 ", requiring %lldK of ram\n",
239 (long long)bl->bl_blocks,
240 (long long)bl->bl_blocks * 4 / 1024,
241 (long long)(nodes * sizeof(blmeta_t) + 1023) / 1024
242 );
243 printf("BLIST raw radix tree contains %lld records\n",
244 (long long)nodes);
245 #endif
246
247 return (bl);
248 }
249
250 void
blist_destroy(blist_t bl)251 blist_destroy(blist_t bl)
252 {
253
254 free(bl, M_SWAP);
255 }
256
257 /*
258 * blist_alloc() - reserve space in the block bitmap. Return the base
259 * of a contiguous region or SWAPBLK_NONE if space could
260 * not be allocated.
261 */
262 daddr_t
blist_alloc(blist_t bl,int * count,int maxcount)263 blist_alloc(blist_t bl, int *count, int maxcount)
264 {
265 daddr_t blk, cursor;
266
267 KASSERT(*count <= maxcount,
268 ("invalid parameters %d > %d", *count, maxcount));
269 KASSERT(*count <= BLIST_MAX_ALLOC,
270 ("minimum allocation too large: %d", *count));
271
272 /*
273 * This loop iterates at most twice. An allocation failure in the
274 * first iteration leads to a second iteration only if the cursor was
275 * non-zero. When the cursor is zero, an allocation failure will
276 * stop further iterations.
277 */
278 for (cursor = bl->bl_cursor;; cursor = 0) {
279 blk = blst_meta_alloc(bl->bl_root, cursor, count, maxcount,
280 bl->bl_radix);
281 if (blk != SWAPBLK_NONE) {
282 bl->bl_avail -= *count;
283 bl->bl_cursor = blk + *count;
284 if (bl->bl_cursor == bl->bl_blocks)
285 bl->bl_cursor = 0;
286 return (blk);
287 }
288 if (cursor == 0)
289 return (SWAPBLK_NONE);
290 }
291 }
292
293 /*
294 * blist_avail() - return the number of free blocks.
295 */
296 daddr_t
blist_avail(blist_t bl)297 blist_avail(blist_t bl)
298 {
299
300 return (bl->bl_avail);
301 }
302
303 /*
304 * blist_free() - free up space in the block bitmap. Return the base
305 * of a contiguous region.
306 */
307 void
blist_free(blist_t bl,daddr_t blkno,daddr_t count)308 blist_free(blist_t bl, daddr_t blkno, daddr_t count)
309 {
310
311 KASSERT(blkno >= 0 && blkno + count <= bl->bl_blocks,
312 ("freeing invalid range: blkno %jx, count %d, blocks %jd",
313 (uintmax_t)blkno, (int)count, (uintmax_t)bl->bl_blocks));
314 blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix);
315 bl->bl_avail += count;
316 }
317
318 /*
319 * blist_fill() - mark a region in the block bitmap as off-limits
320 * to the allocator (i.e. allocate it), ignoring any
321 * existing allocations. Return the number of blocks
322 * actually filled that were free before the call.
323 */
324 daddr_t
blist_fill(blist_t bl,daddr_t blkno,daddr_t count)325 blist_fill(blist_t bl, daddr_t blkno, daddr_t count)
326 {
327 daddr_t filled;
328
329 KASSERT(blkno >= 0 && blkno + count <= bl->bl_blocks,
330 ("filling invalid range: blkno %jx, count %d, blocks %jd",
331 (uintmax_t)blkno, (int)count, (uintmax_t)bl->bl_blocks));
332 filled = blst_meta_fill(bl->bl_root, blkno, count, bl->bl_radix);
333 bl->bl_avail -= filled;
334 return (filled);
335 }
336
337 /*
338 * blist_resize() - resize an existing radix tree to handle the
339 * specified number of blocks. This will reallocate
340 * the tree and transfer the previous bitmap to the new
341 * one. When extending the tree you can specify whether
342 * the new blocks are to left allocated or freed.
343 */
344 void
blist_resize(blist_t * pbl,daddr_t count,int freenew,int flags)345 blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags)
346 {
347 blist_t newbl = blist_create(count, flags);
348 blist_t save = *pbl;
349
350 *pbl = newbl;
351 if (count > save->bl_blocks)
352 count = save->bl_blocks;
353 blst_copy(save->bl_root, 0, save->bl_radix, newbl, count);
354
355 /*
356 * If resizing upwards, should we free the new space or not?
357 */
358 if (freenew && count < newbl->bl_blocks) {
359 blist_free(newbl, count, newbl->bl_blocks - count);
360 }
361 blist_destroy(save);
362 }
363
364 #ifdef BLIST_DEBUG
365
366 /*
367 * blist_print() - dump radix tree
368 */
369 void
blist_print(blist_t bl)370 blist_print(blist_t bl)
371 {
372 printf("BLIST avail = %jd, cursor = %08jx {\n",
373 (uintmax_t)bl->bl_avail, (uintmax_t)bl->bl_cursor);
374
375 if (bl->bl_root->bm_bitmap != 0)
376 blst_radix_print(bl->bl_root, 0, bl->bl_radix, 4);
377 printf("}\n");
378 }
379
380 #endif
381
382 static const u_daddr_t fib[] = {
383 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584,
384 4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418, 317811,
385 514229, 832040, 1346269, 2178309, 3524578,
386 };
387
388 /*
389 * Use 'gap' to describe a maximal range of unallocated blocks/bits.
390 */
391 struct gap_stats {
392 daddr_t start; /* current gap start, or SWAPBLK_NONE */
393 daddr_t num; /* number of gaps observed */
394 daddr_t max; /* largest gap size */
395 daddr_t avg; /* average gap size */
396 daddr_t err; /* sum - num * avg */
397 daddr_t histo[nitems(fib)]; /* # gaps in each size range */
398 int max_bucket; /* last histo elt with nonzero val */
399 };
400
401 /*
402 * gap_stats_counting() - is the state 'counting 1 bits'?
403 * or 'skipping 0 bits'?
404 */
405 static inline bool
gap_stats_counting(const struct gap_stats * stats)406 gap_stats_counting(const struct gap_stats *stats)
407 {
408
409 return (stats->start != SWAPBLK_NONE);
410 }
411
412 /*
413 * init_gap_stats() - initialize stats on gap sizes
414 */
415 static inline void
init_gap_stats(struct gap_stats * stats)416 init_gap_stats(struct gap_stats *stats)
417 {
418
419 bzero(stats, sizeof(*stats));
420 stats->start = SWAPBLK_NONE;
421 }
422
423 /*
424 * update_gap_stats() - update stats on gap sizes
425 */
426 static void
update_gap_stats(struct gap_stats * stats,daddr_t posn)427 update_gap_stats(struct gap_stats *stats, daddr_t posn)
428 {
429 daddr_t size;
430 int hi, lo, mid;
431
432 if (!gap_stats_counting(stats)) {
433 stats->start = posn;
434 return;
435 }
436 size = posn - stats->start;
437 stats->start = SWAPBLK_NONE;
438 if (size > stats->max)
439 stats->max = size;
440
441 /*
442 * Find the fibonacci range that contains size,
443 * expecting to find it in an early range.
444 */
445 lo = 0;
446 hi = 1;
447 while (hi < nitems(fib) && fib[hi] <= size) {
448 lo = hi;
449 hi *= 2;
450 }
451 if (hi >= nitems(fib))
452 hi = nitems(fib);
453 while (lo + 1 != hi) {
454 mid = (lo + hi) >> 1;
455 if (fib[mid] <= size)
456 lo = mid;
457 else
458 hi = mid;
459 }
460 stats->histo[lo]++;
461 if (lo > stats->max_bucket)
462 stats->max_bucket = lo;
463 stats->err += size - stats->avg;
464 stats->num++;
465 stats->avg += stats->err / stats->num;
466 stats->err %= stats->num;
467 }
468
469 /*
470 * dump_gap_stats() - print stats on gap sizes
471 */
472 static inline void
dump_gap_stats(const struct gap_stats * stats,struct sbuf * s)473 dump_gap_stats(const struct gap_stats *stats, struct sbuf *s)
474 {
475 int i;
476
477 sbuf_printf(s, "number of maximal free ranges: %jd\n",
478 (intmax_t)stats->num);
479 sbuf_printf(s, "largest free range: %jd\n", (intmax_t)stats->max);
480 sbuf_printf(s, "average maximal free range size: %jd\n",
481 (intmax_t)stats->avg);
482 sbuf_cat(s, "number of maximal free ranges of different sizes:\n");
483 sbuf_cat(s, " count | size range\n");
484 sbuf_cat(s, " ----- | ----------\n");
485 for (i = 0; i < stats->max_bucket; i++) {
486 if (stats->histo[i] != 0) {
487 sbuf_printf(s, "%20jd | ",
488 (intmax_t)stats->histo[i]);
489 if (fib[i] != fib[i + 1] - 1)
490 sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
491 (intmax_t)fib[i + 1] - 1);
492 else
493 sbuf_printf(s, "%jd\n", (intmax_t)fib[i]);
494 }
495 }
496 sbuf_printf(s, "%20jd | ", (intmax_t)stats->histo[i]);
497 if (stats->histo[i] > 1)
498 sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
499 (intmax_t)stats->max);
500 else
501 sbuf_printf(s, "%jd\n", (intmax_t)stats->max);
502 }
503
504 /*
505 * blist_stats() - dump radix tree stats
506 */
507 void
blist_stats(blist_t bl,struct sbuf * s)508 blist_stats(blist_t bl, struct sbuf *s)
509 {
510 struct gap_stats gstats;
511 struct gap_stats *stats = &gstats;
512 daddr_t i, nodes, radix;
513 u_daddr_t diff, mask;
514 int digit;
515
516 init_gap_stats(stats);
517 nodes = 0;
518 radix = bl->bl_radix;
519 for (i = 0; i < bl->bl_blocks; ) {
520 /*
521 * Check for skippable subtrees starting at i.
522 */
523 while (radix != 1) {
524 if (bl->bl_root[nodes].bm_bitmap == 0) {
525 if (gap_stats_counting(stats))
526 update_gap_stats(stats, i);
527 break;
528 }
529
530 /*
531 * Skip subtree root.
532 */
533 nodes++;
534 radix /= BLIST_RADIX;
535 }
536 if (radix == 1) {
537 /*
538 * Scan leaf.
539 */
540 mask = bl->bl_root[nodes].bm_bitmap;
541 diff = mask ^ (mask << 1);
542 if (gap_stats_counting(stats))
543 diff ^= 1;
544 while (diff != 0) {
545 digit = bitpos(diff);
546 update_gap_stats(stats, i + digit);
547 diff ^= bitrange(digit, 1);
548 }
549 }
550 nodes += radix_to_skip(radix * BLIST_RADIX);
551 i += radix * BLIST_RADIX;
552
553 /*
554 * Find max size subtree starting at i.
555 */
556 for (radix = 1;
557 ((i / BLIST_RADIX / radix) & BLIST_MASK) == 0;
558 radix *= BLIST_RADIX)
559 ;
560 }
561 update_gap_stats(stats, i);
562 dump_gap_stats(stats, s);
563 }
564
565 /************************************************************************
566 * ALLOCATION SUPPORT FUNCTIONS *
567 ************************************************************************
568 *
569 * These support functions do all the actual work. They may seem
570 * rather longish, but that's because I've commented them up. The
571 * actual code is straight forward.
572 *
573 */
574
575 /*
576 * BLST_NEXT_LEAF_ALLOC() - allocate the blocks starting with the next leaf.
577 *
578 * 'scan' is a leaf node, and its first block is at address 'start'. The
579 * next leaf node could be adjacent, or several nodes away if the least
580 * common ancestor of 'scan' and its neighbor is several levels up. Use
581 * addresses to determine how many meta-nodes lie between the leaves. If
582 * sequence of leaves starting with the next one has enough initial bits
583 * set, clear them and clear the bits in the meta nodes on the path up to
584 * the least common ancestor to mark any subtrees made completely empty.
585 */
586 static int
blst_next_leaf_alloc(blmeta_t * scan,daddr_t start,int count,int maxcount)587 blst_next_leaf_alloc(blmeta_t *scan, daddr_t start, int count, int maxcount)
588 {
589 u_daddr_t radix;
590 daddr_t blk;
591 int avail, digit;
592
593 start += BLIST_RADIX;
594 for (blk = start; blk - start < maxcount; blk += BLIST_RADIX) {
595 /* Skip meta-nodes, as long as they promise more free blocks. */
596 radix = BLIST_RADIX;
597 while (((++scan)->bm_bitmap & 1) == 1 &&
598 ((blk / radix) & BLIST_MASK) == 0)
599 radix *= BLIST_RADIX;
600 if (~scan->bm_bitmap != 0) {
601 /*
602 * Either there is no next leaf with any free blocks,
603 * or we've reached the next leaf and found that some
604 * of its blocks are not free. In the first case,
605 * bitpos() returns zero here.
606 */
607 avail = blk - start + bitpos(~scan->bm_bitmap);
608 if (avail < count || avail == 0) {
609 /*
610 * There isn't a next leaf with enough free
611 * blocks at its beginning to bother
612 * allocating.
613 */
614 return (avail);
615 }
616 maxcount = imin(avail, maxcount);
617 if (maxcount % BLIST_RADIX == 0) {
618 /*
619 * There was no next leaf. Back scan up to
620 * last leaf.
621 */
622 do {
623 radix /= BLIST_RADIX;
624 --scan;
625 } while (radix != 1);
626 blk -= BLIST_RADIX;
627 }
628 }
629 }
630
631 /*
632 * 'scan' is the last leaf that provides blocks. Clear from 1 to
633 * BLIST_RADIX bits to represent the allocation of those last blocks.
634 */
635 if (maxcount % BLIST_RADIX != 0)
636 scan->bm_bitmap &= ~bitrange(0, maxcount % BLIST_RADIX);
637 else
638 scan->bm_bitmap = 0;
639
640 for (;;) {
641 /* Back up over meta-nodes, clearing bits if necessary. */
642 blk -= BLIST_RADIX;
643 for (radix = BLIST_RADIX;
644 (digit = ((blk / radix) & BLIST_MASK)) == 0;
645 radix *= BLIST_RADIX) {
646 if ((scan--)->bm_bitmap == 0)
647 scan->bm_bitmap ^= 1;
648 }
649 if ((scan--)->bm_bitmap == 0)
650 scan[-digit * radix_to_skip(radix)].bm_bitmap ^=
651 (u_daddr_t)1 << digit;
652
653 if (blk == start)
654 break;
655 /* Clear all the bits of this leaf. */
656 scan->bm_bitmap = 0;
657 }
658 return (maxcount);
659 }
660
661 /*
662 * BLST_LEAF_ALLOC() - allocate at a leaf in the radix tree (a bitmap).
663 *
664 * This function is the core of the allocator. Its execution time is
665 * proportional to log(count), plus height of the tree if the allocation
666 * crosses a leaf boundary.
667 */
668 static daddr_t
blst_leaf_alloc(blmeta_t * scan,daddr_t blk,int * count,int maxcount)669 blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int *count, int maxcount)
670 {
671 u_daddr_t mask;
672 int bighint, count1, hi, lo, num_shifts;
673
674 count1 = *count - 1;
675 num_shifts = fls(count1);
676 mask = ~scan->bm_bitmap;
677 while ((mask & (mask + 1)) != 0 && num_shifts > 0) {
678 /*
679 * If bit i is 0 in mask, then bits in [i, i + (count1 >>
680 * num_shifts)] are 1 in scan->bm_bitmap. Reduce num_shifts to
681 * 0, while preserving this invariant. The updates to mask
682 * leave fewer bits 0, but each bit that remains 0 represents a
683 * longer string of consecutive 1-bits in scan->bm_bitmap. If
684 * more updates to mask cannot set more bits, because mask is
685 * partitioned with all 1 bits following all 0 bits, the loop
686 * terminates immediately.
687 */
688 num_shifts--;
689 mask |= mask >> ((count1 >> num_shifts) + 1) / 2;
690 }
691 bighint = count1 >> num_shifts;
692 if (~mask == 0) {
693 /*
694 * Update bighint. There is no allocation bigger than
695 * count1 >> num_shifts starting in this leaf.
696 */
697 scan->bm_bighint = bighint;
698 return (SWAPBLK_NONE);
699 }
700
701 /* Discard any candidates that appear before blk. */
702 if ((blk & BLIST_MASK) != 0) {
703 if ((~mask & bitrange(0, blk & BLIST_MASK)) != 0) {
704 /* Grow bighint in case all discarded bits are set. */
705 bighint += blk & BLIST_MASK;
706 mask |= bitrange(0, blk & BLIST_MASK);
707 if (~mask == 0) {
708 scan->bm_bighint = bighint;
709 return (SWAPBLK_NONE);
710 }
711 }
712 blk -= blk & BLIST_MASK;
713 }
714
715 /*
716 * The least significant set bit in mask marks the start of the first
717 * available range of sufficient size. Find its position.
718 */
719 lo = bitpos(~mask);
720
721 /*
722 * Find how much space is available starting at that position.
723 */
724 if ((mask & (mask + 1)) != 0) {
725 /* Count the 1 bits starting at position lo. */
726 hi = bitpos(mask & (mask + 1)) + count1;
727 if (maxcount < hi - lo)
728 hi = lo + maxcount;
729 *count = hi - lo;
730 mask = ~bitrange(lo, *count);
731 } else if (maxcount <= BLIST_RADIX - lo) {
732 /* All the blocks we can use are available here. */
733 hi = lo + maxcount;
734 *count = maxcount;
735 mask = ~bitrange(lo, *count);
736 if (hi == BLIST_RADIX)
737 scan->bm_bighint = bighint;
738 } else {
739 /* Check next leaf for some of the blocks we want or need. */
740 count1 = *count - (BLIST_RADIX - lo);
741 maxcount -= BLIST_RADIX - lo;
742 hi = blst_next_leaf_alloc(scan, blk, count1, maxcount);
743 if (hi < count1)
744 /*
745 * The next leaf cannot supply enough blocks to reach
746 * the minimum required allocation. The hint cannot be
747 * updated, because the same allocation request could
748 * be satisfied later, by this leaf, if the state of
749 * the next leaf changes, and without any changes to
750 * this leaf.
751 */
752 return (SWAPBLK_NONE);
753 *count = BLIST_RADIX - lo + hi;
754 scan->bm_bighint = bighint;
755 }
756
757 /* Clear the allocated bits from this leaf. */
758 scan->bm_bitmap &= mask;
759 return (blk + lo);
760 }
761
762 /*
763 * blist_meta_alloc() - allocate at a meta in the radix tree.
764 *
765 * Attempt to allocate at a meta node. If we can't, we update
766 * bighint and return a failure. Updating bighint optimize future
767 * calls that hit this node. We have to check for our collapse cases
768 * and we have a few optimizations strewn in as well.
769 */
770 static daddr_t
blst_meta_alloc(blmeta_t * scan,daddr_t cursor,int * count,int maxcount,u_daddr_t radix)771 blst_meta_alloc(blmeta_t *scan, daddr_t cursor, int *count,
772 int maxcount, u_daddr_t radix)
773 {
774 daddr_t blk, i, r, skip;
775 u_daddr_t mask;
776 bool scan_from_start;
777 int digit;
778
779 if (radix == 1)
780 return (blst_leaf_alloc(scan, cursor, count, maxcount));
781 blk = cursor & -(radix * BLIST_RADIX);
782 scan_from_start = (cursor == blk);
783 skip = radix_to_skip(radix);
784 mask = scan->bm_bitmap;
785
786 /* Discard any candidates that appear before cursor. */
787 digit = (cursor / radix) & BLIST_MASK;
788 mask &= (u_daddr_t)-1 << digit;
789 if (mask == 0)
790 return (SWAPBLK_NONE);
791
792 /*
793 * If the first try is for a block that includes the cursor, pre-undo
794 * the digit * radix offset in the first call; otherwise, ignore the
795 * cursor entirely.
796 */
797 if (((mask >> digit) & 1) == 1)
798 cursor -= digit * radix;
799 else
800 cursor = blk;
801
802 /*
803 * Examine the nonempty subtree associated with each bit set in mask.
804 */
805 do {
806 digit = bitpos(mask);
807 i = 1 + digit * skip;
808 if (*count <= scan[i].bm_bighint) {
809 /*
810 * The allocation might fit beginning in the i'th subtree.
811 */
812 r = blst_meta_alloc(&scan[i], cursor + digit * radix,
813 count, maxcount, radix / BLIST_RADIX);
814 if (r != SWAPBLK_NONE) {
815 if (scan[i].bm_bitmap == 0)
816 scan->bm_bitmap ^= bitrange(digit, 1);
817 return (r);
818 }
819 }
820 cursor = blk;
821 } while ((mask ^= bitrange(digit, 1)) != 0);
822
823 /*
824 * We couldn't allocate count in this subtree. If the whole tree was
825 * scanned, and the last tree node is allocated, update bighint.
826 */
827 if (scan_from_start && !(digit == BLIST_RADIX - 1 &&
828 scan[i].bm_bighint == BLIST_MAX_ALLOC))
829 scan->bm_bighint = *count - 1;
830
831 return (SWAPBLK_NONE);
832 }
833
834 /*
835 * BLST_LEAF_FREE() - free allocated block from leaf bitmap
836 *
837 */
838 static void
blst_leaf_free(blmeta_t * scan,daddr_t blk,int count)839 blst_leaf_free(blmeta_t *scan, daddr_t blk, int count)
840 {
841 u_daddr_t mask;
842
843 /*
844 * free some data in this bitmap
845 * mask=0000111111111110000
846 * \_________/\__/
847 * count n
848 */
849 mask = bitrange(blk & BLIST_MASK, count);
850 KASSERT((scan->bm_bitmap & mask) == 0,
851 ("freeing free block: %jx, size %d, mask %jx",
852 (uintmax_t)blk, count, (uintmax_t)scan->bm_bitmap & mask));
853 scan->bm_bitmap |= mask;
854 }
855
856 /*
857 * BLST_META_FREE() - free allocated blocks from radix tree meta info
858 *
859 * This support routine frees a range of blocks from the bitmap.
860 * The range must be entirely enclosed by this radix node. If a
861 * meta node, we break the range down recursively to free blocks
862 * in subnodes (which means that this code can free an arbitrary
863 * range whereas the allocation code cannot allocate an arbitrary
864 * range).
865 */
866 static void
blst_meta_free(blmeta_t * scan,daddr_t freeBlk,daddr_t count,u_daddr_t radix)867 blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, u_daddr_t radix)
868 {
869 daddr_t blk, endBlk, i, skip;
870 int digit, endDigit;
871
872 /*
873 * We could probably do a better job here. We are required to make
874 * bighint at least as large as the biggest allocable block of data.
875 * If we just shoehorn it, a little extra overhead will be incurred
876 * on the next allocation (but only that one typically).
877 */
878 scan->bm_bighint = BLIST_MAX_ALLOC;
879
880 if (radix == 1)
881 return (blst_leaf_free(scan, freeBlk, count));
882
883 endBlk = freeBlk + count;
884 blk = (freeBlk + radix * BLIST_RADIX) & -(radix * BLIST_RADIX);
885 /*
886 * blk is first block past the end of the range of this meta node,
887 * or 0 in case of overflow.
888 */
889 if (blk != 0)
890 endBlk = ummin(endBlk, blk);
891 skip = radix_to_skip(radix);
892 blk = freeBlk & -radix;
893 digit = (blk / radix) & BLIST_MASK;
894 endDigit = 1 + (((endBlk - 1) / radix) & BLIST_MASK);
895 scan->bm_bitmap |= bitrange(digit, endDigit - digit);
896 for (i = 1 + digit * skip; blk < endBlk; i += skip) {
897 blk += radix;
898 count = ummin(blk, endBlk) - freeBlk;
899 blst_meta_free(&scan[i], freeBlk, count, radix / BLIST_RADIX);
900 freeBlk = blk;
901 }
902 }
903
904 /*
905 * BLST_COPY() - copy one radix tree to another
906 *
907 * Locates free space in the source tree and frees it in the destination
908 * tree. The space may not already be free in the destination.
909 */
910 static void
blst_copy(blmeta_t * scan,daddr_t blk,daddr_t radix,blist_t dest,daddr_t count)911 blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, blist_t dest,
912 daddr_t count)
913 {
914 daddr_t endBlk, i, skip;
915
916 /*
917 * Leaf node
918 */
919
920 if (radix == 1) {
921 u_daddr_t v = scan->bm_bitmap;
922
923 if (v == (u_daddr_t)-1) {
924 blist_free(dest, blk, count);
925 } else if (v != 0) {
926 int i;
927
928 for (i = 0; i < count; ++i) {
929 if (v & ((u_daddr_t)1 << i))
930 blist_free(dest, blk + i, 1);
931 }
932 }
933 return;
934 }
935
936 /*
937 * Meta node
938 */
939
940 if (scan->bm_bitmap == 0) {
941 /*
942 * Source all allocated, leave dest allocated
943 */
944 return;
945 }
946
947 endBlk = blk + count;
948 skip = radix_to_skip(radix);
949 for (i = 1; blk < endBlk; i += skip) {
950 blk += radix;
951 count = radix;
952 if (blk >= endBlk)
953 count -= blk - endBlk;
954 blst_copy(&scan[i], blk - radix,
955 radix / BLIST_RADIX, dest, count);
956 }
957 }
958
959 /*
960 * BLST_LEAF_FILL() - allocate specific blocks in leaf bitmap
961 *
962 * This routine allocates all blocks in the specified range
963 * regardless of any existing allocations in that range. Returns
964 * the number of blocks allocated by the call.
965 */
966 static daddr_t
blst_leaf_fill(blmeta_t * scan,daddr_t blk,int count)967 blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count)
968 {
969 daddr_t nblks;
970 u_daddr_t mask;
971
972 mask = bitrange(blk & BLIST_MASK, count);
973
974 /* Count the number of blocks that we are allocating. */
975 nblks = bitcount64(scan->bm_bitmap & mask);
976
977 scan->bm_bitmap &= ~mask;
978 return (nblks);
979 }
980
981 /*
982 * BLIST_META_FILL() - allocate specific blocks at a meta node
983 *
984 * This routine allocates the specified range of blocks,
985 * regardless of any existing allocations in the range. The
986 * range must be within the extent of this node. Returns the
987 * number of blocks allocated by the call.
988 */
989 static daddr_t
blst_meta_fill(blmeta_t * scan,daddr_t allocBlk,daddr_t count,u_daddr_t radix)990 blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, u_daddr_t radix)
991 {
992 daddr_t blk, endBlk, i, nblks, skip;
993 int digit;
994
995 if (radix == 1)
996 return (blst_leaf_fill(scan, allocBlk, count));
997
998 endBlk = allocBlk + count;
999 blk = (allocBlk + radix * BLIST_RADIX) & -(radix * BLIST_RADIX);
1000 /*
1001 * blk is first block past the end of the range of this meta node,
1002 * or 0 in case of overflow.
1003 */
1004 if (blk != 0)
1005 endBlk = ummin(endBlk, blk);
1006 skip = radix_to_skip(radix);
1007 blk = allocBlk & -radix;
1008 nblks = 0;
1009 while (blk < endBlk) {
1010 digit = (blk / radix) & BLIST_MASK;
1011 i = 1 + digit * skip;
1012 blk += radix;
1013 count = ummin(blk, endBlk) - allocBlk;
1014 nblks += blst_meta_fill(&scan[i], allocBlk, count,
1015 radix / BLIST_RADIX);
1016 if (scan[i].bm_bitmap == 0)
1017 scan->bm_bitmap &= ~((u_daddr_t)1 << digit);
1018 allocBlk = blk;
1019 }
1020 return (nblks);
1021 }
1022
1023 #ifdef BLIST_DEBUG
1024
1025 static void
blst_radix_print(blmeta_t * scan,daddr_t blk,daddr_t radix,int tab)1026 blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int tab)
1027 {
1028 daddr_t skip;
1029 u_daddr_t mask;
1030 int digit;
1031
1032 if (radix == 1) {
1033 printf(
1034 "%*.*s(%08llx,%lld): bitmap %0*llx big=%lld\n",
1035 tab, tab, "",
1036 (long long)blk, (long long)BLIST_RADIX,
1037 (int)(1 + (BLIST_RADIX - 1) / 4),
1038 (long long)scan->bm_bitmap,
1039 (long long)scan->bm_bighint
1040 );
1041 return;
1042 }
1043
1044 printf(
1045 "%*.*s(%08llx): subtree (%lld/%lld) bitmap %0*llx big=%lld {\n",
1046 tab, tab, "",
1047 (long long)blk, (long long)radix * BLIST_RADIX,
1048 (long long)radix * BLIST_RADIX,
1049 (int)(1 + (BLIST_RADIX - 1) / 4),
1050 (long long)scan->bm_bitmap,
1051 (long long)scan->bm_bighint
1052 );
1053
1054 skip = radix_to_skip(radix);
1055 tab += 4;
1056
1057 mask = scan->bm_bitmap;
1058 /* Examine the nonempty subtree associated with each bit set in mask */
1059 do {
1060 digit = bitpos(mask);
1061 blst_radix_print(&scan[1 + digit * skip], blk + digit * radix,
1062 radix / BLIST_RADIX, tab);
1063 } while ((mask ^= bitrange(digit, 1)) != 0);
1064 tab -= 4;
1065
1066 printf(
1067 "%*.*s}\n",
1068 tab, tab, ""
1069 );
1070 }
1071
1072 #endif
1073
1074 #ifdef BLIST_DEBUG
1075
1076 int
main(int ac,char ** av)1077 main(int ac, char **av)
1078 {
1079 daddr_t size = BLIST_RADIX * BLIST_RADIX;
1080 int i;
1081 blist_t bl;
1082 struct sbuf *s;
1083
1084 for (i = 1; i < ac; ++i) {
1085 const char *ptr = av[i];
1086 if (*ptr != '-') {
1087 size = strtoll(ptr, NULL, 0);
1088 continue;
1089 }
1090 ptr += 2;
1091 fprintf(stderr, "Bad option: %s\n", ptr - 2);
1092 exit(1);
1093 }
1094 bl = blist_create(size, M_WAITOK);
1095 if (bl == NULL) {
1096 fprintf(stderr, "blist_create failed\n");
1097 exit(1);
1098 }
1099 blist_free(bl, 0, size);
1100
1101 for (;;) {
1102 char buf[1024];
1103 long long da = 0;
1104 int count = 0, maxcount = 0;
1105
1106 printf("%lld/%lld/%lld> ", (long long)blist_avail(bl),
1107 (long long)size, (long long)bl->bl_radix * BLIST_RADIX);
1108 fflush(stdout);
1109 if (fgets(buf, sizeof(buf), stdin) == NULL)
1110 break;
1111 switch(buf[0]) {
1112 case 'r':
1113 if (sscanf(buf + 1, "%d", &count) == 1) {
1114 blist_resize(&bl, count, 1, M_WAITOK);
1115 } else {
1116 printf("?\n");
1117 }
1118 case 'p':
1119 blist_print(bl);
1120 break;
1121 case 's':
1122 s = sbuf_new_auto();
1123 blist_stats(bl, s);
1124 sbuf_finish(s);
1125 printf("%s", sbuf_data(s));
1126 sbuf_delete(s);
1127 break;
1128 case 'a':
1129 if (sscanf(buf + 1, "%d%d", &count, &maxcount) == 2) {
1130 daddr_t blk = blist_alloc(bl, &count, maxcount);
1131 printf(" R=%08llx, c=%08d\n",
1132 (long long)blk, count);
1133 } else {
1134 printf("?\n");
1135 }
1136 break;
1137 case 'f':
1138 if (sscanf(buf + 1, "%llx %d", &da, &count) == 2) {
1139 blist_free(bl, da, count);
1140 } else {
1141 printf("?\n");
1142 }
1143 break;
1144 case 'l':
1145 if (sscanf(buf + 1, "%llx %d", &da, &count) == 2) {
1146 printf(" n=%jd\n",
1147 (intmax_t)blist_fill(bl, da, count));
1148 } else {
1149 printf("?\n");
1150 }
1151 break;
1152 case '?':
1153 case 'h':
1154 puts(
1155 "p -print\n"
1156 "s -stats\n"
1157 "a %d %d -allocate\n"
1158 "f %x %d -free\n"
1159 "l %x %d -fill\n"
1160 "r %d -resize\n"
1161 "h/? -help\n"
1162 "q -quit"
1163 );
1164 break;
1165 case 'q':
1166 break;
1167 default:
1168 printf("?\n");
1169 break;
1170 }
1171 if (buf[0] == 'q')
1172 break;
1173 }
1174 return (0);
1175 }
1176
1177 #endif
1178