1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1998 Matthew Dillon. All Rights Reserved. 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS 18 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY 21 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE 23 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 25 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 26 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 */ 29 /* 30 * BLIST.C - Bitmap allocator/deallocator, using a radix tree with hinting 31 * 32 * This module implements a general bitmap allocator/deallocator. The 33 * allocator eats around 2 bits per 'block'. The module does not 34 * try to interpret the meaning of a 'block' other than to return 35 * SWAPBLK_NONE on an allocation failure. 36 * 37 * A radix tree controls access to pieces of the bitmap, and includes 38 * auxiliary information at each interior node about the availabilty of 39 * contiguous free blocks in the subtree rooted at that node. A radix 40 * constant defines the size of the bitmaps contained in a leaf node 41 * and the number of descendents of each of the meta (interior) nodes. 42 * Each subtree is associated with a range of blocks. The root of any 43 * subtree stores a hint field that defines an upper bound on the size 44 * of the largest allocation that can begin in the associated block 45 * range. A hint is an upper bound on a potential allocation, but not 46 * necessarily a tight upper bound. 47 * 48 * The bitmap field in each node directs the search for available blocks. 49 * For a leaf node, a bit is set if the corresponding block is free. For a 50 * meta node, a bit is set if the corresponding subtree contains a free 51 * block somewhere within it. The search at a meta node considers only 52 * children of that node that represent a range that includes a free block. 53 * 54 * The hinting greatly increases code efficiency for allocations while 55 * the general radix structure optimizes both allocations and frees. The 56 * radix tree should be able to operate well no matter how much 57 * fragmentation there is and no matter how large a bitmap is used. 58 * 59 * The blist code wires all necessary memory at creation time. Neither 60 * allocations nor frees require interaction with the memory subsystem. 61 * The non-blocking nature of allocations and frees is required by swap 62 * code (vm/swap_pager.c). 63 * 64 * LAYOUT: The radix tree is laid out recursively using a linear array. 65 * Each meta node is immediately followed (laid out sequentially in 66 * memory) by BLIST_RADIX lower-level nodes. This is a recursive 67 * structure but one that can be easily scanned through a very simple 68 * 'skip' calculation. The memory allocation is only large enough to 69 * cover the number of blocks requested at creation time. Nodes that 70 * represent blocks beyond that limit, nodes that would never be read 71 * or written, are not allocated, so that the last of the 72 * BLIST_RADIX lower-level nodes of a some nodes may not be allocated. 73 * 74 * NOTE: the allocator cannot currently allocate more than 75 * BLIST_RADIX blocks per call. It will panic with 'allocation too 76 * large' if you try. This is an area that could use improvement. The 77 * radix is large enough that this restriction does not effect the swap 78 * system, though. Currently only the allocation code is affected by 79 * this algorithmic unfeature. The freeing code can handle arbitrary 80 * ranges. 81 * 82 * This code can be compiled stand-alone for debugging. 83 */ 84 85 #include <sys/cdefs.h> 86 __FBSDID("$FreeBSD$"); 87 88 #ifdef _KERNEL 89 90 #include <sys/param.h> 91 #include <sys/systm.h> 92 #include <sys/lock.h> 93 #include <sys/kernel.h> 94 #include <sys/blist.h> 95 #include <sys/malloc.h> 96 #include <sys/sbuf.h> 97 #include <sys/proc.h> 98 #include <sys/mutex.h> 99 100 #else 101 102 #ifndef BLIST_NO_DEBUG 103 #define BLIST_DEBUG 104 #endif 105 106 #include <sys/errno.h> 107 #include <sys/types.h> 108 #include <sys/malloc.h> 109 #include <sys/sbuf.h> 110 #include <assert.h> 111 #include <stdio.h> 112 #include <string.h> 113 #include <stddef.h> 114 #include <stdlib.h> 115 #include <stdarg.h> 116 #include <stdbool.h> 117 118 #define bitcount64(x) __bitcount64((uint64_t)(x)) 119 #define malloc(a,b,c) calloc(a, 1) 120 #define free(a,b) free(a) 121 #define ummin(a,b) ((a) < (b) ? (a) : (b)) 122 #define imin(a,b) ((a) < (b) ? (a) : (b)) 123 #define KASSERT(a,b) assert(a) 124 125 #include <sys/blist.h> 126 127 #endif 128 129 /* 130 * static support functions 131 */ 132 static daddr_t blst_leaf_alloc(blmeta_t *scan, daddr_t blk, 133 int *count, int maxcount); 134 static daddr_t blst_meta_alloc(blmeta_t *scan, daddr_t cursor, int *count, 135 int maxcount, u_daddr_t radix); 136 static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count); 137 static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, 138 u_daddr_t radix); 139 static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, 140 blist_t dest, daddr_t count); 141 static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count); 142 static daddr_t blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, 143 u_daddr_t radix); 144 #ifndef _KERNEL 145 static void blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, 146 int tab); 147 #endif 148 149 #ifdef _KERNEL 150 static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space"); 151 #endif 152 153 #define BLIST_MASK (BLIST_RADIX - 1) 154 155 /* 156 * For a subtree that can represent the state of up to 'radix' blocks, the 157 * number of leaf nodes of the subtree is L=radix/BLIST_RADIX. If 'm' 158 * is short for BLIST_RADIX, then for a tree of height h with L=m**h 159 * leaf nodes, the total number of tree nodes is 1 + m + m**2 + ... + m**h, 160 * or, equivalently, (m**(h+1)-1)/(m-1). This quantity is called 'skip' 161 * in the 'meta' functions that process subtrees. Since integer division 162 * discards remainders, we can express this computation as 163 * skip = (m * m**h) / (m - 1) 164 * skip = (m * (radix / m)) / (m - 1) 165 * skip = radix / (m - 1) 166 * so that simple integer division by a constant can safely be used for the 167 * calculation. 168 */ 169 static inline daddr_t 170 radix_to_skip(daddr_t radix) 171 { 172 173 return (radix / BLIST_MASK); 174 } 175 176 /* 177 * Provide a mask with count bits set, starting as position n. 178 */ 179 static inline u_daddr_t 180 bitrange(int n, int count) 181 { 182 183 return (((u_daddr_t)-1 << n) & 184 ((u_daddr_t)-1 >> (BLIST_RADIX - (n + count)))); 185 } 186 187 /* 188 * Find the first bit set in a u_daddr_t. 189 */ 190 static inline int 191 generic_bitpos(u_daddr_t mask) 192 { 193 int hi, lo, mid; 194 195 lo = 0; 196 hi = BLIST_RADIX; 197 while (lo + 1 < hi) { 198 mid = (lo + hi) >> 1; 199 if (mask & bitrange(0, mid)) 200 hi = mid; 201 else 202 lo = mid; 203 } 204 return (lo); 205 } 206 207 static inline int 208 bitpos(u_daddr_t mask) 209 { 210 211 switch (sizeof(mask)) { 212 #ifdef HAVE_INLINE_FFSLL 213 case sizeof(long long): 214 return (ffsll(mask) - 1); 215 #endif 216 #ifdef HAVE_INLINE_FFS 217 case sizeof(int): 218 return (ffs(mask) - 1); 219 #endif 220 default: 221 return (generic_bitpos(mask)); 222 } 223 } 224 225 /* 226 * blist_create() - create a blist capable of handling up to the specified 227 * number of blocks 228 * 229 * blocks - must be greater than 0 230 * flags - malloc flags 231 * 232 * The smallest blist consists of a single leaf node capable of 233 * managing BLIST_RADIX blocks. 234 */ 235 blist_t 236 blist_create(daddr_t blocks, int flags) 237 { 238 blist_t bl; 239 u_daddr_t nodes, radix; 240 241 KASSERT(blocks > 0, ("invalid block count")); 242 243 /* 244 * Calculate the radix and node count used for scanning. 245 */ 246 nodes = 1; 247 for (radix = 1; (blocks - 1) / BLIST_RADIX / radix > 0; 248 radix *= BLIST_RADIX) 249 nodes += 1 + (blocks - 1) / BLIST_RADIX / radix; 250 251 /* 252 * Include a sentinel node to ensure that cross-leaf scans stay within 253 * the bounds of the allocation. 254 */ 255 if (blocks % BLIST_RADIX == 0) 256 nodes++; 257 258 bl = malloc(offsetof(struct blist, bl_root[nodes]), M_SWAP, flags | 259 M_ZERO); 260 if (bl == NULL) 261 return (NULL); 262 263 bl->bl_blocks = blocks; 264 bl->bl_radix = radix; 265 266 #if defined(BLIST_DEBUG) 267 printf( 268 "BLIST representing %lld blocks (%lld MB of swap)" 269 ", requiring %lldK of ram\n", 270 (long long)bl->bl_blocks, 271 (long long)bl->bl_blocks * 4 / 1024, 272 (long long)(nodes * sizeof(blmeta_t) + 1023) / 1024 273 ); 274 printf("BLIST raw radix tree contains %lld records\n", 275 (long long)nodes); 276 #endif 277 278 return (bl); 279 } 280 281 void 282 blist_destroy(blist_t bl) 283 { 284 285 free(bl, M_SWAP); 286 } 287 288 /* 289 * blist_alloc() - reserve space in the block bitmap. Return the base 290 * of a contiguous region or SWAPBLK_NONE if space could 291 * not be allocated. 292 */ 293 daddr_t 294 blist_alloc(blist_t bl, int *count, int maxcount) 295 { 296 daddr_t blk, cursor; 297 298 KASSERT(*count <= maxcount, 299 ("invalid parameters %d > %d", *count, maxcount)); 300 KASSERT(*count <= BLIST_MAX_ALLOC, 301 ("minimum allocation too large: %d", *count)); 302 303 /* 304 * This loop iterates at most twice. An allocation failure in the 305 * first iteration leads to a second iteration only if the cursor was 306 * non-zero. When the cursor is zero, an allocation failure will 307 * stop further iterations. 308 */ 309 for (cursor = bl->bl_cursor;; cursor = 0) { 310 blk = blst_meta_alloc(bl->bl_root, cursor, count, maxcount, 311 bl->bl_radix); 312 if (blk != SWAPBLK_NONE) { 313 bl->bl_avail -= *count; 314 bl->bl_cursor = blk + *count; 315 if (bl->bl_cursor == bl->bl_blocks) 316 bl->bl_cursor = 0; 317 return (blk); 318 } 319 if (cursor == 0) 320 return (SWAPBLK_NONE); 321 } 322 } 323 324 /* 325 * blist_avail() - return the number of free blocks. 326 */ 327 daddr_t 328 blist_avail(blist_t bl) 329 { 330 331 return (bl->bl_avail); 332 } 333 334 /* 335 * blist_free() - free up space in the block bitmap. Return the base 336 * of a contiguous region. 337 */ 338 void 339 blist_free(blist_t bl, daddr_t blkno, daddr_t count) 340 { 341 342 KASSERT(blkno >= 0 && blkno + count <= bl->bl_blocks, 343 ("freeing invalid range: blkno %jx, count %d, blocks %jd", 344 (uintmax_t)blkno, (int)count, (uintmax_t)bl->bl_blocks)); 345 blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix); 346 bl->bl_avail += count; 347 } 348 349 /* 350 * blist_fill() - mark a region in the block bitmap as off-limits 351 * to the allocator (i.e. allocate it), ignoring any 352 * existing allocations. Return the number of blocks 353 * actually filled that were free before the call. 354 */ 355 daddr_t 356 blist_fill(blist_t bl, daddr_t blkno, daddr_t count) 357 { 358 daddr_t filled; 359 360 KASSERT(blkno >= 0 && blkno + count <= bl->bl_blocks, 361 ("filling invalid range: blkno %jx, count %d, blocks %jd", 362 (uintmax_t)blkno, (int)count, (uintmax_t)bl->bl_blocks)); 363 filled = blst_meta_fill(bl->bl_root, blkno, count, bl->bl_radix); 364 bl->bl_avail -= filled; 365 return (filled); 366 } 367 368 /* 369 * blist_resize() - resize an existing radix tree to handle the 370 * specified number of blocks. This will reallocate 371 * the tree and transfer the previous bitmap to the new 372 * one. When extending the tree you can specify whether 373 * the new blocks are to left allocated or freed. 374 */ 375 void 376 blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags) 377 { 378 blist_t newbl = blist_create(count, flags); 379 blist_t save = *pbl; 380 381 *pbl = newbl; 382 if (count > save->bl_blocks) 383 count = save->bl_blocks; 384 blst_copy(save->bl_root, 0, save->bl_radix, newbl, count); 385 386 /* 387 * If resizing upwards, should we free the new space or not? 388 */ 389 if (freenew && count < newbl->bl_blocks) { 390 blist_free(newbl, count, newbl->bl_blocks - count); 391 } 392 blist_destroy(save); 393 } 394 395 #ifdef BLIST_DEBUG 396 397 /* 398 * blist_print() - dump radix tree 399 */ 400 void 401 blist_print(blist_t bl) 402 { 403 printf("BLIST avail = %jd, cursor = %08jx {\n", 404 (uintmax_t)bl->bl_avail, (uintmax_t)bl->bl_cursor); 405 406 if (bl->bl_root->bm_bitmap != 0) 407 blst_radix_print(bl->bl_root, 0, bl->bl_radix, 4); 408 printf("}\n"); 409 } 410 411 #endif 412 413 static const u_daddr_t fib[] = { 414 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584, 415 4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418, 317811, 416 514229, 832040, 1346269, 2178309, 3524578, 417 }; 418 419 /* 420 * Use 'gap' to describe a maximal range of unallocated blocks/bits. 421 */ 422 struct gap_stats { 423 daddr_t start; /* current gap start, or SWAPBLK_NONE */ 424 daddr_t num; /* number of gaps observed */ 425 daddr_t max; /* largest gap size */ 426 daddr_t avg; /* average gap size */ 427 daddr_t err; /* sum - num * avg */ 428 daddr_t histo[nitems(fib)]; /* # gaps in each size range */ 429 int max_bucket; /* last histo elt with nonzero val */ 430 }; 431 432 /* 433 * gap_stats_counting() - is the state 'counting 1 bits'? 434 * or 'skipping 0 bits'? 435 */ 436 static inline bool 437 gap_stats_counting(const struct gap_stats *stats) 438 { 439 440 return (stats->start != SWAPBLK_NONE); 441 } 442 443 /* 444 * init_gap_stats() - initialize stats on gap sizes 445 */ 446 static inline void 447 init_gap_stats(struct gap_stats *stats) 448 { 449 450 bzero(stats, sizeof(*stats)); 451 stats->start = SWAPBLK_NONE; 452 } 453 454 /* 455 * update_gap_stats() - update stats on gap sizes 456 */ 457 static void 458 update_gap_stats(struct gap_stats *stats, daddr_t posn) 459 { 460 daddr_t size; 461 int hi, lo, mid; 462 463 if (!gap_stats_counting(stats)) { 464 stats->start = posn; 465 return; 466 } 467 size = posn - stats->start; 468 stats->start = SWAPBLK_NONE; 469 if (size > stats->max) 470 stats->max = size; 471 472 /* 473 * Find the fibonacci range that contains size, 474 * expecting to find it in an early range. 475 */ 476 lo = 0; 477 hi = 1; 478 while (hi < nitems(fib) && fib[hi] <= size) { 479 lo = hi; 480 hi *= 2; 481 } 482 if (hi >= nitems(fib)) 483 hi = nitems(fib); 484 while (lo + 1 != hi) { 485 mid = (lo + hi) >> 1; 486 if (fib[mid] <= size) 487 lo = mid; 488 else 489 hi = mid; 490 } 491 stats->histo[lo]++; 492 if (lo > stats->max_bucket) 493 stats->max_bucket = lo; 494 stats->err += size - stats->avg; 495 stats->num++; 496 stats->avg += stats->err / stats->num; 497 stats->err %= stats->num; 498 } 499 500 /* 501 * dump_gap_stats() - print stats on gap sizes 502 */ 503 static inline void 504 dump_gap_stats(const struct gap_stats *stats, struct sbuf *s) 505 { 506 int i; 507 508 sbuf_printf(s, "number of maximal free ranges: %jd\n", 509 (intmax_t)stats->num); 510 sbuf_printf(s, "largest free range: %jd\n", (intmax_t)stats->max); 511 sbuf_printf(s, "average maximal free range size: %jd\n", 512 (intmax_t)stats->avg); 513 sbuf_printf(s, "number of maximal free ranges of different sizes:\n"); 514 sbuf_printf(s, " count | size range\n"); 515 sbuf_printf(s, " ----- | ----------\n"); 516 for (i = 0; i < stats->max_bucket; i++) { 517 if (stats->histo[i] != 0) { 518 sbuf_printf(s, "%20jd | ", 519 (intmax_t)stats->histo[i]); 520 if (fib[i] != fib[i + 1] - 1) 521 sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i], 522 (intmax_t)fib[i + 1] - 1); 523 else 524 sbuf_printf(s, "%jd\n", (intmax_t)fib[i]); 525 } 526 } 527 sbuf_printf(s, "%20jd | ", (intmax_t)stats->histo[i]); 528 if (stats->histo[i] > 1) 529 sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i], 530 (intmax_t)stats->max); 531 else 532 sbuf_printf(s, "%jd\n", (intmax_t)stats->max); 533 } 534 535 /* 536 * blist_stats() - dump radix tree stats 537 */ 538 void 539 blist_stats(blist_t bl, struct sbuf *s) 540 { 541 struct gap_stats gstats; 542 struct gap_stats *stats = &gstats; 543 daddr_t i, nodes, radix; 544 u_daddr_t diff, mask; 545 int digit; 546 547 init_gap_stats(stats); 548 nodes = 0; 549 radix = bl->bl_radix; 550 for (i = 0; i < bl->bl_blocks; ) { 551 /* 552 * Check for skippable subtrees starting at i. 553 */ 554 while (radix != 1) { 555 if (bl->bl_root[nodes].bm_bitmap == 0) { 556 if (gap_stats_counting(stats)) 557 update_gap_stats(stats, i); 558 break; 559 } 560 561 /* 562 * Skip subtree root. 563 */ 564 nodes++; 565 radix /= BLIST_RADIX; 566 } 567 if (radix == 1) { 568 /* 569 * Scan leaf. 570 */ 571 mask = bl->bl_root[nodes].bm_bitmap; 572 diff = mask ^ (mask << 1); 573 if (gap_stats_counting(stats)) 574 diff ^= 1; 575 while (diff != 0) { 576 digit = bitpos(diff); 577 update_gap_stats(stats, i + digit); 578 diff ^= bitrange(digit, 1); 579 } 580 } 581 nodes += radix_to_skip(radix * BLIST_RADIX); 582 i += radix * BLIST_RADIX; 583 584 /* 585 * Find max size subtree starting at i. 586 */ 587 for (radix = 1; 588 ((i / BLIST_RADIX / radix) & BLIST_MASK) == 0; 589 radix *= BLIST_RADIX) 590 ; 591 } 592 update_gap_stats(stats, i); 593 dump_gap_stats(stats, s); 594 } 595 596 /************************************************************************ 597 * ALLOCATION SUPPORT FUNCTIONS * 598 ************************************************************************ 599 * 600 * These support functions do all the actual work. They may seem 601 * rather longish, but that's because I've commented them up. The 602 * actual code is straight forward. 603 * 604 */ 605 606 /* 607 * BLST_NEXT_LEAF_ALLOC() - allocate the blocks starting with the next leaf. 608 * 609 * 'scan' is a leaf node, and its first block is at address 'start'. The 610 * next leaf node could be adjacent, or several nodes away if the least 611 * common ancestor of 'scan' and its neighbor is several levels up. Use 612 * addresses to determine how many meta-nodes lie between the leaves. If 613 * sequence of leaves starting with the next one has enough initial bits 614 * set, clear them and clear the bits in the meta nodes on the path up to 615 * the least common ancestor to mark any subtrees made completely empty. 616 */ 617 static int 618 blst_next_leaf_alloc(blmeta_t *scan, daddr_t start, int count, int maxcount) 619 { 620 u_daddr_t radix; 621 daddr_t blk; 622 int avail, digit; 623 624 start += BLIST_RADIX; 625 for (blk = start; blk - start < maxcount; blk += BLIST_RADIX) { 626 /* Skip meta-nodes, as long as they promise more free blocks. */ 627 radix = BLIST_RADIX; 628 while (((++scan)->bm_bitmap & 1) == 1 && 629 ((blk / radix) & BLIST_MASK) == 0) 630 radix *= BLIST_RADIX; 631 if (~scan->bm_bitmap != 0) { 632 /* 633 * Either there is no next leaf with any free blocks, 634 * or we've reached the next leaf and found that some 635 * of its blocks are not free. In the first case, 636 * bitpos() returns zero here. 637 */ 638 avail = blk - start + bitpos(~scan->bm_bitmap); 639 if (avail < count || avail == 0) { 640 /* 641 * There isn't a next leaf with enough free 642 * blocks at its beginning to bother 643 * allocating. 644 */ 645 return (avail); 646 } 647 maxcount = imin(avail, maxcount); 648 if (maxcount % BLIST_RADIX == 0) { 649 /* 650 * There was no next leaf. Back scan up to 651 * last leaf. 652 */ 653 do { 654 radix /= BLIST_RADIX; 655 --scan; 656 } while (radix != 1); 657 blk -= BLIST_RADIX; 658 } 659 } 660 } 661 662 /* 663 * 'scan' is the last leaf that provides blocks. Clear from 1 to 664 * BLIST_RADIX bits to represent the allocation of those last blocks. 665 */ 666 if (maxcount % BLIST_RADIX != 0) 667 scan->bm_bitmap &= ~bitrange(0, maxcount % BLIST_RADIX); 668 else 669 scan->bm_bitmap = 0; 670 671 for (;;) { 672 /* Back up over meta-nodes, clearing bits if necessary. */ 673 blk -= BLIST_RADIX; 674 for (radix = BLIST_RADIX; 675 (digit = ((blk / radix) & BLIST_MASK)) == 0; 676 radix *= BLIST_RADIX) { 677 if ((scan--)->bm_bitmap == 0) 678 scan->bm_bitmap ^= 1; 679 } 680 if ((scan--)->bm_bitmap == 0) 681 scan[-digit * radix_to_skip(radix)].bm_bitmap ^= 682 (u_daddr_t)1 << digit; 683 684 if (blk == start) 685 break; 686 /* Clear all the bits of this leaf. */ 687 scan->bm_bitmap = 0; 688 } 689 return (maxcount); 690 } 691 692 /* 693 * BLST_LEAF_ALLOC() - allocate at a leaf in the radix tree (a bitmap). 694 * 695 * This function is the core of the allocator. Its execution time is 696 * proportional to log(count), plus height of the tree if the allocation 697 * crosses a leaf boundary. 698 */ 699 static daddr_t 700 blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int *count, int maxcount) 701 { 702 u_daddr_t mask; 703 int bighint, count1, hi, lo, num_shifts; 704 705 count1 = *count - 1; 706 num_shifts = fls(count1); 707 mask = ~scan->bm_bitmap; 708 while ((mask & (mask + 1)) != 0 && num_shifts > 0) { 709 /* 710 * If bit i is 0 in mask, then bits in [i, i + (count1 >> 711 * num_shifts)] are 1 in scan->bm_bitmap. Reduce num_shifts to 712 * 0, while preserving this invariant. The updates to mask 713 * leave fewer bits 0, but each bit that remains 0 represents a 714 * longer string of consecutive 1-bits in scan->bm_bitmap. If 715 * more updates to mask cannot set more bits, because mask is 716 * partitioned with all 1 bits following all 0 bits, the loop 717 * terminates immediately. 718 */ 719 num_shifts--; 720 mask |= mask >> ((count1 >> num_shifts) + 1) / 2; 721 } 722 bighint = count1 >> num_shifts; 723 if (~mask == 0) { 724 /* 725 * Update bighint. There is no allocation bigger than 726 * count1 >> num_shifts starting in this leaf. 727 */ 728 scan->bm_bighint = bighint; 729 return (SWAPBLK_NONE); 730 } 731 732 /* Discard any candidates that appear before blk. */ 733 if ((blk & BLIST_MASK) != 0) { 734 if ((~mask & bitrange(0, blk & BLIST_MASK)) != 0) { 735 /* Grow bighint in case all discarded bits are set. */ 736 bighint += blk & BLIST_MASK; 737 mask |= bitrange(0, blk & BLIST_MASK); 738 if (~mask == 0) { 739 scan->bm_bighint = bighint; 740 return (SWAPBLK_NONE); 741 } 742 } 743 blk -= blk & BLIST_MASK; 744 } 745 746 /* 747 * The least significant set bit in mask marks the start of the first 748 * available range of sufficient size. Find its position. 749 */ 750 lo = bitpos(~mask); 751 752 /* 753 * Find how much space is available starting at that position. 754 */ 755 if ((mask & (mask + 1)) != 0) { 756 /* Count the 1 bits starting at position lo. */ 757 hi = bitpos(mask & (mask + 1)) + count1; 758 if (maxcount < hi - lo) 759 hi = lo + maxcount; 760 *count = hi - lo; 761 mask = ~bitrange(lo, *count); 762 } else if (maxcount <= BLIST_RADIX - lo) { 763 /* All the blocks we can use are available here. */ 764 hi = lo + maxcount; 765 *count = maxcount; 766 mask = ~bitrange(lo, *count); 767 if (hi == BLIST_RADIX) 768 scan->bm_bighint = bighint; 769 } else { 770 /* Check next leaf for some of the blocks we want or need. */ 771 count1 = *count - (BLIST_RADIX - lo); 772 maxcount -= BLIST_RADIX - lo; 773 hi = blst_next_leaf_alloc(scan, blk, count1, maxcount); 774 if (hi < count1) 775 /* 776 * The next leaf cannot supply enough blocks to reach 777 * the minimum required allocation. The hint cannot be 778 * updated, because the same allocation request could 779 * be satisfied later, by this leaf, if the state of 780 * the next leaf changes, and without any changes to 781 * this leaf. 782 */ 783 return (SWAPBLK_NONE); 784 *count = BLIST_RADIX - lo + hi; 785 scan->bm_bighint = bighint; 786 } 787 788 /* Clear the allocated bits from this leaf. */ 789 scan->bm_bitmap &= mask; 790 return (blk + lo); 791 } 792 793 /* 794 * blist_meta_alloc() - allocate at a meta in the radix tree. 795 * 796 * Attempt to allocate at a meta node. If we can't, we update 797 * bighint and return a failure. Updating bighint optimize future 798 * calls that hit this node. We have to check for our collapse cases 799 * and we have a few optimizations strewn in as well. 800 */ 801 static daddr_t 802 blst_meta_alloc(blmeta_t *scan, daddr_t cursor, int *count, 803 int maxcount, u_daddr_t radix) 804 { 805 daddr_t blk, i, r, skip; 806 u_daddr_t mask; 807 bool scan_from_start; 808 int digit; 809 810 if (radix == 1) 811 return (blst_leaf_alloc(scan, cursor, count, maxcount)); 812 blk = cursor & -(radix * BLIST_RADIX); 813 scan_from_start = (cursor == blk); 814 skip = radix_to_skip(radix); 815 mask = scan->bm_bitmap; 816 817 /* Discard any candidates that appear before cursor. */ 818 digit = (cursor / radix) & BLIST_MASK; 819 mask &= (u_daddr_t)-1 << digit; 820 if (mask == 0) 821 return (SWAPBLK_NONE); 822 823 /* 824 * If the first try is for a block that includes the cursor, pre-undo 825 * the digit * radix offset in the first call; otherwise, ignore the 826 * cursor entirely. 827 */ 828 if (((mask >> digit) & 1) == 1) 829 cursor -= digit * radix; 830 else 831 cursor = blk; 832 833 /* 834 * Examine the nonempty subtree associated with each bit set in mask. 835 */ 836 do { 837 digit = bitpos(mask); 838 i = 1 + digit * skip; 839 if (*count <= scan[i].bm_bighint) { 840 /* 841 * The allocation might fit beginning in the i'th subtree. 842 */ 843 r = blst_meta_alloc(&scan[i], cursor + digit * radix, 844 count, maxcount, radix / BLIST_RADIX); 845 if (r != SWAPBLK_NONE) { 846 if (scan[i].bm_bitmap == 0) 847 scan->bm_bitmap ^= bitrange(digit, 1); 848 return (r); 849 } 850 } 851 cursor = blk; 852 } while ((mask ^= bitrange(digit, 1)) != 0); 853 854 /* 855 * We couldn't allocate count in this subtree. If the whole tree was 856 * scanned, and the last tree node is allocated, update bighint. 857 */ 858 if (scan_from_start && !(digit == BLIST_RADIX - 1 && 859 scan[i].bm_bighint == BLIST_MAX_ALLOC)) 860 scan->bm_bighint = *count - 1; 861 862 return (SWAPBLK_NONE); 863 } 864 865 /* 866 * BLST_LEAF_FREE() - free allocated block from leaf bitmap 867 * 868 */ 869 static void 870 blst_leaf_free(blmeta_t *scan, daddr_t blk, int count) 871 { 872 u_daddr_t mask; 873 874 /* 875 * free some data in this bitmap 876 * mask=0000111111111110000 877 * \_________/\__/ 878 * count n 879 */ 880 mask = bitrange(blk & BLIST_MASK, count); 881 KASSERT((scan->bm_bitmap & mask) == 0, 882 ("freeing free block: %jx, size %d, mask %jx", 883 (uintmax_t)blk, count, (uintmax_t)scan->bm_bitmap & mask)); 884 scan->bm_bitmap |= mask; 885 } 886 887 /* 888 * BLST_META_FREE() - free allocated blocks from radix tree meta info 889 * 890 * This support routine frees a range of blocks from the bitmap. 891 * The range must be entirely enclosed by this radix node. If a 892 * meta node, we break the range down recursively to free blocks 893 * in subnodes (which means that this code can free an arbitrary 894 * range whereas the allocation code cannot allocate an arbitrary 895 * range). 896 */ 897 static void 898 blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, u_daddr_t radix) 899 { 900 daddr_t blk, endBlk, i, skip; 901 int digit, endDigit; 902 903 /* 904 * We could probably do a better job here. We are required to make 905 * bighint at least as large as the biggest allocable block of data. 906 * If we just shoehorn it, a little extra overhead will be incurred 907 * on the next allocation (but only that one typically). 908 */ 909 scan->bm_bighint = BLIST_MAX_ALLOC; 910 911 if (radix == 1) 912 return (blst_leaf_free(scan, freeBlk, count)); 913 914 endBlk = freeBlk + count; 915 blk = (freeBlk + radix * BLIST_RADIX) & -(radix * BLIST_RADIX); 916 /* 917 * blk is first block past the end of the range of this meta node, 918 * or 0 in case of overflow. 919 */ 920 if (blk != 0) 921 endBlk = ummin(endBlk, blk); 922 skip = radix_to_skip(radix); 923 blk = freeBlk & -radix; 924 digit = (blk / radix) & BLIST_MASK; 925 endDigit = 1 + (((endBlk - 1) / radix) & BLIST_MASK); 926 scan->bm_bitmap |= bitrange(digit, endDigit - digit); 927 for (i = 1 + digit * skip; blk < endBlk; i += skip) { 928 blk += radix; 929 count = ummin(blk, endBlk) - freeBlk; 930 blst_meta_free(&scan[i], freeBlk, count, radix / BLIST_RADIX); 931 freeBlk = blk; 932 } 933 } 934 935 /* 936 * BLST_COPY() - copy one radix tree to another 937 * 938 * Locates free space in the source tree and frees it in the destination 939 * tree. The space may not already be free in the destination. 940 */ 941 static void 942 blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, blist_t dest, 943 daddr_t count) 944 { 945 daddr_t endBlk, i, skip; 946 947 /* 948 * Leaf node 949 */ 950 951 if (radix == 1) { 952 u_daddr_t v = scan->bm_bitmap; 953 954 if (v == (u_daddr_t)-1) { 955 blist_free(dest, blk, count); 956 } else if (v != 0) { 957 int i; 958 959 for (i = 0; i < count; ++i) { 960 if (v & ((u_daddr_t)1 << i)) 961 blist_free(dest, blk + i, 1); 962 } 963 } 964 return; 965 } 966 967 /* 968 * Meta node 969 */ 970 971 if (scan->bm_bitmap == 0) { 972 /* 973 * Source all allocated, leave dest allocated 974 */ 975 return; 976 } 977 978 endBlk = blk + count; 979 skip = radix_to_skip(radix); 980 for (i = 1; blk < endBlk; i += skip) { 981 blk += radix; 982 count = radix; 983 if (blk >= endBlk) 984 count -= blk - endBlk; 985 blst_copy(&scan[i], blk - radix, 986 radix / BLIST_RADIX, dest, count); 987 } 988 } 989 990 /* 991 * BLST_LEAF_FILL() - allocate specific blocks in leaf bitmap 992 * 993 * This routine allocates all blocks in the specified range 994 * regardless of any existing allocations in that range. Returns 995 * the number of blocks allocated by the call. 996 */ 997 static daddr_t 998 blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count) 999 { 1000 daddr_t nblks; 1001 u_daddr_t mask; 1002 1003 mask = bitrange(blk & BLIST_MASK, count); 1004 1005 /* Count the number of blocks that we are allocating. */ 1006 nblks = bitcount64(scan->bm_bitmap & mask); 1007 1008 scan->bm_bitmap &= ~mask; 1009 return (nblks); 1010 } 1011 1012 /* 1013 * BLIST_META_FILL() - allocate specific blocks at a meta node 1014 * 1015 * This routine allocates the specified range of blocks, 1016 * regardless of any existing allocations in the range. The 1017 * range must be within the extent of this node. Returns the 1018 * number of blocks allocated by the call. 1019 */ 1020 static daddr_t 1021 blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, u_daddr_t radix) 1022 { 1023 daddr_t blk, endBlk, i, nblks, skip; 1024 int digit; 1025 1026 if (radix == 1) 1027 return (blst_leaf_fill(scan, allocBlk, count)); 1028 1029 endBlk = allocBlk + count; 1030 blk = (allocBlk + radix * BLIST_RADIX) & -(radix * BLIST_RADIX); 1031 /* 1032 * blk is first block past the end of the range of this meta node, 1033 * or 0 in case of overflow. 1034 */ 1035 if (blk != 0) 1036 endBlk = ummin(endBlk, blk); 1037 skip = radix_to_skip(radix); 1038 blk = allocBlk & -radix; 1039 nblks = 0; 1040 while (blk < endBlk) { 1041 digit = (blk / radix) & BLIST_MASK; 1042 i = 1 + digit * skip; 1043 blk += radix; 1044 count = ummin(blk, endBlk) - allocBlk; 1045 nblks += blst_meta_fill(&scan[i], allocBlk, count, 1046 radix / BLIST_RADIX); 1047 if (scan[i].bm_bitmap == 0) 1048 scan->bm_bitmap &= ~((u_daddr_t)1 << digit); 1049 allocBlk = blk; 1050 } 1051 return (nblks); 1052 } 1053 1054 #ifdef BLIST_DEBUG 1055 1056 static void 1057 blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int tab) 1058 { 1059 daddr_t skip; 1060 u_daddr_t mask; 1061 int digit; 1062 1063 if (radix == 1) { 1064 printf( 1065 "%*.*s(%08llx,%lld): bitmap %0*llx big=%lld\n", 1066 tab, tab, "", 1067 (long long)blk, (long long)BLIST_RADIX, 1068 (int)(1 + (BLIST_RADIX - 1) / 4), 1069 (long long)scan->bm_bitmap, 1070 (long long)scan->bm_bighint 1071 ); 1072 return; 1073 } 1074 1075 printf( 1076 "%*.*s(%08llx): subtree (%lld/%lld) bitmap %0*llx big=%lld {\n", 1077 tab, tab, "", 1078 (long long)blk, (long long)radix * BLIST_RADIX, 1079 (long long)radix * BLIST_RADIX, 1080 (int)(1 + (BLIST_RADIX - 1) / 4), 1081 (long long)scan->bm_bitmap, 1082 (long long)scan->bm_bighint 1083 ); 1084 1085 skip = radix_to_skip(radix); 1086 tab += 4; 1087 1088 mask = scan->bm_bitmap; 1089 /* Examine the nonempty subtree associated with each bit set in mask */ 1090 do { 1091 digit = bitpos(mask); 1092 blst_radix_print(&scan[1 + digit * skip], blk + digit * radix, 1093 radix / BLIST_RADIX, tab); 1094 } while ((mask ^= bitrange(digit, 1)) != 0); 1095 tab -= 4; 1096 1097 printf( 1098 "%*.*s}\n", 1099 tab, tab, "" 1100 ); 1101 } 1102 1103 #endif 1104 1105 #ifdef BLIST_DEBUG 1106 1107 int 1108 main(int ac, char **av) 1109 { 1110 daddr_t size = BLIST_RADIX * BLIST_RADIX; 1111 int i; 1112 blist_t bl; 1113 struct sbuf *s; 1114 1115 for (i = 1; i < ac; ++i) { 1116 const char *ptr = av[i]; 1117 if (*ptr != '-') { 1118 size = strtoll(ptr, NULL, 0); 1119 continue; 1120 } 1121 ptr += 2; 1122 fprintf(stderr, "Bad option: %s\n", ptr - 2); 1123 exit(1); 1124 } 1125 bl = blist_create(size, M_WAITOK); 1126 if (bl == NULL) { 1127 fprintf(stderr, "blist_create failed\n"); 1128 exit(1); 1129 } 1130 blist_free(bl, 0, size); 1131 1132 for (;;) { 1133 char buf[1024]; 1134 long long da = 0; 1135 int count = 0, maxcount = 0; 1136 1137 printf("%lld/%lld/%lld> ", (long long)blist_avail(bl), 1138 (long long)size, (long long)bl->bl_radix * BLIST_RADIX); 1139 fflush(stdout); 1140 if (fgets(buf, sizeof(buf), stdin) == NULL) 1141 break; 1142 switch(buf[0]) { 1143 case 'r': 1144 if (sscanf(buf + 1, "%d", &count) == 1) { 1145 blist_resize(&bl, count, 1, M_WAITOK); 1146 } else { 1147 printf("?\n"); 1148 } 1149 case 'p': 1150 blist_print(bl); 1151 break; 1152 case 's': 1153 s = sbuf_new_auto(); 1154 blist_stats(bl, s); 1155 sbuf_finish(s); 1156 printf("%s", sbuf_data(s)); 1157 sbuf_delete(s); 1158 break; 1159 case 'a': 1160 if (sscanf(buf + 1, "%d%d", &count, &maxcount) == 2) { 1161 daddr_t blk = blist_alloc(bl, &count, maxcount); 1162 printf(" R=%08llx, c=%08d\n", 1163 (long long)blk, count); 1164 } else { 1165 printf("?\n"); 1166 } 1167 break; 1168 case 'f': 1169 if (sscanf(buf + 1, "%llx %d", &da, &count) == 2) { 1170 blist_free(bl, da, count); 1171 } else { 1172 printf("?\n"); 1173 } 1174 break; 1175 case 'l': 1176 if (sscanf(buf + 1, "%llx %d", &da, &count) == 2) { 1177 printf(" n=%jd\n", 1178 (intmax_t)blist_fill(bl, da, count)); 1179 } else { 1180 printf("?\n"); 1181 } 1182 break; 1183 case '?': 1184 case 'h': 1185 puts( 1186 "p -print\n" 1187 "s -stats\n" 1188 "a %d %d -allocate\n" 1189 "f %x %d -free\n" 1190 "l %x %d -fill\n" 1191 "r %d -resize\n" 1192 "h/? -help\n" 1193 "q -quit" 1194 ); 1195 break; 1196 case 'q': 1197 break; 1198 default: 1199 printf("?\n"); 1200 break; 1201 } 1202 if (buf[0] == 'q') 1203 break; 1204 } 1205 return (0); 1206 } 1207 1208 #endif 1209