1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1998 Matthew Dillon. All Rights Reserved. 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS 18 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY 21 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE 23 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 25 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 26 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 */ 29 /* 30 * BLIST.C - Bitmap allocator/deallocator, using a radix tree with hinting 31 * 32 * This module implements a general bitmap allocator/deallocator. The 33 * allocator eats around 2 bits per 'block'. The module does not 34 * try to interpret the meaning of a 'block' other than to return 35 * SWAPBLK_NONE on an allocation failure. 36 * 37 * A radix tree controls access to pieces of the bitmap, and includes 38 * auxiliary information at each interior node about the availabilty of 39 * contiguous free blocks in the subtree rooted at that node. A radix 40 * constant defines the size of the bitmaps contained in a leaf node 41 * and the number of descendents of each of the meta (interior) nodes. 42 * Each subtree is associated with a range of blocks. The root of any 43 * subtree stores a hint field that defines an upper bound on the size 44 * of the largest allocation that can begin in the associated block 45 * range. A hint is an upper bound on a potential allocation, but not 46 * necessarily a tight upper bound. 47 * 48 * The bitmap field in each node directs the search for available blocks. 49 * For a leaf node, a bit is set if the corresponding block is free. For a 50 * meta node, a bit is set if the corresponding subtree contains a free 51 * block somewhere within it. The search at a meta node considers only 52 * children of that node that represent a range that includes a free block. 53 * 54 * The hinting greatly increases code efficiency for allocations while 55 * the general radix structure optimizes both allocations and frees. The 56 * radix tree should be able to operate well no matter how much 57 * fragmentation there is and no matter how large a bitmap is used. 58 * 59 * The blist code wires all necessary memory at creation time. Neither 60 * allocations nor frees require interaction with the memory subsystem. 61 * The non-blocking nature of allocations and frees is required by swap 62 * code (vm/swap_pager.c). 63 * 64 * LAYOUT: The radix tree is laid out recursively using a linear array. 65 * Each meta node is immediately followed (laid out sequentially in 66 * memory) by BLIST_RADIX lower-level nodes. This is a recursive 67 * structure but one that can be easily scanned through a very simple 68 * 'skip' calculation. The memory allocation is only large enough to 69 * cover the number of blocks requested at creation time. Nodes that 70 * represent blocks beyond that limit, nodes that would never be read 71 * or written, are not allocated, so that the last of the 72 * BLIST_RADIX lower-level nodes of a some nodes may not be allocated. 73 * 74 * NOTE: the allocator cannot currently allocate more than 75 * BLIST_RADIX blocks per call. It will panic with 'allocation too 76 * large' if you try. This is an area that could use improvement. The 77 * radix is large enough that this restriction does not effect the swap 78 * system, though. Currently only the allocation code is affected by 79 * this algorithmic unfeature. The freeing code can handle arbitrary 80 * ranges. 81 * 82 * This code can be compiled stand-alone for debugging. 83 */ 84 85 #include <sys/cdefs.h> 86 __FBSDID("$FreeBSD$"); 87 88 #ifdef _KERNEL 89 90 #include <sys/param.h> 91 #include <sys/systm.h> 92 #include <sys/lock.h> 93 #include <sys/kernel.h> 94 #include <sys/blist.h> 95 #include <sys/malloc.h> 96 #include <sys/sbuf.h> 97 #include <sys/proc.h> 98 #include <sys/mutex.h> 99 100 #else 101 102 #ifndef BLIST_NO_DEBUG 103 #define BLIST_DEBUG 104 #endif 105 106 #include <sys/errno.h> 107 #include <sys/types.h> 108 #include <sys/malloc.h> 109 #include <sys/sbuf.h> 110 #include <assert.h> 111 #include <stdio.h> 112 #include <string.h> 113 #include <stddef.h> 114 #include <stdlib.h> 115 #include <stdarg.h> 116 #include <stdbool.h> 117 118 #define bitcount64(x) __bitcount64((uint64_t)(x)) 119 #define malloc(a,b,c) calloc(a, 1) 120 #define free(a,b) free(a) 121 #define ummin(a,b) ((a) < (b) ? (a) : (b)) 122 #define imin(a,b) ((a) < (b) ? (a) : (b)) 123 #define KASSERT(a,b) assert(a) 124 125 #include <sys/blist.h> 126 127 #endif 128 129 /* 130 * static support functions 131 */ 132 static daddr_t blst_leaf_alloc(blmeta_t *scan, daddr_t blk, 133 int *count, int maxcount); 134 static daddr_t blst_meta_alloc(blmeta_t *scan, daddr_t cursor, int *count, 135 int maxcount, u_daddr_t radix); 136 static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count); 137 static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, 138 u_daddr_t radix); 139 static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, 140 blist_t dest, daddr_t count); 141 static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count); 142 static daddr_t blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, 143 u_daddr_t radix); 144 #ifndef _KERNEL 145 static void blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, 146 int tab); 147 #endif 148 149 #ifdef _KERNEL 150 static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space"); 151 #endif 152 153 #define BLIST_MASK (BLIST_RADIX - 1) 154 155 /* 156 * For a subtree that can represent the state of up to 'radix' blocks, the 157 * number of leaf nodes of the subtree is L=radix/BLIST_RADIX. If 'm' 158 * is short for BLIST_RADIX, then for a tree of height h with L=m**h 159 * leaf nodes, the total number of tree nodes is 1 + m + m**2 + ... + m**h, 160 * or, equivalently, (m**(h+1)-1)/(m-1). This quantity is called 'skip' 161 * in the 'meta' functions that process subtrees. Since integer division 162 * discards remainders, we can express this computation as 163 * skip = (m * m**h) / (m - 1) 164 * skip = (m * (radix / m)) / (m - 1) 165 * skip = radix / (m - 1) 166 * so that simple integer division by a constant can safely be used for the 167 * calculation. 168 */ 169 static inline daddr_t 170 radix_to_skip(daddr_t radix) 171 { 172 173 return (radix / BLIST_MASK); 174 } 175 176 /* 177 * Provide a mask with count bits set, starting as position n. 178 */ 179 static inline u_daddr_t 180 bitrange(int n, int count) 181 { 182 183 return (((u_daddr_t)-1 << n) & 184 ((u_daddr_t)-1 >> (BLIST_RADIX - (n + count)))); 185 } 186 187 static inline int 188 bitpos(u_daddr_t mask) 189 { 190 191 _Static_assert(sizeof(long long) >= sizeof(mask), 192 "mask too big for ffsll()"); 193 return (ffsll(mask) - 1); 194 } 195 196 /* 197 * blist_create() - create a blist capable of handling up to the specified 198 * number of blocks 199 * 200 * blocks - must be greater than 0 201 * flags - malloc flags 202 * 203 * The smallest blist consists of a single leaf node capable of 204 * managing BLIST_RADIX blocks. 205 */ 206 blist_t 207 blist_create(daddr_t blocks, int flags) 208 { 209 blist_t bl; 210 u_daddr_t nodes, radix; 211 212 KASSERT(blocks > 0, ("invalid block count")); 213 214 /* 215 * Calculate the radix and node count used for scanning. 216 */ 217 nodes = 1; 218 for (radix = 1; (blocks - 1) / BLIST_RADIX / radix > 0; 219 radix *= BLIST_RADIX) 220 nodes += 1 + (blocks - 1) / BLIST_RADIX / radix; 221 222 /* 223 * Include a sentinel node to ensure that cross-leaf scans stay within 224 * the bounds of the allocation. 225 */ 226 if (blocks % BLIST_RADIX == 0) 227 nodes++; 228 229 bl = malloc(offsetof(struct blist, bl_root[nodes]), M_SWAP, flags | 230 M_ZERO); 231 if (bl == NULL) 232 return (NULL); 233 234 bl->bl_blocks = blocks; 235 bl->bl_radix = radix; 236 237 #if defined(BLIST_DEBUG) 238 printf( 239 "BLIST representing %lld blocks (%lld MB of swap)" 240 ", requiring %lldK of ram\n", 241 (long long)bl->bl_blocks, 242 (long long)bl->bl_blocks * 4 / 1024, 243 (long long)(nodes * sizeof(blmeta_t) + 1023) / 1024 244 ); 245 printf("BLIST raw radix tree contains %lld records\n", 246 (long long)nodes); 247 #endif 248 249 return (bl); 250 } 251 252 void 253 blist_destroy(blist_t bl) 254 { 255 256 free(bl, M_SWAP); 257 } 258 259 /* 260 * blist_alloc() - reserve space in the block bitmap. Return the base 261 * of a contiguous region or SWAPBLK_NONE if space could 262 * not be allocated. 263 */ 264 daddr_t 265 blist_alloc(blist_t bl, int *count, int maxcount) 266 { 267 daddr_t blk, cursor; 268 269 KASSERT(*count <= maxcount, 270 ("invalid parameters %d > %d", *count, maxcount)); 271 KASSERT(*count <= BLIST_MAX_ALLOC, 272 ("minimum allocation too large: %d", *count)); 273 274 /* 275 * This loop iterates at most twice. An allocation failure in the 276 * first iteration leads to a second iteration only if the cursor was 277 * non-zero. When the cursor is zero, an allocation failure will 278 * stop further iterations. 279 */ 280 for (cursor = bl->bl_cursor;; cursor = 0) { 281 blk = blst_meta_alloc(bl->bl_root, cursor, count, maxcount, 282 bl->bl_radix); 283 if (blk != SWAPBLK_NONE) { 284 bl->bl_avail -= *count; 285 bl->bl_cursor = blk + *count; 286 if (bl->bl_cursor == bl->bl_blocks) 287 bl->bl_cursor = 0; 288 return (blk); 289 } 290 if (cursor == 0) 291 return (SWAPBLK_NONE); 292 } 293 } 294 295 /* 296 * blist_avail() - return the number of free blocks. 297 */ 298 daddr_t 299 blist_avail(blist_t bl) 300 { 301 302 return (bl->bl_avail); 303 } 304 305 /* 306 * blist_free() - free up space in the block bitmap. Return the base 307 * of a contiguous region. 308 */ 309 void 310 blist_free(blist_t bl, daddr_t blkno, daddr_t count) 311 { 312 313 KASSERT(blkno >= 0 && blkno + count <= bl->bl_blocks, 314 ("freeing invalid range: blkno %jx, count %d, blocks %jd", 315 (uintmax_t)blkno, (int)count, (uintmax_t)bl->bl_blocks)); 316 blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix); 317 bl->bl_avail += count; 318 } 319 320 /* 321 * blist_fill() - mark a region in the block bitmap as off-limits 322 * to the allocator (i.e. allocate it), ignoring any 323 * existing allocations. Return the number of blocks 324 * actually filled that were free before the call. 325 */ 326 daddr_t 327 blist_fill(blist_t bl, daddr_t blkno, daddr_t count) 328 { 329 daddr_t filled; 330 331 KASSERT(blkno >= 0 && blkno + count <= bl->bl_blocks, 332 ("filling invalid range: blkno %jx, count %d, blocks %jd", 333 (uintmax_t)blkno, (int)count, (uintmax_t)bl->bl_blocks)); 334 filled = blst_meta_fill(bl->bl_root, blkno, count, bl->bl_radix); 335 bl->bl_avail -= filled; 336 return (filled); 337 } 338 339 /* 340 * blist_resize() - resize an existing radix tree to handle the 341 * specified number of blocks. This will reallocate 342 * the tree and transfer the previous bitmap to the new 343 * one. When extending the tree you can specify whether 344 * the new blocks are to left allocated or freed. 345 */ 346 void 347 blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags) 348 { 349 blist_t newbl = blist_create(count, flags); 350 blist_t save = *pbl; 351 352 *pbl = newbl; 353 if (count > save->bl_blocks) 354 count = save->bl_blocks; 355 blst_copy(save->bl_root, 0, save->bl_radix, newbl, count); 356 357 /* 358 * If resizing upwards, should we free the new space or not? 359 */ 360 if (freenew && count < newbl->bl_blocks) { 361 blist_free(newbl, count, newbl->bl_blocks - count); 362 } 363 blist_destroy(save); 364 } 365 366 #ifdef BLIST_DEBUG 367 368 /* 369 * blist_print() - dump radix tree 370 */ 371 void 372 blist_print(blist_t bl) 373 { 374 printf("BLIST avail = %jd, cursor = %08jx {\n", 375 (uintmax_t)bl->bl_avail, (uintmax_t)bl->bl_cursor); 376 377 if (bl->bl_root->bm_bitmap != 0) 378 blst_radix_print(bl->bl_root, 0, bl->bl_radix, 4); 379 printf("}\n"); 380 } 381 382 #endif 383 384 static const u_daddr_t fib[] = { 385 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584, 386 4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418, 317811, 387 514229, 832040, 1346269, 2178309, 3524578, 388 }; 389 390 /* 391 * Use 'gap' to describe a maximal range of unallocated blocks/bits. 392 */ 393 struct gap_stats { 394 daddr_t start; /* current gap start, or SWAPBLK_NONE */ 395 daddr_t num; /* number of gaps observed */ 396 daddr_t max; /* largest gap size */ 397 daddr_t avg; /* average gap size */ 398 daddr_t err; /* sum - num * avg */ 399 daddr_t histo[nitems(fib)]; /* # gaps in each size range */ 400 int max_bucket; /* last histo elt with nonzero val */ 401 }; 402 403 /* 404 * gap_stats_counting() - is the state 'counting 1 bits'? 405 * or 'skipping 0 bits'? 406 */ 407 static inline bool 408 gap_stats_counting(const struct gap_stats *stats) 409 { 410 411 return (stats->start != SWAPBLK_NONE); 412 } 413 414 /* 415 * init_gap_stats() - initialize stats on gap sizes 416 */ 417 static inline void 418 init_gap_stats(struct gap_stats *stats) 419 { 420 421 bzero(stats, sizeof(*stats)); 422 stats->start = SWAPBLK_NONE; 423 } 424 425 /* 426 * update_gap_stats() - update stats on gap sizes 427 */ 428 static void 429 update_gap_stats(struct gap_stats *stats, daddr_t posn) 430 { 431 daddr_t size; 432 int hi, lo, mid; 433 434 if (!gap_stats_counting(stats)) { 435 stats->start = posn; 436 return; 437 } 438 size = posn - stats->start; 439 stats->start = SWAPBLK_NONE; 440 if (size > stats->max) 441 stats->max = size; 442 443 /* 444 * Find the fibonacci range that contains size, 445 * expecting to find it in an early range. 446 */ 447 lo = 0; 448 hi = 1; 449 while (hi < nitems(fib) && fib[hi] <= size) { 450 lo = hi; 451 hi *= 2; 452 } 453 if (hi >= nitems(fib)) 454 hi = nitems(fib); 455 while (lo + 1 != hi) { 456 mid = (lo + hi) >> 1; 457 if (fib[mid] <= size) 458 lo = mid; 459 else 460 hi = mid; 461 } 462 stats->histo[lo]++; 463 if (lo > stats->max_bucket) 464 stats->max_bucket = lo; 465 stats->err += size - stats->avg; 466 stats->num++; 467 stats->avg += stats->err / stats->num; 468 stats->err %= stats->num; 469 } 470 471 /* 472 * dump_gap_stats() - print stats on gap sizes 473 */ 474 static inline void 475 dump_gap_stats(const struct gap_stats *stats, struct sbuf *s) 476 { 477 int i; 478 479 sbuf_printf(s, "number of maximal free ranges: %jd\n", 480 (intmax_t)stats->num); 481 sbuf_printf(s, "largest free range: %jd\n", (intmax_t)stats->max); 482 sbuf_printf(s, "average maximal free range size: %jd\n", 483 (intmax_t)stats->avg); 484 sbuf_printf(s, "number of maximal free ranges of different sizes:\n"); 485 sbuf_printf(s, " count | size range\n"); 486 sbuf_printf(s, " ----- | ----------\n"); 487 for (i = 0; i < stats->max_bucket; i++) { 488 if (stats->histo[i] != 0) { 489 sbuf_printf(s, "%20jd | ", 490 (intmax_t)stats->histo[i]); 491 if (fib[i] != fib[i + 1] - 1) 492 sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i], 493 (intmax_t)fib[i + 1] - 1); 494 else 495 sbuf_printf(s, "%jd\n", (intmax_t)fib[i]); 496 } 497 } 498 sbuf_printf(s, "%20jd | ", (intmax_t)stats->histo[i]); 499 if (stats->histo[i] > 1) 500 sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i], 501 (intmax_t)stats->max); 502 else 503 sbuf_printf(s, "%jd\n", (intmax_t)stats->max); 504 } 505 506 /* 507 * blist_stats() - dump radix tree stats 508 */ 509 void 510 blist_stats(blist_t bl, struct sbuf *s) 511 { 512 struct gap_stats gstats; 513 struct gap_stats *stats = &gstats; 514 daddr_t i, nodes, radix; 515 u_daddr_t diff, mask; 516 int digit; 517 518 init_gap_stats(stats); 519 nodes = 0; 520 radix = bl->bl_radix; 521 for (i = 0; i < bl->bl_blocks; ) { 522 /* 523 * Check for skippable subtrees starting at i. 524 */ 525 while (radix != 1) { 526 if (bl->bl_root[nodes].bm_bitmap == 0) { 527 if (gap_stats_counting(stats)) 528 update_gap_stats(stats, i); 529 break; 530 } 531 532 /* 533 * Skip subtree root. 534 */ 535 nodes++; 536 radix /= BLIST_RADIX; 537 } 538 if (radix == 1) { 539 /* 540 * Scan leaf. 541 */ 542 mask = bl->bl_root[nodes].bm_bitmap; 543 diff = mask ^ (mask << 1); 544 if (gap_stats_counting(stats)) 545 diff ^= 1; 546 while (diff != 0) { 547 digit = bitpos(diff); 548 update_gap_stats(stats, i + digit); 549 diff ^= bitrange(digit, 1); 550 } 551 } 552 nodes += radix_to_skip(radix * BLIST_RADIX); 553 i += radix * BLIST_RADIX; 554 555 /* 556 * Find max size subtree starting at i. 557 */ 558 for (radix = 1; 559 ((i / BLIST_RADIX / radix) & BLIST_MASK) == 0; 560 radix *= BLIST_RADIX) 561 ; 562 } 563 update_gap_stats(stats, i); 564 dump_gap_stats(stats, s); 565 } 566 567 /************************************************************************ 568 * ALLOCATION SUPPORT FUNCTIONS * 569 ************************************************************************ 570 * 571 * These support functions do all the actual work. They may seem 572 * rather longish, but that's because I've commented them up. The 573 * actual code is straight forward. 574 * 575 */ 576 577 /* 578 * BLST_NEXT_LEAF_ALLOC() - allocate the blocks starting with the next leaf. 579 * 580 * 'scan' is a leaf node, and its first block is at address 'start'. The 581 * next leaf node could be adjacent, or several nodes away if the least 582 * common ancestor of 'scan' and its neighbor is several levels up. Use 583 * addresses to determine how many meta-nodes lie between the leaves. If 584 * sequence of leaves starting with the next one has enough initial bits 585 * set, clear them and clear the bits in the meta nodes on the path up to 586 * the least common ancestor to mark any subtrees made completely empty. 587 */ 588 static int 589 blst_next_leaf_alloc(blmeta_t *scan, daddr_t start, int count, int maxcount) 590 { 591 u_daddr_t radix; 592 daddr_t blk; 593 int avail, digit; 594 595 start += BLIST_RADIX; 596 for (blk = start; blk - start < maxcount; blk += BLIST_RADIX) { 597 /* Skip meta-nodes, as long as they promise more free blocks. */ 598 radix = BLIST_RADIX; 599 while (((++scan)->bm_bitmap & 1) == 1 && 600 ((blk / radix) & BLIST_MASK) == 0) 601 radix *= BLIST_RADIX; 602 if (~scan->bm_bitmap != 0) { 603 /* 604 * Either there is no next leaf with any free blocks, 605 * or we've reached the next leaf and found that some 606 * of its blocks are not free. In the first case, 607 * bitpos() returns zero here. 608 */ 609 avail = blk - start + bitpos(~scan->bm_bitmap); 610 if (avail < count || avail == 0) { 611 /* 612 * There isn't a next leaf with enough free 613 * blocks at its beginning to bother 614 * allocating. 615 */ 616 return (avail); 617 } 618 maxcount = imin(avail, maxcount); 619 if (maxcount % BLIST_RADIX == 0) { 620 /* 621 * There was no next leaf. Back scan up to 622 * last leaf. 623 */ 624 do { 625 radix /= BLIST_RADIX; 626 --scan; 627 } while (radix != 1); 628 blk -= BLIST_RADIX; 629 } 630 } 631 } 632 633 /* 634 * 'scan' is the last leaf that provides blocks. Clear from 1 to 635 * BLIST_RADIX bits to represent the allocation of those last blocks. 636 */ 637 if (maxcount % BLIST_RADIX != 0) 638 scan->bm_bitmap &= ~bitrange(0, maxcount % BLIST_RADIX); 639 else 640 scan->bm_bitmap = 0; 641 642 for (;;) { 643 /* Back up over meta-nodes, clearing bits if necessary. */ 644 blk -= BLIST_RADIX; 645 for (radix = BLIST_RADIX; 646 (digit = ((blk / radix) & BLIST_MASK)) == 0; 647 radix *= BLIST_RADIX) { 648 if ((scan--)->bm_bitmap == 0) 649 scan->bm_bitmap ^= 1; 650 } 651 if ((scan--)->bm_bitmap == 0) 652 scan[-digit * radix_to_skip(radix)].bm_bitmap ^= 653 (u_daddr_t)1 << digit; 654 655 if (blk == start) 656 break; 657 /* Clear all the bits of this leaf. */ 658 scan->bm_bitmap = 0; 659 } 660 return (maxcount); 661 } 662 663 /* 664 * BLST_LEAF_ALLOC() - allocate at a leaf in the radix tree (a bitmap). 665 * 666 * This function is the core of the allocator. Its execution time is 667 * proportional to log(count), plus height of the tree if the allocation 668 * crosses a leaf boundary. 669 */ 670 static daddr_t 671 blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int *count, int maxcount) 672 { 673 u_daddr_t mask; 674 int bighint, count1, hi, lo, num_shifts; 675 676 count1 = *count - 1; 677 num_shifts = fls(count1); 678 mask = ~scan->bm_bitmap; 679 while ((mask & (mask + 1)) != 0 && num_shifts > 0) { 680 /* 681 * If bit i is 0 in mask, then bits in [i, i + (count1 >> 682 * num_shifts)] are 1 in scan->bm_bitmap. Reduce num_shifts to 683 * 0, while preserving this invariant. The updates to mask 684 * leave fewer bits 0, but each bit that remains 0 represents a 685 * longer string of consecutive 1-bits in scan->bm_bitmap. If 686 * more updates to mask cannot set more bits, because mask is 687 * partitioned with all 1 bits following all 0 bits, the loop 688 * terminates immediately. 689 */ 690 num_shifts--; 691 mask |= mask >> ((count1 >> num_shifts) + 1) / 2; 692 } 693 bighint = count1 >> num_shifts; 694 if (~mask == 0) { 695 /* 696 * Update bighint. There is no allocation bigger than 697 * count1 >> num_shifts starting in this leaf. 698 */ 699 scan->bm_bighint = bighint; 700 return (SWAPBLK_NONE); 701 } 702 703 /* Discard any candidates that appear before blk. */ 704 if ((blk & BLIST_MASK) != 0) { 705 if ((~mask & bitrange(0, blk & BLIST_MASK)) != 0) { 706 /* Grow bighint in case all discarded bits are set. */ 707 bighint += blk & BLIST_MASK; 708 mask |= bitrange(0, blk & BLIST_MASK); 709 if (~mask == 0) { 710 scan->bm_bighint = bighint; 711 return (SWAPBLK_NONE); 712 } 713 } 714 blk -= blk & BLIST_MASK; 715 } 716 717 /* 718 * The least significant set bit in mask marks the start of the first 719 * available range of sufficient size. Find its position. 720 */ 721 lo = bitpos(~mask); 722 723 /* 724 * Find how much space is available starting at that position. 725 */ 726 if ((mask & (mask + 1)) != 0) { 727 /* Count the 1 bits starting at position lo. */ 728 hi = bitpos(mask & (mask + 1)) + count1; 729 if (maxcount < hi - lo) 730 hi = lo + maxcount; 731 *count = hi - lo; 732 mask = ~bitrange(lo, *count); 733 } else if (maxcount <= BLIST_RADIX - lo) { 734 /* All the blocks we can use are available here. */ 735 hi = lo + maxcount; 736 *count = maxcount; 737 mask = ~bitrange(lo, *count); 738 if (hi == BLIST_RADIX) 739 scan->bm_bighint = bighint; 740 } else { 741 /* Check next leaf for some of the blocks we want or need. */ 742 count1 = *count - (BLIST_RADIX - lo); 743 maxcount -= BLIST_RADIX - lo; 744 hi = blst_next_leaf_alloc(scan, blk, count1, maxcount); 745 if (hi < count1) 746 /* 747 * The next leaf cannot supply enough blocks to reach 748 * the minimum required allocation. The hint cannot be 749 * updated, because the same allocation request could 750 * be satisfied later, by this leaf, if the state of 751 * the next leaf changes, and without any changes to 752 * this leaf. 753 */ 754 return (SWAPBLK_NONE); 755 *count = BLIST_RADIX - lo + hi; 756 scan->bm_bighint = bighint; 757 } 758 759 /* Clear the allocated bits from this leaf. */ 760 scan->bm_bitmap &= mask; 761 return (blk + lo); 762 } 763 764 /* 765 * blist_meta_alloc() - allocate at a meta in the radix tree. 766 * 767 * Attempt to allocate at a meta node. If we can't, we update 768 * bighint and return a failure. Updating bighint optimize future 769 * calls that hit this node. We have to check for our collapse cases 770 * and we have a few optimizations strewn in as well. 771 */ 772 static daddr_t 773 blst_meta_alloc(blmeta_t *scan, daddr_t cursor, int *count, 774 int maxcount, u_daddr_t radix) 775 { 776 daddr_t blk, i, r, skip; 777 u_daddr_t mask; 778 bool scan_from_start; 779 int digit; 780 781 if (radix == 1) 782 return (blst_leaf_alloc(scan, cursor, count, maxcount)); 783 blk = cursor & -(radix * BLIST_RADIX); 784 scan_from_start = (cursor == blk); 785 skip = radix_to_skip(radix); 786 mask = scan->bm_bitmap; 787 788 /* Discard any candidates that appear before cursor. */ 789 digit = (cursor / radix) & BLIST_MASK; 790 mask &= (u_daddr_t)-1 << digit; 791 if (mask == 0) 792 return (SWAPBLK_NONE); 793 794 /* 795 * If the first try is for a block that includes the cursor, pre-undo 796 * the digit * radix offset in the first call; otherwise, ignore the 797 * cursor entirely. 798 */ 799 if (((mask >> digit) & 1) == 1) 800 cursor -= digit * radix; 801 else 802 cursor = blk; 803 804 /* 805 * Examine the nonempty subtree associated with each bit set in mask. 806 */ 807 do { 808 digit = bitpos(mask); 809 i = 1 + digit * skip; 810 if (*count <= scan[i].bm_bighint) { 811 /* 812 * The allocation might fit beginning in the i'th subtree. 813 */ 814 r = blst_meta_alloc(&scan[i], cursor + digit * radix, 815 count, maxcount, radix / BLIST_RADIX); 816 if (r != SWAPBLK_NONE) { 817 if (scan[i].bm_bitmap == 0) 818 scan->bm_bitmap ^= bitrange(digit, 1); 819 return (r); 820 } 821 } 822 cursor = blk; 823 } while ((mask ^= bitrange(digit, 1)) != 0); 824 825 /* 826 * We couldn't allocate count in this subtree. If the whole tree was 827 * scanned, and the last tree node is allocated, update bighint. 828 */ 829 if (scan_from_start && !(digit == BLIST_RADIX - 1 && 830 scan[i].bm_bighint == BLIST_MAX_ALLOC)) 831 scan->bm_bighint = *count - 1; 832 833 return (SWAPBLK_NONE); 834 } 835 836 /* 837 * BLST_LEAF_FREE() - free allocated block from leaf bitmap 838 * 839 */ 840 static void 841 blst_leaf_free(blmeta_t *scan, daddr_t blk, int count) 842 { 843 u_daddr_t mask; 844 845 /* 846 * free some data in this bitmap 847 * mask=0000111111111110000 848 * \_________/\__/ 849 * count n 850 */ 851 mask = bitrange(blk & BLIST_MASK, count); 852 KASSERT((scan->bm_bitmap & mask) == 0, 853 ("freeing free block: %jx, size %d, mask %jx", 854 (uintmax_t)blk, count, (uintmax_t)scan->bm_bitmap & mask)); 855 scan->bm_bitmap |= mask; 856 } 857 858 /* 859 * BLST_META_FREE() - free allocated blocks from radix tree meta info 860 * 861 * This support routine frees a range of blocks from the bitmap. 862 * The range must be entirely enclosed by this radix node. If a 863 * meta node, we break the range down recursively to free blocks 864 * in subnodes (which means that this code can free an arbitrary 865 * range whereas the allocation code cannot allocate an arbitrary 866 * range). 867 */ 868 static void 869 blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, u_daddr_t radix) 870 { 871 daddr_t blk, endBlk, i, skip; 872 int digit, endDigit; 873 874 /* 875 * We could probably do a better job here. We are required to make 876 * bighint at least as large as the biggest allocable block of data. 877 * If we just shoehorn it, a little extra overhead will be incurred 878 * on the next allocation (but only that one typically). 879 */ 880 scan->bm_bighint = BLIST_MAX_ALLOC; 881 882 if (radix == 1) 883 return (blst_leaf_free(scan, freeBlk, count)); 884 885 endBlk = freeBlk + count; 886 blk = (freeBlk + radix * BLIST_RADIX) & -(radix * BLIST_RADIX); 887 /* 888 * blk is first block past the end of the range of this meta node, 889 * or 0 in case of overflow. 890 */ 891 if (blk != 0) 892 endBlk = ummin(endBlk, blk); 893 skip = radix_to_skip(radix); 894 blk = freeBlk & -radix; 895 digit = (blk / radix) & BLIST_MASK; 896 endDigit = 1 + (((endBlk - 1) / radix) & BLIST_MASK); 897 scan->bm_bitmap |= bitrange(digit, endDigit - digit); 898 for (i = 1 + digit * skip; blk < endBlk; i += skip) { 899 blk += radix; 900 count = ummin(blk, endBlk) - freeBlk; 901 blst_meta_free(&scan[i], freeBlk, count, radix / BLIST_RADIX); 902 freeBlk = blk; 903 } 904 } 905 906 /* 907 * BLST_COPY() - copy one radix tree to another 908 * 909 * Locates free space in the source tree and frees it in the destination 910 * tree. The space may not already be free in the destination. 911 */ 912 static void 913 blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, blist_t dest, 914 daddr_t count) 915 { 916 daddr_t endBlk, i, skip; 917 918 /* 919 * Leaf node 920 */ 921 922 if (radix == 1) { 923 u_daddr_t v = scan->bm_bitmap; 924 925 if (v == (u_daddr_t)-1) { 926 blist_free(dest, blk, count); 927 } else if (v != 0) { 928 int i; 929 930 for (i = 0; i < count; ++i) { 931 if (v & ((u_daddr_t)1 << i)) 932 blist_free(dest, blk + i, 1); 933 } 934 } 935 return; 936 } 937 938 /* 939 * Meta node 940 */ 941 942 if (scan->bm_bitmap == 0) { 943 /* 944 * Source all allocated, leave dest allocated 945 */ 946 return; 947 } 948 949 endBlk = blk + count; 950 skip = radix_to_skip(radix); 951 for (i = 1; blk < endBlk; i += skip) { 952 blk += radix; 953 count = radix; 954 if (blk >= endBlk) 955 count -= blk - endBlk; 956 blst_copy(&scan[i], blk - radix, 957 radix / BLIST_RADIX, dest, count); 958 } 959 } 960 961 /* 962 * BLST_LEAF_FILL() - allocate specific blocks in leaf bitmap 963 * 964 * This routine allocates all blocks in the specified range 965 * regardless of any existing allocations in that range. Returns 966 * the number of blocks allocated by the call. 967 */ 968 static daddr_t 969 blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count) 970 { 971 daddr_t nblks; 972 u_daddr_t mask; 973 974 mask = bitrange(blk & BLIST_MASK, count); 975 976 /* Count the number of blocks that we are allocating. */ 977 nblks = bitcount64(scan->bm_bitmap & mask); 978 979 scan->bm_bitmap &= ~mask; 980 return (nblks); 981 } 982 983 /* 984 * BLIST_META_FILL() - allocate specific blocks at a meta node 985 * 986 * This routine allocates the specified range of blocks, 987 * regardless of any existing allocations in the range. The 988 * range must be within the extent of this node. Returns the 989 * number of blocks allocated by the call. 990 */ 991 static daddr_t 992 blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, u_daddr_t radix) 993 { 994 daddr_t blk, endBlk, i, nblks, skip; 995 int digit; 996 997 if (radix == 1) 998 return (blst_leaf_fill(scan, allocBlk, count)); 999 1000 endBlk = allocBlk + count; 1001 blk = (allocBlk + radix * BLIST_RADIX) & -(radix * BLIST_RADIX); 1002 /* 1003 * blk is first block past the end of the range of this meta node, 1004 * or 0 in case of overflow. 1005 */ 1006 if (blk != 0) 1007 endBlk = ummin(endBlk, blk); 1008 skip = radix_to_skip(radix); 1009 blk = allocBlk & -radix; 1010 nblks = 0; 1011 while (blk < endBlk) { 1012 digit = (blk / radix) & BLIST_MASK; 1013 i = 1 + digit * skip; 1014 blk += radix; 1015 count = ummin(blk, endBlk) - allocBlk; 1016 nblks += blst_meta_fill(&scan[i], allocBlk, count, 1017 radix / BLIST_RADIX); 1018 if (scan[i].bm_bitmap == 0) 1019 scan->bm_bitmap &= ~((u_daddr_t)1 << digit); 1020 allocBlk = blk; 1021 } 1022 return (nblks); 1023 } 1024 1025 #ifdef BLIST_DEBUG 1026 1027 static void 1028 blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int tab) 1029 { 1030 daddr_t skip; 1031 u_daddr_t mask; 1032 int digit; 1033 1034 if (radix == 1) { 1035 printf( 1036 "%*.*s(%08llx,%lld): bitmap %0*llx big=%lld\n", 1037 tab, tab, "", 1038 (long long)blk, (long long)BLIST_RADIX, 1039 (int)(1 + (BLIST_RADIX - 1) / 4), 1040 (long long)scan->bm_bitmap, 1041 (long long)scan->bm_bighint 1042 ); 1043 return; 1044 } 1045 1046 printf( 1047 "%*.*s(%08llx): subtree (%lld/%lld) bitmap %0*llx big=%lld {\n", 1048 tab, tab, "", 1049 (long long)blk, (long long)radix * BLIST_RADIX, 1050 (long long)radix * BLIST_RADIX, 1051 (int)(1 + (BLIST_RADIX - 1) / 4), 1052 (long long)scan->bm_bitmap, 1053 (long long)scan->bm_bighint 1054 ); 1055 1056 skip = radix_to_skip(radix); 1057 tab += 4; 1058 1059 mask = scan->bm_bitmap; 1060 /* Examine the nonempty subtree associated with each bit set in mask */ 1061 do { 1062 digit = bitpos(mask); 1063 blst_radix_print(&scan[1 + digit * skip], blk + digit * radix, 1064 radix / BLIST_RADIX, tab); 1065 } while ((mask ^= bitrange(digit, 1)) != 0); 1066 tab -= 4; 1067 1068 printf( 1069 "%*.*s}\n", 1070 tab, tab, "" 1071 ); 1072 } 1073 1074 #endif 1075 1076 #ifdef BLIST_DEBUG 1077 1078 int 1079 main(int ac, char **av) 1080 { 1081 daddr_t size = BLIST_RADIX * BLIST_RADIX; 1082 int i; 1083 blist_t bl; 1084 struct sbuf *s; 1085 1086 for (i = 1; i < ac; ++i) { 1087 const char *ptr = av[i]; 1088 if (*ptr != '-') { 1089 size = strtoll(ptr, NULL, 0); 1090 continue; 1091 } 1092 ptr += 2; 1093 fprintf(stderr, "Bad option: %s\n", ptr - 2); 1094 exit(1); 1095 } 1096 bl = blist_create(size, M_WAITOK); 1097 if (bl == NULL) { 1098 fprintf(stderr, "blist_create failed\n"); 1099 exit(1); 1100 } 1101 blist_free(bl, 0, size); 1102 1103 for (;;) { 1104 char buf[1024]; 1105 long long da = 0; 1106 int count = 0, maxcount = 0; 1107 1108 printf("%lld/%lld/%lld> ", (long long)blist_avail(bl), 1109 (long long)size, (long long)bl->bl_radix * BLIST_RADIX); 1110 fflush(stdout); 1111 if (fgets(buf, sizeof(buf), stdin) == NULL) 1112 break; 1113 switch(buf[0]) { 1114 case 'r': 1115 if (sscanf(buf + 1, "%d", &count) == 1) { 1116 blist_resize(&bl, count, 1, M_WAITOK); 1117 } else { 1118 printf("?\n"); 1119 } 1120 case 'p': 1121 blist_print(bl); 1122 break; 1123 case 's': 1124 s = sbuf_new_auto(); 1125 blist_stats(bl, s); 1126 sbuf_finish(s); 1127 printf("%s", sbuf_data(s)); 1128 sbuf_delete(s); 1129 break; 1130 case 'a': 1131 if (sscanf(buf + 1, "%d%d", &count, &maxcount) == 2) { 1132 daddr_t blk = blist_alloc(bl, &count, maxcount); 1133 printf(" R=%08llx, c=%08d\n", 1134 (long long)blk, count); 1135 } else { 1136 printf("?\n"); 1137 } 1138 break; 1139 case 'f': 1140 if (sscanf(buf + 1, "%llx %d", &da, &count) == 2) { 1141 blist_free(bl, da, count); 1142 } else { 1143 printf("?\n"); 1144 } 1145 break; 1146 case 'l': 1147 if (sscanf(buf + 1, "%llx %d", &da, &count) == 2) { 1148 printf(" n=%jd\n", 1149 (intmax_t)blist_fill(bl, da, count)); 1150 } else { 1151 printf("?\n"); 1152 } 1153 break; 1154 case '?': 1155 case 'h': 1156 puts( 1157 "p -print\n" 1158 "s -stats\n" 1159 "a %d %d -allocate\n" 1160 "f %x %d -free\n" 1161 "l %x %d -fill\n" 1162 "r %d -resize\n" 1163 "h/? -help\n" 1164 "q -quit" 1165 ); 1166 break; 1167 case 'q': 1168 break; 1169 default: 1170 printf("?\n"); 1171 break; 1172 } 1173 if (buf[0] == 'q') 1174 break; 1175 } 1176 return (0); 1177 } 1178 1179 #endif 1180