1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1998 Matthew Dillon. All Rights Reserved. 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS 18 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY 21 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE 23 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 25 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 26 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 */ 29 /* 30 * BLIST.C - Bitmap allocator/deallocator, using a radix tree with hinting 31 * 32 * This module implements a general bitmap allocator/deallocator. The 33 * allocator eats around 2 bits per 'block'. The module does not 34 * try to interpret the meaning of a 'block' other than to return 35 * SWAPBLK_NONE on an allocation failure. 36 * 37 * A radix tree controls access to pieces of the bitmap, and includes 38 * auxiliary information at each interior node about the availabilty of 39 * contiguous free blocks in the subtree rooted at that node. A radix 40 * constant defines the size of the bitmaps contained in a leaf node 41 * and the number of descendents of each of the meta (interior) nodes. 42 * Each subtree is associated with a range of blocks. The root of any 43 * subtree stores a hint field that defines an upper bound on the size 44 * of the largest allocation that can begin in the associated block 45 * range. A hint is an upper bound on a potential allocation, but not 46 * necessarily a tight upper bound. 47 * 48 * The bitmap field in each node directs the search for available blocks. 49 * For a leaf node, a bit is set if the corresponding block is free. For a 50 * meta node, a bit is set if the corresponding subtree contains a free 51 * block somewhere within it. The search at a meta node considers only 52 * children of that node that represent a range that includes a free block. 53 * 54 * The hinting greatly increases code efficiency for allocations while 55 * the general radix structure optimizes both allocations and frees. The 56 * radix tree should be able to operate well no matter how much 57 * fragmentation there is and no matter how large a bitmap is used. 58 * 59 * The blist code wires all necessary memory at creation time. Neither 60 * allocations nor frees require interaction with the memory subsystem. 61 * The non-blocking nature of allocations and frees is required by swap 62 * code (vm/swap_pager.c). 63 * 64 * LAYOUT: The radix tree is laid out recursively using a linear array. 65 * Each meta node is immediately followed (laid out sequentially in 66 * memory) by BLIST_RADIX lower-level nodes. This is a recursive 67 * structure but one that can be easily scanned through a very simple 68 * 'skip' calculation. The memory allocation is only large enough to 69 * cover the number of blocks requested at creation time. Nodes that 70 * represent blocks beyond that limit, nodes that would never be read 71 * or written, are not allocated, so that the last of the 72 * BLIST_RADIX lower-level nodes of a some nodes may not be allocated. 73 * 74 * NOTE: the allocator cannot currently allocate more than 75 * BLIST_RADIX blocks per call. It will panic with 'allocation too 76 * large' if you try. This is an area that could use improvement. The 77 * radix is large enough that this restriction does not effect the swap 78 * system, though. Currently only the allocation code is affected by 79 * this algorithmic unfeature. The freeing code can handle arbitrary 80 * ranges. 81 * 82 * This code can be compiled stand-alone for debugging. 83 */ 84 85 #include <sys/cdefs.h> 86 __FBSDID("$FreeBSD$"); 87 88 #ifdef _KERNEL 89 90 #include <sys/param.h> 91 #include <sys/systm.h> 92 #include <sys/lock.h> 93 #include <sys/kernel.h> 94 #include <sys/blist.h> 95 #include <sys/malloc.h> 96 #include <sys/sbuf.h> 97 #include <sys/proc.h> 98 #include <sys/mutex.h> 99 100 #else 101 102 #ifndef BLIST_NO_DEBUG 103 #define BLIST_DEBUG 104 #endif 105 106 #include <sys/errno.h> 107 #include <sys/types.h> 108 #include <sys/malloc.h> 109 #include <sys/sbuf.h> 110 #include <assert.h> 111 #include <stdio.h> 112 #include <string.h> 113 #include <stddef.h> 114 #include <stdlib.h> 115 #include <stdarg.h> 116 #include <stdbool.h> 117 118 #define bitcount64(x) __bitcount64((uint64_t)(x)) 119 #define malloc(a,b,c) calloc(a, 1) 120 #define free(a,b) free(a) 121 #define ummin(a,b) ((a) < (b) ? (a) : (b)) 122 #define imin(a,b) ((a) < (b) ? (a) : (b)) 123 #define KASSERT(a,b) assert(a) 124 125 #include <sys/blist.h> 126 127 #endif 128 129 /* 130 * static support functions 131 */ 132 static daddr_t blst_leaf_alloc(blmeta_t *scan, daddr_t blk, 133 int *count, int maxcount); 134 static daddr_t blst_meta_alloc(blmeta_t *scan, daddr_t cursor, int *count, 135 int maxcount, u_daddr_t radix); 136 static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count); 137 static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, 138 u_daddr_t radix); 139 static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, 140 blist_t dest, daddr_t count); 141 static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count); 142 static daddr_t blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, 143 u_daddr_t radix); 144 #ifndef _KERNEL 145 static void blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, 146 int tab); 147 #endif 148 149 #ifdef _KERNEL 150 static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space"); 151 #endif 152 153 #define BLIST_MASK (BLIST_RADIX - 1) 154 155 /* 156 * For a subtree that can represent the state of up to 'radix' blocks, the 157 * number of leaf nodes of the subtree is L=radix/BLIST_RADIX. If 'm' 158 * is short for BLIST_RADIX, then for a tree of height h with L=m**h 159 * leaf nodes, the total number of tree nodes is 1 + m + m**2 + ... + m**h, 160 * or, equivalently, (m**(h+1)-1)/(m-1). This quantity is called 'skip' 161 * in the 'meta' functions that process subtrees. Since integer division 162 * discards remainders, we can express this computation as 163 * skip = (m * m**h) / (m - 1) 164 * skip = (m * (radix / m)) / (m - 1) 165 * skip = radix / (m - 1) 166 * so that simple integer division by a constant can safely be used for the 167 * calculation. 168 */ 169 static inline daddr_t 170 radix_to_skip(daddr_t radix) 171 { 172 173 return (radix / BLIST_MASK); 174 } 175 176 /* 177 * Provide a mask with count bits set, starting as position n. 178 */ 179 static inline u_daddr_t 180 bitrange(int n, int count) 181 { 182 183 return (((u_daddr_t)-1 << n) & 184 ((u_daddr_t)-1 >> (BLIST_RADIX - (n + count)))); 185 } 186 187 /* 188 * Find the first bit set in a u_daddr_t. 189 */ 190 static inline int 191 generic_bitpos(u_daddr_t mask) 192 { 193 int hi, lo, mid; 194 195 lo = 0; 196 hi = BLIST_RADIX; 197 while (lo + 1 < hi) { 198 mid = (lo + hi) >> 1; 199 if (mask & bitrange(0, mid)) 200 hi = mid; 201 else 202 lo = mid; 203 } 204 return (lo); 205 } 206 207 static inline int 208 bitpos(u_daddr_t mask) 209 { 210 211 switch (sizeof(mask)) { 212 #ifdef HAVE_INLINE_FFSLL 213 case sizeof(long long): 214 return (ffsll(mask) - 1); 215 #endif 216 #ifdef HAVE_INLINE_FFS 217 case sizeof(int): 218 return (ffs(mask) - 1); 219 #endif 220 default: 221 return (generic_bitpos(mask)); 222 } 223 } 224 225 /* 226 * blist_create() - create a blist capable of handling up to the specified 227 * number of blocks 228 * 229 * blocks - must be greater than 0 230 * flags - malloc flags 231 * 232 * The smallest blist consists of a single leaf node capable of 233 * managing BLIST_RADIX blocks. 234 */ 235 blist_t 236 blist_create(daddr_t blocks, int flags) 237 { 238 blist_t bl; 239 u_daddr_t nodes, radix; 240 241 KASSERT(blocks > 0, ("invalid block count")); 242 243 /* 244 * Calculate the radix and node count used for scanning. 245 */ 246 nodes = 1; 247 for (radix = 1; radix <= blocks / BLIST_RADIX; radix *= BLIST_RADIX) 248 nodes += 1 + (blocks - 1) / radix / BLIST_RADIX; 249 250 bl = malloc(offsetof(struct blist, bl_root[nodes]), M_SWAP, flags | 251 M_ZERO); 252 if (bl == NULL) 253 return (NULL); 254 255 bl->bl_blocks = blocks; 256 bl->bl_radix = radix; 257 258 #if defined(BLIST_DEBUG) 259 printf( 260 "BLIST representing %lld blocks (%lld MB of swap)" 261 ", requiring %lldK of ram\n", 262 (long long)bl->bl_blocks, 263 (long long)bl->bl_blocks * 4 / 1024, 264 (long long)(nodes * sizeof(blmeta_t) + 1023) / 1024 265 ); 266 printf("BLIST raw radix tree contains %lld records\n", 267 (long long)nodes); 268 #endif 269 270 return (bl); 271 } 272 273 void 274 blist_destroy(blist_t bl) 275 { 276 277 free(bl, M_SWAP); 278 } 279 280 /* 281 * blist_alloc() - reserve space in the block bitmap. Return the base 282 * of a contiguous region or SWAPBLK_NONE if space could 283 * not be allocated. 284 */ 285 daddr_t 286 blist_alloc(blist_t bl, int *count, int maxcount) 287 { 288 daddr_t blk, cursor; 289 290 KASSERT(*count <= maxcount, 291 ("invalid parameters %d > %d", *count, maxcount)); 292 KASSERT(*count <= BLIST_MAX_ALLOC, 293 ("minimum allocation too large: %d", *count)); 294 295 /* 296 * This loop iterates at most twice. An allocation failure in the 297 * first iteration leads to a second iteration only if the cursor was 298 * non-zero. When the cursor is zero, an allocation failure will 299 * stop further iterations. 300 */ 301 for (cursor = bl->bl_cursor;; cursor = 0) { 302 blk = blst_meta_alloc(bl->bl_root, cursor, count, maxcount, 303 bl->bl_radix); 304 if (blk != SWAPBLK_NONE) { 305 bl->bl_avail -= *count; 306 bl->bl_cursor = blk + *count; 307 if (bl->bl_cursor == bl->bl_blocks) 308 bl->bl_cursor = 0; 309 return (blk); 310 } 311 if (cursor == 0) 312 return (SWAPBLK_NONE); 313 } 314 } 315 316 /* 317 * blist_avail() - return the number of free blocks. 318 */ 319 daddr_t 320 blist_avail(blist_t bl) 321 { 322 323 return (bl->bl_avail); 324 } 325 326 /* 327 * blist_free() - free up space in the block bitmap. Return the base 328 * of a contiguous region. 329 */ 330 void 331 blist_free(blist_t bl, daddr_t blkno, daddr_t count) 332 { 333 334 KASSERT(blkno >= 0 && blkno + count <= bl->bl_blocks, 335 ("freeing invalid range: blkno %jx, count %d, blocks %jd", 336 (uintmax_t)blkno, (int)count, (uintmax_t)bl->bl_blocks)); 337 blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix); 338 bl->bl_avail += count; 339 } 340 341 /* 342 * blist_fill() - mark a region in the block bitmap as off-limits 343 * to the allocator (i.e. allocate it), ignoring any 344 * existing allocations. Return the number of blocks 345 * actually filled that were free before the call. 346 */ 347 daddr_t 348 blist_fill(blist_t bl, daddr_t blkno, daddr_t count) 349 { 350 daddr_t filled; 351 352 KASSERT(blkno >= 0 && blkno + count <= bl->bl_blocks, 353 ("filling invalid range: blkno %jx, count %d, blocks %jd", 354 (uintmax_t)blkno, (int)count, (uintmax_t)bl->bl_blocks)); 355 filled = blst_meta_fill(bl->bl_root, blkno, count, bl->bl_radix); 356 bl->bl_avail -= filled; 357 return (filled); 358 } 359 360 /* 361 * blist_resize() - resize an existing radix tree to handle the 362 * specified number of blocks. This will reallocate 363 * the tree and transfer the previous bitmap to the new 364 * one. When extending the tree you can specify whether 365 * the new blocks are to left allocated or freed. 366 */ 367 void 368 blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags) 369 { 370 blist_t newbl = blist_create(count, flags); 371 blist_t save = *pbl; 372 373 *pbl = newbl; 374 if (count > save->bl_blocks) 375 count = save->bl_blocks; 376 blst_copy(save->bl_root, 0, save->bl_radix, newbl, count); 377 378 /* 379 * If resizing upwards, should we free the new space or not? 380 */ 381 if (freenew && count < newbl->bl_blocks) { 382 blist_free(newbl, count, newbl->bl_blocks - count); 383 } 384 blist_destroy(save); 385 } 386 387 #ifdef BLIST_DEBUG 388 389 /* 390 * blist_print() - dump radix tree 391 */ 392 void 393 blist_print(blist_t bl) 394 { 395 printf("BLIST avail = %jd, cursor = %08jx {\n", 396 (uintmax_t)bl->bl_avail, (uintmax_t)bl->bl_cursor); 397 398 if (bl->bl_root->bm_bitmap != 0) 399 blst_radix_print(bl->bl_root, 0, bl->bl_radix, 4); 400 printf("}\n"); 401 } 402 403 #endif 404 405 static const u_daddr_t fib[] = { 406 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584, 407 4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418, 317811, 408 514229, 832040, 1346269, 2178309, 3524578, 409 }; 410 411 /* 412 * Use 'gap' to describe a maximal range of unallocated blocks/bits. 413 */ 414 struct gap_stats { 415 daddr_t start; /* current gap start, or SWAPBLK_NONE */ 416 daddr_t num; /* number of gaps observed */ 417 daddr_t max; /* largest gap size */ 418 daddr_t avg; /* average gap size */ 419 daddr_t err; /* sum - num * avg */ 420 daddr_t histo[nitems(fib)]; /* # gaps in each size range */ 421 int max_bucket; /* last histo elt with nonzero val */ 422 }; 423 424 /* 425 * gap_stats_counting() - is the state 'counting 1 bits'? 426 * or 'skipping 0 bits'? 427 */ 428 static inline bool 429 gap_stats_counting(const struct gap_stats *stats) 430 { 431 432 return (stats->start != SWAPBLK_NONE); 433 } 434 435 /* 436 * init_gap_stats() - initialize stats on gap sizes 437 */ 438 static inline void 439 init_gap_stats(struct gap_stats *stats) 440 { 441 442 bzero(stats, sizeof(*stats)); 443 stats->start = SWAPBLK_NONE; 444 } 445 446 /* 447 * update_gap_stats() - update stats on gap sizes 448 */ 449 static void 450 update_gap_stats(struct gap_stats *stats, daddr_t posn) 451 { 452 daddr_t size; 453 int hi, lo, mid; 454 455 if (!gap_stats_counting(stats)) { 456 stats->start = posn; 457 return; 458 } 459 size = posn - stats->start; 460 stats->start = SWAPBLK_NONE; 461 if (size > stats->max) 462 stats->max = size; 463 464 /* 465 * Find the fibonacci range that contains size, 466 * expecting to find it in an early range. 467 */ 468 lo = 0; 469 hi = 1; 470 while (hi < nitems(fib) && fib[hi] <= size) { 471 lo = hi; 472 hi *= 2; 473 } 474 if (hi >= nitems(fib)) 475 hi = nitems(fib); 476 while (lo + 1 != hi) { 477 mid = (lo + hi) >> 1; 478 if (fib[mid] <= size) 479 lo = mid; 480 else 481 hi = mid; 482 } 483 stats->histo[lo]++; 484 if (lo > stats->max_bucket) 485 stats->max_bucket = lo; 486 stats->err += size - stats->avg; 487 stats->num++; 488 stats->avg += stats->err / stats->num; 489 stats->err %= stats->num; 490 } 491 492 /* 493 * dump_gap_stats() - print stats on gap sizes 494 */ 495 static inline void 496 dump_gap_stats(const struct gap_stats *stats, struct sbuf *s) 497 { 498 int i; 499 500 sbuf_printf(s, "number of maximal free ranges: %jd\n", 501 (intmax_t)stats->num); 502 sbuf_printf(s, "largest free range: %jd\n", (intmax_t)stats->max); 503 sbuf_printf(s, "average maximal free range size: %jd\n", 504 (intmax_t)stats->avg); 505 sbuf_printf(s, "number of maximal free ranges of different sizes:\n"); 506 sbuf_printf(s, " count | size range\n"); 507 sbuf_printf(s, " ----- | ----------\n"); 508 for (i = 0; i < stats->max_bucket; i++) { 509 if (stats->histo[i] != 0) { 510 sbuf_printf(s, "%20jd | ", 511 (intmax_t)stats->histo[i]); 512 if (fib[i] != fib[i + 1] - 1) 513 sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i], 514 (intmax_t)fib[i + 1] - 1); 515 else 516 sbuf_printf(s, "%jd\n", (intmax_t)fib[i]); 517 } 518 } 519 sbuf_printf(s, "%20jd | ", (intmax_t)stats->histo[i]); 520 if (stats->histo[i] > 1) 521 sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i], 522 (intmax_t)stats->max); 523 else 524 sbuf_printf(s, "%jd\n", (intmax_t)stats->max); 525 } 526 527 /* 528 * blist_stats() - dump radix tree stats 529 */ 530 void 531 blist_stats(blist_t bl, struct sbuf *s) 532 { 533 struct gap_stats gstats; 534 struct gap_stats *stats = &gstats; 535 daddr_t i, nodes, radix; 536 u_daddr_t diff, mask; 537 int digit; 538 539 init_gap_stats(stats); 540 nodes = 0; 541 radix = bl->bl_radix; 542 for (i = 0; i < bl->bl_blocks; ) { 543 /* 544 * Check for skippable subtrees starting at i. 545 */ 546 while (radix != 1) { 547 if (bl->bl_root[nodes].bm_bitmap == 0) { 548 if (gap_stats_counting(stats)) 549 update_gap_stats(stats, i); 550 break; 551 } 552 553 /* 554 * Skip subtree root. 555 */ 556 nodes++; 557 radix /= BLIST_RADIX; 558 } 559 if (radix == 1) { 560 /* 561 * Scan leaf. 562 */ 563 mask = bl->bl_root[nodes].bm_bitmap; 564 diff = mask ^ (mask << 1); 565 if (gap_stats_counting(stats)) 566 diff ^= 1; 567 while (diff != 0) { 568 digit = bitpos(diff); 569 update_gap_stats(stats, i + digit); 570 diff ^= bitrange(digit, 1); 571 } 572 } 573 nodes += radix_to_skip(radix * BLIST_RADIX); 574 i += radix * BLIST_RADIX; 575 576 /* 577 * Find max size subtree starting at i. 578 */ 579 for (radix = 1; 580 ((i / BLIST_RADIX / radix) & BLIST_MASK) == 0; 581 radix *= BLIST_RADIX) 582 ; 583 } 584 update_gap_stats(stats, i); 585 dump_gap_stats(stats, s); 586 } 587 588 /************************************************************************ 589 * ALLOCATION SUPPORT FUNCTIONS * 590 ************************************************************************ 591 * 592 * These support functions do all the actual work. They may seem 593 * rather longish, but that's because I've commented them up. The 594 * actual code is straight forward. 595 * 596 */ 597 598 /* 599 * BLST_NEXT_LEAF_ALLOC() - allocate the blocks starting with the next leaf. 600 * 601 * 'scan' is a leaf node, and its first block is at address 'start'. The 602 * next leaf node could be adjacent, or several nodes away if the least 603 * common ancestor of 'scan' and its neighbor is several levels up. Use 604 * addresses to determine how many meta-nodes lie between the leaves. If 605 * sequence of leaves starting with the next one has enough initial bits 606 * set, clear them and clear the bits in the meta nodes on the path up to 607 * the least common ancestor to mark any subtrees made completely empty. 608 */ 609 static int 610 blst_next_leaf_alloc(blmeta_t *scan, daddr_t start, int count, int maxcount) 611 { 612 u_daddr_t radix; 613 daddr_t blk; 614 int avail, digit; 615 616 start += BLIST_RADIX; 617 for (blk = start; blk - start < maxcount; blk += BLIST_RADIX) { 618 /* Skip meta-nodes, as long as they promise more free blocks. */ 619 radix = BLIST_RADIX; 620 while (((++scan)->bm_bitmap & 1) == 1 && 621 ((blk / radix) & BLIST_MASK) == 0) 622 radix *= BLIST_RADIX; 623 if (~scan->bm_bitmap != 0) { 624 /* 625 * Either there is no next leaf with any free blocks, 626 * or we've reached the next leaf and found that some 627 * of its blocks are not free. In the first case, 628 * bitpos() returns zero here. 629 */ 630 avail = blk - start + bitpos(~scan->bm_bitmap); 631 if (avail < count || avail == 0) { 632 /* 633 * There isn't a next leaf with enough free 634 * blocks at its beginning to bother 635 * allocating. 636 */ 637 return (avail); 638 } 639 maxcount = imin(avail, maxcount); 640 if (maxcount % BLIST_RADIX == 0) { 641 /* 642 * There was no next leaf. Back scan up to 643 * last leaf. 644 */ 645 do { 646 radix /= BLIST_RADIX; 647 --scan; 648 } while (radix != 1); 649 blk -= BLIST_RADIX; 650 } 651 } 652 } 653 654 /* 655 * 'scan' is the last leaf that provides blocks. Clear from 1 to 656 * BLIST_RADIX bits to represent the allocation of those last blocks. 657 */ 658 if (maxcount % BLIST_RADIX != 0) 659 scan->bm_bitmap &= ~bitrange(0, maxcount % BLIST_RADIX); 660 else 661 scan->bm_bitmap = 0; 662 663 for (;;) { 664 /* Back up over meta-nodes, clearing bits if necessary. */ 665 blk -= BLIST_RADIX; 666 for (radix = BLIST_RADIX; 667 (digit = ((blk / radix) & BLIST_MASK)) == 0; 668 radix *= BLIST_RADIX) { 669 if ((scan--)->bm_bitmap == 0) 670 scan->bm_bitmap ^= 1; 671 } 672 if ((scan--)->bm_bitmap == 0) 673 scan[-digit * radix_to_skip(radix)].bm_bitmap ^= 674 (u_daddr_t)1 << digit; 675 676 if (blk == start) 677 break; 678 /* Clear all the bits of this leaf. */ 679 scan->bm_bitmap = 0; 680 } 681 return (maxcount); 682 } 683 684 /* 685 * BLST_LEAF_ALLOC() - allocate at a leaf in the radix tree (a bitmap). 686 * 687 * This function is the core of the allocator. Its execution time is 688 * proportional to log(count), plus height of the tree if the allocation 689 * crosses a leaf boundary. 690 */ 691 static daddr_t 692 blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int *count, int maxcount) 693 { 694 u_daddr_t mask; 695 int bighint, count1, hi, lo, num_shifts; 696 697 count1 = *count - 1; 698 num_shifts = fls(count1); 699 mask = ~scan->bm_bitmap; 700 while ((mask & (mask + 1)) != 0 && num_shifts > 0) { 701 /* 702 * If bit i is 0 in mask, then bits in [i, i + (count1 >> 703 * num_shifts)] are 1 in scan->bm_bitmap. Reduce num_shifts to 704 * 0, while preserving this invariant. The updates to mask 705 * leave fewer bits 0, but each bit that remains 0 represents a 706 * longer string of consecutive 1-bits in scan->bm_bitmap. If 707 * more updates to mask cannot set more bits, because mask is 708 * partitioned with all 1 bits following all 0 bits, the loop 709 * terminates immediately. 710 */ 711 num_shifts--; 712 mask |= mask >> ((count1 >> num_shifts) + 1) / 2; 713 } 714 bighint = count1 >> num_shifts; 715 if (~mask == 0) { 716 /* 717 * Update bighint. There is no allocation bigger than 718 * count1 >> num_shifts starting in this leaf. 719 */ 720 scan->bm_bighint = bighint; 721 return (SWAPBLK_NONE); 722 } 723 724 /* Discard any candidates that appear before blk. */ 725 if ((blk & BLIST_MASK) != 0) { 726 if ((~mask & bitrange(0, blk & BLIST_MASK)) != 0) { 727 /* Grow bighint in case all discarded bits are set. */ 728 bighint += blk & BLIST_MASK; 729 mask |= bitrange(0, blk & BLIST_MASK); 730 if (~mask == 0) { 731 scan->bm_bighint = bighint; 732 return (SWAPBLK_NONE); 733 } 734 } 735 blk -= blk & BLIST_MASK; 736 } 737 738 /* 739 * The least significant set bit in mask marks the start of the first 740 * available range of sufficient size. Find its position. 741 */ 742 lo = bitpos(~mask); 743 744 /* 745 * Find how much space is available starting at that position. 746 */ 747 if ((mask & (mask + 1)) != 0) { 748 /* Count the 1 bits starting at position lo. */ 749 hi = bitpos(mask & (mask + 1)) + count1; 750 if (maxcount < hi - lo) 751 hi = lo + maxcount; 752 *count = hi - lo; 753 mask = ~bitrange(lo, *count); 754 } else if (maxcount <= BLIST_RADIX - lo) { 755 /* All the blocks we can use are available here. */ 756 hi = lo + maxcount; 757 *count = maxcount; 758 mask = ~bitrange(lo, *count); 759 if (hi == BLIST_RADIX) 760 scan->bm_bighint = bighint; 761 } else { 762 /* Check next leaf for some of the blocks we want or need. */ 763 count1 = *count - (BLIST_RADIX - lo); 764 maxcount -= BLIST_RADIX - lo; 765 hi = blst_next_leaf_alloc(scan, blk, count1, maxcount); 766 if (hi < count1) 767 /* 768 * The next leaf cannot supply enough blocks to reach 769 * the minimum required allocation. The hint cannot be 770 * updated, because the same allocation request could 771 * be satisfied later, by this leaf, if the state of 772 * the next leaf changes, and without any changes to 773 * this leaf. 774 */ 775 return (SWAPBLK_NONE); 776 *count = BLIST_RADIX - lo + hi; 777 scan->bm_bighint = bighint; 778 } 779 780 /* Clear the allocated bits from this leaf. */ 781 scan->bm_bitmap &= mask; 782 return (blk + lo); 783 } 784 785 /* 786 * blist_meta_alloc() - allocate at a meta in the radix tree. 787 * 788 * Attempt to allocate at a meta node. If we can't, we update 789 * bighint and return a failure. Updating bighint optimize future 790 * calls that hit this node. We have to check for our collapse cases 791 * and we have a few optimizations strewn in as well. 792 */ 793 static daddr_t 794 blst_meta_alloc(blmeta_t *scan, daddr_t cursor, int *count, 795 int maxcount, u_daddr_t radix) 796 { 797 daddr_t blk, i, r, skip; 798 u_daddr_t mask; 799 bool scan_from_start; 800 int digit; 801 802 if (radix == 1) 803 return (blst_leaf_alloc(scan, cursor, count, maxcount)); 804 blk = cursor & -(radix * BLIST_RADIX); 805 scan_from_start = (cursor == blk); 806 skip = radix_to_skip(radix); 807 mask = scan->bm_bitmap; 808 809 /* Discard any candidates that appear before cursor. */ 810 digit = (cursor / radix) & BLIST_MASK; 811 mask &= (u_daddr_t)-1 << digit; 812 if (mask == 0) 813 return (SWAPBLK_NONE); 814 815 /* 816 * If the first try is for a block that includes the cursor, pre-undo 817 * the digit * radix offset in the first call; otherwise, ignore the 818 * cursor entirely. 819 */ 820 if (((mask >> digit) & 1) == 1) 821 cursor -= digit * radix; 822 else 823 cursor = blk; 824 825 /* 826 * Examine the nonempty subtree associated with each bit set in mask. 827 */ 828 do { 829 digit = bitpos(mask); 830 i = 1 + digit * skip; 831 if (*count <= scan[i].bm_bighint) { 832 /* 833 * The allocation might fit beginning in the i'th subtree. 834 */ 835 r = blst_meta_alloc(&scan[i], cursor + digit * radix, 836 count, maxcount, radix / BLIST_RADIX); 837 if (r != SWAPBLK_NONE) { 838 if (scan[i].bm_bitmap == 0) 839 scan->bm_bitmap ^= bitrange(digit, 1); 840 return (r); 841 } 842 } 843 cursor = blk; 844 } while ((mask ^= bitrange(digit, 1)) != 0); 845 846 /* 847 * We couldn't allocate count in this subtree. If the whole tree was 848 * scanned, and the last tree node is allocated, update bighint. 849 */ 850 if (scan_from_start && !(digit == BLIST_RADIX - 1 && 851 scan[i].bm_bighint == BLIST_MAX_ALLOC)) 852 scan->bm_bighint = *count - 1; 853 854 return (SWAPBLK_NONE); 855 } 856 857 /* 858 * BLST_LEAF_FREE() - free allocated block from leaf bitmap 859 * 860 */ 861 static void 862 blst_leaf_free(blmeta_t *scan, daddr_t blk, int count) 863 { 864 u_daddr_t mask; 865 866 /* 867 * free some data in this bitmap 868 * mask=0000111111111110000 869 * \_________/\__/ 870 * count n 871 */ 872 mask = bitrange(blk & BLIST_MASK, count); 873 KASSERT((scan->bm_bitmap & mask) == 0, 874 ("freeing free block: %jx, size %d, mask %jx", 875 (uintmax_t)blk, count, (uintmax_t)scan->bm_bitmap & mask)); 876 scan->bm_bitmap |= mask; 877 } 878 879 /* 880 * BLST_META_FREE() - free allocated blocks from radix tree meta info 881 * 882 * This support routine frees a range of blocks from the bitmap. 883 * The range must be entirely enclosed by this radix node. If a 884 * meta node, we break the range down recursively to free blocks 885 * in subnodes (which means that this code can free an arbitrary 886 * range whereas the allocation code cannot allocate an arbitrary 887 * range). 888 */ 889 static void 890 blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, u_daddr_t radix) 891 { 892 daddr_t blk, endBlk, i, skip; 893 int digit, endDigit; 894 895 /* 896 * We could probably do a better job here. We are required to make 897 * bighint at least as large as the biggest allocable block of data. 898 * If we just shoehorn it, a little extra overhead will be incurred 899 * on the next allocation (but only that one typically). 900 */ 901 scan->bm_bighint = BLIST_MAX_ALLOC; 902 903 if (radix == 1) 904 return (blst_leaf_free(scan, freeBlk, count)); 905 906 endBlk = freeBlk + count; 907 blk = (freeBlk + radix * BLIST_RADIX) & -(radix * BLIST_RADIX); 908 /* 909 * blk is first block past the end of the range of this meta node, 910 * or 0 in case of overflow. 911 */ 912 if (blk != 0) 913 endBlk = ummin(endBlk, blk); 914 skip = radix_to_skip(radix); 915 blk = freeBlk & -radix; 916 digit = (blk / radix) & BLIST_MASK; 917 endDigit = 1 + (((endBlk - 1) / radix) & BLIST_MASK); 918 scan->bm_bitmap |= bitrange(digit, endDigit - digit); 919 for (i = 1 + digit * skip; blk < endBlk; i += skip) { 920 blk += radix; 921 count = ummin(blk, endBlk) - freeBlk; 922 blst_meta_free(&scan[i], freeBlk, count, radix / BLIST_RADIX); 923 freeBlk = blk; 924 } 925 } 926 927 /* 928 * BLST_COPY() - copy one radix tree to another 929 * 930 * Locates free space in the source tree and frees it in the destination 931 * tree. The space may not already be free in the destination. 932 */ 933 static void 934 blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, blist_t dest, 935 daddr_t count) 936 { 937 daddr_t endBlk, i, skip; 938 939 /* 940 * Leaf node 941 */ 942 943 if (radix == 1) { 944 u_daddr_t v = scan->bm_bitmap; 945 946 if (v == (u_daddr_t)-1) { 947 blist_free(dest, blk, count); 948 } else if (v != 0) { 949 int i; 950 951 for (i = 0; i < count; ++i) { 952 if (v & ((u_daddr_t)1 << i)) 953 blist_free(dest, blk + i, 1); 954 } 955 } 956 return; 957 } 958 959 /* 960 * Meta node 961 */ 962 963 if (scan->bm_bitmap == 0) { 964 /* 965 * Source all allocated, leave dest allocated 966 */ 967 return; 968 } 969 970 endBlk = blk + count; 971 skip = radix_to_skip(radix); 972 for (i = 1; blk < endBlk; i += skip) { 973 blk += radix; 974 count = radix; 975 if (blk >= endBlk) 976 count -= blk - endBlk; 977 blst_copy(&scan[i], blk - radix, 978 radix / BLIST_RADIX, dest, count); 979 } 980 } 981 982 /* 983 * BLST_LEAF_FILL() - allocate specific blocks in leaf bitmap 984 * 985 * This routine allocates all blocks in the specified range 986 * regardless of any existing allocations in that range. Returns 987 * the number of blocks allocated by the call. 988 */ 989 static daddr_t 990 blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count) 991 { 992 daddr_t nblks; 993 u_daddr_t mask; 994 995 mask = bitrange(blk & BLIST_MASK, count); 996 997 /* Count the number of blocks that we are allocating. */ 998 nblks = bitcount64(scan->bm_bitmap & mask); 999 1000 scan->bm_bitmap &= ~mask; 1001 return (nblks); 1002 } 1003 1004 /* 1005 * BLIST_META_FILL() - allocate specific blocks at a meta node 1006 * 1007 * This routine allocates the specified range of blocks, 1008 * regardless of any existing allocations in the range. The 1009 * range must be within the extent of this node. Returns the 1010 * number of blocks allocated by the call. 1011 */ 1012 static daddr_t 1013 blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, u_daddr_t radix) 1014 { 1015 daddr_t blk, endBlk, i, nblks, skip; 1016 int digit; 1017 1018 if (radix == 1) 1019 return (blst_leaf_fill(scan, allocBlk, count)); 1020 1021 endBlk = allocBlk + count; 1022 blk = (allocBlk + radix * BLIST_RADIX) & -(radix * BLIST_RADIX); 1023 /* 1024 * blk is first block past the end of the range of this meta node, 1025 * or 0 in case of overflow. 1026 */ 1027 if (blk != 0) 1028 endBlk = ummin(endBlk, blk); 1029 skip = radix_to_skip(radix); 1030 blk = allocBlk & -radix; 1031 nblks = 0; 1032 while (blk < endBlk) { 1033 digit = (blk / radix) & BLIST_MASK; 1034 i = 1 + digit * skip; 1035 blk += radix; 1036 count = ummin(blk, endBlk) - allocBlk; 1037 nblks += blst_meta_fill(&scan[i], allocBlk, count, 1038 radix / BLIST_RADIX); 1039 if (scan[i].bm_bitmap == 0) 1040 scan->bm_bitmap &= ~((u_daddr_t)1 << digit); 1041 allocBlk = blk; 1042 } 1043 return (nblks); 1044 } 1045 1046 #ifdef BLIST_DEBUG 1047 1048 static void 1049 blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int tab) 1050 { 1051 daddr_t skip; 1052 u_daddr_t mask; 1053 int digit; 1054 1055 if (radix == 1) { 1056 printf( 1057 "%*.*s(%08llx,%lld): bitmap %0*llx big=%lld\n", 1058 tab, tab, "", 1059 (long long)blk, (long long)BLIST_RADIX, 1060 (int)(1 + (BLIST_RADIX - 1) / 4), 1061 (long long)scan->bm_bitmap, 1062 (long long)scan->bm_bighint 1063 ); 1064 return; 1065 } 1066 1067 printf( 1068 "%*.*s(%08llx): subtree (%lld/%lld) bitmap %0*llx big=%lld {\n", 1069 tab, tab, "", 1070 (long long)blk, (long long)radix * BLIST_RADIX, 1071 (long long)radix * BLIST_RADIX, 1072 (int)(1 + (BLIST_RADIX - 1) / 4), 1073 (long long)scan->bm_bitmap, 1074 (long long)scan->bm_bighint 1075 ); 1076 1077 skip = radix_to_skip(radix); 1078 tab += 4; 1079 1080 mask = scan->bm_bitmap; 1081 /* Examine the nonempty subtree associated with each bit set in mask */ 1082 do { 1083 digit = bitpos(mask); 1084 blst_radix_print(&scan[1 + digit * skip], blk + digit * radix, 1085 radix / BLIST_RADIX, tab); 1086 } while ((mask ^= bitrange(digit, 1)) != 0); 1087 tab -= 4; 1088 1089 printf( 1090 "%*.*s}\n", 1091 tab, tab, "" 1092 ); 1093 } 1094 1095 #endif 1096 1097 #ifdef BLIST_DEBUG 1098 1099 int 1100 main(int ac, char **av) 1101 { 1102 daddr_t size = BLIST_RADIX * BLIST_RADIX; 1103 int i; 1104 blist_t bl; 1105 struct sbuf *s; 1106 1107 for (i = 1; i < ac; ++i) { 1108 const char *ptr = av[i]; 1109 if (*ptr != '-') { 1110 size = strtoll(ptr, NULL, 0); 1111 continue; 1112 } 1113 ptr += 2; 1114 fprintf(stderr, "Bad option: %s\n", ptr - 2); 1115 exit(1); 1116 } 1117 bl = blist_create(size, M_WAITOK); 1118 if (bl == NULL) { 1119 fprintf(stderr, "blist_create failed\n"); 1120 exit(1); 1121 } 1122 blist_free(bl, 0, size); 1123 1124 for (;;) { 1125 char buf[1024]; 1126 long long da = 0; 1127 int count = 0, maxcount = 0; 1128 1129 printf("%lld/%lld/%lld> ", (long long)blist_avail(bl), 1130 (long long)size, (long long)bl->bl_radix * BLIST_RADIX); 1131 fflush(stdout); 1132 if (fgets(buf, sizeof(buf), stdin) == NULL) 1133 break; 1134 switch(buf[0]) { 1135 case 'r': 1136 if (sscanf(buf + 1, "%d", &count) == 1) { 1137 blist_resize(&bl, count, 1, M_WAITOK); 1138 } else { 1139 printf("?\n"); 1140 } 1141 case 'p': 1142 blist_print(bl); 1143 break; 1144 case 's': 1145 s = sbuf_new_auto(); 1146 blist_stats(bl, s); 1147 sbuf_finish(s); 1148 printf("%s", sbuf_data(s)); 1149 sbuf_delete(s); 1150 break; 1151 case 'a': 1152 if (sscanf(buf + 1, "%d%d", &count, &maxcount) == 2) { 1153 daddr_t blk = blist_alloc(bl, &count, maxcount); 1154 printf(" R=%08llx, c=%08d\n", 1155 (long long)blk, count); 1156 } else { 1157 printf("?\n"); 1158 } 1159 break; 1160 case 'f': 1161 if (sscanf(buf + 1, "%llx %d", &da, &count) == 2) { 1162 blist_free(bl, da, count); 1163 } else { 1164 printf("?\n"); 1165 } 1166 break; 1167 case 'l': 1168 if (sscanf(buf + 1, "%llx %d", &da, &count) == 2) { 1169 printf(" n=%jd\n", 1170 (intmax_t)blist_fill(bl, da, count)); 1171 } else { 1172 printf("?\n"); 1173 } 1174 break; 1175 case '?': 1176 case 'h': 1177 puts( 1178 "p -print\n" 1179 "s -stats\n" 1180 "a %d %d -allocate\n" 1181 "f %x %d -free\n" 1182 "l %x %d -fill\n" 1183 "r %d -resize\n" 1184 "h/? -help\n" 1185 "q -quit" 1186 ); 1187 break; 1188 case 'q': 1189 break; 1190 default: 1191 printf("?\n"); 1192 break; 1193 } 1194 if (buf[0] == 'q') 1195 break; 1196 } 1197 return (0); 1198 } 1199 1200 #endif 1201