1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1998 Matthew Dillon. All Rights Reserved. 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS 18 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY 21 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE 23 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 25 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 26 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 */ 29 /* 30 * BLIST.C - Bitmap allocator/deallocator, using a radix tree with hinting 31 * 32 * This module implements a general bitmap allocator/deallocator. The 33 * allocator eats around 2 bits per 'block'. The module does not 34 * try to interpret the meaning of a 'block' other than to return 35 * SWAPBLK_NONE on an allocation failure. 36 * 37 * A radix tree controls access to pieces of the bitmap, and includes 38 * auxiliary information at each interior node about the availabilty of 39 * contiguous free blocks in the subtree rooted at that node. Two radix 40 * constants are involved: one for the size of the bitmaps contained in the 41 * leaf nodes (BLIST_BMAP_RADIX), and one for the number of descendents of 42 * each of the meta (interior) nodes (BLIST_META_RADIX). Each subtree is 43 * associated with a range of blocks. The root of any subtree stores a 44 * hint field that defines an upper bound on the size of the largest 45 * allocation that can begin in the associated block range. A hint is an 46 * upper bound on a potential allocation, but not necessarily a tight upper 47 * bound. 48 * 49 * The bitmap field in each node directs the search for available blocks. 50 * For a leaf node, a bit is set if the corresponding block is free. For a 51 * meta node, a bit is set if the corresponding subtree contains a free 52 * block somewhere within it. The search at a meta node considers only 53 * children of that node that represent a range that includes a free block. 54 * 55 * The hinting greatly increases code efficiency for allocations while 56 * the general radix structure optimizes both allocations and frees. The 57 * radix tree should be able to operate well no matter how much 58 * fragmentation there is and no matter how large a bitmap is used. 59 * 60 * The blist code wires all necessary memory at creation time. Neither 61 * allocations nor frees require interaction with the memory subsystem. 62 * The non-blocking nature of allocations and frees is required by swap 63 * code (vm/swap_pager.c). 64 * 65 * LAYOUT: The radix tree is laid out recursively using a linear array. 66 * Each meta node is immediately followed (laid out sequentially in 67 * memory) by BLIST_META_RADIX lower level nodes. This is a recursive 68 * structure but one that can be easily scanned through a very simple 69 * 'skip' calculation. The memory allocation is only large enough to 70 * cover the number of blocks requested at creation time. Nodes that 71 * represent blocks beyond that limit, nodes that would never be read 72 * or written, are not allocated, so that the last of the 73 * BLIST_META_RADIX lower level nodes of a some nodes may not be 74 * allocated. 75 * 76 * NOTE: the allocator cannot currently allocate more than 77 * BLIST_BMAP_RADIX blocks per call. It will panic with 'allocation too 78 * large' if you try. This is an area that could use improvement. The 79 * radix is large enough that this restriction does not effect the swap 80 * system, though. Currently only the allocation code is affected by 81 * this algorithmic unfeature. The freeing code can handle arbitrary 82 * ranges. 83 * 84 * This code can be compiled stand-alone for debugging. 85 */ 86 87 #include <sys/cdefs.h> 88 __FBSDID("$FreeBSD$"); 89 90 #ifdef _KERNEL 91 92 #include <sys/param.h> 93 #include <sys/systm.h> 94 #include <sys/lock.h> 95 #include <sys/kernel.h> 96 #include <sys/blist.h> 97 #include <sys/malloc.h> 98 #include <sys/sbuf.h> 99 #include <sys/proc.h> 100 #include <sys/mutex.h> 101 102 #else 103 104 #ifndef BLIST_NO_DEBUG 105 #define BLIST_DEBUG 106 #endif 107 108 #include <sys/errno.h> 109 #include <sys/types.h> 110 #include <sys/malloc.h> 111 #include <sys/sbuf.h> 112 #include <assert.h> 113 #include <stdio.h> 114 #include <string.h> 115 #include <stddef.h> 116 #include <stdlib.h> 117 #include <stdarg.h> 118 #include <stdbool.h> 119 120 #define bitcount64(x) __bitcount64((uint64_t)(x)) 121 #define malloc(a,b,c) calloc(a, 1) 122 #define free(a,b) free(a) 123 #define ummin(a,b) ((a) < (b) ? (a) : (b)) 124 #define KASSERT(a,b) assert(a) 125 126 #include <sys/blist.h> 127 128 #endif 129 130 /* 131 * static support functions 132 */ 133 static daddr_t blst_leaf_alloc(blmeta_t *scan, daddr_t blk, 134 int *count, int maxcount); 135 static daddr_t blst_meta_alloc(blmeta_t *scan, daddr_t cursor, int *count, 136 int maxcount, u_daddr_t radix); 137 static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count); 138 static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, 139 u_daddr_t radix); 140 static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, 141 blist_t dest, daddr_t count); 142 static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count); 143 static daddr_t blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, 144 u_daddr_t radix); 145 #ifndef _KERNEL 146 static void blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, 147 int tab); 148 #endif 149 150 #ifdef _KERNEL 151 static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space"); 152 #endif 153 154 _Static_assert(BLIST_BMAP_RADIX % BLIST_META_RADIX == 0, 155 "radix divisibility error"); 156 #define BLIST_BMAP_MASK (BLIST_BMAP_RADIX - 1) 157 #define BLIST_META_MASK (BLIST_META_RADIX - 1) 158 159 /* 160 * For a subtree that can represent the state of up to 'radix' blocks, the 161 * number of leaf nodes of the subtree is L=radix/BLIST_BMAP_RADIX. If 'm' 162 * is short for BLIST_META_RADIX, then for a tree of height h with L=m**h 163 * leaf nodes, the total number of tree nodes is 1 + m + m**2 + ... + m**h, 164 * or, equivalently, (m**(h+1)-1)/(m-1). This quantity is called 'skip' 165 * in the 'meta' functions that process subtrees. Since integer division 166 * discards remainders, we can express this computation as 167 * skip = (m * m**h) / (m - 1) 168 * skip = (m * (radix / BLIST_BMAP_RADIX)) / (m - 1) 169 * and since m divides BLIST_BMAP_RADIX, we can simplify further to 170 * skip = (radix / (BLIST_BMAP_RADIX / m)) / (m - 1) 171 * skip = radix / ((BLIST_BMAP_RADIX / m) * (m - 1)) 172 * so that simple integer division by a constant can safely be used for the 173 * calculation. 174 */ 175 static inline daddr_t 176 radix_to_skip(daddr_t radix) 177 { 178 179 return (radix / 180 ((BLIST_BMAP_RADIX / BLIST_META_RADIX) * BLIST_META_MASK)); 181 } 182 183 /* 184 * Provide a mask with count bits set, starting as position n. 185 */ 186 static inline u_daddr_t 187 bitrange(int n, int count) 188 { 189 190 return (((u_daddr_t)-1 << n) & 191 ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - (n + count)))); 192 } 193 194 195 /* 196 * Find the first bit set in a u_daddr_t. 197 */ 198 static inline int 199 generic_bitpos(u_daddr_t mask) 200 { 201 int hi, lo, mid; 202 203 lo = 0; 204 hi = BLIST_BMAP_RADIX; 205 while (lo + 1 < hi) { 206 mid = (lo + hi) >> 1; 207 if (mask & bitrange(0, mid)) 208 hi = mid; 209 else 210 lo = mid; 211 } 212 return (lo); 213 } 214 215 static inline int 216 bitpos(u_daddr_t mask) 217 { 218 219 switch (sizeof(mask)) { 220 #ifdef HAVE_INLINE_FFSLL 221 case sizeof(long long): 222 return (ffsll(mask) - 1); 223 #endif 224 #ifdef HAVE_INLINE_FFS 225 case sizeof(int): 226 return (ffs(mask) - 1); 227 #endif 228 default: 229 return (generic_bitpos(mask)); 230 } 231 } 232 233 /* 234 * blist_create() - create a blist capable of handling up to the specified 235 * number of blocks 236 * 237 * blocks - must be greater than 0 238 * flags - malloc flags 239 * 240 * The smallest blist consists of a single leaf node capable of 241 * managing BLIST_BMAP_RADIX blocks. 242 */ 243 blist_t 244 blist_create(daddr_t blocks, int flags) 245 { 246 blist_t bl; 247 u_daddr_t nodes, radix; 248 249 KASSERT(blocks > 0, ("invalid block count")); 250 251 /* 252 * Calculate the radix and node count used for scanning. 253 */ 254 nodes = 1; 255 radix = BLIST_BMAP_RADIX; 256 while (radix <= blocks) { 257 nodes += 1 + (blocks - 1) / radix; 258 radix *= BLIST_META_RADIX; 259 } 260 261 bl = malloc(offsetof(struct blist, bl_root[nodes]), M_SWAP, flags | 262 M_ZERO); 263 if (bl == NULL) 264 return (NULL); 265 266 bl->bl_blocks = blocks; 267 bl->bl_radix = radix; 268 269 #if defined(BLIST_DEBUG) 270 printf( 271 "BLIST representing %lld blocks (%lld MB of swap)" 272 ", requiring %lldK of ram\n", 273 (long long)bl->bl_blocks, 274 (long long)bl->bl_blocks * 4 / 1024, 275 (long long)(nodes * sizeof(blmeta_t) + 1023) / 1024 276 ); 277 printf("BLIST raw radix tree contains %lld records\n", 278 (long long)nodes); 279 #endif 280 281 return (bl); 282 } 283 284 void 285 blist_destroy(blist_t bl) 286 { 287 288 free(bl, M_SWAP); 289 } 290 291 /* 292 * blist_alloc() - reserve space in the block bitmap. Return the base 293 * of a contiguous region or SWAPBLK_NONE if space could 294 * not be allocated. 295 */ 296 daddr_t 297 blist_alloc(blist_t bl, int *count, int maxcount) 298 { 299 daddr_t blk, cursor; 300 301 KASSERT(*count <= maxcount, 302 ("invalid parameters %d > %d", *count, maxcount)); 303 KASSERT(maxcount <= BLIST_MAX_ALLOC, 304 ("allocation too large: %d", maxcount)); 305 306 /* 307 * This loop iterates at most twice. An allocation failure in the 308 * first iteration leads to a second iteration only if the cursor was 309 * non-zero. When the cursor is zero, an allocation failure will 310 * stop further iterations. 311 */ 312 for (cursor = bl->bl_cursor;; cursor = 0) { 313 blk = blst_meta_alloc(bl->bl_root, cursor, count, maxcount, 314 bl->bl_radix); 315 if (blk != SWAPBLK_NONE) { 316 bl->bl_avail -= *count; 317 bl->bl_cursor = blk + *count; 318 if (bl->bl_cursor == bl->bl_blocks) 319 bl->bl_cursor = 0; 320 return (blk); 321 } 322 if (cursor == 0) 323 return (SWAPBLK_NONE); 324 } 325 } 326 327 /* 328 * blist_avail() - return the number of free blocks. 329 */ 330 daddr_t 331 blist_avail(blist_t bl) 332 { 333 334 return (bl->bl_avail); 335 } 336 337 /* 338 * blist_free() - free up space in the block bitmap. Return the base 339 * of a contiguous region. 340 */ 341 void 342 blist_free(blist_t bl, daddr_t blkno, daddr_t count) 343 { 344 345 KASSERT(blkno >= 0 && blkno + count <= bl->bl_blocks, 346 ("freeing invalid range: blkno %jx, count %d, blocks %jd", 347 (uintmax_t)blkno, (int)count, (uintmax_t)bl->bl_blocks)); 348 blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix); 349 bl->bl_avail += count; 350 } 351 352 /* 353 * blist_fill() - mark a region in the block bitmap as off-limits 354 * to the allocator (i.e. allocate it), ignoring any 355 * existing allocations. Return the number of blocks 356 * actually filled that were free before the call. 357 */ 358 daddr_t 359 blist_fill(blist_t bl, daddr_t blkno, daddr_t count) 360 { 361 daddr_t filled; 362 363 KASSERT(blkno >= 0 && blkno + count <= bl->bl_blocks, 364 ("filling invalid range: blkno %jx, count %d, blocks %jd", 365 (uintmax_t)blkno, (int)count, (uintmax_t)bl->bl_blocks)); 366 filled = blst_meta_fill(bl->bl_root, blkno, count, bl->bl_radix); 367 bl->bl_avail -= filled; 368 return (filled); 369 } 370 371 /* 372 * blist_resize() - resize an existing radix tree to handle the 373 * specified number of blocks. This will reallocate 374 * the tree and transfer the previous bitmap to the new 375 * one. When extending the tree you can specify whether 376 * the new blocks are to left allocated or freed. 377 */ 378 void 379 blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags) 380 { 381 blist_t newbl = blist_create(count, flags); 382 blist_t save = *pbl; 383 384 *pbl = newbl; 385 if (count > save->bl_blocks) 386 count = save->bl_blocks; 387 blst_copy(save->bl_root, 0, save->bl_radix, newbl, count); 388 389 /* 390 * If resizing upwards, should we free the new space or not? 391 */ 392 if (freenew && count < newbl->bl_blocks) { 393 blist_free(newbl, count, newbl->bl_blocks - count); 394 } 395 blist_destroy(save); 396 } 397 398 #ifdef BLIST_DEBUG 399 400 /* 401 * blist_print() - dump radix tree 402 */ 403 void 404 blist_print(blist_t bl) 405 { 406 printf("BLIST avail = %jd, cursor = %08jx {\n", 407 (uintmax_t)bl->bl_avail, (uintmax_t)bl->bl_cursor); 408 409 if (bl->bl_root->bm_bitmap != 0) 410 blst_radix_print(bl->bl_root, 0, bl->bl_radix, 4); 411 printf("}\n"); 412 } 413 414 #endif 415 416 static const u_daddr_t fib[] = { 417 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584, 418 4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418, 317811, 419 514229, 832040, 1346269, 2178309, 3524578, 420 }; 421 422 /* 423 * Use 'gap' to describe a maximal range of unallocated blocks/bits. 424 */ 425 struct gap_stats { 426 daddr_t start; /* current gap start, or SWAPBLK_NONE */ 427 daddr_t num; /* number of gaps observed */ 428 daddr_t max; /* largest gap size */ 429 daddr_t avg; /* average gap size */ 430 daddr_t err; /* sum - num * avg */ 431 daddr_t histo[nitems(fib)]; /* # gaps in each size range */ 432 int max_bucket; /* last histo elt with nonzero val */ 433 }; 434 435 /* 436 * gap_stats_counting() - is the state 'counting 1 bits'? 437 * or 'skipping 0 bits'? 438 */ 439 static inline bool 440 gap_stats_counting(const struct gap_stats *stats) 441 { 442 443 return (stats->start != SWAPBLK_NONE); 444 } 445 446 /* 447 * init_gap_stats() - initialize stats on gap sizes 448 */ 449 static inline void 450 init_gap_stats(struct gap_stats *stats) 451 { 452 453 bzero(stats, sizeof(*stats)); 454 stats->start = SWAPBLK_NONE; 455 } 456 457 /* 458 * update_gap_stats() - update stats on gap sizes 459 */ 460 static void 461 update_gap_stats(struct gap_stats *stats, daddr_t posn) 462 { 463 daddr_t size; 464 int hi, lo, mid; 465 466 if (!gap_stats_counting(stats)) { 467 stats->start = posn; 468 return; 469 } 470 size = posn - stats->start; 471 stats->start = SWAPBLK_NONE; 472 if (size > stats->max) 473 stats->max = size; 474 475 /* 476 * Find the fibonacci range that contains size, 477 * expecting to find it in an early range. 478 */ 479 lo = 0; 480 hi = 1; 481 while (hi < nitems(fib) && fib[hi] <= size) { 482 lo = hi; 483 hi *= 2; 484 } 485 if (hi >= nitems(fib)) 486 hi = nitems(fib); 487 while (lo + 1 != hi) { 488 mid = (lo + hi) >> 1; 489 if (fib[mid] <= size) 490 lo = mid; 491 else 492 hi = mid; 493 } 494 stats->histo[lo]++; 495 if (lo > stats->max_bucket) 496 stats->max_bucket = lo; 497 stats->err += size - stats->avg; 498 stats->num++; 499 stats->avg += stats->err / stats->num; 500 stats->err %= stats->num; 501 } 502 503 /* 504 * dump_gap_stats() - print stats on gap sizes 505 */ 506 static inline void 507 dump_gap_stats(const struct gap_stats *stats, struct sbuf *s) 508 { 509 int i; 510 511 sbuf_printf(s, "number of maximal free ranges: %jd\n", 512 (intmax_t)stats->num); 513 sbuf_printf(s, "largest free range: %jd\n", (intmax_t)stats->max); 514 sbuf_printf(s, "average maximal free range size: %jd\n", 515 (intmax_t)stats->avg); 516 sbuf_printf(s, "number of maximal free ranges of different sizes:\n"); 517 sbuf_printf(s, " count | size range\n"); 518 sbuf_printf(s, " ----- | ----------\n"); 519 for (i = 0; i < stats->max_bucket; i++) { 520 if (stats->histo[i] != 0) { 521 sbuf_printf(s, "%20jd | ", 522 (intmax_t)stats->histo[i]); 523 if (fib[i] != fib[i + 1] - 1) 524 sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i], 525 (intmax_t)fib[i + 1] - 1); 526 else 527 sbuf_printf(s, "%jd\n", (intmax_t)fib[i]); 528 } 529 } 530 sbuf_printf(s, "%20jd | ", (intmax_t)stats->histo[i]); 531 if (stats->histo[i] > 1) 532 sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i], 533 (intmax_t)stats->max); 534 else 535 sbuf_printf(s, "%jd\n", (intmax_t)stats->max); 536 } 537 538 /* 539 * blist_stats() - dump radix tree stats 540 */ 541 void 542 blist_stats(blist_t bl, struct sbuf *s) 543 { 544 struct gap_stats gstats; 545 struct gap_stats *stats = &gstats; 546 daddr_t i, nodes, radix; 547 u_daddr_t diff, mask; 548 int digit; 549 550 init_gap_stats(stats); 551 nodes = 0; 552 i = bl->bl_radix; 553 while (i < bl->bl_radix + bl->bl_blocks) { 554 /* 555 * Find max size subtree starting at i. 556 */ 557 radix = BLIST_BMAP_RADIX; 558 while (((i / radix) & BLIST_META_MASK) == 0) 559 radix *= BLIST_META_RADIX; 560 561 /* 562 * Check for skippable subtrees starting at i. 563 */ 564 while (radix > BLIST_BMAP_RADIX) { 565 if (bl->bl_root[nodes].bm_bitmap == 0) { 566 if (gap_stats_counting(stats)) 567 update_gap_stats(stats, i); 568 break; 569 } 570 571 /* 572 * Skip subtree root. 573 */ 574 nodes++; 575 radix /= BLIST_META_RADIX; 576 } 577 if (radix == BLIST_BMAP_RADIX) { 578 /* 579 * Scan leaf. 580 */ 581 mask = bl->bl_root[nodes].bm_bitmap; 582 diff = mask ^ (mask << 1); 583 if (gap_stats_counting(stats)) 584 diff ^= 1; 585 while (diff != 0) { 586 digit = bitpos(diff); 587 update_gap_stats(stats, i + digit); 588 diff ^= bitrange(digit, 1); 589 } 590 } 591 nodes += radix_to_skip(radix); 592 i += radix; 593 } 594 update_gap_stats(stats, i); 595 dump_gap_stats(stats, s); 596 } 597 598 /************************************************************************ 599 * ALLOCATION SUPPORT FUNCTIONS * 600 ************************************************************************ 601 * 602 * These support functions do all the actual work. They may seem 603 * rather longish, but that's because I've commented them up. The 604 * actual code is straight forward. 605 * 606 */ 607 608 /* 609 * BLST_NEXT_LEAF_ALLOC() - allocate the first few blocks in the next leaf. 610 * 611 * 'scan' is a leaf node, associated with a block containing 'blk'. 612 * The next leaf node could be adjacent, or several nodes away if the 613 * least common ancestor of 'scan' and its neighbor is several levels 614 * up. Use 'blk' to determine how many meta-nodes lie between the 615 * leaves. If the next leaf has enough initial bits set, clear them 616 * and clear the bits in the meta nodes on the path up to the least 617 * common ancestor to mark any subtrees made completely empty. 618 */ 619 static int 620 blst_next_leaf_alloc(blmeta_t *scan, daddr_t blk, int count, int maxcount) 621 { 622 blmeta_t *next; 623 u_daddr_t radix; 624 int avail, digit; 625 626 next = scan + 1; 627 blk += BLIST_BMAP_RADIX; 628 radix = BLIST_BMAP_RADIX; 629 while ((next->bm_bitmap & 1) == 1 && 630 (digit = ((blk / radix) & BLIST_META_MASK)) == 0) { 631 next++; 632 radix *= BLIST_META_RADIX; 633 } 634 if ((next->bm_bitmap & 1) != 1) 635 return (0); 636 avail = (~next->bm_bitmap != 0) ? 637 bitpos(~next->bm_bitmap) : BLIST_BMAP_RADIX; 638 if (avail < count) { 639 /* 640 * The next leaf doesn't have enough free blocks at the 641 * beginning to complete the spanning allocation. 642 */ 643 return (0); 644 } 645 count = imin(avail, maxcount); 646 /* Clear the first 'count' bits in the next leaf to allocate. */ 647 next->bm_bitmap &= ~bitrange(0, count); 648 649 /* 650 * Update bitmaps of next-ancestors, up to least common ancestor. 651 */ 652 while (next->bm_bitmap == 0) { 653 if (--next == scan) { 654 scan[-digit * radix_to_skip(radix)].bm_bitmap ^= 655 (u_daddr_t)1 << digit; 656 break; 657 } 658 next->bm_bitmap ^= 1; 659 } 660 return (count); 661 } 662 663 /* 664 * Given a bitmask, flip all the bits from the least-significant 1-bit to the 665 * most significant bit. If the result is non-zero, then the least-significant 666 * 1-bit of the result is in the same position as the least-signification 0-bit 667 * in mask that is followed by a 1-bit. 668 */ 669 static inline u_daddr_t 670 flip_hibits(u_daddr_t mask) 671 { 672 673 return (-mask & ~mask); 674 } 675 676 /* 677 * BLST_LEAF_ALLOC() - allocate at a leaf in the radix tree (a bitmap). 678 * 679 * This function is the core of the allocator. Its execution time is 680 * proportional to log(count), plus height of the tree if the allocation 681 * crosses a leaf boundary. 682 */ 683 static daddr_t 684 blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int *count, int maxcount) 685 { 686 u_daddr_t cursor_mask, mask; 687 int count1, hi, lo, num_shifts, range1, range_ext; 688 689 range1 = 0; 690 count1 = *count - 1; 691 num_shifts = fls(count1); 692 mask = scan->bm_bitmap; 693 while (flip_hibits(mask) != 0 && num_shifts > 0) { 694 /* 695 * If bit i is set in mask, then bits in [i, i+range1] are set 696 * in scan->bm_bitmap. The value of range1 is equal to count1 697 * >> num_shifts. Grow range1 and reduce num_shifts to 0, 698 * while preserving these invariants. The updates to mask 699 * leave fewer bits set, but each bit that remains set 700 * represents a longer string of consecutive bits set in 701 * scan->bm_bitmap. If more updates to mask cannot clear more 702 * bits, because mask is partitioned with all 0 bits preceding 703 * all 1 bits, the loop terminates immediately. 704 */ 705 num_shifts--; 706 range_ext = range1 + ((count1 >> num_shifts) & 1); 707 /* 708 * mask is a signed quantity for the shift because when it is 709 * shifted right, the sign bit should copied; when the last 710 * block of the leaf is free, pretend, for a while, that all the 711 * blocks that follow it are also free. 712 */ 713 mask &= (daddr_t)mask >> range_ext; 714 range1 += range_ext; 715 } 716 if (mask == 0) { 717 /* 718 * Update bighint. There is no allocation bigger than range1 719 * starting in this leaf. 720 */ 721 scan->bm_bighint = range1; 722 return (SWAPBLK_NONE); 723 } 724 725 /* Discard any candidates that appear before blk. */ 726 if ((blk & BLIST_BMAP_MASK) != 0) { 727 cursor_mask = mask & bitrange(0, blk & BLIST_BMAP_MASK); 728 if (cursor_mask != 0) { 729 mask ^= cursor_mask; 730 if (mask == 0) 731 return (SWAPBLK_NONE); 732 733 /* 734 * Bighint change for last block allocation cannot 735 * assume that any other blocks are allocated, so the 736 * bighint cannot be reduced much. 737 */ 738 range1 = BLIST_MAX_ALLOC - 1; 739 } 740 blk &= ~BLIST_BMAP_MASK; 741 } 742 743 /* 744 * The least significant set bit in mask marks the start of the first 745 * available range of sufficient size. Find its position. 746 */ 747 lo = bitpos(mask); 748 749 /* 750 * Find how much space is available starting at that position. 751 */ 752 if (flip_hibits(mask) != 0) { 753 /* Count the 1 bits starting at position lo. */ 754 hi = bitpos(flip_hibits(mask)) + count1; 755 if (maxcount < hi - lo) 756 hi = lo + maxcount; 757 *count = hi - lo; 758 mask = bitrange(lo, *count); 759 } else if (maxcount <= BLIST_BMAP_RADIX - lo) { 760 /* All the blocks we can use are available here. */ 761 hi = lo + maxcount; 762 *count = maxcount; 763 mask = bitrange(lo, *count); 764 } else { 765 /* Check next leaf for some of the blocks we want or need. */ 766 count1 = *count - (BLIST_BMAP_RADIX - lo); 767 maxcount -= BLIST_BMAP_RADIX - lo; 768 hi = blst_next_leaf_alloc(scan, blk, count1, maxcount); 769 if (hi < count1) 770 /* 771 * The next leaf cannot supply enough blocks to reach 772 * the minimum required allocation. The hint cannot be 773 * updated, because the same allocation request could 774 * be satisfied later, by this leaf, if the state of 775 * the next leaf changes, and without any changes to 776 * this leaf. 777 */ 778 return (SWAPBLK_NONE); 779 *count = BLIST_BMAP_RADIX - lo + hi; 780 hi = BLIST_BMAP_RADIX; 781 } 782 783 if (hi == BLIST_BMAP_RADIX) { 784 /* 785 * Update bighint. There is no allocation bigger than range1 786 * available in this leaf after this allocation completes. 787 */ 788 scan->bm_bighint = range1; 789 } 790 /* Clear the allocated bits from this leaf. */ 791 scan->bm_bitmap &= ~mask; 792 return (blk + lo); 793 } 794 795 /* 796 * blist_meta_alloc() - allocate at a meta in the radix tree. 797 * 798 * Attempt to allocate at a meta node. If we can't, we update 799 * bighint and return a failure. Updating bighint optimize future 800 * calls that hit this node. We have to check for our collapse cases 801 * and we have a few optimizations strewn in as well. 802 */ 803 static daddr_t 804 blst_meta_alloc(blmeta_t *scan, daddr_t cursor, int *count, 805 int maxcount, u_daddr_t radix) 806 { 807 daddr_t blk, i, r, skip; 808 u_daddr_t mask; 809 bool scan_from_start; 810 int digit; 811 812 if (radix == BLIST_BMAP_RADIX) 813 return (blst_leaf_alloc(scan, cursor, count, maxcount)); 814 blk = cursor & -radix; 815 scan_from_start = (cursor == blk); 816 radix /= BLIST_META_RADIX; 817 skip = radix_to_skip(radix); 818 mask = scan->bm_bitmap; 819 820 /* Discard any candidates that appear before cursor. */ 821 digit = (cursor / radix) & BLIST_META_MASK; 822 mask &= (u_daddr_t)-1 << digit; 823 if (mask == 0) 824 return (SWAPBLK_NONE); 825 826 /* 827 * If the first try is for a block that includes the cursor, pre-undo 828 * the digit * radix offset in the first call; otherwise, ignore the 829 * cursor entirely. 830 */ 831 if (((mask >> digit) & 1) == 1) 832 cursor -= digit * radix; 833 else 834 cursor = blk; 835 836 /* 837 * Examine the nonempty subtree associated with each bit set in mask. 838 */ 839 do { 840 digit = bitpos(mask); 841 i = 1 + digit * skip; 842 if (*count <= scan[i].bm_bighint) { 843 /* 844 * The allocation might fit beginning in the i'th subtree. 845 */ 846 r = blst_meta_alloc(&scan[i], cursor + digit * radix, 847 count, maxcount, radix); 848 if (r != SWAPBLK_NONE) { 849 if (scan[i].bm_bitmap == 0) 850 scan->bm_bitmap ^= bitrange(digit, 1); 851 return (r); 852 } 853 } 854 cursor = blk; 855 } while ((mask ^= bitrange(digit, 1)) != 0); 856 857 /* 858 * We couldn't allocate count in this subtree. If the whole tree was 859 * scanned, and the last tree node is allocated, update bighint. 860 */ 861 if (scan_from_start && !(digit == BLIST_META_RADIX - 1 && 862 scan[i].bm_bighint == BLIST_MAX_ALLOC)) 863 scan->bm_bighint = *count - 1; 864 865 return (SWAPBLK_NONE); 866 } 867 868 /* 869 * BLST_LEAF_FREE() - free allocated block from leaf bitmap 870 * 871 */ 872 static void 873 blst_leaf_free(blmeta_t *scan, daddr_t blk, int count) 874 { 875 u_daddr_t mask; 876 877 /* 878 * free some data in this bitmap 879 * mask=0000111111111110000 880 * \_________/\__/ 881 * count n 882 */ 883 mask = bitrange(blk & BLIST_BMAP_MASK, count); 884 KASSERT((scan->bm_bitmap & mask) == 0, 885 ("freeing free block: %jx, size %d, mask %jx", 886 (uintmax_t)blk, count, (uintmax_t)scan->bm_bitmap & mask)); 887 scan->bm_bitmap |= mask; 888 } 889 890 /* 891 * BLST_META_FREE() - free allocated blocks from radix tree meta info 892 * 893 * This support routine frees a range of blocks from the bitmap. 894 * The range must be entirely enclosed by this radix node. If a 895 * meta node, we break the range down recursively to free blocks 896 * in subnodes (which means that this code can free an arbitrary 897 * range whereas the allocation code cannot allocate an arbitrary 898 * range). 899 */ 900 static void 901 blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, u_daddr_t radix) 902 { 903 daddr_t blk, endBlk, i, skip; 904 int digit, endDigit; 905 906 /* 907 * We could probably do a better job here. We are required to make 908 * bighint at least as large as the biggest allocable block of data. 909 * If we just shoehorn it, a little extra overhead will be incurred 910 * on the next allocation (but only that one typically). 911 */ 912 scan->bm_bighint = BLIST_MAX_ALLOC; 913 914 if (radix == BLIST_BMAP_RADIX) 915 return (blst_leaf_free(scan, freeBlk, count)); 916 917 endBlk = ummin(freeBlk + count, (freeBlk + radix) & -radix); 918 radix /= BLIST_META_RADIX; 919 skip = radix_to_skip(radix); 920 blk = freeBlk & -radix; 921 digit = (blk / radix) & BLIST_META_MASK; 922 endDigit = 1 + (((endBlk - 1) / radix) & BLIST_META_MASK); 923 scan->bm_bitmap |= bitrange(digit, endDigit - digit); 924 for (i = 1 + digit * skip; blk < endBlk; i += skip) { 925 blk += radix; 926 count = ummin(blk, endBlk) - freeBlk; 927 blst_meta_free(&scan[i], freeBlk, count, radix); 928 freeBlk = blk; 929 } 930 } 931 932 /* 933 * BLST_COPY() - copy one radix tree to another 934 * 935 * Locates free space in the source tree and frees it in the destination 936 * tree. The space may not already be free in the destination. 937 */ 938 static void 939 blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, blist_t dest, 940 daddr_t count) 941 { 942 daddr_t endBlk, i, skip; 943 944 /* 945 * Leaf node 946 */ 947 948 if (radix == BLIST_BMAP_RADIX) { 949 u_daddr_t v = scan->bm_bitmap; 950 951 if (v == (u_daddr_t)-1) { 952 blist_free(dest, blk, count); 953 } else if (v != 0) { 954 int i; 955 956 for (i = 0; i < count; ++i) { 957 if (v & ((u_daddr_t)1 << i)) 958 blist_free(dest, blk + i, 1); 959 } 960 } 961 return; 962 } 963 964 /* 965 * Meta node 966 */ 967 968 if (scan->bm_bitmap == 0) { 969 /* 970 * Source all allocated, leave dest allocated 971 */ 972 return; 973 } 974 975 endBlk = blk + count; 976 radix /= BLIST_META_RADIX; 977 skip = radix_to_skip(radix); 978 for (i = 1; blk < endBlk; i += skip) { 979 blk += radix; 980 count = radix; 981 if (blk >= endBlk) 982 count -= blk - endBlk; 983 blst_copy(&scan[i], blk - radix, radix, dest, count); 984 } 985 } 986 987 /* 988 * BLST_LEAF_FILL() - allocate specific blocks in leaf bitmap 989 * 990 * This routine allocates all blocks in the specified range 991 * regardless of any existing allocations in that range. Returns 992 * the number of blocks allocated by the call. 993 */ 994 static daddr_t 995 blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count) 996 { 997 daddr_t nblks; 998 u_daddr_t mask; 999 1000 mask = bitrange(blk & BLIST_BMAP_MASK, count); 1001 1002 /* Count the number of blocks that we are allocating. */ 1003 nblks = bitcount64(scan->bm_bitmap & mask); 1004 1005 scan->bm_bitmap &= ~mask; 1006 return (nblks); 1007 } 1008 1009 /* 1010 * BLIST_META_FILL() - allocate specific blocks at a meta node 1011 * 1012 * This routine allocates the specified range of blocks, 1013 * regardless of any existing allocations in the range. The 1014 * range must be within the extent of this node. Returns the 1015 * number of blocks allocated by the call. 1016 */ 1017 static daddr_t 1018 blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, u_daddr_t radix) 1019 { 1020 daddr_t blk, endBlk, i, nblks, skip; 1021 int digit; 1022 1023 if (radix == BLIST_BMAP_RADIX) 1024 return (blst_leaf_fill(scan, allocBlk, count)); 1025 1026 endBlk = ummin(allocBlk + count, (allocBlk + radix) & -radix); 1027 radix /= BLIST_META_RADIX; 1028 skip = radix_to_skip(radix); 1029 blk = allocBlk & -radix; 1030 nblks = 0; 1031 while (blk < endBlk) { 1032 digit = (blk / radix) & BLIST_META_MASK; 1033 i = 1 + digit * skip; 1034 blk += radix; 1035 count = ummin(blk, endBlk) - allocBlk; 1036 nblks += blst_meta_fill(&scan[i], allocBlk, count, radix); 1037 if (scan[i].bm_bitmap == 0) 1038 scan->bm_bitmap &= ~((u_daddr_t)1 << digit); 1039 allocBlk = blk; 1040 } 1041 return (nblks); 1042 } 1043 1044 #ifdef BLIST_DEBUG 1045 1046 static void 1047 blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int tab) 1048 { 1049 daddr_t skip; 1050 u_daddr_t mask; 1051 int digit; 1052 1053 if (radix == BLIST_BMAP_RADIX) { 1054 printf( 1055 "%*.*s(%08llx,%lld): bitmap %0*llx big=%lld\n", 1056 tab, tab, "", 1057 (long long)blk, (long long)radix, 1058 1 + (BLIST_BMAP_RADIX - 1) / 4, 1059 (long long)scan->bm_bitmap, 1060 (long long)scan->bm_bighint 1061 ); 1062 return; 1063 } 1064 1065 printf( 1066 "%*.*s(%08llx): subtree (%lld/%lld) bitmap %0*llx big=%lld {\n", 1067 tab, tab, "", 1068 (long long)blk, (long long)radix, 1069 (long long)radix, 1070 1 + (BLIST_META_RADIX - 1) / 4, 1071 (long long)scan->bm_bitmap, 1072 (long long)scan->bm_bighint 1073 ); 1074 1075 radix /= BLIST_META_RADIX; 1076 skip = radix_to_skip(radix); 1077 tab += 4; 1078 1079 mask = scan->bm_bitmap; 1080 /* Examine the nonempty subtree associated with each bit set in mask */ 1081 do { 1082 digit = bitpos(mask); 1083 blst_radix_print(&scan[1 + digit * skip], blk + digit * radix, 1084 radix, tab); 1085 } while ((mask ^= bitrange(digit, 1)) != 0); 1086 tab -= 4; 1087 1088 printf( 1089 "%*.*s}\n", 1090 tab, tab, "" 1091 ); 1092 } 1093 1094 #endif 1095 1096 #ifdef BLIST_DEBUG 1097 1098 int 1099 main(int ac, char **av) 1100 { 1101 int size = BLIST_META_RADIX * BLIST_BMAP_RADIX; 1102 int i; 1103 blist_t bl; 1104 struct sbuf *s; 1105 1106 for (i = 1; i < ac; ++i) { 1107 const char *ptr = av[i]; 1108 if (*ptr != '-') { 1109 size = strtol(ptr, NULL, 0); 1110 continue; 1111 } 1112 ptr += 2; 1113 fprintf(stderr, "Bad option: %s\n", ptr - 2); 1114 exit(1); 1115 } 1116 bl = blist_create(size, M_WAITOK); 1117 blist_free(bl, 0, size); 1118 1119 for (;;) { 1120 char buf[1024]; 1121 long long da = 0; 1122 int count = 0, maxcount = 0; 1123 1124 printf("%lld/%lld/%lld> ", (long long)blist_avail(bl), 1125 (long long)size, (long long)bl->bl_radix); 1126 fflush(stdout); 1127 if (fgets(buf, sizeof(buf), stdin) == NULL) 1128 break; 1129 switch(buf[0]) { 1130 case 'r': 1131 if (sscanf(buf + 1, "%d", &count) == 1) { 1132 blist_resize(&bl, count, 1, M_WAITOK); 1133 } else { 1134 printf("?\n"); 1135 } 1136 case 'p': 1137 blist_print(bl); 1138 break; 1139 case 's': 1140 s = sbuf_new_auto(); 1141 blist_stats(bl, s); 1142 sbuf_finish(s); 1143 printf("%s", sbuf_data(s)); 1144 sbuf_delete(s); 1145 break; 1146 case 'a': 1147 if (sscanf(buf + 1, "%d%d", &count, &maxcount) == 2) { 1148 daddr_t blk = blist_alloc(bl, &count, maxcount); 1149 printf(" R=%08llx, c=%08d\n", 1150 (long long)blk, count); 1151 } else { 1152 printf("?\n"); 1153 } 1154 break; 1155 case 'f': 1156 if (sscanf(buf + 1, "%llx %d", &da, &count) == 2) { 1157 blist_free(bl, da, count); 1158 } else { 1159 printf("?\n"); 1160 } 1161 break; 1162 case 'l': 1163 if (sscanf(buf + 1, "%llx %d", &da, &count) == 2) { 1164 printf(" n=%jd\n", 1165 (intmax_t)blist_fill(bl, da, count)); 1166 } else { 1167 printf("?\n"); 1168 } 1169 break; 1170 case '?': 1171 case 'h': 1172 puts( 1173 "p -print\n" 1174 "s -stats\n" 1175 "a %d %d -allocate\n" 1176 "f %x %d -free\n" 1177 "l %x %d -fill\n" 1178 "r %d -resize\n" 1179 "h/? -help\n" 1180 "q -quit" 1181 ); 1182 break; 1183 case 'q': 1184 break; 1185 default: 1186 printf("?\n"); 1187 break; 1188 } 1189 if (buf[0] == 'q') 1190 break; 1191 } 1192 return (0); 1193 } 1194 1195 #endif 1196