1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1998 Matthew Dillon. All Rights Reserved. 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS 18 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY 21 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE 23 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 25 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 26 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 */ 29 /* 30 * BLIST.C - Bitmap allocator/deallocator, using a radix tree with hinting 31 * 32 * This module implements a general bitmap allocator/deallocator. The 33 * allocator eats around 2 bits per 'block'. The module does not 34 * try to interpret the meaning of a 'block' other than to return 35 * SWAPBLK_NONE on an allocation failure. 36 * 37 * A radix tree controls access to pieces of the bitmap, and includes 38 * auxiliary information at each interior node about the availabilty of 39 * contiguous free blocks in the subtree rooted at that node. Two radix 40 * constants are involved: one for the size of the bitmaps contained in the 41 * leaf nodes (BLIST_BMAP_RADIX), and one for the number of descendents of 42 * each of the meta (interior) nodes (BLIST_META_RADIX). Each subtree is 43 * associated with a range of blocks. The root of any subtree stores a 44 * hint field that defines an upper bound on the size of the largest 45 * allocation that can begin in the associated block range. A hint is an 46 * upper bound on a potential allocation, but not necessarily a tight upper 47 * bound. 48 * 49 * The bitmap field in each node directs the search for available blocks. 50 * For a leaf node, a bit is set if the corresponding block is free. For a 51 * meta node, a bit is set if the corresponding subtree contains a free 52 * block somewhere within it. The search at a meta node considers only 53 * children of that node that represent a range that includes a free block. 54 * 55 * The hinting greatly increases code efficiency for allocations while 56 * the general radix structure optimizes both allocations and frees. The 57 * radix tree should be able to operate well no matter how much 58 * fragmentation there is and no matter how large a bitmap is used. 59 * 60 * The blist code wires all necessary memory at creation time. Neither 61 * allocations nor frees require interaction with the memory subsystem. 62 * The non-blocking nature of allocations and frees is required by swap 63 * code (vm/swap_pager.c). 64 * 65 * LAYOUT: The radix tree is laid out recursively using a linear array. 66 * Each meta node is immediately followed (laid out sequentially in 67 * memory) by BLIST_META_RADIX lower level nodes. This is a recursive 68 * structure but one that can be easily scanned through a very simple 69 * 'skip' calculation. The memory allocation is only large enough to 70 * cover the number of blocks requested at creation time. Nodes that 71 * represent blocks beyond that limit, nodes that would never be read 72 * or written, are not allocated, so that the last of the 73 * BLIST_META_RADIX lower level nodes of a some nodes may not be 74 * allocated. 75 * 76 * NOTE: the allocator cannot currently allocate more than 77 * BLIST_BMAP_RADIX blocks per call. It will panic with 'allocation too 78 * large' if you try. This is an area that could use improvement. The 79 * radix is large enough that this restriction does not effect the swap 80 * system, though. Currently only the allocation code is affected by 81 * this algorithmic unfeature. The freeing code can handle arbitrary 82 * ranges. 83 * 84 * This code can be compiled stand-alone for debugging. 85 */ 86 87 #include <sys/cdefs.h> 88 __FBSDID("$FreeBSD$"); 89 90 #ifdef _KERNEL 91 92 #include <sys/param.h> 93 #include <sys/systm.h> 94 #include <sys/lock.h> 95 #include <sys/kernel.h> 96 #include <sys/blist.h> 97 #include <sys/malloc.h> 98 #include <sys/sbuf.h> 99 #include <sys/proc.h> 100 #include <sys/mutex.h> 101 102 #else 103 104 #ifndef BLIST_NO_DEBUG 105 #define BLIST_DEBUG 106 #endif 107 108 #include <sys/errno.h> 109 #include <sys/types.h> 110 #include <sys/malloc.h> 111 #include <sys/sbuf.h> 112 #include <assert.h> 113 #include <stdio.h> 114 #include <string.h> 115 #include <stddef.h> 116 #include <stdlib.h> 117 #include <stdarg.h> 118 #include <stdbool.h> 119 120 #define bitcount64(x) __bitcount64((uint64_t)(x)) 121 #define malloc(a,b,c) calloc(a, 1) 122 #define free(a,b) free(a) 123 #define ummin(a,b) ((a) < (b) ? (a) : (b)) 124 #define KASSERT(a,b) assert(a) 125 126 #include <sys/blist.h> 127 128 #endif 129 130 /* 131 * static support functions 132 */ 133 static daddr_t blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count); 134 static daddr_t blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count, 135 u_daddr_t radix); 136 static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count); 137 static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, 138 u_daddr_t radix); 139 static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, 140 blist_t dest, daddr_t count); 141 static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count); 142 static daddr_t blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, 143 u_daddr_t radix); 144 #ifndef _KERNEL 145 static void blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, 146 int tab); 147 #endif 148 149 #ifdef _KERNEL 150 static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space"); 151 #endif 152 153 _Static_assert(BLIST_BMAP_RADIX % BLIST_META_RADIX == 0, 154 "radix divisibility error"); 155 #define BLIST_BMAP_MASK (BLIST_BMAP_RADIX - 1) 156 #define BLIST_META_MASK (BLIST_META_RADIX - 1) 157 158 /* 159 * For a subtree that can represent the state of up to 'radix' blocks, the 160 * number of leaf nodes of the subtree is L=radix/BLIST_BMAP_RADIX. If 'm' 161 * is short for BLIST_META_RADIX, then for a tree of height h with L=m**h 162 * leaf nodes, the total number of tree nodes is 1 + m + m**2 + ... + m**h, 163 * or, equivalently, (m**(h+1)-1)/(m-1). This quantity is called 'skip' 164 * in the 'meta' functions that process subtrees. Since integer division 165 * discards remainders, we can express this computation as 166 * skip = (m * m**h) / (m - 1) 167 * skip = (m * (radix / BLIST_BMAP_RADIX)) / (m - 1) 168 * and since m divides BLIST_BMAP_RADIX, we can simplify further to 169 * skip = (radix / (BLIST_BMAP_RADIX / m)) / (m - 1) 170 * skip = radix / ((BLIST_BMAP_RADIX / m) * (m - 1)) 171 * so that simple integer division by a constant can safely be used for the 172 * calculation. 173 */ 174 static inline daddr_t 175 radix_to_skip(daddr_t radix) 176 { 177 178 return (radix / 179 ((BLIST_BMAP_RADIX / BLIST_META_RADIX) * BLIST_META_MASK)); 180 } 181 182 /* 183 * Provide a mask with count bits set, starting as position n. 184 */ 185 static inline u_daddr_t 186 bitrange(int n, int count) 187 { 188 189 return (((u_daddr_t)-1 << n) & 190 ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - (n + count)))); 191 } 192 193 194 /* 195 * Find the first bit set in a u_daddr_t. 196 */ 197 static inline int 198 generic_bitpos(u_daddr_t mask) 199 { 200 int hi, lo, mid; 201 202 lo = 0; 203 hi = BLIST_BMAP_RADIX; 204 while (lo + 1 < hi) { 205 mid = (lo + hi) >> 1; 206 if (mask & bitrange(0, mid)) 207 hi = mid; 208 else 209 lo = mid; 210 } 211 return (lo); 212 } 213 214 static inline int 215 bitpos(u_daddr_t mask) 216 { 217 218 switch (sizeof(mask)) { 219 #ifdef HAVE_INLINE_FFSLL 220 case sizeof(long long): 221 return (ffsll(mask) - 1); 222 #endif 223 #ifdef HAVE_INLINE_FFS 224 case sizeof(int): 225 return (ffs(mask) - 1); 226 #endif 227 default: 228 return (generic_bitpos(mask)); 229 } 230 } 231 232 /* 233 * blist_create() - create a blist capable of handling up to the specified 234 * number of blocks 235 * 236 * blocks - must be greater than 0 237 * flags - malloc flags 238 * 239 * The smallest blist consists of a single leaf node capable of 240 * managing BLIST_BMAP_RADIX blocks. 241 */ 242 blist_t 243 blist_create(daddr_t blocks, int flags) 244 { 245 blist_t bl; 246 u_daddr_t nodes, radix; 247 248 KASSERT(blocks > 0, ("invalid block count")); 249 250 /* 251 * Calculate the radix and node count used for scanning. 252 */ 253 nodes = 1; 254 radix = BLIST_BMAP_RADIX; 255 while (radix <= blocks) { 256 nodes += 1 + (blocks - 1) / radix; 257 radix *= BLIST_META_RADIX; 258 } 259 260 bl = malloc(offsetof(struct blist, bl_root[nodes]), M_SWAP, flags | 261 M_ZERO); 262 if (bl == NULL) 263 return (NULL); 264 265 bl->bl_blocks = blocks; 266 bl->bl_radix = radix; 267 268 #if defined(BLIST_DEBUG) 269 printf( 270 "BLIST representing %lld blocks (%lld MB of swap)" 271 ", requiring %lldK of ram\n", 272 (long long)bl->bl_blocks, 273 (long long)bl->bl_blocks * 4 / 1024, 274 (long long)(nodes * sizeof(blmeta_t) + 1023) / 1024 275 ); 276 printf("BLIST raw radix tree contains %lld records\n", 277 (long long)nodes); 278 #endif 279 280 return (bl); 281 } 282 283 void 284 blist_destroy(blist_t bl) 285 { 286 287 free(bl, M_SWAP); 288 } 289 290 /* 291 * blist_alloc() - reserve space in the block bitmap. Return the base 292 * of a contiguous region or SWAPBLK_NONE if space could 293 * not be allocated. 294 */ 295 daddr_t 296 blist_alloc(blist_t bl, daddr_t count) 297 { 298 daddr_t blk, cursor; 299 300 KASSERT(count <= BLIST_MAX_ALLOC, 301 ("allocation too large: %d", (int)count)); 302 303 /* 304 * This loop iterates at most twice. An allocation failure in the 305 * first iteration leads to a second iteration only if the cursor was 306 * non-zero. When the cursor is zero, an allocation failure will 307 * stop further iterations. 308 */ 309 cursor = bl->bl_cursor; 310 for (;;) { 311 blk = blst_meta_alloc(bl->bl_root, cursor, count, 312 bl->bl_radix); 313 if (blk != SWAPBLK_NONE) { 314 bl->bl_avail -= count; 315 bl->bl_cursor = blk + count; 316 if (bl->bl_cursor == bl->bl_blocks) 317 bl->bl_cursor = 0; 318 return (blk); 319 } else if (cursor == 0) 320 return (SWAPBLK_NONE); 321 cursor = 0; 322 } 323 } 324 325 /* 326 * blist_avail() - return the number of free blocks. 327 */ 328 daddr_t 329 blist_avail(blist_t bl) 330 { 331 332 return (bl->bl_avail); 333 } 334 335 /* 336 * blist_free() - free up space in the block bitmap. Return the base 337 * of a contiguous region. 338 */ 339 void 340 blist_free(blist_t bl, daddr_t blkno, daddr_t count) 341 { 342 343 KASSERT(blkno >= 0 && blkno + count <= bl->bl_blocks, 344 ("freeing invalid range: blkno %jx, count %d, blocks %jd", 345 (uintmax_t)blkno, (int)count, (uintmax_t)bl->bl_blocks)); 346 blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix); 347 bl->bl_avail += count; 348 } 349 350 /* 351 * blist_fill() - mark a region in the block bitmap as off-limits 352 * to the allocator (i.e. allocate it), ignoring any 353 * existing allocations. Return the number of blocks 354 * actually filled that were free before the call. 355 */ 356 daddr_t 357 blist_fill(blist_t bl, daddr_t blkno, daddr_t count) 358 { 359 daddr_t filled; 360 361 KASSERT(blkno >= 0 && blkno + count <= bl->bl_blocks, 362 ("filling invalid range: blkno %jx, count %d, blocks %jd", 363 (uintmax_t)blkno, (int)count, (uintmax_t)bl->bl_blocks)); 364 filled = blst_meta_fill(bl->bl_root, blkno, count, bl->bl_radix); 365 bl->bl_avail -= filled; 366 return (filled); 367 } 368 369 /* 370 * blist_resize() - resize an existing radix tree to handle the 371 * specified number of blocks. This will reallocate 372 * the tree and transfer the previous bitmap to the new 373 * one. When extending the tree you can specify whether 374 * the new blocks are to left allocated or freed. 375 */ 376 void 377 blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags) 378 { 379 blist_t newbl = blist_create(count, flags); 380 blist_t save = *pbl; 381 382 *pbl = newbl; 383 if (count > save->bl_blocks) 384 count = save->bl_blocks; 385 blst_copy(save->bl_root, 0, save->bl_radix, newbl, count); 386 387 /* 388 * If resizing upwards, should we free the new space or not? 389 */ 390 if (freenew && count < newbl->bl_blocks) { 391 blist_free(newbl, count, newbl->bl_blocks - count); 392 } 393 blist_destroy(save); 394 } 395 396 #ifdef BLIST_DEBUG 397 398 /* 399 * blist_print() - dump radix tree 400 */ 401 void 402 blist_print(blist_t bl) 403 { 404 printf("BLIST avail = %jd, cursor = %08jx {\n", 405 (uintmax_t)bl->bl_avail, (uintmax_t)bl->bl_cursor); 406 407 if (bl->bl_root->bm_bitmap != 0) 408 blst_radix_print(bl->bl_root, 0, bl->bl_radix, 4); 409 printf("}\n"); 410 } 411 412 #endif 413 414 static const u_daddr_t fib[] = { 415 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584, 416 4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418, 317811, 417 514229, 832040, 1346269, 2178309, 3524578, 418 }; 419 420 /* 421 * Use 'gap' to describe a maximal range of unallocated blocks/bits. 422 */ 423 struct gap_stats { 424 daddr_t start; /* current gap start, or SWAPBLK_NONE */ 425 daddr_t num; /* number of gaps observed */ 426 daddr_t max; /* largest gap size */ 427 daddr_t avg; /* average gap size */ 428 daddr_t err; /* sum - num * avg */ 429 daddr_t histo[nitems(fib)]; /* # gaps in each size range */ 430 int max_bucket; /* last histo elt with nonzero val */ 431 }; 432 433 /* 434 * gap_stats_counting() - is the state 'counting 1 bits'? 435 * or 'skipping 0 bits'? 436 */ 437 static inline bool 438 gap_stats_counting(const struct gap_stats *stats) 439 { 440 441 return (stats->start != SWAPBLK_NONE); 442 } 443 444 /* 445 * init_gap_stats() - initialize stats on gap sizes 446 */ 447 static inline void 448 init_gap_stats(struct gap_stats *stats) 449 { 450 451 bzero(stats, sizeof(*stats)); 452 stats->start = SWAPBLK_NONE; 453 } 454 455 /* 456 * update_gap_stats() - update stats on gap sizes 457 */ 458 static void 459 update_gap_stats(struct gap_stats *stats, daddr_t posn) 460 { 461 daddr_t size; 462 int hi, lo, mid; 463 464 if (!gap_stats_counting(stats)) { 465 stats->start = posn; 466 return; 467 } 468 size = posn - stats->start; 469 stats->start = SWAPBLK_NONE; 470 if (size > stats->max) 471 stats->max = size; 472 473 /* 474 * Find the fibonacci range that contains size, 475 * expecting to find it in an early range. 476 */ 477 lo = 0; 478 hi = 1; 479 while (hi < nitems(fib) && fib[hi] <= size) { 480 lo = hi; 481 hi *= 2; 482 } 483 if (hi >= nitems(fib)) 484 hi = nitems(fib); 485 while (lo + 1 != hi) { 486 mid = (lo + hi) >> 1; 487 if (fib[mid] <= size) 488 lo = mid; 489 else 490 hi = mid; 491 } 492 stats->histo[lo]++; 493 if (lo > stats->max_bucket) 494 stats->max_bucket = lo; 495 stats->err += size - stats->avg; 496 stats->num++; 497 stats->avg += stats->err / stats->num; 498 stats->err %= stats->num; 499 } 500 501 /* 502 * dump_gap_stats() - print stats on gap sizes 503 */ 504 static inline void 505 dump_gap_stats(const struct gap_stats *stats, struct sbuf *s) 506 { 507 int i; 508 509 sbuf_printf(s, "number of maximal free ranges: %jd\n", 510 (intmax_t)stats->num); 511 sbuf_printf(s, "largest free range: %jd\n", (intmax_t)stats->max); 512 sbuf_printf(s, "average maximal free range size: %jd\n", 513 (intmax_t)stats->avg); 514 sbuf_printf(s, "number of maximal free ranges of different sizes:\n"); 515 sbuf_printf(s, " count | size range\n"); 516 sbuf_printf(s, " ----- | ----------\n"); 517 for (i = 0; i < stats->max_bucket; i++) { 518 if (stats->histo[i] != 0) { 519 sbuf_printf(s, "%20jd | ", 520 (intmax_t)stats->histo[i]); 521 if (fib[i] != fib[i + 1] - 1) 522 sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i], 523 (intmax_t)fib[i + 1] - 1); 524 else 525 sbuf_printf(s, "%jd\n", (intmax_t)fib[i]); 526 } 527 } 528 sbuf_printf(s, "%20jd | ", (intmax_t)stats->histo[i]); 529 if (stats->histo[i] > 1) 530 sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i], 531 (intmax_t)stats->max); 532 else 533 sbuf_printf(s, "%jd\n", (intmax_t)stats->max); 534 } 535 536 /* 537 * blist_stats() - dump radix tree stats 538 */ 539 void 540 blist_stats(blist_t bl, struct sbuf *s) 541 { 542 struct gap_stats gstats; 543 struct gap_stats *stats = &gstats; 544 daddr_t i, nodes, radix; 545 u_daddr_t diff, mask; 546 int digit; 547 548 init_gap_stats(stats); 549 nodes = 0; 550 i = bl->bl_radix; 551 while (i < bl->bl_radix + bl->bl_blocks) { 552 /* 553 * Find max size subtree starting at i. 554 */ 555 radix = BLIST_BMAP_RADIX; 556 while (((i / radix) & BLIST_META_MASK) == 0) 557 radix *= BLIST_META_RADIX; 558 559 /* 560 * Check for skippable subtrees starting at i. 561 */ 562 while (radix > BLIST_BMAP_RADIX) { 563 if (bl->bl_root[nodes].bm_bitmap == 0) { 564 if (gap_stats_counting(stats)) 565 update_gap_stats(stats, i); 566 break; 567 } 568 569 /* 570 * Skip subtree root. 571 */ 572 nodes++; 573 radix /= BLIST_META_RADIX; 574 } 575 if (radix == BLIST_BMAP_RADIX) { 576 /* 577 * Scan leaf. 578 */ 579 mask = bl->bl_root[nodes].bm_bitmap; 580 diff = mask ^ (mask << 1); 581 if (gap_stats_counting(stats)) 582 diff ^= 1; 583 while (diff != 0) { 584 digit = bitpos(diff); 585 update_gap_stats(stats, i + digit); 586 diff ^= bitrange(digit, 1); 587 } 588 } 589 nodes += radix_to_skip(radix); 590 i += radix; 591 } 592 update_gap_stats(stats, i); 593 dump_gap_stats(stats, s); 594 } 595 596 /************************************************************************ 597 * ALLOCATION SUPPORT FUNCTIONS * 598 ************************************************************************ 599 * 600 * These support functions do all the actual work. They may seem 601 * rather longish, but that's because I've commented them up. The 602 * actual code is straight forward. 603 * 604 */ 605 606 /* 607 * BLST_NEXT_LEAF_ALLOC() - allocate the first few blocks in the next leaf. 608 * 609 * 'scan' is a leaf node, associated with a block containing 'blk'. 610 * The next leaf node could be adjacent, or several nodes away if the 611 * least common ancestor of 'scan' and its neighbor is several levels 612 * up. Use 'blk' to determine how many meta-nodes lie between the 613 * leaves. If the next leaf has enough initial bits set, clear them 614 * and clear the bits in the meta nodes on the path up to the least 615 * common ancestor to mark any subtrees made completely empty. 616 */ 617 static int 618 blst_next_leaf_alloc(blmeta_t *scan, daddr_t blk, int count) 619 { 620 blmeta_t *next; 621 u_daddr_t radix; 622 int digit; 623 624 next = scan + 1; 625 blk += BLIST_BMAP_RADIX; 626 radix = BLIST_BMAP_RADIX; 627 while ((digit = ((blk / radix) & BLIST_META_MASK)) == 0 && 628 (next->bm_bitmap & 1) == 1) { 629 next++; 630 radix *= BLIST_META_RADIX; 631 } 632 if (((next->bm_bitmap + 1) & ~((u_daddr_t)-1 << count)) != 0) { 633 /* 634 * The next leaf doesn't have enough free blocks at the 635 * beginning to complete the spanning allocation. 636 */ 637 return (ENOMEM); 638 } 639 /* Clear the first 'count' bits in the next leaf to allocate. */ 640 next->bm_bitmap &= (u_daddr_t)-1 << count; 641 642 /* 643 * Update bitmaps of next-ancestors, up to least common ancestor. 644 */ 645 while (next->bm_bitmap == 0) { 646 if (--next == scan) { 647 scan[-digit * radix_to_skip(radix)].bm_bitmap ^= 648 (u_daddr_t)1 << digit; 649 break; 650 } 651 next->bm_bitmap ^= 1; 652 } 653 return (0); 654 } 655 656 /* 657 * Given a bitmask, flip all the bits from the least-significant 1-bit to the 658 * most significant bit. If the result is non-zero, then the least-significant 659 * 1-bit of the result is in the same position as the least-signification 0-bit 660 * in mask that is followed by a 1-bit. 661 */ 662 static inline u_daddr_t 663 flip_hibits(u_daddr_t mask) 664 { 665 666 return (-mask & ~mask); 667 } 668 669 /* 670 * BLST_LEAF_ALLOC() - allocate at a leaf in the radix tree (a bitmap). 671 * 672 * This function is the core of the allocator. Its execution time is 673 * proportional to log(count), plus height of the tree if the allocation 674 * crosses a leaf boundary. 675 */ 676 static daddr_t 677 blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count) 678 { 679 u_daddr_t cursor_mask, mask; 680 int count1, hi, lo, num_shifts, range1, range_ext; 681 682 range1 = 0; 683 count1 = count - 1; 684 num_shifts = fls(count1); 685 mask = scan->bm_bitmap; 686 while (flip_hibits(mask) != 0 && num_shifts > 0) { 687 /* 688 * If bit i is set in mask, then bits in [i, i+range1] are set 689 * in scan->bm_bitmap. The value of range1 is equal to count1 690 * >> num_shifts. Grow range1 and reduce num_shifts to 0, 691 * while preserving these invariants. The updates to mask 692 * leave fewer bits set, but each bit that remains set 693 * represents a longer string of consecutive bits set in 694 * scan->bm_bitmap. If more updates to mask cannot clear more 695 * bits, because mask is partitioned with all 0 bits preceding 696 * all 1 bits, the loop terminates immediately. 697 */ 698 num_shifts--; 699 range_ext = range1 + ((count1 >> num_shifts) & 1); 700 /* 701 * mask is a signed quantity for the shift because when it is 702 * shifted right, the sign bit should copied; when the last 703 * block of the leaf is free, pretend, for a while, that all the 704 * blocks that follow it are also free. 705 */ 706 mask &= (daddr_t)mask >> range_ext; 707 range1 += range_ext; 708 } 709 if (mask == 0) { 710 /* 711 * Update bighint. There is no allocation bigger than range1 712 * starting in this leaf. 713 */ 714 scan->bm_bighint = range1; 715 return (SWAPBLK_NONE); 716 } 717 718 /* Discard any candidates that appear before blk. */ 719 if ((blk & BLIST_BMAP_MASK) != 0) { 720 cursor_mask = mask & bitrange(0, blk & BLIST_BMAP_MASK); 721 if (cursor_mask != 0) { 722 mask ^= cursor_mask; 723 if (mask == 0) 724 return (SWAPBLK_NONE); 725 726 /* 727 * Bighint change for last block allocation cannot 728 * assume that any other blocks are allocated, so the 729 * bighint cannot be reduced much. 730 */ 731 range1 = BLIST_MAX_ALLOC - 1; 732 } 733 blk &= ~BLIST_BMAP_MASK; 734 } 735 736 /* 737 * The least significant set bit in mask marks the start of the first 738 * available range of sufficient size. Clear all the bits but that one, 739 * and then find its position. 740 */ 741 mask &= -mask; 742 lo = bitpos(mask); 743 744 hi = lo + count; 745 if (hi > BLIST_BMAP_RADIX) { 746 /* 747 * An allocation within this leaf is impossible, so a successful 748 * allocation depends on the next leaf providing some of the blocks. 749 */ 750 if (blst_next_leaf_alloc(scan, blk, hi - BLIST_BMAP_RADIX) != 0) 751 /* 752 * The hint cannot be updated, because the same 753 * allocation request could be satisfied later, by this 754 * leaf, if the state of the next leaf changes, and 755 * without any changes to this leaf. 756 */ 757 return (SWAPBLK_NONE); 758 hi = BLIST_BMAP_RADIX; 759 } 760 761 /* Set the bits of mask at position 'lo' and higher. */ 762 mask = -mask; 763 if (hi == BLIST_BMAP_RADIX) { 764 /* 765 * Update bighint. There is no allocation bigger than range1 766 * available in this leaf after this allocation completes. 767 */ 768 scan->bm_bighint = range1; 769 } else { 770 /* Clear the bits of mask at position 'hi' and higher. */ 771 mask &= (u_daddr_t)-1 >> (BLIST_BMAP_RADIX - hi); 772 } 773 /* Clear the allocated bits from this leaf. */ 774 scan->bm_bitmap &= ~mask; 775 return (blk + lo); 776 } 777 778 /* 779 * blist_meta_alloc() - allocate at a meta in the radix tree. 780 * 781 * Attempt to allocate at a meta node. If we can't, we update 782 * bighint and return a failure. Updating bighint optimize future 783 * calls that hit this node. We have to check for our collapse cases 784 * and we have a few optimizations strewn in as well. 785 */ 786 static daddr_t 787 blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count, u_daddr_t radix) 788 { 789 daddr_t blk, i, r, skip; 790 u_daddr_t mask; 791 bool scan_from_start; 792 int digit; 793 794 if (radix == BLIST_BMAP_RADIX) 795 return (blst_leaf_alloc(scan, cursor, count)); 796 blk = cursor & -radix; 797 scan_from_start = (cursor == blk); 798 radix /= BLIST_META_RADIX; 799 skip = radix_to_skip(radix); 800 mask = scan->bm_bitmap; 801 802 /* Discard any candidates that appear before cursor. */ 803 digit = (cursor / radix) & BLIST_META_MASK; 804 mask &= (u_daddr_t)-1 << digit; 805 if (mask == 0) 806 return (SWAPBLK_NONE); 807 808 /* 809 * If the first try is for a block that includes the cursor, pre-undo 810 * the digit * radix offset in the first call; otherwise, ignore the 811 * cursor entirely. 812 */ 813 if (((mask >> digit) & 1) == 1) 814 cursor -= digit * radix; 815 else 816 cursor = blk; 817 818 /* 819 * Examine the nonempty subtree associated with each bit set in mask. 820 */ 821 do { 822 digit = bitpos(mask); 823 i = 1 + digit * skip; 824 if (count <= scan[i].bm_bighint) { 825 /* 826 * The allocation might fit beginning in the i'th subtree. 827 */ 828 r = blst_meta_alloc(&scan[i], cursor + digit * radix, 829 count, radix); 830 if (r != SWAPBLK_NONE) { 831 if (scan[i].bm_bitmap == 0) 832 scan->bm_bitmap ^= bitrange(digit, 1); 833 return (r); 834 } 835 } 836 cursor = blk; 837 } while ((mask ^= bitrange(digit, 1)) != 0); 838 839 /* 840 * We couldn't allocate count in this subtree. If the whole tree was 841 * scanned, and the last tree node is allocated, update bighint. 842 */ 843 if (scan_from_start && !(digit == BLIST_META_RADIX - 1 && 844 scan[i].bm_bighint == BLIST_MAX_ALLOC)) 845 scan->bm_bighint = count - 1; 846 847 return (SWAPBLK_NONE); 848 } 849 850 /* 851 * BLST_LEAF_FREE() - free allocated block from leaf bitmap 852 * 853 */ 854 static void 855 blst_leaf_free(blmeta_t *scan, daddr_t blk, int count) 856 { 857 u_daddr_t mask; 858 859 /* 860 * free some data in this bitmap 861 * mask=0000111111111110000 862 * \_________/\__/ 863 * count n 864 */ 865 mask = bitrange(blk & BLIST_BMAP_MASK, count); 866 KASSERT((scan->bm_bitmap & mask) == 0, 867 ("freeing free block: %jx, size %d, mask %jx", 868 (uintmax_t)blk, count, (uintmax_t)scan->bm_bitmap & mask)); 869 scan->bm_bitmap |= mask; 870 } 871 872 /* 873 * BLST_META_FREE() - free allocated blocks from radix tree meta info 874 * 875 * This support routine frees a range of blocks from the bitmap. 876 * The range must be entirely enclosed by this radix node. If a 877 * meta node, we break the range down recursively to free blocks 878 * in subnodes (which means that this code can free an arbitrary 879 * range whereas the allocation code cannot allocate an arbitrary 880 * range). 881 */ 882 static void 883 blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, u_daddr_t radix) 884 { 885 daddr_t blk, endBlk, i, skip; 886 int digit, endDigit; 887 888 /* 889 * We could probably do a better job here. We are required to make 890 * bighint at least as large as the biggest allocable block of data. 891 * If we just shoehorn it, a little extra overhead will be incurred 892 * on the next allocation (but only that one typically). 893 */ 894 scan->bm_bighint = BLIST_MAX_ALLOC; 895 896 if (radix == BLIST_BMAP_RADIX) 897 return (blst_leaf_free(scan, freeBlk, count)); 898 899 endBlk = ummin(freeBlk + count, (freeBlk + radix) & -radix); 900 radix /= BLIST_META_RADIX; 901 skip = radix_to_skip(radix); 902 blk = freeBlk & -radix; 903 digit = (blk / radix) & BLIST_META_MASK; 904 endDigit = 1 + (((endBlk - 1) / radix) & BLIST_META_MASK); 905 scan->bm_bitmap |= bitrange(digit, endDigit - digit); 906 for (i = 1 + digit * skip; blk < endBlk; i += skip) { 907 blk += radix; 908 count = ummin(blk, endBlk) - freeBlk; 909 blst_meta_free(&scan[i], freeBlk, count, radix); 910 freeBlk = blk; 911 } 912 } 913 914 /* 915 * BLST_COPY() - copy one radix tree to another 916 * 917 * Locates free space in the source tree and frees it in the destination 918 * tree. The space may not already be free in the destination. 919 */ 920 static void 921 blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, blist_t dest, 922 daddr_t count) 923 { 924 daddr_t endBlk, i, skip; 925 926 /* 927 * Leaf node 928 */ 929 930 if (radix == BLIST_BMAP_RADIX) { 931 u_daddr_t v = scan->bm_bitmap; 932 933 if (v == (u_daddr_t)-1) { 934 blist_free(dest, blk, count); 935 } else if (v != 0) { 936 int i; 937 938 for (i = 0; i < count; ++i) { 939 if (v & ((u_daddr_t)1 << i)) 940 blist_free(dest, blk + i, 1); 941 } 942 } 943 return; 944 } 945 946 /* 947 * Meta node 948 */ 949 950 if (scan->bm_bitmap == 0) { 951 /* 952 * Source all allocated, leave dest allocated 953 */ 954 return; 955 } 956 957 endBlk = blk + count; 958 radix /= BLIST_META_RADIX; 959 skip = radix_to_skip(radix); 960 for (i = 1; blk < endBlk; i += skip) { 961 blk += radix; 962 count = radix; 963 if (blk >= endBlk) 964 count -= blk - endBlk; 965 blst_copy(&scan[i], blk - radix, radix, dest, count); 966 } 967 } 968 969 /* 970 * BLST_LEAF_FILL() - allocate specific blocks in leaf bitmap 971 * 972 * This routine allocates all blocks in the specified range 973 * regardless of any existing allocations in that range. Returns 974 * the number of blocks allocated by the call. 975 */ 976 static daddr_t 977 blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count) 978 { 979 daddr_t nblks; 980 u_daddr_t mask; 981 982 mask = bitrange(blk & BLIST_BMAP_MASK, count); 983 984 /* Count the number of blocks that we are allocating. */ 985 nblks = bitcount64(scan->bm_bitmap & mask); 986 987 scan->bm_bitmap &= ~mask; 988 return (nblks); 989 } 990 991 /* 992 * BLIST_META_FILL() - allocate specific blocks at a meta node 993 * 994 * This routine allocates the specified range of blocks, 995 * regardless of any existing allocations in the range. The 996 * range must be within the extent of this node. Returns the 997 * number of blocks allocated by the call. 998 */ 999 static daddr_t 1000 blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, u_daddr_t radix) 1001 { 1002 daddr_t blk, endBlk, i, nblks, skip; 1003 int digit; 1004 1005 if (radix == BLIST_BMAP_RADIX) 1006 return (blst_leaf_fill(scan, allocBlk, count)); 1007 1008 endBlk = ummin(allocBlk + count, (allocBlk + radix) & -radix); 1009 radix /= BLIST_META_RADIX; 1010 skip = radix_to_skip(radix); 1011 blk = allocBlk & -radix; 1012 nblks = 0; 1013 while (blk < endBlk) { 1014 digit = (blk / radix) & BLIST_META_MASK; 1015 i = 1 + digit * skip; 1016 blk += radix; 1017 count = ummin(blk, endBlk) - allocBlk; 1018 nblks += blst_meta_fill(&scan[i], allocBlk, count, radix); 1019 if (scan[i].bm_bitmap == 0) 1020 scan->bm_bitmap &= ~((u_daddr_t)1 << digit); 1021 allocBlk = blk; 1022 } 1023 return (nblks); 1024 } 1025 1026 #ifdef BLIST_DEBUG 1027 1028 static void 1029 blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int tab) 1030 { 1031 daddr_t skip; 1032 u_daddr_t mask; 1033 int digit; 1034 1035 if (radix == BLIST_BMAP_RADIX) { 1036 printf( 1037 "%*.*s(%08llx,%lld): bitmap %0*llx big=%lld\n", 1038 tab, tab, "", 1039 (long long)blk, (long long)radix, 1040 1 + (BLIST_BMAP_RADIX - 1) / 4, 1041 (long long)scan->bm_bitmap, 1042 (long long)scan->bm_bighint 1043 ); 1044 return; 1045 } 1046 1047 printf( 1048 "%*.*s(%08llx): subtree (%lld/%lld) bitmap %0*llx big=%lld {\n", 1049 tab, tab, "", 1050 (long long)blk, (long long)radix, 1051 (long long)radix, 1052 1 + (BLIST_META_RADIX - 1) / 4, 1053 (long long)scan->bm_bitmap, 1054 (long long)scan->bm_bighint 1055 ); 1056 1057 radix /= BLIST_META_RADIX; 1058 skip = radix_to_skip(radix); 1059 tab += 4; 1060 1061 mask = scan->bm_bitmap; 1062 /* Examine the nonempty subtree associated with each bit set in mask */ 1063 do { 1064 digit = bitpos(mask); 1065 blst_radix_print(&scan[1 + digit * skip], blk + digit * radix, 1066 radix, tab); 1067 } while ((mask ^= bitrange(digit, 1)) != 0); 1068 tab -= 4; 1069 1070 printf( 1071 "%*.*s}\n", 1072 tab, tab, "" 1073 ); 1074 } 1075 1076 #endif 1077 1078 #ifdef BLIST_DEBUG 1079 1080 int 1081 main(int ac, char **av) 1082 { 1083 int size = BLIST_META_RADIX * BLIST_BMAP_RADIX; 1084 int i; 1085 blist_t bl; 1086 struct sbuf *s; 1087 1088 for (i = 1; i < ac; ++i) { 1089 const char *ptr = av[i]; 1090 if (*ptr != '-') { 1091 size = strtol(ptr, NULL, 0); 1092 continue; 1093 } 1094 ptr += 2; 1095 fprintf(stderr, "Bad option: %s\n", ptr - 2); 1096 exit(1); 1097 } 1098 bl = blist_create(size, M_WAITOK); 1099 blist_free(bl, 0, size); 1100 1101 for (;;) { 1102 char buf[1024]; 1103 long long da = 0; 1104 long long count = 0; 1105 1106 printf("%lld/%lld/%lld> ", (long long)blist_avail(bl), 1107 (long long)size, (long long)bl->bl_radix); 1108 fflush(stdout); 1109 if (fgets(buf, sizeof(buf), stdin) == NULL) 1110 break; 1111 switch(buf[0]) { 1112 case 'r': 1113 if (sscanf(buf + 1, "%lld", &count) == 1) { 1114 blist_resize(&bl, count, 1, M_WAITOK); 1115 } else { 1116 printf("?\n"); 1117 } 1118 case 'p': 1119 blist_print(bl); 1120 break; 1121 case 's': 1122 s = sbuf_new_auto(); 1123 blist_stats(bl, s); 1124 sbuf_finish(s); 1125 printf("%s", sbuf_data(s)); 1126 sbuf_delete(s); 1127 break; 1128 case 'a': 1129 if (sscanf(buf + 1, "%lld", &count) == 1) { 1130 daddr_t blk = blist_alloc(bl, count); 1131 printf(" R=%08llx\n", (long long)blk); 1132 } else { 1133 printf("?\n"); 1134 } 1135 break; 1136 case 'f': 1137 if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) { 1138 blist_free(bl, da, count); 1139 } else { 1140 printf("?\n"); 1141 } 1142 break; 1143 case 'l': 1144 if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) { 1145 printf(" n=%jd\n", 1146 (intmax_t)blist_fill(bl, da, count)); 1147 } else { 1148 printf("?\n"); 1149 } 1150 break; 1151 case '?': 1152 case 'h': 1153 puts( 1154 "p -print\n" 1155 "s -stats\n" 1156 "a %d -allocate\n" 1157 "f %x %d -free\n" 1158 "l %x %d -fill\n" 1159 "r %d -resize\n" 1160 "h/? -help\n" 1161 "q -quit" 1162 ); 1163 break; 1164 case 'q': 1165 break; 1166 default: 1167 printf("?\n"); 1168 break; 1169 } 1170 if (buf[0] == 'q') 1171 break; 1172 } 1173 return (0); 1174 } 1175 1176 #endif 1177