1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1998 Matthew Dillon. All Rights Reserved. 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS 18 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY 21 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE 23 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 25 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 26 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 */ 29 /* 30 * BLIST.C - Bitmap allocator/deallocator, using a radix tree with hinting 31 * 32 * This module implements a general bitmap allocator/deallocator. The 33 * allocator eats around 2 bits per 'block'. The module does not 34 * try to interpret the meaning of a 'block' other than to return 35 * SWAPBLK_NONE on an allocation failure. 36 * 37 * A radix tree controls access to pieces of the bitmap, and includes 38 * auxiliary information at each interior node about the availabilty of 39 * contiguous free blocks in the subtree rooted at that node. Two radix 40 * constants are involved: one for the size of the bitmaps contained in the 41 * leaf nodes (BLIST_BMAP_RADIX), and one for the number of descendents of 42 * each of the meta (interior) nodes (BLIST_META_RADIX). Each subtree is 43 * associated with a range of blocks. The root of any subtree stores a 44 * hint field that defines an upper bound on the size of the largest 45 * allocation that can begin in the associated block range. A hint is an 46 * upper bound on a potential allocation, but not necessarily a tight upper 47 * bound. 48 * 49 * The radix tree also implements two collapsed states for meta nodes: 50 * the ALL-ALLOCATED state and the ALL-FREE state. If a meta node is 51 * in either of these two states, all information contained underneath 52 * the node is considered stale. These states are used to optimize 53 * allocation and freeing operations. 54 * 55 * The hinting greatly increases code efficiency for allocations while 56 * the general radix structure optimizes both allocations and frees. The 57 * radix tree should be able to operate well no matter how much 58 * fragmentation there is and no matter how large a bitmap is used. 59 * 60 * The blist code wires all necessary memory at creation time. Neither 61 * allocations nor frees require interaction with the memory subsystem. 62 * The non-blocking features of the blist code are used in the swap code 63 * (vm/swap_pager.c). 64 * 65 * LAYOUT: The radix tree is laid out recursively using a 66 * linear array. Each meta node is immediately followed (laid out 67 * sequentially in memory) by BLIST_META_RADIX lower level nodes. This 68 * is a recursive structure but one that can be easily scanned through 69 * a very simple 'skip' calculation. In order to support large radixes, 70 * portions of the tree may reside outside our memory allocation. We 71 * handle this with an early-termination optimization (when bighint is 72 * set to -1) on the scan. The memory allocation is only large enough 73 * to cover the number of blocks requested at creation time even if it 74 * must be encompassed in larger root-node radix. 75 * 76 * NOTE: the allocator cannot currently allocate more than 77 * BLIST_BMAP_RADIX blocks per call. It will panic with 'allocation too 78 * large' if you try. This is an area that could use improvement. The 79 * radix is large enough that this restriction does not effect the swap 80 * system, though. Currently only the allocation code is affected by 81 * this algorithmic unfeature. The freeing code can handle arbitrary 82 * ranges. 83 * 84 * This code can be compiled stand-alone for debugging. 85 */ 86 87 #include <sys/cdefs.h> 88 __FBSDID("$FreeBSD$"); 89 90 #ifdef _KERNEL 91 92 #include <sys/param.h> 93 #include <sys/systm.h> 94 #include <sys/lock.h> 95 #include <sys/kernel.h> 96 #include <sys/blist.h> 97 #include <sys/malloc.h> 98 #include <sys/sbuf.h> 99 #include <sys/proc.h> 100 #include <sys/mutex.h> 101 102 #else 103 104 #ifndef BLIST_NO_DEBUG 105 #define BLIST_DEBUG 106 #endif 107 108 #include <sys/types.h> 109 #include <sys/malloc.h> 110 #include <sys/sbuf.h> 111 #include <stdio.h> 112 #include <string.h> 113 #include <stddef.h> 114 #include <stdlib.h> 115 #include <stdarg.h> 116 #include <stdbool.h> 117 118 #define bitcount64(x) __bitcount64((uint64_t)(x)) 119 #define malloc(a,b,c) calloc(a, 1) 120 #define free(a,b) free(a) 121 static __inline int imax(int a, int b) { return (a > b ? a : b); } 122 123 #include <sys/blist.h> 124 125 void panic(const char *ctl, ...); 126 127 #endif 128 129 /* 130 * static support functions 131 */ 132 static daddr_t blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count); 133 static daddr_t blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count, 134 u_daddr_t radix); 135 static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count); 136 static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, 137 u_daddr_t radix); 138 static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, 139 blist_t dest, daddr_t count); 140 static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count); 141 static daddr_t blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, 142 u_daddr_t radix); 143 #ifndef _KERNEL 144 static void blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, 145 int tab); 146 #endif 147 148 #ifdef _KERNEL 149 static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space"); 150 #endif 151 152 _Static_assert(BLIST_BMAP_RADIX % BLIST_META_RADIX == 0, 153 "radix divisibility error"); 154 #define BLIST_BMAP_MASK (BLIST_BMAP_RADIX - 1) 155 #define BLIST_META_MASK (BLIST_META_RADIX - 1) 156 157 /* 158 * For a subtree that can represent the state of up to 'radix' blocks, the 159 * number of leaf nodes of the subtree is L=radix/BLIST_BMAP_RADIX. If 'm' 160 * is short for BLIST_META_RADIX, then for a tree of height h with L=m**h 161 * leaf nodes, the total number of tree nodes is 1 + m + m**2 + ... + m**h, 162 * or, equivalently, (m**(h+1)-1)/(m-1). This quantity is called 'skip' 163 * in the 'meta' functions that process subtrees. Since integer division 164 * discards remainders, we can express this computation as 165 * skip = (m * m**h) / (m - 1) 166 * skip = (m * (radix / BLIST_BMAP_RADIX)) / (m - 1) 167 * and since m divides BLIST_BMAP_RADIX, we can simplify further to 168 * skip = (radix / (BLIST_BMAP_RADIX / m)) / (m - 1) 169 * skip = radix / ((BLIST_BMAP_RADIX / m) * (m - 1)) 170 * so that simple integer division by a constant can safely be used for the 171 * calculation. 172 */ 173 static inline daddr_t 174 radix_to_skip(daddr_t radix) 175 { 176 177 return (radix / 178 ((BLIST_BMAP_RADIX / BLIST_META_RADIX) * BLIST_META_MASK)); 179 } 180 181 /* 182 * Use binary search, or a faster method, to find the 1 bit in a u_daddr_t. 183 * Assumes that the argument has only one bit set. 184 */ 185 static inline int 186 bitpos(u_daddr_t mask) 187 { 188 int hi, lo, mid; 189 190 switch (sizeof(mask)) { 191 #ifdef HAVE_INLINE_FFSLL 192 case sizeof(long long): 193 return (ffsll(mask) - 1); 194 #endif 195 default: 196 lo = 0; 197 hi = BLIST_BMAP_RADIX; 198 while (lo + 1 < hi) { 199 mid = (lo + hi) >> 1; 200 if ((mask >> mid) != 0) 201 lo = mid; 202 else 203 hi = mid; 204 } 205 return (lo); 206 } 207 } 208 209 /* 210 * blist_create() - create a blist capable of handling up to the specified 211 * number of blocks 212 * 213 * blocks - must be greater than 0 214 * flags - malloc flags 215 * 216 * The smallest blist consists of a single leaf node capable of 217 * managing BLIST_BMAP_RADIX blocks. 218 */ 219 blist_t 220 blist_create(daddr_t blocks, int flags) 221 { 222 blist_t bl; 223 daddr_t i, last_block; 224 u_daddr_t nodes, radix, skip; 225 int digit; 226 227 /* 228 * Calculate the radix and node count used for scanning. Find the last 229 * block that is followed by a terminator. 230 */ 231 last_block = blocks - 1; 232 radix = BLIST_BMAP_RADIX; 233 while (radix < blocks) { 234 if (((last_block / radix + 1) & BLIST_META_MASK) != 0) 235 /* 236 * A terminator will be added. Update last_block to the 237 * position just before that terminator. 238 */ 239 last_block |= radix - 1; 240 radix *= BLIST_META_RADIX; 241 } 242 243 /* 244 * Count the meta-nodes in the expanded tree, including the final 245 * terminator, from the bottom level up to the root. 246 */ 247 nodes = (last_block >= blocks) ? 2 : 1; 248 last_block /= BLIST_BMAP_RADIX; 249 while (last_block > 0) { 250 nodes += last_block + 1; 251 last_block /= BLIST_META_RADIX; 252 } 253 bl = malloc(offsetof(struct blist, bl_root[nodes]), M_SWAP, flags | 254 M_ZERO); 255 if (bl == NULL) 256 return (NULL); 257 258 bl->bl_blocks = blocks; 259 bl->bl_radix = radix; 260 bl->bl_cursor = 0; 261 262 /* 263 * Initialize the empty tree by filling in root values, then initialize 264 * just the terminators in the rest of the tree. 265 */ 266 bl->bl_root[0].bm_bighint = 0; 267 if (radix == BLIST_BMAP_RADIX) 268 bl->bl_root[0].u.bmu_bitmap = 0; 269 else 270 bl->bl_root[0].u.bmu_avail = 0; 271 last_block = blocks - 1; 272 i = 0; 273 while (radix > BLIST_BMAP_RADIX) { 274 radix /= BLIST_META_RADIX; 275 skip = radix_to_skip(radix); 276 digit = last_block / radix; 277 i += 1 + digit * skip; 278 if (digit != BLIST_META_MASK) { 279 /* 280 * Add a terminator. 281 */ 282 bl->bl_root[i + skip].bm_bighint = (daddr_t)-1; 283 bl->bl_root[i + skip].u.bmu_bitmap = 0; 284 } 285 last_block %= radix; 286 } 287 288 #if defined(BLIST_DEBUG) 289 printf( 290 "BLIST representing %lld blocks (%lld MB of swap)" 291 ", requiring %lldK of ram\n", 292 (long long)bl->bl_blocks, 293 (long long)bl->bl_blocks * 4 / 1024, 294 (long long)(nodes * sizeof(blmeta_t) + 1023) / 1024 295 ); 296 printf("BLIST raw radix tree contains %lld records\n", 297 (long long)nodes); 298 #endif 299 300 return (bl); 301 } 302 303 void 304 blist_destroy(blist_t bl) 305 { 306 307 free(bl, M_SWAP); 308 } 309 310 /* 311 * blist_alloc() - reserve space in the block bitmap. Return the base 312 * of a contiguous region or SWAPBLK_NONE if space could 313 * not be allocated. 314 */ 315 daddr_t 316 blist_alloc(blist_t bl, daddr_t count) 317 { 318 daddr_t blk; 319 320 /* 321 * This loop iterates at most twice. An allocation failure in the 322 * first iteration leads to a second iteration only if the cursor was 323 * non-zero. When the cursor is zero, an allocation failure will 324 * reduce the hint, stopping further iterations. 325 */ 326 while (count <= bl->bl_root->bm_bighint) { 327 blk = blst_meta_alloc(bl->bl_root, bl->bl_cursor, count, 328 bl->bl_radix); 329 if (blk != SWAPBLK_NONE) { 330 bl->bl_cursor = blk + count; 331 if (bl->bl_cursor == bl->bl_blocks) 332 bl->bl_cursor = 0; 333 return (blk); 334 } else if (bl->bl_cursor != 0) 335 bl->bl_cursor = 0; 336 } 337 return (SWAPBLK_NONE); 338 } 339 340 /* 341 * blist_avail() - return the number of free blocks. 342 */ 343 daddr_t 344 blist_avail(blist_t bl) 345 { 346 347 if (bl->bl_radix == BLIST_BMAP_RADIX) 348 return (bitcount64(bl->bl_root->u.bmu_bitmap)); 349 else 350 return (bl->bl_root->u.bmu_avail); 351 } 352 353 /* 354 * blist_free() - free up space in the block bitmap. Return the base 355 * of a contiguous region. Panic if an inconsistancy is 356 * found. 357 */ 358 void 359 blist_free(blist_t bl, daddr_t blkno, daddr_t count) 360 { 361 362 blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix); 363 } 364 365 /* 366 * blist_fill() - mark a region in the block bitmap as off-limits 367 * to the allocator (i.e. allocate it), ignoring any 368 * existing allocations. Return the number of blocks 369 * actually filled that were free before the call. 370 */ 371 daddr_t 372 blist_fill(blist_t bl, daddr_t blkno, daddr_t count) 373 { 374 375 return (blst_meta_fill(bl->bl_root, blkno, count, bl->bl_radix)); 376 } 377 378 /* 379 * blist_resize() - resize an existing radix tree to handle the 380 * specified number of blocks. This will reallocate 381 * the tree and transfer the previous bitmap to the new 382 * one. When extending the tree you can specify whether 383 * the new blocks are to left allocated or freed. 384 */ 385 void 386 blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags) 387 { 388 blist_t newbl = blist_create(count, flags); 389 blist_t save = *pbl; 390 391 *pbl = newbl; 392 if (count > save->bl_blocks) 393 count = save->bl_blocks; 394 blst_copy(save->bl_root, 0, save->bl_radix, newbl, count); 395 396 /* 397 * If resizing upwards, should we free the new space or not? 398 */ 399 if (freenew && count < newbl->bl_blocks) { 400 blist_free(newbl, count, newbl->bl_blocks - count); 401 } 402 blist_destroy(save); 403 } 404 405 #ifdef BLIST_DEBUG 406 407 /* 408 * blist_print() - dump radix tree 409 */ 410 void 411 blist_print(blist_t bl) 412 { 413 printf("BLIST cursor = %08jx {\n", (uintmax_t)bl->bl_cursor); 414 blst_radix_print(bl->bl_root, 0, bl->bl_radix, 4); 415 printf("}\n"); 416 } 417 418 #endif 419 420 static const u_daddr_t fib[] = { 421 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584, 422 4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418, 317811, 423 514229, 832040, 1346269, 2178309, 3524578, 424 }; 425 426 /* 427 * Use 'gap' to describe a maximal range of unallocated blocks/bits. 428 */ 429 struct gap_stats { 430 daddr_t start; /* current gap start, or SWAPBLK_NONE */ 431 daddr_t num; /* number of gaps observed */ 432 daddr_t max; /* largest gap size */ 433 daddr_t avg; /* average gap size */ 434 daddr_t err; /* sum - num * avg */ 435 daddr_t histo[nitems(fib)]; /* # gaps in each size range */ 436 int max_bucket; /* last histo elt with nonzero val */ 437 }; 438 439 /* 440 * gap_stats_counting() - is the state 'counting 1 bits'? 441 * or 'skipping 0 bits'? 442 */ 443 static inline bool 444 gap_stats_counting(const struct gap_stats *stats) 445 { 446 447 return (stats->start != SWAPBLK_NONE); 448 } 449 450 /* 451 * init_gap_stats() - initialize stats on gap sizes 452 */ 453 static inline void 454 init_gap_stats(struct gap_stats *stats) 455 { 456 457 bzero(stats, sizeof(*stats)); 458 stats->start = SWAPBLK_NONE; 459 } 460 461 /* 462 * update_gap_stats() - update stats on gap sizes 463 */ 464 static void 465 update_gap_stats(struct gap_stats *stats, daddr_t posn) 466 { 467 daddr_t size; 468 int hi, lo, mid; 469 470 if (!gap_stats_counting(stats)) { 471 stats->start = posn; 472 return; 473 } 474 size = posn - stats->start; 475 stats->start = SWAPBLK_NONE; 476 if (size > stats->max) 477 stats->max = size; 478 479 /* 480 * Find the fibonacci range that contains size, 481 * expecting to find it in an early range. 482 */ 483 lo = 0; 484 hi = 1; 485 while (hi < nitems(fib) && fib[hi] <= size) { 486 lo = hi; 487 hi *= 2; 488 } 489 if (hi >= nitems(fib)) 490 hi = nitems(fib); 491 while (lo + 1 != hi) { 492 mid = (lo + hi) >> 1; 493 if (fib[mid] <= size) 494 lo = mid; 495 else 496 hi = mid; 497 } 498 stats->histo[lo]++; 499 if (lo > stats->max_bucket) 500 stats->max_bucket = lo; 501 stats->err += size - stats->avg; 502 stats->num++; 503 stats->avg += stats->err / stats->num; 504 stats->err %= stats->num; 505 } 506 507 /* 508 * dump_gap_stats() - print stats on gap sizes 509 */ 510 static inline void 511 dump_gap_stats(const struct gap_stats *stats, struct sbuf *s) 512 { 513 int i; 514 515 sbuf_printf(s, "number of maximal free ranges: %jd\n", 516 (intmax_t)stats->num); 517 sbuf_printf(s, "largest free range: %jd\n", (intmax_t)stats->max); 518 sbuf_printf(s, "average maximal free range size: %jd\n", 519 (intmax_t)stats->avg); 520 sbuf_printf(s, "number of maximal free ranges of different sizes:\n"); 521 sbuf_printf(s, " count | size range\n"); 522 sbuf_printf(s, " ----- | ----------\n"); 523 for (i = 0; i < stats->max_bucket; i++) { 524 if (stats->histo[i] != 0) { 525 sbuf_printf(s, "%20jd | ", 526 (intmax_t)stats->histo[i]); 527 if (fib[i] != fib[i + 1] - 1) 528 sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i], 529 (intmax_t)fib[i + 1] - 1); 530 else 531 sbuf_printf(s, "%jd\n", (intmax_t)fib[i]); 532 } 533 } 534 sbuf_printf(s, "%20jd | ", (intmax_t)stats->histo[i]); 535 if (stats->histo[i] > 1) 536 sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i], 537 (intmax_t)stats->max); 538 else 539 sbuf_printf(s, "%jd\n", (intmax_t)stats->max); 540 } 541 542 /* 543 * blist_stats() - dump radix tree stats 544 */ 545 void 546 blist_stats(blist_t bl, struct sbuf *s) 547 { 548 struct gap_stats gstats; 549 struct gap_stats *stats = &gstats; 550 daddr_t i, nodes, radix; 551 u_daddr_t bit, diff, mask; 552 553 init_gap_stats(stats); 554 nodes = 0; 555 i = bl->bl_radix; 556 while (i < bl->bl_radix + bl->bl_blocks) { 557 /* 558 * Find max size subtree starting at i. 559 */ 560 radix = BLIST_BMAP_RADIX; 561 while (((i / radix) & BLIST_META_MASK) == 0) 562 radix *= BLIST_META_RADIX; 563 564 /* 565 * Check for skippable subtrees starting at i. 566 */ 567 while (radix > BLIST_BMAP_RADIX) { 568 if (bl->bl_root[nodes].u.bmu_avail == 0) { 569 if (gap_stats_counting(stats)) 570 update_gap_stats(stats, i); 571 break; 572 } 573 if (bl->bl_root[nodes].u.bmu_avail == radix) { 574 if (!gap_stats_counting(stats)) 575 update_gap_stats(stats, i); 576 break; 577 } 578 579 /* 580 * Skip subtree root. 581 */ 582 nodes++; 583 radix /= BLIST_META_RADIX; 584 } 585 if (radix == BLIST_BMAP_RADIX) { 586 /* 587 * Scan leaf. 588 */ 589 mask = bl->bl_root[nodes].u.bmu_bitmap; 590 diff = mask ^ (mask << 1); 591 if (gap_stats_counting(stats)) 592 diff ^= 1; 593 while (diff != 0) { 594 bit = diff & -diff; 595 update_gap_stats(stats, i + bitpos(bit)); 596 diff ^= bit; 597 } 598 } 599 nodes += radix_to_skip(radix); 600 i += radix; 601 } 602 update_gap_stats(stats, i); 603 dump_gap_stats(stats, s); 604 } 605 606 /************************************************************************ 607 * ALLOCATION SUPPORT FUNCTIONS * 608 ************************************************************************ 609 * 610 * These support functions do all the actual work. They may seem 611 * rather longish, but that's because I've commented them up. The 612 * actual code is straight forward. 613 * 614 */ 615 616 /* 617 * blist_leaf_alloc() - allocate at a leaf in the radix tree (a bitmap). 618 * 619 * This is the core of the allocator and is optimized for the 620 * BLIST_BMAP_RADIX block allocation case. Otherwise, execution 621 * time is proportional to log2(count) + bitpos time. 622 */ 623 static daddr_t 624 blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count) 625 { 626 u_daddr_t mask; 627 int count1, hi, lo, num_shifts, range1, range_ext; 628 629 range1 = 0; 630 count1 = count - 1; 631 num_shifts = fls(count1); 632 mask = scan->u.bmu_bitmap; 633 while ((-mask & ~mask) != 0 && num_shifts > 0) { 634 /* 635 * If bit i is set in mask, then bits in [i, i+range1] are set 636 * in scan->u.bmu_bitmap. The value of range1 is equal to 637 * count1 >> num_shifts. Grow range and reduce num_shifts to 0, 638 * while preserving these invariants. The updates to mask leave 639 * fewer bits set, but each bit that remains set represents a 640 * longer string of consecutive bits set in scan->u.bmu_bitmap. 641 * If more updates to mask cannot clear more bits, because mask 642 * is partitioned with all 0 bits preceding all 1 bits, the loop 643 * terminates immediately. 644 */ 645 num_shifts--; 646 range_ext = range1 + ((count1 >> num_shifts) & 1); 647 /* 648 * mask is a signed quantity for the shift because when it is 649 * shifted right, the sign bit should copied; when the last 650 * block of the leaf is free, pretend, for a while, that all the 651 * blocks that follow it are also free. 652 */ 653 mask &= (daddr_t)mask >> range_ext; 654 range1 += range_ext; 655 } 656 if (mask == 0) { 657 /* 658 * Update bighint. There is no allocation bigger than range1 659 * starting in this leaf. 660 */ 661 scan->bm_bighint = range1; 662 return (SWAPBLK_NONE); 663 } 664 665 /* Discard any candidates that appear before blk. */ 666 mask &= (u_daddr_t)-1 << (blk & BLIST_BMAP_MASK); 667 if (mask == 0) 668 return (SWAPBLK_NONE); 669 670 /* 671 * The least significant set bit in mask marks the start of the first 672 * available range of sufficient size. Clear all the bits but that one, 673 * and then find its position. 674 */ 675 mask &= -mask; 676 lo = bitpos(mask); 677 678 hi = lo + count; 679 if (hi > BLIST_BMAP_RADIX) { 680 /* 681 * An allocation within this leaf is impossible, so a successful 682 * allocation depends on the next leaf providing some of the blocks. 683 */ 684 if (((blk / BLIST_BMAP_RADIX + 1) & BLIST_META_MASK) == 0) { 685 /* 686 * The next leaf has a different meta-node parent, so it 687 * is not necessarily initialized. Update bighint, 688 * comparing the range found at the end of mask to the 689 * largest earlier range that could have been made to 690 * vanish in the initial processing of mask. 691 */ 692 scan->bm_bighint = imax(BLIST_BMAP_RADIX - lo, range1); 693 return (SWAPBLK_NONE); 694 } 695 hi -= BLIST_BMAP_RADIX; 696 if (((scan[1].u.bmu_bitmap + 1) & ~((u_daddr_t)-1 << hi)) != 0) { 697 /* 698 * The next leaf doesn't have enough free blocks at the 699 * beginning to complete the spanning allocation. The 700 * hint cannot be updated, because the same allocation 701 * request could be satisfied later, by this leaf, if 702 * the state of the next leaf changes, and without any 703 * changes to this leaf. 704 */ 705 return (SWAPBLK_NONE); 706 } 707 /* Clear the first 'hi' bits in the next leaf, allocating them. */ 708 scan[1].u.bmu_bitmap &= (u_daddr_t)-1 << hi; 709 hi = BLIST_BMAP_RADIX; 710 } 711 712 /* Set the bits of mask at position 'lo' and higher. */ 713 mask = -mask; 714 if (hi == BLIST_BMAP_RADIX) { 715 /* 716 * Update bighint. There is no allocation bigger than range1 717 * available in this leaf after this allocation completes. 718 */ 719 scan->bm_bighint = range1; 720 } else { 721 /* Clear the bits of mask at position 'hi' and higher. */ 722 mask &= (u_daddr_t)-1 >> (BLIST_BMAP_RADIX - hi); 723 /* If this allocation uses all the bits, clear the hint. */ 724 if (mask == scan->u.bmu_bitmap) 725 scan->bm_bighint = 0; 726 } 727 /* Clear the allocated bits from this leaf. */ 728 scan->u.bmu_bitmap &= ~mask; 729 return ((blk & ~BLIST_BMAP_MASK) + lo); 730 } 731 732 /* 733 * blist_meta_alloc() - allocate at a meta in the radix tree. 734 * 735 * Attempt to allocate at a meta node. If we can't, we update 736 * bighint and return a failure. Updating bighint optimize future 737 * calls that hit this node. We have to check for our collapse cases 738 * and we have a few optimizations strewn in as well. 739 */ 740 static daddr_t 741 blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count, u_daddr_t radix) 742 { 743 daddr_t blk, i, next_skip, r, skip; 744 int child; 745 bool scan_from_start; 746 747 if (radix == BLIST_BMAP_RADIX) 748 return (blst_leaf_alloc(scan, cursor, count)); 749 if (scan->u.bmu_avail < count) { 750 /* 751 * The meta node's hint must be too large if the allocation 752 * exceeds the number of free blocks. Reduce the hint, and 753 * return failure. 754 */ 755 scan->bm_bighint = scan->u.bmu_avail; 756 return (SWAPBLK_NONE); 757 } 758 blk = cursor & -radix; 759 skip = radix_to_skip(radix); 760 next_skip = skip / BLIST_META_RADIX; 761 762 /* 763 * An ALL-FREE meta node requires special handling before allocating 764 * any of its blocks. 765 */ 766 if (scan->u.bmu_avail == radix) { 767 radix /= BLIST_META_RADIX; 768 769 /* 770 * Reinitialize each of the meta node's children. An ALL-FREE 771 * meta node cannot have a terminator in any subtree. 772 */ 773 for (i = 1; i < skip; i += next_skip) { 774 if (next_skip == 1) 775 scan[i].u.bmu_bitmap = (u_daddr_t)-1; 776 else 777 scan[i].u.bmu_avail = radix; 778 scan[i].bm_bighint = radix; 779 } 780 } else { 781 radix /= BLIST_META_RADIX; 782 } 783 784 if (count > radix) { 785 /* 786 * The allocation exceeds the number of blocks that are 787 * managed by a subtree of this meta node. 788 */ 789 panic("allocation too large"); 790 } 791 scan_from_start = cursor == blk; 792 child = (cursor - blk) / radix; 793 blk += child * radix; 794 for (i = 1 + child * next_skip; i < skip; i += next_skip) { 795 if (count <= scan[i].bm_bighint) { 796 /* 797 * The allocation might fit beginning in the i'th subtree. 798 */ 799 r = blst_meta_alloc(&scan[i], 800 cursor > blk ? cursor : blk, count, radix); 801 if (r != SWAPBLK_NONE) { 802 scan->u.bmu_avail -= count; 803 return (r); 804 } 805 } else if (scan[i].bm_bighint == (daddr_t)-1) { 806 /* 807 * Terminator 808 */ 809 break; 810 } 811 blk += radix; 812 } 813 814 /* 815 * We couldn't allocate count in this subtree, update bighint. 816 */ 817 if (scan_from_start && scan->bm_bighint >= count) 818 scan->bm_bighint = count - 1; 819 820 return (SWAPBLK_NONE); 821 } 822 823 /* 824 * BLST_LEAF_FREE() - free allocated block from leaf bitmap 825 * 826 */ 827 static void 828 blst_leaf_free(blmeta_t *scan, daddr_t blk, int count) 829 { 830 u_daddr_t mask; 831 int n; 832 833 /* 834 * free some data in this bitmap 835 * mask=0000111111111110000 836 * \_________/\__/ 837 * count n 838 */ 839 n = blk & BLIST_BMAP_MASK; 840 mask = ((u_daddr_t)-1 << n) & 841 ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n)); 842 if (scan->u.bmu_bitmap & mask) 843 panic("freeing free block"); 844 scan->u.bmu_bitmap |= mask; 845 846 /* 847 * We could probably do a better job here. We are required to make 848 * bighint at least as large as the biggest contiguous block of 849 * data. If we just shoehorn it, a little extra overhead will 850 * be incured on the next allocation (but only that one typically). 851 */ 852 scan->bm_bighint = BLIST_BMAP_RADIX; 853 } 854 855 /* 856 * BLST_META_FREE() - free allocated blocks from radix tree meta info 857 * 858 * This support routine frees a range of blocks from the bitmap. 859 * The range must be entirely enclosed by this radix node. If a 860 * meta node, we break the range down recursively to free blocks 861 * in subnodes (which means that this code can free an arbitrary 862 * range whereas the allocation code cannot allocate an arbitrary 863 * range). 864 */ 865 static void 866 blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, u_daddr_t radix) 867 { 868 daddr_t blk, i, next_skip, skip, v; 869 int child; 870 871 if (scan->bm_bighint == (daddr_t)-1) 872 panic("freeing invalid range"); 873 if (radix == BLIST_BMAP_RADIX) 874 return (blst_leaf_free(scan, freeBlk, count)); 875 skip = radix_to_skip(radix); 876 next_skip = skip / BLIST_META_RADIX; 877 878 if (scan->u.bmu_avail == 0) { 879 /* 880 * ALL-ALLOCATED special case, with possible 881 * shortcut to ALL-FREE special case. 882 */ 883 scan->u.bmu_avail = count; 884 scan->bm_bighint = count; 885 886 if (count != radix) { 887 for (i = 1; i < skip; i += next_skip) { 888 if (scan[i].bm_bighint == (daddr_t)-1) 889 break; 890 scan[i].bm_bighint = 0; 891 if (next_skip == 1) { 892 scan[i].u.bmu_bitmap = 0; 893 } else { 894 scan[i].u.bmu_avail = 0; 895 } 896 } 897 /* fall through */ 898 } 899 } else { 900 scan->u.bmu_avail += count; 901 /* scan->bm_bighint = radix; */ 902 } 903 904 /* 905 * ALL-FREE special case. 906 */ 907 908 if (scan->u.bmu_avail == radix) 909 return; 910 if (scan->u.bmu_avail > radix) 911 panic("blst_meta_free: freeing already free blocks (%lld) %lld/%lld", 912 (long long)count, (long long)scan->u.bmu_avail, 913 (long long)radix); 914 915 /* 916 * Break the free down into its components 917 */ 918 919 blk = freeBlk & -radix; 920 radix /= BLIST_META_RADIX; 921 922 child = (freeBlk - blk) / radix; 923 blk += child * radix; 924 i = 1 + child * next_skip; 925 while (i < skip && blk < freeBlk + count) { 926 v = blk + radix - freeBlk; 927 if (v > count) 928 v = count; 929 blst_meta_free(&scan[i], freeBlk, v, radix); 930 if (scan->bm_bighint < scan[i].bm_bighint) 931 scan->bm_bighint = scan[i].bm_bighint; 932 count -= v; 933 freeBlk += v; 934 blk += radix; 935 i += next_skip; 936 } 937 } 938 939 /* 940 * BLIST_RADIX_COPY() - copy one radix tree to another 941 * 942 * Locates free space in the source tree and frees it in the destination 943 * tree. The space may not already be free in the destination. 944 */ 945 static void 946 blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, blist_t dest, 947 daddr_t count) 948 { 949 daddr_t i, next_skip, skip; 950 951 /* 952 * Leaf node 953 */ 954 955 if (radix == BLIST_BMAP_RADIX) { 956 u_daddr_t v = scan->u.bmu_bitmap; 957 958 if (v == (u_daddr_t)-1) { 959 blist_free(dest, blk, count); 960 } else if (v != 0) { 961 int i; 962 963 for (i = 0; i < BLIST_BMAP_RADIX && i < count; ++i) { 964 if (v & ((u_daddr_t)1 << i)) 965 blist_free(dest, blk + i, 1); 966 } 967 } 968 return; 969 } 970 971 /* 972 * Meta node 973 */ 974 975 if (scan->u.bmu_avail == 0) { 976 /* 977 * Source all allocated, leave dest allocated 978 */ 979 return; 980 } 981 if (scan->u.bmu_avail == radix) { 982 /* 983 * Source all free, free entire dest 984 */ 985 if (count < radix) 986 blist_free(dest, blk, count); 987 else 988 blist_free(dest, blk, radix); 989 return; 990 } 991 992 993 skip = radix_to_skip(radix); 994 next_skip = skip / BLIST_META_RADIX; 995 radix /= BLIST_META_RADIX; 996 997 for (i = 1; count && i < skip; i += next_skip) { 998 if (scan[i].bm_bighint == (daddr_t)-1) 999 break; 1000 1001 if (count >= radix) { 1002 blst_copy(&scan[i], blk, radix, dest, radix); 1003 count -= radix; 1004 } else { 1005 if (count) { 1006 blst_copy(&scan[i], blk, radix, dest, count); 1007 } 1008 count = 0; 1009 } 1010 blk += radix; 1011 } 1012 } 1013 1014 /* 1015 * BLST_LEAF_FILL() - allocate specific blocks in leaf bitmap 1016 * 1017 * This routine allocates all blocks in the specified range 1018 * regardless of any existing allocations in that range. Returns 1019 * the number of blocks allocated by the call. 1020 */ 1021 static daddr_t 1022 blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count) 1023 { 1024 daddr_t nblks; 1025 u_daddr_t mask; 1026 int n; 1027 1028 n = blk & BLIST_BMAP_MASK; 1029 mask = ((u_daddr_t)-1 << n) & 1030 ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n)); 1031 1032 /* Count the number of blocks that we are allocating. */ 1033 nblks = bitcount64(scan->u.bmu_bitmap & mask); 1034 1035 scan->u.bmu_bitmap &= ~mask; 1036 return (nblks); 1037 } 1038 1039 /* 1040 * BLIST_META_FILL() - allocate specific blocks at a meta node 1041 * 1042 * This routine allocates the specified range of blocks, 1043 * regardless of any existing allocations in the range. The 1044 * range must be within the extent of this node. Returns the 1045 * number of blocks allocated by the call. 1046 */ 1047 static daddr_t 1048 blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, u_daddr_t radix) 1049 { 1050 daddr_t blk, i, nblks, next_skip, skip, v; 1051 int child; 1052 1053 if (scan->bm_bighint == (daddr_t)-1) 1054 panic("filling invalid range"); 1055 if (count > radix) { 1056 /* 1057 * The allocation exceeds the number of blocks that are 1058 * managed by this node. 1059 */ 1060 panic("fill too large"); 1061 } 1062 if (radix == BLIST_BMAP_RADIX) 1063 return (blst_leaf_fill(scan, allocBlk, count)); 1064 if (count == radix || scan->u.bmu_avail == 0) { 1065 /* 1066 * ALL-ALLOCATED special case 1067 */ 1068 nblks = scan->u.bmu_avail; 1069 scan->u.bmu_avail = 0; 1070 scan->bm_bighint = 0; 1071 return (nblks); 1072 } 1073 skip = radix_to_skip(radix); 1074 next_skip = skip / BLIST_META_RADIX; 1075 blk = allocBlk & -radix; 1076 1077 /* 1078 * An ALL-FREE meta node requires special handling before allocating 1079 * any of its blocks. 1080 */ 1081 if (scan->u.bmu_avail == radix) { 1082 radix /= BLIST_META_RADIX; 1083 1084 /* 1085 * Reinitialize each of the meta node's children. An ALL-FREE 1086 * meta node cannot have a terminator in any subtree. 1087 */ 1088 for (i = 1; i < skip; i += next_skip) { 1089 if (next_skip == 1) 1090 scan[i].u.bmu_bitmap = (u_daddr_t)-1; 1091 else 1092 scan[i].u.bmu_avail = radix; 1093 scan[i].bm_bighint = radix; 1094 } 1095 } else { 1096 radix /= BLIST_META_RADIX; 1097 } 1098 1099 nblks = 0; 1100 child = (allocBlk - blk) / radix; 1101 blk += child * radix; 1102 i = 1 + child * next_skip; 1103 while (i < skip && blk < allocBlk + count) { 1104 v = blk + radix - allocBlk; 1105 if (v > count) 1106 v = count; 1107 nblks += blst_meta_fill(&scan[i], allocBlk, v, radix); 1108 count -= v; 1109 allocBlk += v; 1110 blk += radix; 1111 i += next_skip; 1112 } 1113 scan->u.bmu_avail -= nblks; 1114 return (nblks); 1115 } 1116 1117 #ifdef BLIST_DEBUG 1118 1119 static void 1120 blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int tab) 1121 { 1122 daddr_t i, next_skip, skip; 1123 1124 if (radix == BLIST_BMAP_RADIX) { 1125 printf( 1126 "%*.*s(%08llx,%lld): bitmap %016llx big=%lld\n", 1127 tab, tab, "", 1128 (long long)blk, (long long)radix, 1129 (long long)scan->u.bmu_bitmap, 1130 (long long)scan->bm_bighint 1131 ); 1132 return; 1133 } 1134 1135 if (scan->u.bmu_avail == 0) { 1136 printf( 1137 "%*.*s(%08llx,%lld) ALL ALLOCATED\n", 1138 tab, tab, "", 1139 (long long)blk, 1140 (long long)radix 1141 ); 1142 return; 1143 } 1144 if (scan->u.bmu_avail == radix) { 1145 printf( 1146 "%*.*s(%08llx,%lld) ALL FREE\n", 1147 tab, tab, "", 1148 (long long)blk, 1149 (long long)radix 1150 ); 1151 return; 1152 } 1153 1154 printf( 1155 "%*.*s(%08llx,%lld): subtree (%lld/%lld) big=%lld {\n", 1156 tab, tab, "", 1157 (long long)blk, (long long)radix, 1158 (long long)scan->u.bmu_avail, 1159 (long long)radix, 1160 (long long)scan->bm_bighint 1161 ); 1162 1163 skip = radix_to_skip(radix); 1164 next_skip = skip / BLIST_META_RADIX; 1165 radix /= BLIST_META_RADIX; 1166 tab += 4; 1167 1168 for (i = 1; i < skip; i += next_skip) { 1169 if (scan[i].bm_bighint == (daddr_t)-1) { 1170 printf( 1171 "%*.*s(%08llx,%lld): Terminator\n", 1172 tab, tab, "", 1173 (long long)blk, (long long)radix 1174 ); 1175 break; 1176 } 1177 blst_radix_print(&scan[i], blk, radix, tab); 1178 blk += radix; 1179 } 1180 tab -= 4; 1181 1182 printf( 1183 "%*.*s}\n", 1184 tab, tab, "" 1185 ); 1186 } 1187 1188 #endif 1189 1190 #ifdef BLIST_DEBUG 1191 1192 int 1193 main(int ac, char **av) 1194 { 1195 int size = 1024; 1196 int i; 1197 blist_t bl; 1198 struct sbuf *s; 1199 1200 for (i = 1; i < ac; ++i) { 1201 const char *ptr = av[i]; 1202 if (*ptr != '-') { 1203 size = strtol(ptr, NULL, 0); 1204 continue; 1205 } 1206 ptr += 2; 1207 fprintf(stderr, "Bad option: %s\n", ptr - 2); 1208 exit(1); 1209 } 1210 bl = blist_create(size, M_WAITOK); 1211 blist_free(bl, 0, size); 1212 1213 for (;;) { 1214 char buf[1024]; 1215 long long da = 0; 1216 long long count = 0; 1217 1218 printf("%lld/%lld/%lld> ", (long long)blist_avail(bl), 1219 (long long)size, (long long)bl->bl_radix); 1220 fflush(stdout); 1221 if (fgets(buf, sizeof(buf), stdin) == NULL) 1222 break; 1223 switch(buf[0]) { 1224 case 'r': 1225 if (sscanf(buf + 1, "%lld", &count) == 1) { 1226 blist_resize(&bl, count, 1, M_WAITOK); 1227 } else { 1228 printf("?\n"); 1229 } 1230 case 'p': 1231 blist_print(bl); 1232 break; 1233 case 's': 1234 s = sbuf_new_auto(); 1235 blist_stats(bl, s); 1236 sbuf_finish(s); 1237 printf("%s", sbuf_data(s)); 1238 sbuf_delete(s); 1239 break; 1240 case 'a': 1241 if (sscanf(buf + 1, "%lld", &count) == 1) { 1242 daddr_t blk = blist_alloc(bl, count); 1243 printf(" R=%08llx\n", (long long)blk); 1244 } else { 1245 printf("?\n"); 1246 } 1247 break; 1248 case 'f': 1249 if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) { 1250 blist_free(bl, da, count); 1251 } else { 1252 printf("?\n"); 1253 } 1254 break; 1255 case 'l': 1256 if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) { 1257 printf(" n=%jd\n", 1258 (intmax_t)blist_fill(bl, da, count)); 1259 } else { 1260 printf("?\n"); 1261 } 1262 break; 1263 case '?': 1264 case 'h': 1265 puts( 1266 "p -print\n" 1267 "s -stats\n" 1268 "a %d -allocate\n" 1269 "f %x %d -free\n" 1270 "l %x %d -fill\n" 1271 "r %d -resize\n" 1272 "h/? -help" 1273 ); 1274 break; 1275 default: 1276 printf("?\n"); 1277 break; 1278 } 1279 } 1280 return(0); 1281 } 1282 1283 void 1284 panic(const char *ctl, ...) 1285 { 1286 va_list va; 1287 1288 va_start(va, ctl); 1289 vfprintf(stderr, ctl, va); 1290 fprintf(stderr, "\n"); 1291 va_end(va); 1292 exit(1); 1293 } 1294 1295 #endif 1296