1 /*- 2 * Copyright (c) 1998 Matthew Dillon. All Rights Reserved. 3 * Redistribution and use in source and binary forms, with or without 4 * modification, are permitted provided that the following conditions 5 * are met: 6 * 1. Redistributions of source code must retain the above copyright 7 * notice, this list of conditions and the following disclaimer. 8 * 2. Redistributions in binary form must reproduce the above copyright 9 * notice, this list of conditions and the following disclaimer in the 10 * documentation and/or other materials provided with the distribution. 11 * 3. Neither the name of the University nor the names of its contributors 12 * may be used to endorse or promote products derived from this software 13 * without specific prior written permission. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS 16 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY 19 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE 21 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 23 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 24 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 /* 28 * BLIST.C - Bitmap allocator/deallocator, using a radix tree with hinting 29 * 30 * This module implements a general bitmap allocator/deallocator. The 31 * allocator eats around 2 bits per 'block'. The module does not 32 * try to interpret the meaning of a 'block' other than to return 33 * SWAPBLK_NONE on an allocation failure. 34 * 35 * A radix tree controls access to pieces of the bitmap, and includes 36 * auxiliary information at each interior node about the availabilty of 37 * contiguous free blocks in the subtree rooted at that node. Two radix 38 * constants are involved: one for the size of the bitmaps contained in the 39 * leaf nodes (BLIST_BMAP_RADIX), and one for the number of descendents of 40 * each of the meta (interior) nodes (BLIST_META_RADIX). Each subtree is 41 * associated with a range of blocks. The root of any subtree stores a 42 * hint field that defines an upper bound on the size of the largest 43 * allocation that can begin in the associated block range. A hint is an 44 * upper bound on a potential allocation, but not necessarily a tight upper 45 * bound. 46 * 47 * The radix tree also implements two collapsed states for meta nodes: 48 * the ALL-ALLOCATED state and the ALL-FREE state. If a meta node is 49 * in either of these two states, all information contained underneath 50 * the node is considered stale. These states are used to optimize 51 * allocation and freeing operations. 52 * 53 * The hinting greatly increases code efficiency for allocations while 54 * the general radix structure optimizes both allocations and frees. The 55 * radix tree should be able to operate well no matter how much 56 * fragmentation there is and no matter how large a bitmap is used. 57 * 58 * The blist code wires all necessary memory at creation time. Neither 59 * allocations nor frees require interaction with the memory subsystem. 60 * The non-blocking features of the blist code are used in the swap code 61 * (vm/swap_pager.c). 62 * 63 * LAYOUT: The radix tree is laid out recursively using a 64 * linear array. Each meta node is immediately followed (laid out 65 * sequentially in memory) by BLIST_META_RADIX lower level nodes. This 66 * is a recursive structure but one that can be easily scanned through 67 * a very simple 'skip' calculation. In order to support large radixes, 68 * portions of the tree may reside outside our memory allocation. We 69 * handle this with an early-termination optimization (when bighint is 70 * set to -1) on the scan. The memory allocation is only large enough 71 * to cover the number of blocks requested at creation time even if it 72 * must be encompassed in larger root-node radix. 73 * 74 * NOTE: the allocator cannot currently allocate more than 75 * BLIST_BMAP_RADIX blocks per call. It will panic with 'allocation too 76 * large' if you try. This is an area that could use improvement. The 77 * radix is large enough that this restriction does not effect the swap 78 * system, though. Currently only the allocation code is affected by 79 * this algorithmic unfeature. The freeing code can handle arbitrary 80 * ranges. 81 * 82 * This code can be compiled stand-alone for debugging. 83 */ 84 85 #include <sys/cdefs.h> 86 __FBSDID("$FreeBSD$"); 87 88 #ifdef _KERNEL 89 90 #include <sys/param.h> 91 #include <sys/systm.h> 92 #include <sys/lock.h> 93 #include <sys/kernel.h> 94 #include <sys/blist.h> 95 #include <sys/malloc.h> 96 #include <sys/sbuf.h> 97 #include <sys/proc.h> 98 #include <sys/mutex.h> 99 100 #else 101 102 #ifndef BLIST_NO_DEBUG 103 #define BLIST_DEBUG 104 #endif 105 106 #include <sys/types.h> 107 #include <sys/malloc.h> 108 #include <sys/sbuf.h> 109 #include <stdio.h> 110 #include <string.h> 111 #include <stddef.h> 112 #include <stdlib.h> 113 #include <stdarg.h> 114 #include <stdbool.h> 115 116 #define bitcount64(x) __bitcount64((uint64_t)(x)) 117 #define malloc(a,b,c) calloc(a, 1) 118 #define free(a,b) free(a) 119 static __inline int imax(int a, int b) { return (a > b ? a : b); } 120 121 #include <sys/blist.h> 122 123 void panic(const char *ctl, ...); 124 125 #endif 126 127 /* 128 * static support functions 129 */ 130 static daddr_t blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count); 131 static daddr_t blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count, 132 u_daddr_t radix); 133 static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count); 134 static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, 135 u_daddr_t radix); 136 static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, 137 blist_t dest, daddr_t count); 138 static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count); 139 static daddr_t blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, 140 u_daddr_t radix); 141 #ifndef _KERNEL 142 static void blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, 143 int tab); 144 #endif 145 146 #ifdef _KERNEL 147 static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space"); 148 #endif 149 150 _Static_assert(BLIST_BMAP_RADIX % BLIST_META_RADIX == 0, 151 "radix divisibility error"); 152 #define BLIST_BMAP_MASK (BLIST_BMAP_RADIX - 1) 153 #define BLIST_META_MASK (BLIST_META_RADIX - 1) 154 155 /* 156 * For a subtree that can represent the state of up to 'radix' blocks, the 157 * number of leaf nodes of the subtree is L=radix/BLIST_BMAP_RADIX. If 'm' 158 * is short for BLIST_META_RADIX, then for a tree of height h with L=m**h 159 * leaf nodes, the total number of tree nodes is 1 + m + m**2 + ... + m**h, 160 * or, equivalently, (m**(h+1)-1)/(m-1). This quantity is called 'skip' 161 * in the 'meta' functions that process subtrees. Since integer division 162 * discards remainders, we can express this computation as 163 * skip = (m * m**h) / (m - 1) 164 * skip = (m * (radix / BLIST_BMAP_RADIX)) / (m - 1) 165 * and since m divides BLIST_BMAP_RADIX, we can simplify further to 166 * skip = (radix / (BLIST_BMAP_RADIX / m)) / (m - 1) 167 * skip = radix / ((BLIST_BMAP_RADIX / m) * (m - 1)) 168 * so that simple integer division by a constant can safely be used for the 169 * calculation. 170 */ 171 static inline daddr_t 172 radix_to_skip(daddr_t radix) 173 { 174 175 return (radix / 176 ((BLIST_BMAP_RADIX / BLIST_META_RADIX) * BLIST_META_MASK)); 177 } 178 179 /* 180 * Use binary search, or a faster method, to find the 1 bit in a u_daddr_t. 181 * Assumes that the argument has only one bit set. 182 */ 183 static inline int 184 bitpos(u_daddr_t mask) 185 { 186 int hi, lo, mid; 187 188 switch (sizeof(mask)) { 189 #ifdef HAVE_INLINE_FFSLL 190 case sizeof(long long): 191 return (ffsll(mask) - 1); 192 #endif 193 default: 194 lo = 0; 195 hi = BLIST_BMAP_RADIX; 196 while (lo + 1 < hi) { 197 mid = (lo + hi) >> 1; 198 if ((mask >> mid) != 0) 199 lo = mid; 200 else 201 hi = mid; 202 } 203 return (lo); 204 } 205 } 206 207 /* 208 * blist_create() - create a blist capable of handling up to the specified 209 * number of blocks 210 * 211 * blocks - must be greater than 0 212 * flags - malloc flags 213 * 214 * The smallest blist consists of a single leaf node capable of 215 * managing BLIST_BMAP_RADIX blocks. 216 */ 217 blist_t 218 blist_create(daddr_t blocks, int flags) 219 { 220 blist_t bl; 221 daddr_t i, last_block; 222 u_daddr_t nodes, radix, skip; 223 int digit; 224 225 /* 226 * Calculate the radix and node count used for scanning. Find the last 227 * block that is followed by a terminator. 228 */ 229 last_block = blocks - 1; 230 radix = BLIST_BMAP_RADIX; 231 while (radix < blocks) { 232 if (((last_block / radix + 1) & BLIST_META_MASK) != 0) 233 /* 234 * A terminator will be added. Update last_block to the 235 * position just before that terminator. 236 */ 237 last_block |= radix - 1; 238 radix *= BLIST_META_RADIX; 239 } 240 241 /* 242 * Count the meta-nodes in the expanded tree, including the final 243 * terminator, from the bottom level up to the root. 244 */ 245 nodes = (last_block >= blocks) ? 2 : 1; 246 last_block /= BLIST_BMAP_RADIX; 247 while (last_block > 0) { 248 nodes += last_block + 1; 249 last_block /= BLIST_META_RADIX; 250 } 251 bl = malloc(offsetof(struct blist, bl_root[nodes]), M_SWAP, flags | 252 M_ZERO); 253 if (bl == NULL) 254 return (NULL); 255 256 bl->bl_blocks = blocks; 257 bl->bl_radix = radix; 258 bl->bl_cursor = 0; 259 260 /* 261 * Initialize the empty tree by filling in root values, then initialize 262 * just the terminators in the rest of the tree. 263 */ 264 bl->bl_root[0].bm_bighint = 0; 265 if (radix == BLIST_BMAP_RADIX) 266 bl->bl_root[0].u.bmu_bitmap = 0; 267 else 268 bl->bl_root[0].u.bmu_avail = 0; 269 last_block = blocks - 1; 270 i = 0; 271 while (radix > BLIST_BMAP_RADIX) { 272 radix /= BLIST_META_RADIX; 273 skip = radix_to_skip(radix); 274 digit = last_block / radix; 275 i += 1 + digit * skip; 276 if (digit != BLIST_META_MASK) { 277 /* 278 * Add a terminator. 279 */ 280 bl->bl_root[i + skip].bm_bighint = (daddr_t)-1; 281 bl->bl_root[i + skip].u.bmu_bitmap = 0; 282 } 283 last_block %= radix; 284 } 285 286 #if defined(BLIST_DEBUG) 287 printf( 288 "BLIST representing %lld blocks (%lld MB of swap)" 289 ", requiring %lldK of ram\n", 290 (long long)bl->bl_blocks, 291 (long long)bl->bl_blocks * 4 / 1024, 292 (long long)(nodes * sizeof(blmeta_t) + 1023) / 1024 293 ); 294 printf("BLIST raw radix tree contains %lld records\n", 295 (long long)nodes); 296 #endif 297 298 return (bl); 299 } 300 301 void 302 blist_destroy(blist_t bl) 303 { 304 305 free(bl, M_SWAP); 306 } 307 308 /* 309 * blist_alloc() - reserve space in the block bitmap. Return the base 310 * of a contiguous region or SWAPBLK_NONE if space could 311 * not be allocated. 312 */ 313 daddr_t 314 blist_alloc(blist_t bl, daddr_t count) 315 { 316 daddr_t blk; 317 318 /* 319 * This loop iterates at most twice. An allocation failure in the 320 * first iteration leads to a second iteration only if the cursor was 321 * non-zero. When the cursor is zero, an allocation failure will 322 * reduce the hint, stopping further iterations. 323 */ 324 while (count <= bl->bl_root->bm_bighint) { 325 blk = blst_meta_alloc(bl->bl_root, bl->bl_cursor, count, 326 bl->bl_radix); 327 if (blk != SWAPBLK_NONE) { 328 bl->bl_cursor = blk + count; 329 if (bl->bl_cursor == bl->bl_blocks) 330 bl->bl_cursor = 0; 331 return (blk); 332 } else if (bl->bl_cursor != 0) 333 bl->bl_cursor = 0; 334 } 335 return (SWAPBLK_NONE); 336 } 337 338 /* 339 * blist_avail() - return the number of free blocks. 340 */ 341 daddr_t 342 blist_avail(blist_t bl) 343 { 344 345 if (bl->bl_radix == BLIST_BMAP_RADIX) 346 return (bitcount64(bl->bl_root->u.bmu_bitmap)); 347 else 348 return (bl->bl_root->u.bmu_avail); 349 } 350 351 /* 352 * blist_free() - free up space in the block bitmap. Return the base 353 * of a contiguous region. Panic if an inconsistancy is 354 * found. 355 */ 356 void 357 blist_free(blist_t bl, daddr_t blkno, daddr_t count) 358 { 359 360 blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix); 361 } 362 363 /* 364 * blist_fill() - mark a region in the block bitmap as off-limits 365 * to the allocator (i.e. allocate it), ignoring any 366 * existing allocations. Return the number of blocks 367 * actually filled that were free before the call. 368 */ 369 daddr_t 370 blist_fill(blist_t bl, daddr_t blkno, daddr_t count) 371 { 372 373 return (blst_meta_fill(bl->bl_root, blkno, count, bl->bl_radix)); 374 } 375 376 /* 377 * blist_resize() - resize an existing radix tree to handle the 378 * specified number of blocks. This will reallocate 379 * the tree and transfer the previous bitmap to the new 380 * one. When extending the tree you can specify whether 381 * the new blocks are to left allocated or freed. 382 */ 383 void 384 blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags) 385 { 386 blist_t newbl = blist_create(count, flags); 387 blist_t save = *pbl; 388 389 *pbl = newbl; 390 if (count > save->bl_blocks) 391 count = save->bl_blocks; 392 blst_copy(save->bl_root, 0, save->bl_radix, newbl, count); 393 394 /* 395 * If resizing upwards, should we free the new space or not? 396 */ 397 if (freenew && count < newbl->bl_blocks) { 398 blist_free(newbl, count, newbl->bl_blocks - count); 399 } 400 blist_destroy(save); 401 } 402 403 #ifdef BLIST_DEBUG 404 405 /* 406 * blist_print() - dump radix tree 407 */ 408 void 409 blist_print(blist_t bl) 410 { 411 printf("BLIST cursor = %08jx {\n", (uintmax_t)bl->bl_cursor); 412 blst_radix_print(bl->bl_root, 0, bl->bl_radix, 4); 413 printf("}\n"); 414 } 415 416 #endif 417 418 static const u_daddr_t fib[] = { 419 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584, 420 4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418, 317811, 421 514229, 832040, 1346269, 2178309, 3524578, 422 }; 423 424 /* 425 * Use 'gap' to describe a maximal range of unallocated blocks/bits. 426 */ 427 struct gap_stats { 428 daddr_t start; /* current gap start, or SWAPBLK_NONE */ 429 daddr_t num; /* number of gaps observed */ 430 daddr_t max; /* largest gap size */ 431 daddr_t avg; /* average gap size */ 432 daddr_t err; /* sum - num * avg */ 433 daddr_t histo[nitems(fib)]; /* # gaps in each size range */ 434 int max_bucket; /* last histo elt with nonzero val */ 435 }; 436 437 /* 438 * gap_stats_counting() - is the state 'counting 1 bits'? 439 * or 'skipping 0 bits'? 440 */ 441 static inline bool 442 gap_stats_counting(const struct gap_stats *stats) 443 { 444 445 return (stats->start != SWAPBLK_NONE); 446 } 447 448 /* 449 * init_gap_stats() - initialize stats on gap sizes 450 */ 451 static inline void 452 init_gap_stats(struct gap_stats *stats) 453 { 454 455 bzero(stats, sizeof(*stats)); 456 stats->start = SWAPBLK_NONE; 457 } 458 459 /* 460 * update_gap_stats() - update stats on gap sizes 461 */ 462 static void 463 update_gap_stats(struct gap_stats *stats, daddr_t posn) 464 { 465 daddr_t size; 466 int hi, lo, mid; 467 468 if (!gap_stats_counting(stats)) { 469 stats->start = posn; 470 return; 471 } 472 size = posn - stats->start; 473 stats->start = SWAPBLK_NONE; 474 if (size > stats->max) 475 stats->max = size; 476 477 /* 478 * Find the fibonacci range that contains size, 479 * expecting to find it in an early range. 480 */ 481 lo = 0; 482 hi = 1; 483 while (hi < nitems(fib) && fib[hi] <= size) { 484 lo = hi; 485 hi *= 2; 486 } 487 if (hi >= nitems(fib)) 488 hi = nitems(fib); 489 while (lo + 1 != hi) { 490 mid = (lo + hi) >> 1; 491 if (fib[mid] <= size) 492 lo = mid; 493 else 494 hi = mid; 495 } 496 stats->histo[lo]++; 497 if (lo > stats->max_bucket) 498 stats->max_bucket = lo; 499 stats->err += size - stats->avg; 500 stats->num++; 501 stats->avg += stats->err / stats->num; 502 stats->err %= stats->num; 503 } 504 505 /* 506 * dump_gap_stats() - print stats on gap sizes 507 */ 508 static inline void 509 dump_gap_stats(const struct gap_stats *stats, struct sbuf *s) 510 { 511 int i; 512 513 sbuf_printf(s, "number of maximal free ranges: %jd\n", 514 (intmax_t)stats->num); 515 sbuf_printf(s, "largest free range: %jd\n", (intmax_t)stats->max); 516 sbuf_printf(s, "average maximal free range size: %jd\n", 517 (intmax_t)stats->avg); 518 sbuf_printf(s, "number of maximal free ranges of different sizes:\n"); 519 sbuf_printf(s, " count | size range\n"); 520 sbuf_printf(s, " ----- | ----------\n"); 521 for (i = 0; i < stats->max_bucket; i++) { 522 if (stats->histo[i] != 0) { 523 sbuf_printf(s, "%20jd | ", 524 (intmax_t)stats->histo[i]); 525 if (fib[i] != fib[i + 1] - 1) 526 sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i], 527 (intmax_t)fib[i + 1] - 1); 528 else 529 sbuf_printf(s, "%jd\n", (intmax_t)fib[i]); 530 } 531 } 532 sbuf_printf(s, "%20jd | ", (intmax_t)stats->histo[i]); 533 if (stats->histo[i] > 1) 534 sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i], 535 (intmax_t)stats->max); 536 else 537 sbuf_printf(s, "%jd\n", (intmax_t)stats->max); 538 } 539 540 /* 541 * blist_stats() - dump radix tree stats 542 */ 543 void 544 blist_stats(blist_t bl, struct sbuf *s) 545 { 546 struct gap_stats gstats; 547 struct gap_stats *stats = &gstats; 548 daddr_t i, nodes, radix; 549 u_daddr_t bit, diff, mask; 550 551 init_gap_stats(stats); 552 nodes = 0; 553 i = bl->bl_radix; 554 while (i < bl->bl_radix + bl->bl_blocks) { 555 /* 556 * Find max size subtree starting at i. 557 */ 558 radix = BLIST_BMAP_RADIX; 559 while (((i / radix) & BLIST_META_MASK) == 0) 560 radix *= BLIST_META_RADIX; 561 562 /* 563 * Check for skippable subtrees starting at i. 564 */ 565 while (radix > BLIST_BMAP_RADIX) { 566 if (bl->bl_root[nodes].u.bmu_avail == 0) { 567 if (gap_stats_counting(stats)) 568 update_gap_stats(stats, i); 569 break; 570 } 571 if (bl->bl_root[nodes].u.bmu_avail == radix) { 572 if (!gap_stats_counting(stats)) 573 update_gap_stats(stats, i); 574 break; 575 } 576 577 /* 578 * Skip subtree root. 579 */ 580 nodes++; 581 radix /= BLIST_META_RADIX; 582 } 583 if (radix == BLIST_BMAP_RADIX) { 584 /* 585 * Scan leaf. 586 */ 587 mask = bl->bl_root[nodes].u.bmu_bitmap; 588 diff = mask ^ (mask << 1); 589 if (gap_stats_counting(stats)) 590 diff ^= 1; 591 while (diff != 0) { 592 bit = diff & -diff; 593 update_gap_stats(stats, i + bitpos(bit)); 594 diff ^= bit; 595 } 596 } 597 nodes += radix_to_skip(radix); 598 i += radix; 599 } 600 update_gap_stats(stats, i); 601 dump_gap_stats(stats, s); 602 } 603 604 /************************************************************************ 605 * ALLOCATION SUPPORT FUNCTIONS * 606 ************************************************************************ 607 * 608 * These support functions do all the actual work. They may seem 609 * rather longish, but that's because I've commented them up. The 610 * actual code is straight forward. 611 * 612 */ 613 614 /* 615 * blist_leaf_alloc() - allocate at a leaf in the radix tree (a bitmap). 616 * 617 * This is the core of the allocator and is optimized for the 618 * BLIST_BMAP_RADIX block allocation case. Otherwise, execution 619 * time is proportional to log2(count) + bitpos time. 620 */ 621 static daddr_t 622 blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count) 623 { 624 u_daddr_t mask; 625 int count1, hi, lo, num_shifts, range1, range_ext; 626 627 range1 = 0; 628 count1 = count - 1; 629 num_shifts = fls(count1); 630 mask = scan->u.bmu_bitmap; 631 while ((-mask & ~mask) != 0 && num_shifts > 0) { 632 /* 633 * If bit i is set in mask, then bits in [i, i+range1] are set 634 * in scan->u.bmu_bitmap. The value of range1 is equal to 635 * count1 >> num_shifts. Grow range and reduce num_shifts to 0, 636 * while preserving these invariants. The updates to mask leave 637 * fewer bits set, but each bit that remains set represents a 638 * longer string of consecutive bits set in scan->u.bmu_bitmap. 639 * If more updates to mask cannot clear more bits, because mask 640 * is partitioned with all 0 bits preceding all 1 bits, the loop 641 * terminates immediately. 642 */ 643 num_shifts--; 644 range_ext = range1 + ((count1 >> num_shifts) & 1); 645 /* 646 * mask is a signed quantity for the shift because when it is 647 * shifted right, the sign bit should copied; when the last 648 * block of the leaf is free, pretend, for a while, that all the 649 * blocks that follow it are also free. 650 */ 651 mask &= (daddr_t)mask >> range_ext; 652 range1 += range_ext; 653 } 654 if (mask == 0) { 655 /* 656 * Update bighint. There is no allocation bigger than range1 657 * starting in this leaf. 658 */ 659 scan->bm_bighint = range1; 660 return (SWAPBLK_NONE); 661 } 662 663 /* Discard any candidates that appear before blk. */ 664 mask &= (u_daddr_t)-1 << (blk & BLIST_BMAP_MASK); 665 if (mask == 0) 666 return (SWAPBLK_NONE); 667 668 /* 669 * The least significant set bit in mask marks the start of the first 670 * available range of sufficient size. Clear all the bits but that one, 671 * and then find its position. 672 */ 673 mask &= -mask; 674 lo = bitpos(mask); 675 676 hi = lo + count; 677 if (hi > BLIST_BMAP_RADIX) { 678 /* 679 * An allocation within this leaf is impossible, so a successful 680 * allocation depends on the next leaf providing some of the blocks. 681 */ 682 if (((blk / BLIST_BMAP_RADIX + 1) & BLIST_META_MASK) == 0) { 683 /* 684 * The next leaf has a different meta-node parent, so it 685 * is not necessarily initialized. Update bighint, 686 * comparing the range found at the end of mask to the 687 * largest earlier range that could have been made to 688 * vanish in the initial processing of mask. 689 */ 690 scan->bm_bighint = imax(BLIST_BMAP_RADIX - lo, range1); 691 return (SWAPBLK_NONE); 692 } 693 hi -= BLIST_BMAP_RADIX; 694 if (((scan[1].u.bmu_bitmap + 1) & ~((u_daddr_t)-1 << hi)) != 0) { 695 /* 696 * The next leaf doesn't have enough free blocks at the 697 * beginning to complete the spanning allocation. The 698 * hint cannot be updated, because the same allocation 699 * request could be satisfied later, by this leaf, if 700 * the state of the next leaf changes, and without any 701 * changes to this leaf. 702 */ 703 return (SWAPBLK_NONE); 704 } 705 /* Clear the first 'hi' bits in the next leaf, allocating them. */ 706 scan[1].u.bmu_bitmap &= (u_daddr_t)-1 << hi; 707 hi = BLIST_BMAP_RADIX; 708 } 709 710 /* Set the bits of mask at position 'lo' and higher. */ 711 mask = -mask; 712 if (hi == BLIST_BMAP_RADIX) { 713 /* 714 * Update bighint. There is no allocation bigger than range1 715 * available in this leaf after this allocation completes. 716 */ 717 scan->bm_bighint = range1; 718 } else { 719 /* Clear the bits of mask at position 'hi' and higher. */ 720 mask &= (u_daddr_t)-1 >> (BLIST_BMAP_RADIX - hi); 721 /* If this allocation uses all the bits, clear the hint. */ 722 if (mask == scan->u.bmu_bitmap) 723 scan->bm_bighint = 0; 724 } 725 /* Clear the allocated bits from this leaf. */ 726 scan->u.bmu_bitmap &= ~mask; 727 return ((blk & ~BLIST_BMAP_MASK) + lo); 728 } 729 730 /* 731 * blist_meta_alloc() - allocate at a meta in the radix tree. 732 * 733 * Attempt to allocate at a meta node. If we can't, we update 734 * bighint and return a failure. Updating bighint optimize future 735 * calls that hit this node. We have to check for our collapse cases 736 * and we have a few optimizations strewn in as well. 737 */ 738 static daddr_t 739 blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count, u_daddr_t radix) 740 { 741 daddr_t blk, i, next_skip, r, skip; 742 int child; 743 bool scan_from_start; 744 745 if (radix == BLIST_BMAP_RADIX) 746 return (blst_leaf_alloc(scan, cursor, count)); 747 if (scan->u.bmu_avail < count) { 748 /* 749 * The meta node's hint must be too large if the allocation 750 * exceeds the number of free blocks. Reduce the hint, and 751 * return failure. 752 */ 753 scan->bm_bighint = scan->u.bmu_avail; 754 return (SWAPBLK_NONE); 755 } 756 blk = cursor & -radix; 757 skip = radix_to_skip(radix); 758 next_skip = skip / BLIST_META_RADIX; 759 760 /* 761 * An ALL-FREE meta node requires special handling before allocating 762 * any of its blocks. 763 */ 764 if (scan->u.bmu_avail == radix) { 765 radix /= BLIST_META_RADIX; 766 767 /* 768 * Reinitialize each of the meta node's children. An ALL-FREE 769 * meta node cannot have a terminator in any subtree. 770 */ 771 for (i = 1; i < skip; i += next_skip) { 772 if (next_skip == 1) 773 scan[i].u.bmu_bitmap = (u_daddr_t)-1; 774 else 775 scan[i].u.bmu_avail = radix; 776 scan[i].bm_bighint = radix; 777 } 778 } else { 779 radix /= BLIST_META_RADIX; 780 } 781 782 if (count > radix) { 783 /* 784 * The allocation exceeds the number of blocks that are 785 * managed by a subtree of this meta node. 786 */ 787 panic("allocation too large"); 788 } 789 scan_from_start = cursor == blk; 790 child = (cursor - blk) / radix; 791 blk += child * radix; 792 for (i = 1 + child * next_skip; i < skip; i += next_skip) { 793 if (count <= scan[i].bm_bighint) { 794 /* 795 * The allocation might fit beginning in the i'th subtree. 796 */ 797 r = blst_meta_alloc(&scan[i], 798 cursor > blk ? cursor : blk, count, radix); 799 if (r != SWAPBLK_NONE) { 800 scan->u.bmu_avail -= count; 801 return (r); 802 } 803 } else if (scan[i].bm_bighint == (daddr_t)-1) { 804 /* 805 * Terminator 806 */ 807 break; 808 } 809 blk += radix; 810 } 811 812 /* 813 * We couldn't allocate count in this subtree, update bighint. 814 */ 815 if (scan_from_start && scan->bm_bighint >= count) 816 scan->bm_bighint = count - 1; 817 818 return (SWAPBLK_NONE); 819 } 820 821 /* 822 * BLST_LEAF_FREE() - free allocated block from leaf bitmap 823 * 824 */ 825 static void 826 blst_leaf_free(blmeta_t *scan, daddr_t blk, int count) 827 { 828 u_daddr_t mask; 829 int n; 830 831 /* 832 * free some data in this bitmap 833 * mask=0000111111111110000 834 * \_________/\__/ 835 * count n 836 */ 837 n = blk & BLIST_BMAP_MASK; 838 mask = ((u_daddr_t)-1 << n) & 839 ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n)); 840 if (scan->u.bmu_bitmap & mask) 841 panic("freeing free block"); 842 scan->u.bmu_bitmap |= mask; 843 844 /* 845 * We could probably do a better job here. We are required to make 846 * bighint at least as large as the biggest contiguous block of 847 * data. If we just shoehorn it, a little extra overhead will 848 * be incured on the next allocation (but only that one typically). 849 */ 850 scan->bm_bighint = BLIST_BMAP_RADIX; 851 } 852 853 /* 854 * BLST_META_FREE() - free allocated blocks from radix tree meta info 855 * 856 * This support routine frees a range of blocks from the bitmap. 857 * The range must be entirely enclosed by this radix node. If a 858 * meta node, we break the range down recursively to free blocks 859 * in subnodes (which means that this code can free an arbitrary 860 * range whereas the allocation code cannot allocate an arbitrary 861 * range). 862 */ 863 static void 864 blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, u_daddr_t radix) 865 { 866 daddr_t blk, i, next_skip, skip, v; 867 int child; 868 869 if (scan->bm_bighint == (daddr_t)-1) 870 panic("freeing invalid range"); 871 if (radix == BLIST_BMAP_RADIX) 872 return (blst_leaf_free(scan, freeBlk, count)); 873 skip = radix_to_skip(radix); 874 next_skip = skip / BLIST_META_RADIX; 875 876 if (scan->u.bmu_avail == 0) { 877 /* 878 * ALL-ALLOCATED special case, with possible 879 * shortcut to ALL-FREE special case. 880 */ 881 scan->u.bmu_avail = count; 882 scan->bm_bighint = count; 883 884 if (count != radix) { 885 for (i = 1; i < skip; i += next_skip) { 886 if (scan[i].bm_bighint == (daddr_t)-1) 887 break; 888 scan[i].bm_bighint = 0; 889 if (next_skip == 1) { 890 scan[i].u.bmu_bitmap = 0; 891 } else { 892 scan[i].u.bmu_avail = 0; 893 } 894 } 895 /* fall through */ 896 } 897 } else { 898 scan->u.bmu_avail += count; 899 /* scan->bm_bighint = radix; */ 900 } 901 902 /* 903 * ALL-FREE special case. 904 */ 905 906 if (scan->u.bmu_avail == radix) 907 return; 908 if (scan->u.bmu_avail > radix) 909 panic("blst_meta_free: freeing already free blocks (%lld) %lld/%lld", 910 (long long)count, (long long)scan->u.bmu_avail, 911 (long long)radix); 912 913 /* 914 * Break the free down into its components 915 */ 916 917 blk = freeBlk & -radix; 918 radix /= BLIST_META_RADIX; 919 920 child = (freeBlk - blk) / radix; 921 blk += child * radix; 922 i = 1 + child * next_skip; 923 while (i < skip && blk < freeBlk + count) { 924 v = blk + radix - freeBlk; 925 if (v > count) 926 v = count; 927 blst_meta_free(&scan[i], freeBlk, v, radix); 928 if (scan->bm_bighint < scan[i].bm_bighint) 929 scan->bm_bighint = scan[i].bm_bighint; 930 count -= v; 931 freeBlk += v; 932 blk += radix; 933 i += next_skip; 934 } 935 } 936 937 /* 938 * BLIST_RADIX_COPY() - copy one radix tree to another 939 * 940 * Locates free space in the source tree and frees it in the destination 941 * tree. The space may not already be free in the destination. 942 */ 943 static void 944 blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, blist_t dest, 945 daddr_t count) 946 { 947 daddr_t i, next_skip, skip; 948 949 /* 950 * Leaf node 951 */ 952 953 if (radix == BLIST_BMAP_RADIX) { 954 u_daddr_t v = scan->u.bmu_bitmap; 955 956 if (v == (u_daddr_t)-1) { 957 blist_free(dest, blk, count); 958 } else if (v != 0) { 959 int i; 960 961 for (i = 0; i < BLIST_BMAP_RADIX && i < count; ++i) { 962 if (v & ((u_daddr_t)1 << i)) 963 blist_free(dest, blk + i, 1); 964 } 965 } 966 return; 967 } 968 969 /* 970 * Meta node 971 */ 972 973 if (scan->u.bmu_avail == 0) { 974 /* 975 * Source all allocated, leave dest allocated 976 */ 977 return; 978 } 979 if (scan->u.bmu_avail == radix) { 980 /* 981 * Source all free, free entire dest 982 */ 983 if (count < radix) 984 blist_free(dest, blk, count); 985 else 986 blist_free(dest, blk, radix); 987 return; 988 } 989 990 991 skip = radix_to_skip(radix); 992 next_skip = skip / BLIST_META_RADIX; 993 radix /= BLIST_META_RADIX; 994 995 for (i = 1; count && i < skip; i += next_skip) { 996 if (scan[i].bm_bighint == (daddr_t)-1) 997 break; 998 999 if (count >= radix) { 1000 blst_copy(&scan[i], blk, radix, dest, radix); 1001 count -= radix; 1002 } else { 1003 if (count) { 1004 blst_copy(&scan[i], blk, radix, dest, count); 1005 } 1006 count = 0; 1007 } 1008 blk += radix; 1009 } 1010 } 1011 1012 /* 1013 * BLST_LEAF_FILL() - allocate specific blocks in leaf bitmap 1014 * 1015 * This routine allocates all blocks in the specified range 1016 * regardless of any existing allocations in that range. Returns 1017 * the number of blocks allocated by the call. 1018 */ 1019 static daddr_t 1020 blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count) 1021 { 1022 daddr_t nblks; 1023 u_daddr_t mask; 1024 int n; 1025 1026 n = blk & BLIST_BMAP_MASK; 1027 mask = ((u_daddr_t)-1 << n) & 1028 ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n)); 1029 1030 /* Count the number of blocks that we are allocating. */ 1031 nblks = bitcount64(scan->u.bmu_bitmap & mask); 1032 1033 scan->u.bmu_bitmap &= ~mask; 1034 return (nblks); 1035 } 1036 1037 /* 1038 * BLIST_META_FILL() - allocate specific blocks at a meta node 1039 * 1040 * This routine allocates the specified range of blocks, 1041 * regardless of any existing allocations in the range. The 1042 * range must be within the extent of this node. Returns the 1043 * number of blocks allocated by the call. 1044 */ 1045 static daddr_t 1046 blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, u_daddr_t radix) 1047 { 1048 daddr_t blk, i, nblks, next_skip, skip, v; 1049 int child; 1050 1051 if (scan->bm_bighint == (daddr_t)-1) 1052 panic("filling invalid range"); 1053 if (count > radix) { 1054 /* 1055 * The allocation exceeds the number of blocks that are 1056 * managed by this node. 1057 */ 1058 panic("fill too large"); 1059 } 1060 if (radix == BLIST_BMAP_RADIX) 1061 return (blst_leaf_fill(scan, allocBlk, count)); 1062 if (count == radix || scan->u.bmu_avail == 0) { 1063 /* 1064 * ALL-ALLOCATED special case 1065 */ 1066 nblks = scan->u.bmu_avail; 1067 scan->u.bmu_avail = 0; 1068 scan->bm_bighint = 0; 1069 return (nblks); 1070 } 1071 skip = radix_to_skip(radix); 1072 next_skip = skip / BLIST_META_RADIX; 1073 blk = allocBlk & -radix; 1074 1075 /* 1076 * An ALL-FREE meta node requires special handling before allocating 1077 * any of its blocks. 1078 */ 1079 if (scan->u.bmu_avail == radix) { 1080 radix /= BLIST_META_RADIX; 1081 1082 /* 1083 * Reinitialize each of the meta node's children. An ALL-FREE 1084 * meta node cannot have a terminator in any subtree. 1085 */ 1086 for (i = 1; i < skip; i += next_skip) { 1087 if (next_skip == 1) 1088 scan[i].u.bmu_bitmap = (u_daddr_t)-1; 1089 else 1090 scan[i].u.bmu_avail = radix; 1091 scan[i].bm_bighint = radix; 1092 } 1093 } else { 1094 radix /= BLIST_META_RADIX; 1095 } 1096 1097 nblks = 0; 1098 child = (allocBlk - blk) / radix; 1099 blk += child * radix; 1100 i = 1 + child * next_skip; 1101 while (i < skip && blk < allocBlk + count) { 1102 v = blk + radix - allocBlk; 1103 if (v > count) 1104 v = count; 1105 nblks += blst_meta_fill(&scan[i], allocBlk, v, radix); 1106 count -= v; 1107 allocBlk += v; 1108 blk += radix; 1109 i += next_skip; 1110 } 1111 scan->u.bmu_avail -= nblks; 1112 return (nblks); 1113 } 1114 1115 #ifdef BLIST_DEBUG 1116 1117 static void 1118 blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int tab) 1119 { 1120 daddr_t i, next_skip, skip; 1121 1122 if (radix == BLIST_BMAP_RADIX) { 1123 printf( 1124 "%*.*s(%08llx,%lld): bitmap %016llx big=%lld\n", 1125 tab, tab, "", 1126 (long long)blk, (long long)radix, 1127 (long long)scan->u.bmu_bitmap, 1128 (long long)scan->bm_bighint 1129 ); 1130 return; 1131 } 1132 1133 if (scan->u.bmu_avail == 0) { 1134 printf( 1135 "%*.*s(%08llx,%lld) ALL ALLOCATED\n", 1136 tab, tab, "", 1137 (long long)blk, 1138 (long long)radix 1139 ); 1140 return; 1141 } 1142 if (scan->u.bmu_avail == radix) { 1143 printf( 1144 "%*.*s(%08llx,%lld) ALL FREE\n", 1145 tab, tab, "", 1146 (long long)blk, 1147 (long long)radix 1148 ); 1149 return; 1150 } 1151 1152 printf( 1153 "%*.*s(%08llx,%lld): subtree (%lld/%lld) big=%lld {\n", 1154 tab, tab, "", 1155 (long long)blk, (long long)radix, 1156 (long long)scan->u.bmu_avail, 1157 (long long)radix, 1158 (long long)scan->bm_bighint 1159 ); 1160 1161 skip = radix_to_skip(radix); 1162 next_skip = skip / BLIST_META_RADIX; 1163 radix /= BLIST_META_RADIX; 1164 tab += 4; 1165 1166 for (i = 1; i < skip; i += next_skip) { 1167 if (scan[i].bm_bighint == (daddr_t)-1) { 1168 printf( 1169 "%*.*s(%08llx,%lld): Terminator\n", 1170 tab, tab, "", 1171 (long long)blk, (long long)radix 1172 ); 1173 break; 1174 } 1175 blst_radix_print(&scan[i], blk, radix, tab); 1176 blk += radix; 1177 } 1178 tab -= 4; 1179 1180 printf( 1181 "%*.*s}\n", 1182 tab, tab, "" 1183 ); 1184 } 1185 1186 #endif 1187 1188 #ifdef BLIST_DEBUG 1189 1190 int 1191 main(int ac, char **av) 1192 { 1193 int size = 1024; 1194 int i; 1195 blist_t bl; 1196 struct sbuf *s; 1197 1198 for (i = 1; i < ac; ++i) { 1199 const char *ptr = av[i]; 1200 if (*ptr != '-') { 1201 size = strtol(ptr, NULL, 0); 1202 continue; 1203 } 1204 ptr += 2; 1205 fprintf(stderr, "Bad option: %s\n", ptr - 2); 1206 exit(1); 1207 } 1208 bl = blist_create(size, M_WAITOK); 1209 blist_free(bl, 0, size); 1210 1211 for (;;) { 1212 char buf[1024]; 1213 long long da = 0; 1214 long long count = 0; 1215 1216 printf("%lld/%lld/%lld> ", (long long)blist_avail(bl), 1217 (long long)size, (long long)bl->bl_radix); 1218 fflush(stdout); 1219 if (fgets(buf, sizeof(buf), stdin) == NULL) 1220 break; 1221 switch(buf[0]) { 1222 case 'r': 1223 if (sscanf(buf + 1, "%lld", &count) == 1) { 1224 blist_resize(&bl, count, 1, M_WAITOK); 1225 } else { 1226 printf("?\n"); 1227 } 1228 case 'p': 1229 blist_print(bl); 1230 break; 1231 case 's': 1232 s = sbuf_new_auto(); 1233 blist_stats(bl, s); 1234 sbuf_finish(s); 1235 printf("%s", sbuf_data(s)); 1236 sbuf_delete(s); 1237 break; 1238 case 'a': 1239 if (sscanf(buf + 1, "%lld", &count) == 1) { 1240 daddr_t blk = blist_alloc(bl, count); 1241 printf(" R=%08llx\n", (long long)blk); 1242 } else { 1243 printf("?\n"); 1244 } 1245 break; 1246 case 'f': 1247 if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) { 1248 blist_free(bl, da, count); 1249 } else { 1250 printf("?\n"); 1251 } 1252 break; 1253 case 'l': 1254 if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) { 1255 printf(" n=%jd\n", 1256 (intmax_t)blist_fill(bl, da, count)); 1257 } else { 1258 printf("?\n"); 1259 } 1260 break; 1261 case '?': 1262 case 'h': 1263 puts( 1264 "p -print\n" 1265 "s -stats\n" 1266 "a %d -allocate\n" 1267 "f %x %d -free\n" 1268 "l %x %d -fill\n" 1269 "r %d -resize\n" 1270 "h/? -help" 1271 ); 1272 break; 1273 default: 1274 printf("?\n"); 1275 break; 1276 } 1277 } 1278 return(0); 1279 } 1280 1281 void 1282 panic(const char *ctl, ...) 1283 { 1284 va_list va; 1285 1286 va_start(va, ctl); 1287 vfprintf(stderr, ctl, va); 1288 fprintf(stderr, "\n"); 1289 va_end(va); 1290 exit(1); 1291 } 1292 1293 #endif 1294