1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1998 Matthew Dillon. All Rights Reserved. 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS 18 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY 21 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE 23 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 25 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 26 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 */ 29 /* 30 * BLIST.C - Bitmap allocator/deallocator, using a radix tree with hinting 31 * 32 * This module implements a general bitmap allocator/deallocator. The 33 * allocator eats around 2 bits per 'block'. The module does not 34 * try to interpret the meaning of a 'block' other than to return 35 * SWAPBLK_NONE on an allocation failure. 36 * 37 * A radix tree controls access to pieces of the bitmap, and includes 38 * auxiliary information at each interior node about the availabilty of 39 * contiguous free blocks in the subtree rooted at that node. Two radix 40 * constants are involved: one for the size of the bitmaps contained in the 41 * leaf nodes (BLIST_BMAP_RADIX), and one for the number of descendents of 42 * each of the meta (interior) nodes (BLIST_META_RADIX). Each subtree is 43 * associated with a range of blocks. The root of any subtree stores a 44 * hint field that defines an upper bound on the size of the largest 45 * allocation that can begin in the associated block range. A hint is an 46 * upper bound on a potential allocation, but not necessarily a tight upper 47 * bound. 48 * 49 * The bitmap field in each node directs the search for available blocks. 50 * For a leaf node, a bit is set if the corresponding block is free. For a 51 * meta node, a bit is set if the corresponding subtree contains a free 52 * block somewhere within it. The search at a meta node considers only 53 * children of that node that represent a range that includes a free block. 54 * 55 * The hinting greatly increases code efficiency for allocations while 56 * the general radix structure optimizes both allocations and frees. The 57 * radix tree should be able to operate well no matter how much 58 * fragmentation there is and no matter how large a bitmap is used. 59 * 60 * The blist code wires all necessary memory at creation time. Neither 61 * allocations nor frees require interaction with the memory subsystem. 62 * The non-blocking nature of allocations and frees is required by swap 63 * code (vm/swap_pager.c). 64 * 65 * LAYOUT: The radix tree is laid out recursively using a linear array. 66 * Each meta node is immediately followed (laid out sequentially in 67 * memory) by BLIST_META_RADIX lower level nodes. This is a recursive 68 * structure but one that can be easily scanned through a very simple 69 * 'skip' calculation. The memory allocation is only large enough to 70 * cover the number of blocks requested at creation time. Nodes that 71 * represent blocks beyond that limit, nodes that would never be read 72 * or written, are not allocated, so that the last of the 73 * BLIST_META_RADIX lower level nodes of a some nodes may not be 74 * allocated. 75 * 76 * NOTE: the allocator cannot currently allocate more than 77 * BLIST_BMAP_RADIX blocks per call. It will panic with 'allocation too 78 * large' if you try. This is an area that could use improvement. The 79 * radix is large enough that this restriction does not effect the swap 80 * system, though. Currently only the allocation code is affected by 81 * this algorithmic unfeature. The freeing code can handle arbitrary 82 * ranges. 83 * 84 * This code can be compiled stand-alone for debugging. 85 */ 86 87 #include <sys/cdefs.h> 88 __FBSDID("$FreeBSD$"); 89 90 #ifdef _KERNEL 91 92 #include <sys/param.h> 93 #include <sys/systm.h> 94 #include <sys/lock.h> 95 #include <sys/kernel.h> 96 #include <sys/blist.h> 97 #include <sys/malloc.h> 98 #include <sys/sbuf.h> 99 #include <sys/proc.h> 100 #include <sys/mutex.h> 101 102 #else 103 104 #ifndef BLIST_NO_DEBUG 105 #define BLIST_DEBUG 106 #endif 107 108 #include <sys/errno.h> 109 #include <sys/types.h> 110 #include <sys/malloc.h> 111 #include <sys/sbuf.h> 112 #include <stdio.h> 113 #include <string.h> 114 #include <stddef.h> 115 #include <stdlib.h> 116 #include <stdarg.h> 117 #include <stdbool.h> 118 119 #define bitcount64(x) __bitcount64((uint64_t)(x)) 120 #define malloc(a,b,c) calloc(a, 1) 121 #define free(a,b) free(a) 122 #define ummin(a,b) ((a) < (b) ? (a) : (b)) 123 124 #include <sys/blist.h> 125 126 void panic(const char *ctl, ...); 127 128 #endif 129 130 /* 131 * static support functions 132 */ 133 static daddr_t blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count); 134 static daddr_t blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count, 135 u_daddr_t radix); 136 static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count); 137 static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, 138 u_daddr_t radix); 139 static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, 140 blist_t dest, daddr_t count); 141 static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count); 142 static daddr_t blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, 143 u_daddr_t radix); 144 #ifndef _KERNEL 145 static void blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, 146 int tab); 147 #endif 148 149 #ifdef _KERNEL 150 static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space"); 151 #endif 152 153 _Static_assert(BLIST_BMAP_RADIX % BLIST_META_RADIX == 0, 154 "radix divisibility error"); 155 #define BLIST_BMAP_MASK (BLIST_BMAP_RADIX - 1) 156 #define BLIST_META_MASK (BLIST_META_RADIX - 1) 157 158 /* 159 * For a subtree that can represent the state of up to 'radix' blocks, the 160 * number of leaf nodes of the subtree is L=radix/BLIST_BMAP_RADIX. If 'm' 161 * is short for BLIST_META_RADIX, then for a tree of height h with L=m**h 162 * leaf nodes, the total number of tree nodes is 1 + m + m**2 + ... + m**h, 163 * or, equivalently, (m**(h+1)-1)/(m-1). This quantity is called 'skip' 164 * in the 'meta' functions that process subtrees. Since integer division 165 * discards remainders, we can express this computation as 166 * skip = (m * m**h) / (m - 1) 167 * skip = (m * (radix / BLIST_BMAP_RADIX)) / (m - 1) 168 * and since m divides BLIST_BMAP_RADIX, we can simplify further to 169 * skip = (radix / (BLIST_BMAP_RADIX / m)) / (m - 1) 170 * skip = radix / ((BLIST_BMAP_RADIX / m) * (m - 1)) 171 * so that simple integer division by a constant can safely be used for the 172 * calculation. 173 */ 174 static inline daddr_t 175 radix_to_skip(daddr_t radix) 176 { 177 178 return (radix / 179 ((BLIST_BMAP_RADIX / BLIST_META_RADIX) * BLIST_META_MASK)); 180 } 181 182 /* 183 * Provide a mask with count bits set, starting as position n. 184 */ 185 static inline u_daddr_t 186 bitrange(int n, int count) 187 { 188 189 return (((u_daddr_t)-1 << n) & 190 ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - (n + count)))); 191 } 192 193 194 /* 195 * Use binary search, or a faster method, to find the 1 bit in a u_daddr_t. 196 * Assumes that the argument has only one bit set. 197 */ 198 static inline int 199 bitpos(u_daddr_t mask) 200 { 201 int hi, lo, mid; 202 203 switch (sizeof(mask)) { 204 #ifdef HAVE_INLINE_FFSLL 205 case sizeof(long long): 206 return (ffsll(mask) - 1); 207 #endif 208 default: 209 lo = 0; 210 hi = BLIST_BMAP_RADIX; 211 while (lo + 1 < hi) { 212 mid = (lo + hi) >> 1; 213 if ((mask >> mid) != 0) 214 lo = mid; 215 else 216 hi = mid; 217 } 218 return (lo); 219 } 220 } 221 222 /* 223 * blist_create() - create a blist capable of handling up to the specified 224 * number of blocks 225 * 226 * blocks - must be greater than 0 227 * flags - malloc flags 228 * 229 * The smallest blist consists of a single leaf node capable of 230 * managing BLIST_BMAP_RADIX blocks. 231 */ 232 blist_t 233 blist_create(daddr_t blocks, int flags) 234 { 235 blist_t bl; 236 u_daddr_t nodes, radix; 237 238 if (blocks == 0) 239 panic("invalid block count"); 240 241 /* 242 * Calculate the radix and node count used for scanning. 243 */ 244 nodes = 1; 245 radix = BLIST_BMAP_RADIX; 246 while (radix <= blocks) { 247 nodes += 1 + (blocks - 1) / radix; 248 radix *= BLIST_META_RADIX; 249 } 250 251 bl = malloc(offsetof(struct blist, bl_root[nodes]), M_SWAP, flags | 252 M_ZERO); 253 if (bl == NULL) 254 return (NULL); 255 256 bl->bl_blocks = blocks; 257 bl->bl_radix = radix; 258 259 #if defined(BLIST_DEBUG) 260 printf( 261 "BLIST representing %lld blocks (%lld MB of swap)" 262 ", requiring %lldK of ram\n", 263 (long long)bl->bl_blocks, 264 (long long)bl->bl_blocks * 4 / 1024, 265 (long long)(nodes * sizeof(blmeta_t) + 1023) / 1024 266 ); 267 printf("BLIST raw radix tree contains %lld records\n", 268 (long long)nodes); 269 #endif 270 271 return (bl); 272 } 273 274 void 275 blist_destroy(blist_t bl) 276 { 277 278 free(bl, M_SWAP); 279 } 280 281 /* 282 * blist_alloc() - reserve space in the block bitmap. Return the base 283 * of a contiguous region or SWAPBLK_NONE if space could 284 * not be allocated. 285 */ 286 daddr_t 287 blist_alloc(blist_t bl, daddr_t count) 288 { 289 daddr_t blk; 290 291 if (count > BLIST_MAX_ALLOC) 292 panic("allocation too large"); 293 294 /* 295 * This loop iterates at most twice. An allocation failure in the 296 * first iteration leads to a second iteration only if the cursor was 297 * non-zero. When the cursor is zero, an allocation failure will 298 * reduce the hint, stopping further iterations. 299 */ 300 while (count <= bl->bl_root->bm_bighint) { 301 blk = blst_meta_alloc(bl->bl_root, bl->bl_cursor, count, 302 bl->bl_radix); 303 if (blk != SWAPBLK_NONE) { 304 bl->bl_avail -= count; 305 bl->bl_cursor = blk + count; 306 if (bl->bl_cursor == bl->bl_blocks) 307 bl->bl_cursor = 0; 308 return (blk); 309 } 310 bl->bl_cursor = 0; 311 } 312 return (SWAPBLK_NONE); 313 } 314 315 /* 316 * blist_avail() - return the number of free blocks. 317 */ 318 daddr_t 319 blist_avail(blist_t bl) 320 { 321 322 return (bl->bl_avail); 323 } 324 325 /* 326 * blist_free() - free up space in the block bitmap. Return the base 327 * of a contiguous region. Panic if an inconsistancy is 328 * found. 329 */ 330 void 331 blist_free(blist_t bl, daddr_t blkno, daddr_t count) 332 { 333 334 if (blkno < 0 || blkno + count > bl->bl_blocks) 335 panic("freeing invalid range"); 336 blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix); 337 bl->bl_avail += count; 338 } 339 340 /* 341 * blist_fill() - mark a region in the block bitmap as off-limits 342 * to the allocator (i.e. allocate it), ignoring any 343 * existing allocations. Return the number of blocks 344 * actually filled that were free before the call. 345 */ 346 daddr_t 347 blist_fill(blist_t bl, daddr_t blkno, daddr_t count) 348 { 349 daddr_t filled; 350 351 if (blkno < 0 || blkno + count > bl->bl_blocks) 352 panic("filling invalid range"); 353 filled = blst_meta_fill(bl->bl_root, blkno, count, bl->bl_radix); 354 bl->bl_avail -= filled; 355 return (filled); 356 } 357 358 /* 359 * blist_resize() - resize an existing radix tree to handle the 360 * specified number of blocks. This will reallocate 361 * the tree and transfer the previous bitmap to the new 362 * one. When extending the tree you can specify whether 363 * the new blocks are to left allocated or freed. 364 */ 365 void 366 blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags) 367 { 368 blist_t newbl = blist_create(count, flags); 369 blist_t save = *pbl; 370 371 *pbl = newbl; 372 if (count > save->bl_blocks) 373 count = save->bl_blocks; 374 blst_copy(save->bl_root, 0, save->bl_radix, newbl, count); 375 376 /* 377 * If resizing upwards, should we free the new space or not? 378 */ 379 if (freenew && count < newbl->bl_blocks) { 380 blist_free(newbl, count, newbl->bl_blocks - count); 381 } 382 blist_destroy(save); 383 } 384 385 #ifdef BLIST_DEBUG 386 387 /* 388 * blist_print() - dump radix tree 389 */ 390 void 391 blist_print(blist_t bl) 392 { 393 printf("BLIST avail = %jd, cursor = %08jx {\n", 394 (uintmax_t)bl->bl_avail, (uintmax_t)bl->bl_cursor); 395 396 if (bl->bl_root->bm_bitmap != 0) 397 blst_radix_print(bl->bl_root, 0, bl->bl_radix, 4); 398 printf("}\n"); 399 } 400 401 #endif 402 403 static const u_daddr_t fib[] = { 404 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584, 405 4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418, 317811, 406 514229, 832040, 1346269, 2178309, 3524578, 407 }; 408 409 /* 410 * Use 'gap' to describe a maximal range of unallocated blocks/bits. 411 */ 412 struct gap_stats { 413 daddr_t start; /* current gap start, or SWAPBLK_NONE */ 414 daddr_t num; /* number of gaps observed */ 415 daddr_t max; /* largest gap size */ 416 daddr_t avg; /* average gap size */ 417 daddr_t err; /* sum - num * avg */ 418 daddr_t histo[nitems(fib)]; /* # gaps in each size range */ 419 int max_bucket; /* last histo elt with nonzero val */ 420 }; 421 422 /* 423 * gap_stats_counting() - is the state 'counting 1 bits'? 424 * or 'skipping 0 bits'? 425 */ 426 static inline bool 427 gap_stats_counting(const struct gap_stats *stats) 428 { 429 430 return (stats->start != SWAPBLK_NONE); 431 } 432 433 /* 434 * init_gap_stats() - initialize stats on gap sizes 435 */ 436 static inline void 437 init_gap_stats(struct gap_stats *stats) 438 { 439 440 bzero(stats, sizeof(*stats)); 441 stats->start = SWAPBLK_NONE; 442 } 443 444 /* 445 * update_gap_stats() - update stats on gap sizes 446 */ 447 static void 448 update_gap_stats(struct gap_stats *stats, daddr_t posn) 449 { 450 daddr_t size; 451 int hi, lo, mid; 452 453 if (!gap_stats_counting(stats)) { 454 stats->start = posn; 455 return; 456 } 457 size = posn - stats->start; 458 stats->start = SWAPBLK_NONE; 459 if (size > stats->max) 460 stats->max = size; 461 462 /* 463 * Find the fibonacci range that contains size, 464 * expecting to find it in an early range. 465 */ 466 lo = 0; 467 hi = 1; 468 while (hi < nitems(fib) && fib[hi] <= size) { 469 lo = hi; 470 hi *= 2; 471 } 472 if (hi >= nitems(fib)) 473 hi = nitems(fib); 474 while (lo + 1 != hi) { 475 mid = (lo + hi) >> 1; 476 if (fib[mid] <= size) 477 lo = mid; 478 else 479 hi = mid; 480 } 481 stats->histo[lo]++; 482 if (lo > stats->max_bucket) 483 stats->max_bucket = lo; 484 stats->err += size - stats->avg; 485 stats->num++; 486 stats->avg += stats->err / stats->num; 487 stats->err %= stats->num; 488 } 489 490 /* 491 * dump_gap_stats() - print stats on gap sizes 492 */ 493 static inline void 494 dump_gap_stats(const struct gap_stats *stats, struct sbuf *s) 495 { 496 int i; 497 498 sbuf_printf(s, "number of maximal free ranges: %jd\n", 499 (intmax_t)stats->num); 500 sbuf_printf(s, "largest free range: %jd\n", (intmax_t)stats->max); 501 sbuf_printf(s, "average maximal free range size: %jd\n", 502 (intmax_t)stats->avg); 503 sbuf_printf(s, "number of maximal free ranges of different sizes:\n"); 504 sbuf_printf(s, " count | size range\n"); 505 sbuf_printf(s, " ----- | ----------\n"); 506 for (i = 0; i < stats->max_bucket; i++) { 507 if (stats->histo[i] != 0) { 508 sbuf_printf(s, "%20jd | ", 509 (intmax_t)stats->histo[i]); 510 if (fib[i] != fib[i + 1] - 1) 511 sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i], 512 (intmax_t)fib[i + 1] - 1); 513 else 514 sbuf_printf(s, "%jd\n", (intmax_t)fib[i]); 515 } 516 } 517 sbuf_printf(s, "%20jd | ", (intmax_t)stats->histo[i]); 518 if (stats->histo[i] > 1) 519 sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i], 520 (intmax_t)stats->max); 521 else 522 sbuf_printf(s, "%jd\n", (intmax_t)stats->max); 523 } 524 525 /* 526 * blist_stats() - dump radix tree stats 527 */ 528 void 529 blist_stats(blist_t bl, struct sbuf *s) 530 { 531 struct gap_stats gstats; 532 struct gap_stats *stats = &gstats; 533 daddr_t i, nodes, radix; 534 u_daddr_t bit, diff, mask; 535 536 init_gap_stats(stats); 537 nodes = 0; 538 i = bl->bl_radix; 539 while (i < bl->bl_radix + bl->bl_blocks) { 540 /* 541 * Find max size subtree starting at i. 542 */ 543 radix = BLIST_BMAP_RADIX; 544 while (((i / radix) & BLIST_META_MASK) == 0) 545 radix *= BLIST_META_RADIX; 546 547 /* 548 * Check for skippable subtrees starting at i. 549 */ 550 while (radix > BLIST_BMAP_RADIX) { 551 if (bl->bl_root[nodes].bm_bitmap == 0) { 552 if (gap_stats_counting(stats)) 553 update_gap_stats(stats, i); 554 break; 555 } 556 557 /* 558 * Skip subtree root. 559 */ 560 nodes++; 561 radix /= BLIST_META_RADIX; 562 } 563 if (radix == BLIST_BMAP_RADIX) { 564 /* 565 * Scan leaf. 566 */ 567 mask = bl->bl_root[nodes].bm_bitmap; 568 diff = mask ^ (mask << 1); 569 if (gap_stats_counting(stats)) 570 diff ^= 1; 571 while (diff != 0) { 572 bit = diff & -diff; 573 update_gap_stats(stats, i + bitpos(bit)); 574 diff ^= bit; 575 } 576 } 577 nodes += radix_to_skip(radix); 578 i += radix; 579 } 580 update_gap_stats(stats, i); 581 dump_gap_stats(stats, s); 582 } 583 584 /************************************************************************ 585 * ALLOCATION SUPPORT FUNCTIONS * 586 ************************************************************************ 587 * 588 * These support functions do all the actual work. They may seem 589 * rather longish, but that's because I've commented them up. The 590 * actual code is straight forward. 591 * 592 */ 593 594 /* 595 * BLST_NEXT_LEAF_ALLOC() - allocate the first few blocks in the next leaf. 596 * 597 * 'scan' is a leaf node, associated with a block containing 'blk'. 598 * The next leaf node could be adjacent, or several nodes away if the 599 * least common ancestor of 'scan' and its neighbor is several levels 600 * up. Use 'blk' to determine how many meta-nodes lie between the 601 * leaves. If the next leaf has enough initial bits set, clear them 602 * and clear the bits in the meta nodes on the path up to the least 603 * common ancestor to mark any subtrees made completely empty. 604 */ 605 static int 606 blst_next_leaf_alloc(blmeta_t *scan, daddr_t blk, int count) 607 { 608 blmeta_t *next; 609 daddr_t skip; 610 u_daddr_t radix; 611 int digit; 612 613 next = scan + 1; 614 blk += BLIST_BMAP_RADIX; 615 radix = BLIST_BMAP_RADIX; 616 while ((digit = ((blk / radix) & BLIST_META_MASK)) == 0 && 617 (next->bm_bitmap & 1) == 1) { 618 next++; 619 radix *= BLIST_META_RADIX; 620 } 621 if (((next->bm_bitmap + 1) & ~((u_daddr_t)-1 << count)) != 0) { 622 /* 623 * The next leaf doesn't have enough free blocks at the 624 * beginning to complete the spanning allocation. 625 */ 626 return (ENOMEM); 627 } 628 /* Clear the first 'count' bits in the next leaf to allocate. */ 629 next->bm_bitmap &= (u_daddr_t)-1 << count; 630 631 /* 632 * Update bitmaps of next-ancestors, up to least common ancestor. 633 */ 634 skip = radix_to_skip(radix); 635 while (radix != BLIST_BMAP_RADIX && next->bm_bitmap == 0) { 636 (--next)->bm_bitmap ^= 1; 637 radix /= BLIST_META_RADIX; 638 } 639 if (next->bm_bitmap == 0) 640 scan[-digit * skip].bm_bitmap ^= (u_daddr_t)1 << digit; 641 return (0); 642 } 643 644 /* 645 * BLST_LEAF_ALLOC() - allocate at a leaf in the radix tree (a bitmap). 646 * 647 * This is the core of the allocator and is optimized for the 648 * BLIST_BMAP_RADIX block allocation case. Otherwise, execution 649 * time is proportional to log2(count) + bitpos time. 650 */ 651 static daddr_t 652 blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count) 653 { 654 u_daddr_t mask; 655 int count1, hi, lo, num_shifts, range1, range_ext; 656 657 range1 = 0; 658 count1 = count - 1; 659 num_shifts = fls(count1); 660 mask = scan->bm_bitmap; 661 while ((-mask & ~mask) != 0 && num_shifts > 0) { 662 /* 663 * If bit i is set in mask, then bits in [i, i+range1] are set 664 * in scan->bm_bitmap. The value of range1 is equal to 665 * count1 >> num_shifts. Grow range and reduce num_shifts to 0, 666 * while preserving these invariants. The updates to mask leave 667 * fewer bits set, but each bit that remains set represents a 668 * longer string of consecutive bits set in scan->bm_bitmap. 669 * If more updates to mask cannot clear more bits, because mask 670 * is partitioned with all 0 bits preceding all 1 bits, the loop 671 * terminates immediately. 672 */ 673 num_shifts--; 674 range_ext = range1 + ((count1 >> num_shifts) & 1); 675 /* 676 * mask is a signed quantity for the shift because when it is 677 * shifted right, the sign bit should copied; when the last 678 * block of the leaf is free, pretend, for a while, that all the 679 * blocks that follow it are also free. 680 */ 681 mask &= (daddr_t)mask >> range_ext; 682 range1 += range_ext; 683 } 684 if (mask == 0) { 685 /* 686 * Update bighint. There is no allocation bigger than range1 687 * starting in this leaf. 688 */ 689 scan->bm_bighint = range1; 690 return (SWAPBLK_NONE); 691 } 692 693 /* Discard any candidates that appear before blk. */ 694 mask &= (u_daddr_t)-1 << (blk & BLIST_BMAP_MASK); 695 if (mask == 0) 696 return (SWAPBLK_NONE); 697 698 /* 699 * The least significant set bit in mask marks the start of the first 700 * available range of sufficient size. Clear all the bits but that one, 701 * and then find its position. 702 */ 703 mask &= -mask; 704 lo = bitpos(mask); 705 706 hi = lo + count; 707 if (hi > BLIST_BMAP_RADIX) { 708 /* 709 * An allocation within this leaf is impossible, so a successful 710 * allocation depends on the next leaf providing some of the blocks. 711 */ 712 if (blst_next_leaf_alloc(scan, blk, hi - BLIST_BMAP_RADIX) != 0) 713 /* 714 * The hint cannot be updated, because the same 715 * allocation request could be satisfied later, by this 716 * leaf, if the state of the next leaf changes, and 717 * without any changes to this leaf. 718 */ 719 return (SWAPBLK_NONE); 720 hi = BLIST_BMAP_RADIX; 721 } 722 723 /* Set the bits of mask at position 'lo' and higher. */ 724 mask = -mask; 725 if (hi == BLIST_BMAP_RADIX) { 726 /* 727 * Update bighint. There is no allocation bigger than range1 728 * available in this leaf after this allocation completes. 729 */ 730 scan->bm_bighint = range1; 731 } else { 732 /* Clear the bits of mask at position 'hi' and higher. */ 733 mask &= (u_daddr_t)-1 >> (BLIST_BMAP_RADIX - hi); 734 } 735 /* Clear the allocated bits from this leaf. */ 736 scan->bm_bitmap &= ~mask; 737 return ((blk & ~BLIST_BMAP_MASK) + lo); 738 } 739 740 /* 741 * blist_meta_alloc() - allocate at a meta in the radix tree. 742 * 743 * Attempt to allocate at a meta node. If we can't, we update 744 * bighint and return a failure. Updating bighint optimize future 745 * calls that hit this node. We have to check for our collapse cases 746 * and we have a few optimizations strewn in as well. 747 */ 748 static daddr_t 749 blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count, u_daddr_t radix) 750 { 751 daddr_t blk, i, r, skip; 752 u_daddr_t bit, mask; 753 bool scan_from_start; 754 int digit; 755 756 if (radix == BLIST_BMAP_RADIX) 757 return (blst_leaf_alloc(scan, cursor, count)); 758 blk = cursor & -radix; 759 scan_from_start = (cursor == blk); 760 radix /= BLIST_META_RADIX; 761 skip = radix_to_skip(radix); 762 mask = scan->bm_bitmap; 763 764 /* Discard any candidates that appear before cursor. */ 765 digit = (cursor / radix) & BLIST_META_MASK; 766 mask &= (u_daddr_t)-1 << digit; 767 if (mask == 0) 768 return (SWAPBLK_NONE); 769 770 /* 771 * If the first try is for a block that includes the cursor, pre-undo 772 * the digit * radix offset in the first call; otherwise, ignore the 773 * cursor entirely. 774 */ 775 if (((mask >> digit) & 1) == 1) 776 cursor -= digit * radix; 777 else 778 cursor = blk; 779 780 /* 781 * Examine the nonempty subtree associated with each bit set in mask. 782 */ 783 do { 784 bit = mask & -mask; 785 digit = bitpos(bit); 786 i = 1 + digit * skip; 787 if (count <= scan[i].bm_bighint) { 788 /* 789 * The allocation might fit beginning in the i'th subtree. 790 */ 791 r = blst_meta_alloc(&scan[i], cursor + digit * radix, 792 count, radix); 793 if (r != SWAPBLK_NONE) { 794 if (scan[i].bm_bitmap == 0) 795 scan->bm_bitmap ^= bit; 796 return (r); 797 } 798 } 799 cursor = blk; 800 } while ((mask ^= bit) != 0); 801 802 /* 803 * We couldn't allocate count in this subtree. If the whole tree was 804 * scanned, and the last tree node is allocated, update bighint. 805 */ 806 if (scan_from_start && !(digit == BLIST_META_RADIX - 1 && 807 scan[i].bm_bighint == BLIST_MAX_ALLOC)) 808 scan->bm_bighint = count - 1; 809 810 return (SWAPBLK_NONE); 811 } 812 813 /* 814 * BLST_LEAF_FREE() - free allocated block from leaf bitmap 815 * 816 */ 817 static void 818 blst_leaf_free(blmeta_t *scan, daddr_t blk, int count) 819 { 820 u_daddr_t mask; 821 822 /* 823 * free some data in this bitmap 824 * mask=0000111111111110000 825 * \_________/\__/ 826 * count n 827 */ 828 mask = bitrange(blk & BLIST_BMAP_MASK, count); 829 if (scan->bm_bitmap & mask) 830 panic("freeing free block"); 831 scan->bm_bitmap |= mask; 832 } 833 834 /* 835 * BLST_META_FREE() - free allocated blocks from radix tree meta info 836 * 837 * This support routine frees a range of blocks from the bitmap. 838 * The range must be entirely enclosed by this radix node. If a 839 * meta node, we break the range down recursively to free blocks 840 * in subnodes (which means that this code can free an arbitrary 841 * range whereas the allocation code cannot allocate an arbitrary 842 * range). 843 */ 844 static void 845 blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, u_daddr_t radix) 846 { 847 daddr_t blk, endBlk, i, skip; 848 int digit, endDigit; 849 850 /* 851 * We could probably do a better job here. We are required to make 852 * bighint at least as large as the biggest allocable block of data. 853 * If we just shoehorn it, a little extra overhead will be incurred 854 * on the next allocation (but only that one typically). 855 */ 856 scan->bm_bighint = BLIST_MAX_ALLOC; 857 858 if (radix == BLIST_BMAP_RADIX) 859 return (blst_leaf_free(scan, freeBlk, count)); 860 861 endBlk = ummin(freeBlk + count, (freeBlk + radix) & -radix); 862 radix /= BLIST_META_RADIX; 863 skip = radix_to_skip(radix); 864 blk = freeBlk & -radix; 865 digit = (blk / radix) & BLIST_META_MASK; 866 endDigit = 1 + (((endBlk - 1) / radix) & BLIST_META_MASK); 867 scan->bm_bitmap |= bitrange(digit, endDigit - digit); 868 for (i = 1 + digit * skip; blk < endBlk; i += skip) { 869 blk += radix; 870 count = ummin(blk, endBlk) - freeBlk; 871 blst_meta_free(&scan[i], freeBlk, count, radix); 872 freeBlk = blk; 873 } 874 } 875 876 /* 877 * BLST_COPY() - copy one radix tree to another 878 * 879 * Locates free space in the source tree and frees it in the destination 880 * tree. The space may not already be free in the destination. 881 */ 882 static void 883 blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, blist_t dest, 884 daddr_t count) 885 { 886 daddr_t endBlk, i, skip; 887 888 /* 889 * Leaf node 890 */ 891 892 if (radix == BLIST_BMAP_RADIX) { 893 u_daddr_t v = scan->bm_bitmap; 894 895 if (v == (u_daddr_t)-1) { 896 blist_free(dest, blk, count); 897 } else if (v != 0) { 898 int i; 899 900 for (i = 0; i < count; ++i) { 901 if (v & ((u_daddr_t)1 << i)) 902 blist_free(dest, blk + i, 1); 903 } 904 } 905 return; 906 } 907 908 /* 909 * Meta node 910 */ 911 912 if (scan->bm_bitmap == 0) { 913 /* 914 * Source all allocated, leave dest allocated 915 */ 916 return; 917 } 918 919 endBlk = blk + count; 920 radix /= BLIST_META_RADIX; 921 skip = radix_to_skip(radix); 922 for (i = 1; blk < endBlk; i += skip) { 923 blk += radix; 924 count = radix; 925 if (blk >= endBlk) 926 count -= blk - endBlk; 927 blst_copy(&scan[i], blk - radix, radix, dest, count); 928 } 929 } 930 931 /* 932 * BLST_LEAF_FILL() - allocate specific blocks in leaf bitmap 933 * 934 * This routine allocates all blocks in the specified range 935 * regardless of any existing allocations in that range. Returns 936 * the number of blocks allocated by the call. 937 */ 938 static daddr_t 939 blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count) 940 { 941 daddr_t nblks; 942 u_daddr_t mask; 943 944 mask = bitrange(blk & BLIST_BMAP_MASK, count); 945 946 /* Count the number of blocks that we are allocating. */ 947 nblks = bitcount64(scan->bm_bitmap & mask); 948 949 scan->bm_bitmap &= ~mask; 950 return (nblks); 951 } 952 953 /* 954 * BLIST_META_FILL() - allocate specific blocks at a meta node 955 * 956 * This routine allocates the specified range of blocks, 957 * regardless of any existing allocations in the range. The 958 * range must be within the extent of this node. Returns the 959 * number of blocks allocated by the call. 960 */ 961 static daddr_t 962 blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, u_daddr_t radix) 963 { 964 daddr_t blk, endBlk, i, nblks, skip; 965 int digit; 966 967 if (radix == BLIST_BMAP_RADIX) 968 return (blst_leaf_fill(scan, allocBlk, count)); 969 970 endBlk = ummin(allocBlk + count, (allocBlk + radix) & -radix); 971 radix /= BLIST_META_RADIX; 972 skip = radix_to_skip(radix); 973 blk = allocBlk & -radix; 974 nblks = 0; 975 while (blk < endBlk) { 976 digit = (blk / radix) & BLIST_META_MASK; 977 i = 1 + digit * skip; 978 blk += radix; 979 count = ummin(blk, endBlk) - allocBlk; 980 nblks += blst_meta_fill(&scan[i], allocBlk, count, radix); 981 if (scan[i].bm_bitmap == 0) 982 scan->bm_bitmap &= ~((u_daddr_t)1 << digit); 983 allocBlk = blk; 984 } 985 return (nblks); 986 } 987 988 #ifdef BLIST_DEBUG 989 990 static void 991 blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int tab) 992 { 993 daddr_t skip; 994 u_daddr_t bit, mask; 995 int digit; 996 997 if (radix == BLIST_BMAP_RADIX) { 998 printf( 999 "%*.*s(%08llx,%lld): bitmap %0*llx big=%lld\n", 1000 tab, tab, "", 1001 (long long)blk, (long long)radix, 1002 1 + (BLIST_BMAP_RADIX - 1) / 4, 1003 (long long)scan->bm_bitmap, 1004 (long long)scan->bm_bighint 1005 ); 1006 return; 1007 } 1008 1009 printf( 1010 "%*.*s(%08llx): subtree (%lld/%lld) bitmap %0*llx big=%lld {\n", 1011 tab, tab, "", 1012 (long long)blk, (long long)radix, 1013 (long long)radix, 1014 1 + (BLIST_META_RADIX - 1) / 4, 1015 (long long)scan->bm_bitmap, 1016 (long long)scan->bm_bighint 1017 ); 1018 1019 radix /= BLIST_META_RADIX; 1020 skip = radix_to_skip(radix); 1021 tab += 4; 1022 1023 mask = scan->bm_bitmap; 1024 /* Examine the nonempty subtree associated with each bit set in mask */ 1025 do { 1026 bit = mask & -mask; 1027 digit = bitpos(bit); 1028 blst_radix_print(&scan[1 + digit * skip], blk + digit * radix, 1029 radix, tab); 1030 } while ((mask ^= bit) != 0); 1031 tab -= 4; 1032 1033 printf( 1034 "%*.*s}\n", 1035 tab, tab, "" 1036 ); 1037 } 1038 1039 #endif 1040 1041 #ifdef BLIST_DEBUG 1042 1043 int 1044 main(int ac, char **av) 1045 { 1046 int size = BLIST_META_RADIX * BLIST_BMAP_RADIX; 1047 int i; 1048 blist_t bl; 1049 struct sbuf *s; 1050 1051 for (i = 1; i < ac; ++i) { 1052 const char *ptr = av[i]; 1053 if (*ptr != '-') { 1054 size = strtol(ptr, NULL, 0); 1055 continue; 1056 } 1057 ptr += 2; 1058 fprintf(stderr, "Bad option: %s\n", ptr - 2); 1059 exit(1); 1060 } 1061 bl = blist_create(size, M_WAITOK); 1062 blist_free(bl, 0, size); 1063 1064 for (;;) { 1065 char buf[1024]; 1066 long long da = 0; 1067 long long count = 0; 1068 1069 printf("%lld/%lld/%lld> ", (long long)blist_avail(bl), 1070 (long long)size, (long long)bl->bl_radix); 1071 fflush(stdout); 1072 if (fgets(buf, sizeof(buf), stdin) == NULL) 1073 break; 1074 switch(buf[0]) { 1075 case 'r': 1076 if (sscanf(buf + 1, "%lld", &count) == 1) { 1077 blist_resize(&bl, count, 1, M_WAITOK); 1078 } else { 1079 printf("?\n"); 1080 } 1081 case 'p': 1082 blist_print(bl); 1083 break; 1084 case 's': 1085 s = sbuf_new_auto(); 1086 blist_stats(bl, s); 1087 sbuf_finish(s); 1088 printf("%s", sbuf_data(s)); 1089 sbuf_delete(s); 1090 break; 1091 case 'a': 1092 if (sscanf(buf + 1, "%lld", &count) == 1) { 1093 daddr_t blk = blist_alloc(bl, count); 1094 printf(" R=%08llx\n", (long long)blk); 1095 } else { 1096 printf("?\n"); 1097 } 1098 break; 1099 case 'f': 1100 if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) { 1101 blist_free(bl, da, count); 1102 } else { 1103 printf("?\n"); 1104 } 1105 break; 1106 case 'l': 1107 if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) { 1108 printf(" n=%jd\n", 1109 (intmax_t)blist_fill(bl, da, count)); 1110 } else { 1111 printf("?\n"); 1112 } 1113 break; 1114 case '?': 1115 case 'h': 1116 puts( 1117 "p -print\n" 1118 "s -stats\n" 1119 "a %d -allocate\n" 1120 "f %x %d -free\n" 1121 "l %x %d -fill\n" 1122 "r %d -resize\n" 1123 "h/? -help" 1124 ); 1125 break; 1126 default: 1127 printf("?\n"); 1128 break; 1129 } 1130 } 1131 return(0); 1132 } 1133 1134 void 1135 panic(const char *ctl, ...) 1136 { 1137 va_list va; 1138 1139 va_start(va, ctl); 1140 vfprintf(stderr, ctl, va); 1141 fprintf(stderr, "\n"); 1142 va_end(va); 1143 exit(1); 1144 } 1145 1146 #endif 1147