1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1998 Matthew Dillon. All Rights Reserved. 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS 18 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY 21 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE 23 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 25 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 26 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 */ 29 /* 30 * BLIST.C - Bitmap allocator/deallocator, using a radix tree with hinting 31 * 32 * This module implements a general bitmap allocator/deallocator. The 33 * allocator eats around 2 bits per 'block'. The module does not 34 * try to interpret the meaning of a 'block' other than to return 35 * SWAPBLK_NONE on an allocation failure. 36 * 37 * A radix tree controls access to pieces of the bitmap, and includes 38 * auxiliary information at each interior node about the availabilty of 39 * contiguous free blocks in the subtree rooted at that node. Two radix 40 * constants are involved: one for the size of the bitmaps contained in the 41 * leaf nodes (BLIST_BMAP_RADIX), and one for the number of descendents of 42 * each of the meta (interior) nodes (BLIST_META_RADIX). Each subtree is 43 * associated with a range of blocks. The root of any subtree stores a 44 * hint field that defines an upper bound on the size of the largest 45 * allocation that can begin in the associated block range. A hint is an 46 * upper bound on a potential allocation, but not necessarily a tight upper 47 * bound. 48 * 49 * The bitmap field in each node directs the search for available blocks. 50 * For a leaf node, a bit is set if the corresponding block is free. For a 51 * meta node, a bit is set if the corresponding subtree contains a free 52 * block somewhere within it. The search at a meta node considers only 53 * children of that node that represent a range that includes a free block. 54 * 55 * The hinting greatly increases code efficiency for allocations while 56 * the general radix structure optimizes both allocations and frees. The 57 * radix tree should be able to operate well no matter how much 58 * fragmentation there is and no matter how large a bitmap is used. 59 * 60 * The blist code wires all necessary memory at creation time. Neither 61 * allocations nor frees require interaction with the memory subsystem. 62 * The non-blocking nature of allocations and frees is required by swap 63 * code (vm/swap_pager.c). 64 * 65 * LAYOUT: The radix tree is laid out recursively using a linear array. 66 * Each meta node is immediately followed (laid out sequentially in 67 * memory) by BLIST_META_RADIX lower level nodes. This is a recursive 68 * structure but one that can be easily scanned through a very simple 69 * 'skip' calculation. The memory allocation is only large enough to 70 * cover the number of blocks requested at creation time. Nodes that 71 * represent blocks beyond that limit, nodes that would never be read 72 * or written, are not allocated, so that the last of the 73 * BLIST_META_RADIX lower level nodes of a some nodes may not be 74 * allocated. 75 * 76 * NOTE: the allocator cannot currently allocate more than 77 * BLIST_BMAP_RADIX blocks per call. It will panic with 'allocation too 78 * large' if you try. This is an area that could use improvement. The 79 * radix is large enough that this restriction does not effect the swap 80 * system, though. Currently only the allocation code is affected by 81 * this algorithmic unfeature. The freeing code can handle arbitrary 82 * ranges. 83 * 84 * This code can be compiled stand-alone for debugging. 85 */ 86 87 #include <sys/cdefs.h> 88 __FBSDID("$FreeBSD$"); 89 90 #ifdef _KERNEL 91 92 #include <sys/param.h> 93 #include <sys/systm.h> 94 #include <sys/lock.h> 95 #include <sys/kernel.h> 96 #include <sys/blist.h> 97 #include <sys/malloc.h> 98 #include <sys/sbuf.h> 99 #include <sys/proc.h> 100 #include <sys/mutex.h> 101 102 #else 103 104 #ifndef BLIST_NO_DEBUG 105 #define BLIST_DEBUG 106 #endif 107 108 #include <sys/errno.h> 109 #include <sys/types.h> 110 #include <sys/malloc.h> 111 #include <sys/sbuf.h> 112 #include <assert.h> 113 #include <stdio.h> 114 #include <string.h> 115 #include <stddef.h> 116 #include <stdlib.h> 117 #include <stdarg.h> 118 #include <stdbool.h> 119 120 #define bitcount64(x) __bitcount64((uint64_t)(x)) 121 #define malloc(a,b,c) calloc(a, 1) 122 #define free(a,b) free(a) 123 #define ummin(a,b) ((a) < (b) ? (a) : (b)) 124 #define imin(a,b) ((a) < (b) ? (a) : (b)) 125 #define KASSERT(a,b) assert(a) 126 127 #include <sys/blist.h> 128 129 #endif 130 131 /* 132 * static support functions 133 */ 134 static daddr_t blst_leaf_alloc(blmeta_t *scan, daddr_t blk, 135 int *count, int maxcount); 136 static daddr_t blst_meta_alloc(blmeta_t *scan, daddr_t cursor, int *count, 137 int maxcount, u_daddr_t radix); 138 static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count); 139 static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, 140 u_daddr_t radix); 141 static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, 142 blist_t dest, daddr_t count); 143 static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count); 144 static daddr_t blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, 145 u_daddr_t radix); 146 #ifndef _KERNEL 147 static void blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, 148 int tab); 149 #endif 150 151 #ifdef _KERNEL 152 static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space"); 153 #endif 154 155 _Static_assert(BLIST_BMAP_RADIX % BLIST_META_RADIX == 0, 156 "radix divisibility error"); 157 #define BLIST_BMAP_MASK (BLIST_BMAP_RADIX - 1) 158 #define BLIST_META_MASK (BLIST_META_RADIX - 1) 159 160 /* 161 * For a subtree that can represent the state of up to 'radix' blocks, the 162 * number of leaf nodes of the subtree is L=radix/BLIST_BMAP_RADIX. If 'm' 163 * is short for BLIST_META_RADIX, then for a tree of height h with L=m**h 164 * leaf nodes, the total number of tree nodes is 1 + m + m**2 + ... + m**h, 165 * or, equivalently, (m**(h+1)-1)/(m-1). This quantity is called 'skip' 166 * in the 'meta' functions that process subtrees. Since integer division 167 * discards remainders, we can express this computation as 168 * skip = (m * m**h) / (m - 1) 169 * skip = (m * (radix / BLIST_BMAP_RADIX)) / (m - 1) 170 * and since m divides BLIST_BMAP_RADIX, we can simplify further to 171 * skip = (radix / (BLIST_BMAP_RADIX / m)) / (m - 1) 172 * skip = radix / ((BLIST_BMAP_RADIX / m) * (m - 1)) 173 * so that simple integer division by a constant can safely be used for the 174 * calculation. 175 */ 176 static inline daddr_t 177 radix_to_skip(daddr_t radix) 178 { 179 180 return (radix / 181 ((BLIST_BMAP_RADIX / BLIST_META_RADIX) * BLIST_META_MASK)); 182 } 183 184 /* 185 * Provide a mask with count bits set, starting as position n. 186 */ 187 static inline u_daddr_t 188 bitrange(int n, int count) 189 { 190 191 return (((u_daddr_t)-1 << n) & 192 ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - (n + count)))); 193 } 194 195 /* 196 * Find the first bit set in a u_daddr_t. 197 */ 198 static inline int 199 generic_bitpos(u_daddr_t mask) 200 { 201 int hi, lo, mid; 202 203 lo = 0; 204 hi = BLIST_BMAP_RADIX; 205 while (lo + 1 < hi) { 206 mid = (lo + hi) >> 1; 207 if (mask & bitrange(0, mid)) 208 hi = mid; 209 else 210 lo = mid; 211 } 212 return (lo); 213 } 214 215 static inline int 216 bitpos(u_daddr_t mask) 217 { 218 219 switch (sizeof(mask)) { 220 #ifdef HAVE_INLINE_FFSLL 221 case sizeof(long long): 222 return (ffsll(mask) - 1); 223 #endif 224 #ifdef HAVE_INLINE_FFS 225 case sizeof(int): 226 return (ffs(mask) - 1); 227 #endif 228 default: 229 return (generic_bitpos(mask)); 230 } 231 } 232 233 /* 234 * blist_create() - create a blist capable of handling up to the specified 235 * number of blocks 236 * 237 * blocks - must be greater than 0 238 * flags - malloc flags 239 * 240 * The smallest blist consists of a single leaf node capable of 241 * managing BLIST_BMAP_RADIX blocks. 242 */ 243 blist_t 244 blist_create(daddr_t blocks, int flags) 245 { 246 blist_t bl; 247 u_daddr_t nodes, radix; 248 249 KASSERT(blocks > 0, ("invalid block count")); 250 251 /* 252 * Calculate the radix and node count used for scanning. 253 */ 254 nodes = 1; 255 radix = BLIST_BMAP_RADIX; 256 while (radix <= blocks) { 257 nodes += 1 + (blocks - 1) / radix; 258 radix *= BLIST_META_RADIX; 259 } 260 261 bl = malloc(offsetof(struct blist, bl_root[nodes]), M_SWAP, flags | 262 M_ZERO); 263 if (bl == NULL) 264 return (NULL); 265 266 bl->bl_blocks = blocks; 267 bl->bl_radix = radix; 268 269 #if defined(BLIST_DEBUG) 270 printf( 271 "BLIST representing %lld blocks (%lld MB of swap)" 272 ", requiring %lldK of ram\n", 273 (long long)bl->bl_blocks, 274 (long long)bl->bl_blocks * 4 / 1024, 275 (long long)(nodes * sizeof(blmeta_t) + 1023) / 1024 276 ); 277 printf("BLIST raw radix tree contains %lld records\n", 278 (long long)nodes); 279 #endif 280 281 return (bl); 282 } 283 284 void 285 blist_destroy(blist_t bl) 286 { 287 288 free(bl, M_SWAP); 289 } 290 291 /* 292 * blist_alloc() - reserve space in the block bitmap. Return the base 293 * of a contiguous region or SWAPBLK_NONE if space could 294 * not be allocated. 295 */ 296 daddr_t 297 blist_alloc(blist_t bl, int *count, int maxcount) 298 { 299 daddr_t blk, cursor; 300 301 KASSERT(*count <= maxcount, 302 ("invalid parameters %d > %d", *count, maxcount)); 303 KASSERT(*count <= BLIST_MAX_ALLOC, 304 ("minimum allocation too large: %d", *count)); 305 306 /* 307 * This loop iterates at most twice. An allocation failure in the 308 * first iteration leads to a second iteration only if the cursor was 309 * non-zero. When the cursor is zero, an allocation failure will 310 * stop further iterations. 311 */ 312 for (cursor = bl->bl_cursor;; cursor = 0) { 313 blk = blst_meta_alloc(bl->bl_root, cursor, count, maxcount, 314 bl->bl_radix); 315 if (blk != SWAPBLK_NONE) { 316 bl->bl_avail -= *count; 317 bl->bl_cursor = blk + *count; 318 if (bl->bl_cursor == bl->bl_blocks) 319 bl->bl_cursor = 0; 320 return (blk); 321 } 322 if (cursor == 0) 323 return (SWAPBLK_NONE); 324 } 325 } 326 327 /* 328 * blist_avail() - return the number of free blocks. 329 */ 330 daddr_t 331 blist_avail(blist_t bl) 332 { 333 334 return (bl->bl_avail); 335 } 336 337 /* 338 * blist_free() - free up space in the block bitmap. Return the base 339 * of a contiguous region. 340 */ 341 void 342 blist_free(blist_t bl, daddr_t blkno, daddr_t count) 343 { 344 345 KASSERT(blkno >= 0 && blkno + count <= bl->bl_blocks, 346 ("freeing invalid range: blkno %jx, count %d, blocks %jd", 347 (uintmax_t)blkno, (int)count, (uintmax_t)bl->bl_blocks)); 348 blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix); 349 bl->bl_avail += count; 350 } 351 352 /* 353 * blist_fill() - mark a region in the block bitmap as off-limits 354 * to the allocator (i.e. allocate it), ignoring any 355 * existing allocations. Return the number of blocks 356 * actually filled that were free before the call. 357 */ 358 daddr_t 359 blist_fill(blist_t bl, daddr_t blkno, daddr_t count) 360 { 361 daddr_t filled; 362 363 KASSERT(blkno >= 0 && blkno + count <= bl->bl_blocks, 364 ("filling invalid range: blkno %jx, count %d, blocks %jd", 365 (uintmax_t)blkno, (int)count, (uintmax_t)bl->bl_blocks)); 366 filled = blst_meta_fill(bl->bl_root, blkno, count, bl->bl_radix); 367 bl->bl_avail -= filled; 368 return (filled); 369 } 370 371 /* 372 * blist_resize() - resize an existing radix tree to handle the 373 * specified number of blocks. This will reallocate 374 * the tree and transfer the previous bitmap to the new 375 * one. When extending the tree you can specify whether 376 * the new blocks are to left allocated or freed. 377 */ 378 void 379 blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags) 380 { 381 blist_t newbl = blist_create(count, flags); 382 blist_t save = *pbl; 383 384 *pbl = newbl; 385 if (count > save->bl_blocks) 386 count = save->bl_blocks; 387 blst_copy(save->bl_root, 0, save->bl_radix, newbl, count); 388 389 /* 390 * If resizing upwards, should we free the new space or not? 391 */ 392 if (freenew && count < newbl->bl_blocks) { 393 blist_free(newbl, count, newbl->bl_blocks - count); 394 } 395 blist_destroy(save); 396 } 397 398 #ifdef BLIST_DEBUG 399 400 /* 401 * blist_print() - dump radix tree 402 */ 403 void 404 blist_print(blist_t bl) 405 { 406 printf("BLIST avail = %jd, cursor = %08jx {\n", 407 (uintmax_t)bl->bl_avail, (uintmax_t)bl->bl_cursor); 408 409 if (bl->bl_root->bm_bitmap != 0) 410 blst_radix_print(bl->bl_root, 0, bl->bl_radix, 4); 411 printf("}\n"); 412 } 413 414 #endif 415 416 static const u_daddr_t fib[] = { 417 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584, 418 4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418, 317811, 419 514229, 832040, 1346269, 2178309, 3524578, 420 }; 421 422 /* 423 * Use 'gap' to describe a maximal range of unallocated blocks/bits. 424 */ 425 struct gap_stats { 426 daddr_t start; /* current gap start, or SWAPBLK_NONE */ 427 daddr_t num; /* number of gaps observed */ 428 daddr_t max; /* largest gap size */ 429 daddr_t avg; /* average gap size */ 430 daddr_t err; /* sum - num * avg */ 431 daddr_t histo[nitems(fib)]; /* # gaps in each size range */ 432 int max_bucket; /* last histo elt with nonzero val */ 433 }; 434 435 /* 436 * gap_stats_counting() - is the state 'counting 1 bits'? 437 * or 'skipping 0 bits'? 438 */ 439 static inline bool 440 gap_stats_counting(const struct gap_stats *stats) 441 { 442 443 return (stats->start != SWAPBLK_NONE); 444 } 445 446 /* 447 * init_gap_stats() - initialize stats on gap sizes 448 */ 449 static inline void 450 init_gap_stats(struct gap_stats *stats) 451 { 452 453 bzero(stats, sizeof(*stats)); 454 stats->start = SWAPBLK_NONE; 455 } 456 457 /* 458 * update_gap_stats() - update stats on gap sizes 459 */ 460 static void 461 update_gap_stats(struct gap_stats *stats, daddr_t posn) 462 { 463 daddr_t size; 464 int hi, lo, mid; 465 466 if (!gap_stats_counting(stats)) { 467 stats->start = posn; 468 return; 469 } 470 size = posn - stats->start; 471 stats->start = SWAPBLK_NONE; 472 if (size > stats->max) 473 stats->max = size; 474 475 /* 476 * Find the fibonacci range that contains size, 477 * expecting to find it in an early range. 478 */ 479 lo = 0; 480 hi = 1; 481 while (hi < nitems(fib) && fib[hi] <= size) { 482 lo = hi; 483 hi *= 2; 484 } 485 if (hi >= nitems(fib)) 486 hi = nitems(fib); 487 while (lo + 1 != hi) { 488 mid = (lo + hi) >> 1; 489 if (fib[mid] <= size) 490 lo = mid; 491 else 492 hi = mid; 493 } 494 stats->histo[lo]++; 495 if (lo > stats->max_bucket) 496 stats->max_bucket = lo; 497 stats->err += size - stats->avg; 498 stats->num++; 499 stats->avg += stats->err / stats->num; 500 stats->err %= stats->num; 501 } 502 503 /* 504 * dump_gap_stats() - print stats on gap sizes 505 */ 506 static inline void 507 dump_gap_stats(const struct gap_stats *stats, struct sbuf *s) 508 { 509 int i; 510 511 sbuf_printf(s, "number of maximal free ranges: %jd\n", 512 (intmax_t)stats->num); 513 sbuf_printf(s, "largest free range: %jd\n", (intmax_t)stats->max); 514 sbuf_printf(s, "average maximal free range size: %jd\n", 515 (intmax_t)stats->avg); 516 sbuf_printf(s, "number of maximal free ranges of different sizes:\n"); 517 sbuf_printf(s, " count | size range\n"); 518 sbuf_printf(s, " ----- | ----------\n"); 519 for (i = 0; i < stats->max_bucket; i++) { 520 if (stats->histo[i] != 0) { 521 sbuf_printf(s, "%20jd | ", 522 (intmax_t)stats->histo[i]); 523 if (fib[i] != fib[i + 1] - 1) 524 sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i], 525 (intmax_t)fib[i + 1] - 1); 526 else 527 sbuf_printf(s, "%jd\n", (intmax_t)fib[i]); 528 } 529 } 530 sbuf_printf(s, "%20jd | ", (intmax_t)stats->histo[i]); 531 if (stats->histo[i] > 1) 532 sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i], 533 (intmax_t)stats->max); 534 else 535 sbuf_printf(s, "%jd\n", (intmax_t)stats->max); 536 } 537 538 /* 539 * blist_stats() - dump radix tree stats 540 */ 541 void 542 blist_stats(blist_t bl, struct sbuf *s) 543 { 544 struct gap_stats gstats; 545 struct gap_stats *stats = &gstats; 546 daddr_t i, nodes, radix; 547 u_daddr_t diff, mask; 548 int digit; 549 550 init_gap_stats(stats); 551 nodes = 0; 552 i = bl->bl_radix; 553 while (i < bl->bl_radix + bl->bl_blocks) { 554 /* 555 * Find max size subtree starting at i. 556 */ 557 radix = BLIST_BMAP_RADIX; 558 while (((i / radix) & BLIST_META_MASK) == 0) 559 radix *= BLIST_META_RADIX; 560 561 /* 562 * Check for skippable subtrees starting at i. 563 */ 564 while (radix > BLIST_BMAP_RADIX) { 565 if (bl->bl_root[nodes].bm_bitmap == 0) { 566 if (gap_stats_counting(stats)) 567 update_gap_stats(stats, i); 568 break; 569 } 570 571 /* 572 * Skip subtree root. 573 */ 574 nodes++; 575 radix /= BLIST_META_RADIX; 576 } 577 if (radix == BLIST_BMAP_RADIX) { 578 /* 579 * Scan leaf. 580 */ 581 mask = bl->bl_root[nodes].bm_bitmap; 582 diff = mask ^ (mask << 1); 583 if (gap_stats_counting(stats)) 584 diff ^= 1; 585 while (diff != 0) { 586 digit = bitpos(diff); 587 update_gap_stats(stats, i + digit); 588 diff ^= bitrange(digit, 1); 589 } 590 } 591 nodes += radix_to_skip(radix); 592 i += radix; 593 } 594 update_gap_stats(stats, i); 595 dump_gap_stats(stats, s); 596 } 597 598 /************************************************************************ 599 * ALLOCATION SUPPORT FUNCTIONS * 600 ************************************************************************ 601 * 602 * These support functions do all the actual work. They may seem 603 * rather longish, but that's because I've commented them up. The 604 * actual code is straight forward. 605 * 606 */ 607 608 /* 609 * BLST_NEXT_LEAF_ALLOC() - allocate the blocks starting with the next leaf. 610 * 611 * 'scan' is a leaf node, and its first block is at address 'start'. The 612 * next leaf node could be adjacent, or several nodes away if the least 613 * common ancestor of 'scan' and its neighbor is several levels up. Use 614 * addresses to determine how many meta-nodes lie between the leaves. If 615 * sequence of leaves starting with the next one has enough initial bits 616 * set, clear them and clear the bits in the meta nodes on the path up to 617 * the least common ancestor to mark any subtrees made completely empty. 618 */ 619 static int 620 blst_next_leaf_alloc(blmeta_t *scan, daddr_t start, int count, int maxcount) 621 { 622 u_daddr_t radix; 623 daddr_t blk; 624 int avail, digit; 625 626 start += BLIST_BMAP_RADIX; 627 for (blk = start; blk - start < maxcount; blk += BLIST_BMAP_RADIX) { 628 /* Skip meta-nodes, as long as they promise more free blocks. */ 629 radix = BLIST_BMAP_RADIX; 630 while (((++scan)->bm_bitmap & 1) == 1 && 631 ((blk / radix) & BLIST_META_MASK) == 0) 632 radix *= BLIST_META_RADIX; 633 if (~scan->bm_bitmap != 0) { 634 /* 635 * Either there is no next leaf with any free blocks, 636 * or we've reached the next leaf and found that some 637 * of its blocks are not free. In the first case, 638 * bitpos() returns zero here. 639 */ 640 avail = blk - start + bitpos(~scan->bm_bitmap); 641 if (avail < count || avail == 0) { 642 /* 643 * There isn't a next leaf with enough free 644 * blocks at its beginning to bother 645 * allocating. 646 */ 647 return (avail); 648 } 649 maxcount = imin(avail, maxcount); 650 if (maxcount % BLIST_BMAP_RADIX == 0) { 651 /* 652 * There was no next leaf. Back scan up to 653 * last leaf. 654 */ 655 --scan; 656 while (radix != BLIST_BMAP_RADIX) { 657 radix /= BLIST_META_RADIX; 658 --scan; 659 } 660 blk -= BLIST_BMAP_RADIX; 661 } 662 } 663 } 664 665 /* 666 * 'scan' is the last leaf that provides blocks. Clear from 1 to 667 * BLIST_BMAP_RADIX bits to represent the allocation of those last 668 * blocks. 669 */ 670 if (maxcount % BLIST_BMAP_RADIX != 0) 671 scan->bm_bitmap &= ~bitrange(0, maxcount % BLIST_BMAP_RADIX); 672 else 673 scan->bm_bitmap = 0; 674 675 for (;;) { 676 /* Back up over meta-nodes, clearing bits if necessary. */ 677 blk -= BLIST_BMAP_RADIX; 678 radix = BLIST_BMAP_RADIX; 679 while ((digit = ((blk / radix) & BLIST_META_MASK)) == 0) { 680 if ((scan--)->bm_bitmap == 0) 681 scan->bm_bitmap ^= 1; 682 radix *= BLIST_META_RADIX; 683 } 684 if ((scan--)->bm_bitmap == 0) 685 scan[-digit * radix_to_skip(radix)].bm_bitmap ^= 686 (u_daddr_t)1 << digit; 687 688 if (blk == start) 689 break; 690 /* Clear all the bits of this leaf. */ 691 scan->bm_bitmap = 0; 692 } 693 return (maxcount); 694 } 695 696 /* 697 * BLST_LEAF_ALLOC() - allocate at a leaf in the radix tree (a bitmap). 698 * 699 * This function is the core of the allocator. Its execution time is 700 * proportional to log(count), plus height of the tree if the allocation 701 * crosses a leaf boundary. 702 */ 703 static daddr_t 704 blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int *count, int maxcount) 705 { 706 u_daddr_t mask; 707 int bighint, count1, hi, lo, num_shifts; 708 709 count1 = *count - 1; 710 num_shifts = fls(count1); 711 mask = ~scan->bm_bitmap; 712 while ((mask & (mask + 1)) != 0 && num_shifts > 0) { 713 /* 714 * If bit i is 0 in mask, then bits in [i, i + (count1 >> 715 * num_shifts)] are 1 in scan->bm_bitmap. Reduce num_shifts to 716 * 0, while preserving this invariant. The updates to mask 717 * leave fewer bits 0, but each bit that remains 0 represents a 718 * longer string of consecutive 1-bits in scan->bm_bitmap. If 719 * more updates to mask cannot set more bits, because mask is 720 * partitioned with all 1 bits following all 0 bits, the loop 721 * terminates immediately. 722 */ 723 num_shifts--; 724 mask |= mask >> ((count1 >> num_shifts) + 1) / 2; 725 } 726 bighint = count1 >> num_shifts; 727 if (~mask == 0) { 728 /* 729 * Update bighint. There is no allocation bigger than 730 * count1 >> num_shifts starting in this leaf. 731 */ 732 scan->bm_bighint = bighint; 733 return (SWAPBLK_NONE); 734 } 735 736 /* Discard any candidates that appear before blk. */ 737 if ((blk & BLIST_BMAP_MASK) != 0) { 738 if ((~mask & bitrange(0, blk & BLIST_BMAP_MASK)) != 0) { 739 /* Grow bighint in case all discarded bits are set. */ 740 bighint += blk & BLIST_BMAP_MASK; 741 mask |= bitrange(0, blk & BLIST_BMAP_MASK); 742 if (~mask == 0) { 743 scan->bm_bighint = bighint; 744 return (SWAPBLK_NONE); 745 } 746 } 747 blk -= blk & BLIST_BMAP_MASK; 748 } 749 750 /* 751 * The least significant set bit in mask marks the start of the first 752 * available range of sufficient size. Find its position. 753 */ 754 lo = bitpos(~mask); 755 756 /* 757 * Find how much space is available starting at that position. 758 */ 759 if ((mask & (mask + 1)) != 0) { 760 /* Count the 1 bits starting at position lo. */ 761 hi = bitpos(mask & (mask + 1)) + count1; 762 if (maxcount < hi - lo) 763 hi = lo + maxcount; 764 *count = hi - lo; 765 mask = ~bitrange(lo, *count); 766 } else if (maxcount <= BLIST_BMAP_RADIX - lo) { 767 /* All the blocks we can use are available here. */ 768 hi = lo + maxcount; 769 *count = maxcount; 770 mask = ~bitrange(lo, *count); 771 if (hi == BLIST_BMAP_RADIX) 772 scan->bm_bighint = bighint; 773 } else { 774 /* Check next leaf for some of the blocks we want or need. */ 775 count1 = *count - (BLIST_BMAP_RADIX - lo); 776 maxcount -= BLIST_BMAP_RADIX - lo; 777 hi = blst_next_leaf_alloc(scan, blk, count1, maxcount); 778 if (hi < count1) 779 /* 780 * The next leaf cannot supply enough blocks to reach 781 * the minimum required allocation. The hint cannot be 782 * updated, because the same allocation request could 783 * be satisfied later, by this leaf, if the state of 784 * the next leaf changes, and without any changes to 785 * this leaf. 786 */ 787 return (SWAPBLK_NONE); 788 *count = BLIST_BMAP_RADIX - lo + hi; 789 scan->bm_bighint = bighint; 790 } 791 792 /* Clear the allocated bits from this leaf. */ 793 scan->bm_bitmap &= mask; 794 return (blk + lo); 795 } 796 797 /* 798 * blist_meta_alloc() - allocate at a meta in the radix tree. 799 * 800 * Attempt to allocate at a meta node. If we can't, we update 801 * bighint and return a failure. Updating bighint optimize future 802 * calls that hit this node. We have to check for our collapse cases 803 * and we have a few optimizations strewn in as well. 804 */ 805 static daddr_t 806 blst_meta_alloc(blmeta_t *scan, daddr_t cursor, int *count, 807 int maxcount, u_daddr_t radix) 808 { 809 daddr_t blk, i, r, skip; 810 u_daddr_t mask; 811 bool scan_from_start; 812 int digit; 813 814 if (radix == BLIST_BMAP_RADIX) 815 return (blst_leaf_alloc(scan, cursor, count, maxcount)); 816 blk = cursor & -radix; 817 scan_from_start = (cursor == blk); 818 radix /= BLIST_META_RADIX; 819 skip = radix_to_skip(radix); 820 mask = scan->bm_bitmap; 821 822 /* Discard any candidates that appear before cursor. */ 823 digit = (cursor / radix) & BLIST_META_MASK; 824 mask &= (u_daddr_t)-1 << digit; 825 if (mask == 0) 826 return (SWAPBLK_NONE); 827 828 /* 829 * If the first try is for a block that includes the cursor, pre-undo 830 * the digit * radix offset in the first call; otherwise, ignore the 831 * cursor entirely. 832 */ 833 if (((mask >> digit) & 1) == 1) 834 cursor -= digit * radix; 835 else 836 cursor = blk; 837 838 /* 839 * Examine the nonempty subtree associated with each bit set in mask. 840 */ 841 do { 842 digit = bitpos(mask); 843 i = 1 + digit * skip; 844 if (*count <= scan[i].bm_bighint) { 845 /* 846 * The allocation might fit beginning in the i'th subtree. 847 */ 848 r = blst_meta_alloc(&scan[i], cursor + digit * radix, 849 count, maxcount, radix); 850 if (r != SWAPBLK_NONE) { 851 if (scan[i].bm_bitmap == 0) 852 scan->bm_bitmap ^= bitrange(digit, 1); 853 return (r); 854 } 855 } 856 cursor = blk; 857 } while ((mask ^= bitrange(digit, 1)) != 0); 858 859 /* 860 * We couldn't allocate count in this subtree. If the whole tree was 861 * scanned, and the last tree node is allocated, update bighint. 862 */ 863 if (scan_from_start && !(digit == BLIST_META_RADIX - 1 && 864 scan[i].bm_bighint == BLIST_MAX_ALLOC)) 865 scan->bm_bighint = *count - 1; 866 867 return (SWAPBLK_NONE); 868 } 869 870 /* 871 * BLST_LEAF_FREE() - free allocated block from leaf bitmap 872 * 873 */ 874 static void 875 blst_leaf_free(blmeta_t *scan, daddr_t blk, int count) 876 { 877 u_daddr_t mask; 878 879 /* 880 * free some data in this bitmap 881 * mask=0000111111111110000 882 * \_________/\__/ 883 * count n 884 */ 885 mask = bitrange(blk & BLIST_BMAP_MASK, count); 886 KASSERT((scan->bm_bitmap & mask) == 0, 887 ("freeing free block: %jx, size %d, mask %jx", 888 (uintmax_t)blk, count, (uintmax_t)scan->bm_bitmap & mask)); 889 scan->bm_bitmap |= mask; 890 } 891 892 /* 893 * BLST_META_FREE() - free allocated blocks from radix tree meta info 894 * 895 * This support routine frees a range of blocks from the bitmap. 896 * The range must be entirely enclosed by this radix node. If a 897 * meta node, we break the range down recursively to free blocks 898 * in subnodes (which means that this code can free an arbitrary 899 * range whereas the allocation code cannot allocate an arbitrary 900 * range). 901 */ 902 static void 903 blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, u_daddr_t radix) 904 { 905 daddr_t blk, endBlk, i, skip; 906 int digit, endDigit; 907 908 /* 909 * We could probably do a better job here. We are required to make 910 * bighint at least as large as the biggest allocable block of data. 911 * If we just shoehorn it, a little extra overhead will be incurred 912 * on the next allocation (but only that one typically). 913 */ 914 scan->bm_bighint = BLIST_MAX_ALLOC; 915 916 if (radix == BLIST_BMAP_RADIX) 917 return (blst_leaf_free(scan, freeBlk, count)); 918 919 endBlk = ummin(freeBlk + count, (freeBlk + radix) & -radix); 920 radix /= BLIST_META_RADIX; 921 skip = radix_to_skip(radix); 922 blk = freeBlk & -radix; 923 digit = (blk / radix) & BLIST_META_MASK; 924 endDigit = 1 + (((endBlk - 1) / radix) & BLIST_META_MASK); 925 scan->bm_bitmap |= bitrange(digit, endDigit - digit); 926 for (i = 1 + digit * skip; blk < endBlk; i += skip) { 927 blk += radix; 928 count = ummin(blk, endBlk) - freeBlk; 929 blst_meta_free(&scan[i], freeBlk, count, radix); 930 freeBlk = blk; 931 } 932 } 933 934 /* 935 * BLST_COPY() - copy one radix tree to another 936 * 937 * Locates free space in the source tree and frees it in the destination 938 * tree. The space may not already be free in the destination. 939 */ 940 static void 941 blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, blist_t dest, 942 daddr_t count) 943 { 944 daddr_t endBlk, i, skip; 945 946 /* 947 * Leaf node 948 */ 949 950 if (radix == BLIST_BMAP_RADIX) { 951 u_daddr_t v = scan->bm_bitmap; 952 953 if (v == (u_daddr_t)-1) { 954 blist_free(dest, blk, count); 955 } else if (v != 0) { 956 int i; 957 958 for (i = 0; i < count; ++i) { 959 if (v & ((u_daddr_t)1 << i)) 960 blist_free(dest, blk + i, 1); 961 } 962 } 963 return; 964 } 965 966 /* 967 * Meta node 968 */ 969 970 if (scan->bm_bitmap == 0) { 971 /* 972 * Source all allocated, leave dest allocated 973 */ 974 return; 975 } 976 977 endBlk = blk + count; 978 radix /= BLIST_META_RADIX; 979 skip = radix_to_skip(radix); 980 for (i = 1; blk < endBlk; i += skip) { 981 blk += radix; 982 count = radix; 983 if (blk >= endBlk) 984 count -= blk - endBlk; 985 blst_copy(&scan[i], blk - radix, radix, dest, count); 986 } 987 } 988 989 /* 990 * BLST_LEAF_FILL() - allocate specific blocks in leaf bitmap 991 * 992 * This routine allocates all blocks in the specified range 993 * regardless of any existing allocations in that range. Returns 994 * the number of blocks allocated by the call. 995 */ 996 static daddr_t 997 blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count) 998 { 999 daddr_t nblks; 1000 u_daddr_t mask; 1001 1002 mask = bitrange(blk & BLIST_BMAP_MASK, count); 1003 1004 /* Count the number of blocks that we are allocating. */ 1005 nblks = bitcount64(scan->bm_bitmap & mask); 1006 1007 scan->bm_bitmap &= ~mask; 1008 return (nblks); 1009 } 1010 1011 /* 1012 * BLIST_META_FILL() - allocate specific blocks at a meta node 1013 * 1014 * This routine allocates the specified range of blocks, 1015 * regardless of any existing allocations in the range. The 1016 * range must be within the extent of this node. Returns the 1017 * number of blocks allocated by the call. 1018 */ 1019 static daddr_t 1020 blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, u_daddr_t radix) 1021 { 1022 daddr_t blk, endBlk, i, nblks, skip; 1023 int digit; 1024 1025 if (radix == BLIST_BMAP_RADIX) 1026 return (blst_leaf_fill(scan, allocBlk, count)); 1027 1028 endBlk = ummin(allocBlk + count, (allocBlk + radix) & -radix); 1029 radix /= BLIST_META_RADIX; 1030 skip = radix_to_skip(radix); 1031 blk = allocBlk & -radix; 1032 nblks = 0; 1033 while (blk < endBlk) { 1034 digit = (blk / radix) & BLIST_META_MASK; 1035 i = 1 + digit * skip; 1036 blk += radix; 1037 count = ummin(blk, endBlk) - allocBlk; 1038 nblks += blst_meta_fill(&scan[i], allocBlk, count, radix); 1039 if (scan[i].bm_bitmap == 0) 1040 scan->bm_bitmap &= ~((u_daddr_t)1 << digit); 1041 allocBlk = blk; 1042 } 1043 return (nblks); 1044 } 1045 1046 #ifdef BLIST_DEBUG 1047 1048 static void 1049 blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int tab) 1050 { 1051 daddr_t skip; 1052 u_daddr_t mask; 1053 int digit; 1054 1055 if (radix == BLIST_BMAP_RADIX) { 1056 printf( 1057 "%*.*s(%08llx,%lld): bitmap %0*llx big=%lld\n", 1058 tab, tab, "", 1059 (long long)blk, (long long)radix, 1060 1 + (BLIST_BMAP_RADIX - 1) / 4, 1061 (long long)scan->bm_bitmap, 1062 (long long)scan->bm_bighint 1063 ); 1064 return; 1065 } 1066 1067 printf( 1068 "%*.*s(%08llx): subtree (%lld/%lld) bitmap %0*llx big=%lld {\n", 1069 tab, tab, "", 1070 (long long)blk, (long long)radix, 1071 (long long)radix, 1072 1 + (BLIST_META_RADIX - 1) / 4, 1073 (long long)scan->bm_bitmap, 1074 (long long)scan->bm_bighint 1075 ); 1076 1077 radix /= BLIST_META_RADIX; 1078 skip = radix_to_skip(radix); 1079 tab += 4; 1080 1081 mask = scan->bm_bitmap; 1082 /* Examine the nonempty subtree associated with each bit set in mask */ 1083 do { 1084 digit = bitpos(mask); 1085 blst_radix_print(&scan[1 + digit * skip], blk + digit * radix, 1086 radix, tab); 1087 } while ((mask ^= bitrange(digit, 1)) != 0); 1088 tab -= 4; 1089 1090 printf( 1091 "%*.*s}\n", 1092 tab, tab, "" 1093 ); 1094 } 1095 1096 #endif 1097 1098 #ifdef BLIST_DEBUG 1099 1100 int 1101 main(int ac, char **av) 1102 { 1103 int size = BLIST_META_RADIX * BLIST_BMAP_RADIX; 1104 int i; 1105 blist_t bl; 1106 struct sbuf *s; 1107 1108 for (i = 1; i < ac; ++i) { 1109 const char *ptr = av[i]; 1110 if (*ptr != '-') { 1111 size = strtol(ptr, NULL, 0); 1112 continue; 1113 } 1114 ptr += 2; 1115 fprintf(stderr, "Bad option: %s\n", ptr - 2); 1116 exit(1); 1117 } 1118 bl = blist_create(size, M_WAITOK); 1119 blist_free(bl, 0, size); 1120 1121 for (;;) { 1122 char buf[1024]; 1123 long long da = 0; 1124 int count = 0, maxcount = 0; 1125 1126 printf("%lld/%lld/%lld> ", (long long)blist_avail(bl), 1127 (long long)size, (long long)bl->bl_radix); 1128 fflush(stdout); 1129 if (fgets(buf, sizeof(buf), stdin) == NULL) 1130 break; 1131 switch(buf[0]) { 1132 case 'r': 1133 if (sscanf(buf + 1, "%d", &count) == 1) { 1134 blist_resize(&bl, count, 1, M_WAITOK); 1135 } else { 1136 printf("?\n"); 1137 } 1138 case 'p': 1139 blist_print(bl); 1140 break; 1141 case 's': 1142 s = sbuf_new_auto(); 1143 blist_stats(bl, s); 1144 sbuf_finish(s); 1145 printf("%s", sbuf_data(s)); 1146 sbuf_delete(s); 1147 break; 1148 case 'a': 1149 if (sscanf(buf + 1, "%d%d", &count, &maxcount) == 2) { 1150 daddr_t blk = blist_alloc(bl, &count, maxcount); 1151 printf(" R=%08llx, c=%08d\n", 1152 (long long)blk, count); 1153 } else { 1154 printf("?\n"); 1155 } 1156 break; 1157 case 'f': 1158 if (sscanf(buf + 1, "%llx %d", &da, &count) == 2) { 1159 blist_free(bl, da, count); 1160 } else { 1161 printf("?\n"); 1162 } 1163 break; 1164 case 'l': 1165 if (sscanf(buf + 1, "%llx %d", &da, &count) == 2) { 1166 printf(" n=%jd\n", 1167 (intmax_t)blist_fill(bl, da, count)); 1168 } else { 1169 printf("?\n"); 1170 } 1171 break; 1172 case '?': 1173 case 'h': 1174 puts( 1175 "p -print\n" 1176 "s -stats\n" 1177 "a %d %d -allocate\n" 1178 "f %x %d -free\n" 1179 "l %x %d -fill\n" 1180 "r %d -resize\n" 1181 "h/? -help\n" 1182 "q -quit" 1183 ); 1184 break; 1185 case 'q': 1186 break; 1187 default: 1188 printf("?\n"); 1189 break; 1190 } 1191 if (buf[0] == 'q') 1192 break; 1193 } 1194 return (0); 1195 } 1196 1197 #endif 1198