1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1998 Matthew Dillon. All Rights Reserved. 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS 18 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY 21 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE 23 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 25 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 26 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 */ 29 /* 30 * BLIST.C - Bitmap allocator/deallocator, using a radix tree with hinting 31 * 32 * This module implements a general bitmap allocator/deallocator. The 33 * allocator eats around 2 bits per 'block'. The module does not 34 * try to interpret the meaning of a 'block' other than to return 35 * SWAPBLK_NONE on an allocation failure. 36 * 37 * A radix tree controls access to pieces of the bitmap, and includes 38 * auxiliary information at each interior node about the availabilty of 39 * contiguous free blocks in the subtree rooted at that node. Two radix 40 * constants are involved: one for the size of the bitmaps contained in the 41 * leaf nodes (BLIST_BMAP_RADIX), and one for the number of descendents of 42 * each of the meta (interior) nodes (BLIST_META_RADIX). Each subtree is 43 * associated with a range of blocks. The root of any subtree stores a 44 * hint field that defines an upper bound on the size of the largest 45 * allocation that can begin in the associated block range. A hint is an 46 * upper bound on a potential allocation, but not necessarily a tight upper 47 * bound. 48 * 49 * The bitmap field in each node directs the search for available blocks. 50 * For a leaf node, a bit is set if the corresponding block is free. For a 51 * meta node, a bit is set if the corresponding subtree contains a free 52 * block somewhere within it. The search at a meta node considers only 53 * children of that node that represent a range that includes a free block. 54 * 55 * The hinting greatly increases code efficiency for allocations while 56 * the general radix structure optimizes both allocations and frees. The 57 * radix tree should be able to operate well no matter how much 58 * fragmentation there is and no matter how large a bitmap is used. 59 * 60 * The blist code wires all necessary memory at creation time. Neither 61 * allocations nor frees require interaction with the memory subsystem. 62 * The non-blocking nature of allocations and frees is required by swap 63 * code (vm/swap_pager.c). 64 * 65 * LAYOUT: The radix tree is laid out recursively using a linear array. 66 * Each meta node is immediately followed (laid out sequentially in 67 * memory) by BLIST_META_RADIX lower level nodes. This is a recursive 68 * structure but one that can be easily scanned through a very simple 69 * 'skip' calculation. The memory allocation is only large enough to 70 * cover the number of blocks requested at creation time. Nodes that 71 * represent blocks beyond that limit, nodes that would never be read 72 * or written, are not allocated, so that the last of the 73 * BLIST_META_RADIX lower level nodes of a some nodes may not be 74 * allocated. 75 * 76 * NOTE: the allocator cannot currently allocate more than 77 * BLIST_BMAP_RADIX blocks per call. It will panic with 'allocation too 78 * large' if you try. This is an area that could use improvement. The 79 * radix is large enough that this restriction does not effect the swap 80 * system, though. Currently only the allocation code is affected by 81 * this algorithmic unfeature. The freeing code can handle arbitrary 82 * ranges. 83 * 84 * This code can be compiled stand-alone for debugging. 85 */ 86 87 #include <sys/cdefs.h> 88 __FBSDID("$FreeBSD$"); 89 90 #ifdef _KERNEL 91 92 #include <sys/param.h> 93 #include <sys/systm.h> 94 #include <sys/lock.h> 95 #include <sys/kernel.h> 96 #include <sys/blist.h> 97 #include <sys/malloc.h> 98 #include <sys/sbuf.h> 99 #include <sys/proc.h> 100 #include <sys/mutex.h> 101 102 #else 103 104 #ifndef BLIST_NO_DEBUG 105 #define BLIST_DEBUG 106 #endif 107 108 #include <sys/errno.h> 109 #include <sys/types.h> 110 #include <sys/malloc.h> 111 #include <sys/sbuf.h> 112 #include <assert.h> 113 #include <stdio.h> 114 #include <string.h> 115 #include <stddef.h> 116 #include <stdlib.h> 117 #include <stdarg.h> 118 #include <stdbool.h> 119 120 #define bitcount64(x) __bitcount64((uint64_t)(x)) 121 #define malloc(a,b,c) calloc(a, 1) 122 #define free(a,b) free(a) 123 #define ummin(a,b) ((a) < (b) ? (a) : (b)) 124 #define imin(a,b) ((a) < (b) ? (a) : (b)) 125 #define KASSERT(a,b) assert(a) 126 127 #include <sys/blist.h> 128 129 #endif 130 131 /* 132 * static support functions 133 */ 134 static daddr_t blst_leaf_alloc(blmeta_t *scan, daddr_t blk, 135 int *count, int maxcount); 136 static daddr_t blst_meta_alloc(blmeta_t *scan, daddr_t cursor, int *count, 137 int maxcount, u_daddr_t radix); 138 static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count); 139 static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, 140 u_daddr_t radix); 141 static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, 142 blist_t dest, daddr_t count); 143 static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count); 144 static daddr_t blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, 145 u_daddr_t radix); 146 #ifndef _KERNEL 147 static void blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, 148 int tab); 149 #endif 150 151 #ifdef _KERNEL 152 static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space"); 153 #endif 154 155 _Static_assert(BLIST_BMAP_RADIX % BLIST_META_RADIX == 0, 156 "radix divisibility error"); 157 #define BLIST_BMAP_MASK (BLIST_BMAP_RADIX - 1) 158 #define BLIST_META_MASK (BLIST_META_RADIX - 1) 159 160 /* 161 * For a subtree that can represent the state of up to 'radix' blocks, the 162 * number of leaf nodes of the subtree is L=radix/BLIST_BMAP_RADIX. If 'm' 163 * is short for BLIST_META_RADIX, then for a tree of height h with L=m**h 164 * leaf nodes, the total number of tree nodes is 1 + m + m**2 + ... + m**h, 165 * or, equivalently, (m**(h+1)-1)/(m-1). This quantity is called 'skip' 166 * in the 'meta' functions that process subtrees. Since integer division 167 * discards remainders, we can express this computation as 168 * skip = (m * m**h) / (m - 1) 169 * skip = (m * (radix / BLIST_BMAP_RADIX)) / (m - 1) 170 * and since m divides BLIST_BMAP_RADIX, we can simplify further to 171 * skip = (radix / (BLIST_BMAP_RADIX / m)) / (m - 1) 172 * skip = radix / ((BLIST_BMAP_RADIX / m) * (m - 1)) 173 * so that simple integer division by a constant can safely be used for the 174 * calculation. 175 */ 176 static inline daddr_t 177 radix_to_skip(daddr_t radix) 178 { 179 180 return (radix / 181 ((BLIST_BMAP_RADIX / BLIST_META_RADIX) * BLIST_META_MASK)); 182 } 183 184 /* 185 * Provide a mask with count bits set, starting as position n. 186 */ 187 static inline u_daddr_t 188 bitrange(int n, int count) 189 { 190 191 return (((u_daddr_t)-1 << n) & 192 ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - (n + count)))); 193 } 194 195 196 /* 197 * Find the first bit set in a u_daddr_t. 198 */ 199 static inline int 200 generic_bitpos(u_daddr_t mask) 201 { 202 int hi, lo, mid; 203 204 lo = 0; 205 hi = BLIST_BMAP_RADIX; 206 while (lo + 1 < hi) { 207 mid = (lo + hi) >> 1; 208 if (mask & bitrange(0, mid)) 209 hi = mid; 210 else 211 lo = mid; 212 } 213 return (lo); 214 } 215 216 static inline int 217 bitpos(u_daddr_t mask) 218 { 219 220 switch (sizeof(mask)) { 221 #ifdef HAVE_INLINE_FFSLL 222 case sizeof(long long): 223 return (ffsll(mask) - 1); 224 #endif 225 #ifdef HAVE_INLINE_FFS 226 case sizeof(int): 227 return (ffs(mask) - 1); 228 #endif 229 default: 230 return (generic_bitpos(mask)); 231 } 232 } 233 234 /* 235 * blist_create() - create a blist capable of handling up to the specified 236 * number of blocks 237 * 238 * blocks - must be greater than 0 239 * flags - malloc flags 240 * 241 * The smallest blist consists of a single leaf node capable of 242 * managing BLIST_BMAP_RADIX blocks. 243 */ 244 blist_t 245 blist_create(daddr_t blocks, int flags) 246 { 247 blist_t bl; 248 u_daddr_t nodes, radix; 249 250 KASSERT(blocks > 0, ("invalid block count")); 251 252 /* 253 * Calculate the radix and node count used for scanning. 254 */ 255 nodes = 1; 256 radix = BLIST_BMAP_RADIX; 257 while (radix <= blocks) { 258 nodes += 1 + (blocks - 1) / radix; 259 radix *= BLIST_META_RADIX; 260 } 261 262 bl = malloc(offsetof(struct blist, bl_root[nodes]), M_SWAP, flags | 263 M_ZERO); 264 if (bl == NULL) 265 return (NULL); 266 267 bl->bl_blocks = blocks; 268 bl->bl_radix = radix; 269 270 #if defined(BLIST_DEBUG) 271 printf( 272 "BLIST representing %lld blocks (%lld MB of swap)" 273 ", requiring %lldK of ram\n", 274 (long long)bl->bl_blocks, 275 (long long)bl->bl_blocks * 4 / 1024, 276 (long long)(nodes * sizeof(blmeta_t) + 1023) / 1024 277 ); 278 printf("BLIST raw radix tree contains %lld records\n", 279 (long long)nodes); 280 #endif 281 282 return (bl); 283 } 284 285 void 286 blist_destroy(blist_t bl) 287 { 288 289 free(bl, M_SWAP); 290 } 291 292 /* 293 * blist_alloc() - reserve space in the block bitmap. Return the base 294 * of a contiguous region or SWAPBLK_NONE if space could 295 * not be allocated. 296 */ 297 daddr_t 298 blist_alloc(blist_t bl, int *count, int maxcount) 299 { 300 daddr_t blk, cursor; 301 302 KASSERT(*count <= maxcount, 303 ("invalid parameters %d > %d", *count, maxcount)); 304 KASSERT(*count <= BLIST_MAX_ALLOC, 305 ("minimum allocation too large: %d", *count)); 306 307 /* 308 * This loop iterates at most twice. An allocation failure in the 309 * first iteration leads to a second iteration only if the cursor was 310 * non-zero. When the cursor is zero, an allocation failure will 311 * stop further iterations. 312 */ 313 for (cursor = bl->bl_cursor;; cursor = 0) { 314 blk = blst_meta_alloc(bl->bl_root, cursor, count, maxcount, 315 bl->bl_radix); 316 if (blk != SWAPBLK_NONE) { 317 bl->bl_avail -= *count; 318 bl->bl_cursor = blk + *count; 319 if (bl->bl_cursor == bl->bl_blocks) 320 bl->bl_cursor = 0; 321 return (blk); 322 } 323 if (cursor == 0) 324 return (SWAPBLK_NONE); 325 } 326 } 327 328 /* 329 * blist_avail() - return the number of free blocks. 330 */ 331 daddr_t 332 blist_avail(blist_t bl) 333 { 334 335 return (bl->bl_avail); 336 } 337 338 /* 339 * blist_free() - free up space in the block bitmap. Return the base 340 * of a contiguous region. 341 */ 342 void 343 blist_free(blist_t bl, daddr_t blkno, daddr_t count) 344 { 345 346 KASSERT(blkno >= 0 && blkno + count <= bl->bl_blocks, 347 ("freeing invalid range: blkno %jx, count %d, blocks %jd", 348 (uintmax_t)blkno, (int)count, (uintmax_t)bl->bl_blocks)); 349 blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix); 350 bl->bl_avail += count; 351 } 352 353 /* 354 * blist_fill() - mark a region in the block bitmap as off-limits 355 * to the allocator (i.e. allocate it), ignoring any 356 * existing allocations. Return the number of blocks 357 * actually filled that were free before the call. 358 */ 359 daddr_t 360 blist_fill(blist_t bl, daddr_t blkno, daddr_t count) 361 { 362 daddr_t filled; 363 364 KASSERT(blkno >= 0 && blkno + count <= bl->bl_blocks, 365 ("filling invalid range: blkno %jx, count %d, blocks %jd", 366 (uintmax_t)blkno, (int)count, (uintmax_t)bl->bl_blocks)); 367 filled = blst_meta_fill(bl->bl_root, blkno, count, bl->bl_radix); 368 bl->bl_avail -= filled; 369 return (filled); 370 } 371 372 /* 373 * blist_resize() - resize an existing radix tree to handle the 374 * specified number of blocks. This will reallocate 375 * the tree and transfer the previous bitmap to the new 376 * one. When extending the tree you can specify whether 377 * the new blocks are to left allocated or freed. 378 */ 379 void 380 blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags) 381 { 382 blist_t newbl = blist_create(count, flags); 383 blist_t save = *pbl; 384 385 *pbl = newbl; 386 if (count > save->bl_blocks) 387 count = save->bl_blocks; 388 blst_copy(save->bl_root, 0, save->bl_radix, newbl, count); 389 390 /* 391 * If resizing upwards, should we free the new space or not? 392 */ 393 if (freenew && count < newbl->bl_blocks) { 394 blist_free(newbl, count, newbl->bl_blocks - count); 395 } 396 blist_destroy(save); 397 } 398 399 #ifdef BLIST_DEBUG 400 401 /* 402 * blist_print() - dump radix tree 403 */ 404 void 405 blist_print(blist_t bl) 406 { 407 printf("BLIST avail = %jd, cursor = %08jx {\n", 408 (uintmax_t)bl->bl_avail, (uintmax_t)bl->bl_cursor); 409 410 if (bl->bl_root->bm_bitmap != 0) 411 blst_radix_print(bl->bl_root, 0, bl->bl_radix, 4); 412 printf("}\n"); 413 } 414 415 #endif 416 417 static const u_daddr_t fib[] = { 418 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584, 419 4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418, 317811, 420 514229, 832040, 1346269, 2178309, 3524578, 421 }; 422 423 /* 424 * Use 'gap' to describe a maximal range of unallocated blocks/bits. 425 */ 426 struct gap_stats { 427 daddr_t start; /* current gap start, or SWAPBLK_NONE */ 428 daddr_t num; /* number of gaps observed */ 429 daddr_t max; /* largest gap size */ 430 daddr_t avg; /* average gap size */ 431 daddr_t err; /* sum - num * avg */ 432 daddr_t histo[nitems(fib)]; /* # gaps in each size range */ 433 int max_bucket; /* last histo elt with nonzero val */ 434 }; 435 436 /* 437 * gap_stats_counting() - is the state 'counting 1 bits'? 438 * or 'skipping 0 bits'? 439 */ 440 static inline bool 441 gap_stats_counting(const struct gap_stats *stats) 442 { 443 444 return (stats->start != SWAPBLK_NONE); 445 } 446 447 /* 448 * init_gap_stats() - initialize stats on gap sizes 449 */ 450 static inline void 451 init_gap_stats(struct gap_stats *stats) 452 { 453 454 bzero(stats, sizeof(*stats)); 455 stats->start = SWAPBLK_NONE; 456 } 457 458 /* 459 * update_gap_stats() - update stats on gap sizes 460 */ 461 static void 462 update_gap_stats(struct gap_stats *stats, daddr_t posn) 463 { 464 daddr_t size; 465 int hi, lo, mid; 466 467 if (!gap_stats_counting(stats)) { 468 stats->start = posn; 469 return; 470 } 471 size = posn - stats->start; 472 stats->start = SWAPBLK_NONE; 473 if (size > stats->max) 474 stats->max = size; 475 476 /* 477 * Find the fibonacci range that contains size, 478 * expecting to find it in an early range. 479 */ 480 lo = 0; 481 hi = 1; 482 while (hi < nitems(fib) && fib[hi] <= size) { 483 lo = hi; 484 hi *= 2; 485 } 486 if (hi >= nitems(fib)) 487 hi = nitems(fib); 488 while (lo + 1 != hi) { 489 mid = (lo + hi) >> 1; 490 if (fib[mid] <= size) 491 lo = mid; 492 else 493 hi = mid; 494 } 495 stats->histo[lo]++; 496 if (lo > stats->max_bucket) 497 stats->max_bucket = lo; 498 stats->err += size - stats->avg; 499 stats->num++; 500 stats->avg += stats->err / stats->num; 501 stats->err %= stats->num; 502 } 503 504 /* 505 * dump_gap_stats() - print stats on gap sizes 506 */ 507 static inline void 508 dump_gap_stats(const struct gap_stats *stats, struct sbuf *s) 509 { 510 int i; 511 512 sbuf_printf(s, "number of maximal free ranges: %jd\n", 513 (intmax_t)stats->num); 514 sbuf_printf(s, "largest free range: %jd\n", (intmax_t)stats->max); 515 sbuf_printf(s, "average maximal free range size: %jd\n", 516 (intmax_t)stats->avg); 517 sbuf_printf(s, "number of maximal free ranges of different sizes:\n"); 518 sbuf_printf(s, " count | size range\n"); 519 sbuf_printf(s, " ----- | ----------\n"); 520 for (i = 0; i < stats->max_bucket; i++) { 521 if (stats->histo[i] != 0) { 522 sbuf_printf(s, "%20jd | ", 523 (intmax_t)stats->histo[i]); 524 if (fib[i] != fib[i + 1] - 1) 525 sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i], 526 (intmax_t)fib[i + 1] - 1); 527 else 528 sbuf_printf(s, "%jd\n", (intmax_t)fib[i]); 529 } 530 } 531 sbuf_printf(s, "%20jd | ", (intmax_t)stats->histo[i]); 532 if (stats->histo[i] > 1) 533 sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i], 534 (intmax_t)stats->max); 535 else 536 sbuf_printf(s, "%jd\n", (intmax_t)stats->max); 537 } 538 539 /* 540 * blist_stats() - dump radix tree stats 541 */ 542 void 543 blist_stats(blist_t bl, struct sbuf *s) 544 { 545 struct gap_stats gstats; 546 struct gap_stats *stats = &gstats; 547 daddr_t i, nodes, radix; 548 u_daddr_t diff, mask; 549 int digit; 550 551 init_gap_stats(stats); 552 nodes = 0; 553 i = bl->bl_radix; 554 while (i < bl->bl_radix + bl->bl_blocks) { 555 /* 556 * Find max size subtree starting at i. 557 */ 558 radix = BLIST_BMAP_RADIX; 559 while (((i / radix) & BLIST_META_MASK) == 0) 560 radix *= BLIST_META_RADIX; 561 562 /* 563 * Check for skippable subtrees starting at i. 564 */ 565 while (radix > BLIST_BMAP_RADIX) { 566 if (bl->bl_root[nodes].bm_bitmap == 0) { 567 if (gap_stats_counting(stats)) 568 update_gap_stats(stats, i); 569 break; 570 } 571 572 /* 573 * Skip subtree root. 574 */ 575 nodes++; 576 radix /= BLIST_META_RADIX; 577 } 578 if (radix == BLIST_BMAP_RADIX) { 579 /* 580 * Scan leaf. 581 */ 582 mask = bl->bl_root[nodes].bm_bitmap; 583 diff = mask ^ (mask << 1); 584 if (gap_stats_counting(stats)) 585 diff ^= 1; 586 while (diff != 0) { 587 digit = bitpos(diff); 588 update_gap_stats(stats, i + digit); 589 diff ^= bitrange(digit, 1); 590 } 591 } 592 nodes += radix_to_skip(radix); 593 i += radix; 594 } 595 update_gap_stats(stats, i); 596 dump_gap_stats(stats, s); 597 } 598 599 /************************************************************************ 600 * ALLOCATION SUPPORT FUNCTIONS * 601 ************************************************************************ 602 * 603 * These support functions do all the actual work. They may seem 604 * rather longish, but that's because I've commented them up. The 605 * actual code is straight forward. 606 * 607 */ 608 609 /* 610 * BLST_NEXT_LEAF_ALLOC() - allocate the blocks starting with the next leaf. 611 * 612 * 'scan' is a leaf node, and its first block is at address 'start'. The 613 * next leaf node could be adjacent, or several nodes away if the least 614 * common ancestor of 'scan' and its neighbor is several levels up. Use 615 * addresses to determine how many meta-nodes lie between the leaves. If 616 * sequence of leaves starting with the next one has enough initial bits 617 * set, clear them and clear the bits in the meta nodes on the path up to 618 * the least common ancestor to mark any subtrees made completely empty. 619 */ 620 static int 621 blst_next_leaf_alloc(blmeta_t *scan, daddr_t start, int count, int maxcount) 622 { 623 u_daddr_t radix; 624 daddr_t blk; 625 int avail, digit; 626 627 start += BLIST_BMAP_RADIX; 628 for (blk = start; blk - start < maxcount; blk += BLIST_BMAP_RADIX) { 629 /* Skip meta-nodes, as long as they promise more free blocks. */ 630 radix = BLIST_BMAP_RADIX; 631 while (((++scan)->bm_bitmap & 1) == 1 && 632 ((blk / radix) & BLIST_META_MASK) == 0) 633 radix *= BLIST_META_RADIX; 634 if (~scan->bm_bitmap != 0) { 635 /* 636 * Either there is no next leaf with any free blocks, 637 * or we've reached the next leaf and found that some 638 * of its blocks are not free. In the first case, 639 * bitpos() returns zero here. 640 */ 641 avail = blk - start + bitpos(~scan->bm_bitmap); 642 if (avail < count || avail == 0) { 643 /* 644 * There isn't a next leaf with enough free 645 * blocks at its beginning to bother 646 * allocating. 647 */ 648 return (avail); 649 } 650 maxcount = imin(avail, maxcount); 651 if (maxcount % BLIST_BMAP_RADIX == 0) { 652 /* 653 * There was no next leaf. Back scan up to 654 * last leaf. 655 */ 656 --scan; 657 while (radix != BLIST_BMAP_RADIX) { 658 radix /= BLIST_META_RADIX; 659 --scan; 660 } 661 blk -= BLIST_BMAP_RADIX; 662 } 663 } 664 } 665 666 /* 667 * 'scan' is the last leaf that provides blocks. Clear from 1 to 668 * BLIST_BMAP_RADIX bits to represent the allocation of those last 669 * blocks. 670 */ 671 if (maxcount % BLIST_BMAP_RADIX != 0) 672 scan->bm_bitmap &= ~bitrange(0, maxcount % BLIST_BMAP_RADIX); 673 else 674 scan->bm_bitmap = 0; 675 676 for (;;) { 677 /* Back up over meta-nodes, clearing bits if necessary. */ 678 blk -= BLIST_BMAP_RADIX; 679 radix = BLIST_BMAP_RADIX; 680 while ((digit = ((blk / radix) & BLIST_META_MASK)) == 0) { 681 if ((scan--)->bm_bitmap == 0) 682 scan->bm_bitmap ^= 1; 683 radix *= BLIST_META_RADIX; 684 } 685 if ((scan--)->bm_bitmap == 0) 686 scan[-digit * radix_to_skip(radix)].bm_bitmap ^= 687 (u_daddr_t)1 << digit; 688 689 if (blk == start) 690 break; 691 /* Clear all the bits of this leaf. */ 692 scan->bm_bitmap = 0; 693 } 694 return (maxcount); 695 } 696 697 /* 698 * BLST_LEAF_ALLOC() - allocate at a leaf in the radix tree (a bitmap). 699 * 700 * This function is the core of the allocator. Its execution time is 701 * proportional to log(count), plus height of the tree if the allocation 702 * crosses a leaf boundary. 703 */ 704 static daddr_t 705 blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int *count, int maxcount) 706 { 707 u_daddr_t mask; 708 int bighint, count1, hi, lo, num_shifts; 709 710 count1 = *count - 1; 711 num_shifts = fls(count1); 712 mask = ~scan->bm_bitmap; 713 while ((mask & (mask + 1)) != 0 && num_shifts > 0) { 714 /* 715 * If bit i is 0 in mask, then bits in [i, i + (count1 >> 716 * num_shifts)] are 1 in scan->bm_bitmap. Reduce num_shifts to 717 * 0, while preserving this invariant. The updates to mask 718 * leave fewer bits 0, but each bit that remains 0 represents a 719 * longer string of consecutive 1-bits in scan->bm_bitmap. If 720 * more updates to mask cannot set more bits, because mask is 721 * partitioned with all 1 bits following all 0 bits, the loop 722 * terminates immediately. 723 */ 724 num_shifts--; 725 mask |= mask >> ((count1 >> num_shifts) + 1) / 2; 726 } 727 bighint = count1 >> num_shifts; 728 if (~mask == 0) { 729 /* 730 * Update bighint. There is no allocation bigger than 731 * count1 >> num_shifts starting in this leaf. 732 */ 733 scan->bm_bighint = bighint; 734 return (SWAPBLK_NONE); 735 } 736 737 /* Discard any candidates that appear before blk. */ 738 if ((blk & BLIST_BMAP_MASK) != 0) { 739 if ((~mask & bitrange(0, blk & BLIST_BMAP_MASK)) != 0) { 740 /* Grow bighint in case all discarded bits are set. */ 741 bighint += blk & BLIST_BMAP_MASK; 742 mask |= bitrange(0, blk & BLIST_BMAP_MASK); 743 if (~mask == 0) { 744 scan->bm_bighint = bighint; 745 return (SWAPBLK_NONE); 746 } 747 } 748 blk -= blk & BLIST_BMAP_MASK; 749 } 750 751 /* 752 * The least significant set bit in mask marks the start of the first 753 * available range of sufficient size. Find its position. 754 */ 755 lo = bitpos(~mask); 756 757 /* 758 * Find how much space is available starting at that position. 759 */ 760 if ((mask & (mask + 1)) != 0) { 761 /* Count the 1 bits starting at position lo. */ 762 hi = bitpos(mask & (mask + 1)) + count1; 763 if (maxcount < hi - lo) 764 hi = lo + maxcount; 765 *count = hi - lo; 766 mask = ~bitrange(lo, *count); 767 } else if (maxcount <= BLIST_BMAP_RADIX - lo) { 768 /* All the blocks we can use are available here. */ 769 hi = lo + maxcount; 770 *count = maxcount; 771 mask = ~bitrange(lo, *count); 772 if (hi == BLIST_BMAP_RADIX) 773 scan->bm_bighint = bighint; 774 } else { 775 /* Check next leaf for some of the blocks we want or need. */ 776 count1 = *count - (BLIST_BMAP_RADIX - lo); 777 maxcount -= BLIST_BMAP_RADIX - lo; 778 hi = blst_next_leaf_alloc(scan, blk, count1, maxcount); 779 if (hi < count1) 780 /* 781 * The next leaf cannot supply enough blocks to reach 782 * the minimum required allocation. The hint cannot be 783 * updated, because the same allocation request could 784 * be satisfied later, by this leaf, if the state of 785 * the next leaf changes, and without any changes to 786 * this leaf. 787 */ 788 return (SWAPBLK_NONE); 789 *count = BLIST_BMAP_RADIX - lo + hi; 790 scan->bm_bighint = bighint; 791 } 792 793 /* Clear the allocated bits from this leaf. */ 794 scan->bm_bitmap &= mask; 795 return (blk + lo); 796 } 797 798 /* 799 * blist_meta_alloc() - allocate at a meta in the radix tree. 800 * 801 * Attempt to allocate at a meta node. If we can't, we update 802 * bighint and return a failure. Updating bighint optimize future 803 * calls that hit this node. We have to check for our collapse cases 804 * and we have a few optimizations strewn in as well. 805 */ 806 static daddr_t 807 blst_meta_alloc(blmeta_t *scan, daddr_t cursor, int *count, 808 int maxcount, u_daddr_t radix) 809 { 810 daddr_t blk, i, r, skip; 811 u_daddr_t mask; 812 bool scan_from_start; 813 int digit; 814 815 if (radix == BLIST_BMAP_RADIX) 816 return (blst_leaf_alloc(scan, cursor, count, maxcount)); 817 blk = cursor & -radix; 818 scan_from_start = (cursor == blk); 819 radix /= BLIST_META_RADIX; 820 skip = radix_to_skip(radix); 821 mask = scan->bm_bitmap; 822 823 /* Discard any candidates that appear before cursor. */ 824 digit = (cursor / radix) & BLIST_META_MASK; 825 mask &= (u_daddr_t)-1 << digit; 826 if (mask == 0) 827 return (SWAPBLK_NONE); 828 829 /* 830 * If the first try is for a block that includes the cursor, pre-undo 831 * the digit * radix offset in the first call; otherwise, ignore the 832 * cursor entirely. 833 */ 834 if (((mask >> digit) & 1) == 1) 835 cursor -= digit * radix; 836 else 837 cursor = blk; 838 839 /* 840 * Examine the nonempty subtree associated with each bit set in mask. 841 */ 842 do { 843 digit = bitpos(mask); 844 i = 1 + digit * skip; 845 if (*count <= scan[i].bm_bighint) { 846 /* 847 * The allocation might fit beginning in the i'th subtree. 848 */ 849 r = blst_meta_alloc(&scan[i], cursor + digit * radix, 850 count, maxcount, radix); 851 if (r != SWAPBLK_NONE) { 852 if (scan[i].bm_bitmap == 0) 853 scan->bm_bitmap ^= bitrange(digit, 1); 854 return (r); 855 } 856 } 857 cursor = blk; 858 } while ((mask ^= bitrange(digit, 1)) != 0); 859 860 /* 861 * We couldn't allocate count in this subtree. If the whole tree was 862 * scanned, and the last tree node is allocated, update bighint. 863 */ 864 if (scan_from_start && !(digit == BLIST_META_RADIX - 1 && 865 scan[i].bm_bighint == BLIST_MAX_ALLOC)) 866 scan->bm_bighint = *count - 1; 867 868 return (SWAPBLK_NONE); 869 } 870 871 /* 872 * BLST_LEAF_FREE() - free allocated block from leaf bitmap 873 * 874 */ 875 static void 876 blst_leaf_free(blmeta_t *scan, daddr_t blk, int count) 877 { 878 u_daddr_t mask; 879 880 /* 881 * free some data in this bitmap 882 * mask=0000111111111110000 883 * \_________/\__/ 884 * count n 885 */ 886 mask = bitrange(blk & BLIST_BMAP_MASK, count); 887 KASSERT((scan->bm_bitmap & mask) == 0, 888 ("freeing free block: %jx, size %d, mask %jx", 889 (uintmax_t)blk, count, (uintmax_t)scan->bm_bitmap & mask)); 890 scan->bm_bitmap |= mask; 891 } 892 893 /* 894 * BLST_META_FREE() - free allocated blocks from radix tree meta info 895 * 896 * This support routine frees a range of blocks from the bitmap. 897 * The range must be entirely enclosed by this radix node. If a 898 * meta node, we break the range down recursively to free blocks 899 * in subnodes (which means that this code can free an arbitrary 900 * range whereas the allocation code cannot allocate an arbitrary 901 * range). 902 */ 903 static void 904 blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, u_daddr_t radix) 905 { 906 daddr_t blk, endBlk, i, skip; 907 int digit, endDigit; 908 909 /* 910 * We could probably do a better job here. We are required to make 911 * bighint at least as large as the biggest allocable block of data. 912 * If we just shoehorn it, a little extra overhead will be incurred 913 * on the next allocation (but only that one typically). 914 */ 915 scan->bm_bighint = BLIST_MAX_ALLOC; 916 917 if (radix == BLIST_BMAP_RADIX) 918 return (blst_leaf_free(scan, freeBlk, count)); 919 920 endBlk = ummin(freeBlk + count, (freeBlk + radix) & -radix); 921 radix /= BLIST_META_RADIX; 922 skip = radix_to_skip(radix); 923 blk = freeBlk & -radix; 924 digit = (blk / radix) & BLIST_META_MASK; 925 endDigit = 1 + (((endBlk - 1) / radix) & BLIST_META_MASK); 926 scan->bm_bitmap |= bitrange(digit, endDigit - digit); 927 for (i = 1 + digit * skip; blk < endBlk; i += skip) { 928 blk += radix; 929 count = ummin(blk, endBlk) - freeBlk; 930 blst_meta_free(&scan[i], freeBlk, count, radix); 931 freeBlk = blk; 932 } 933 } 934 935 /* 936 * BLST_COPY() - copy one radix tree to another 937 * 938 * Locates free space in the source tree and frees it in the destination 939 * tree. The space may not already be free in the destination. 940 */ 941 static void 942 blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, blist_t dest, 943 daddr_t count) 944 { 945 daddr_t endBlk, i, skip; 946 947 /* 948 * Leaf node 949 */ 950 951 if (radix == BLIST_BMAP_RADIX) { 952 u_daddr_t v = scan->bm_bitmap; 953 954 if (v == (u_daddr_t)-1) { 955 blist_free(dest, blk, count); 956 } else if (v != 0) { 957 int i; 958 959 for (i = 0; i < count; ++i) { 960 if (v & ((u_daddr_t)1 << i)) 961 blist_free(dest, blk + i, 1); 962 } 963 } 964 return; 965 } 966 967 /* 968 * Meta node 969 */ 970 971 if (scan->bm_bitmap == 0) { 972 /* 973 * Source all allocated, leave dest allocated 974 */ 975 return; 976 } 977 978 endBlk = blk + count; 979 radix /= BLIST_META_RADIX; 980 skip = radix_to_skip(radix); 981 for (i = 1; blk < endBlk; i += skip) { 982 blk += radix; 983 count = radix; 984 if (blk >= endBlk) 985 count -= blk - endBlk; 986 blst_copy(&scan[i], blk - radix, radix, dest, count); 987 } 988 } 989 990 /* 991 * BLST_LEAF_FILL() - allocate specific blocks in leaf bitmap 992 * 993 * This routine allocates all blocks in the specified range 994 * regardless of any existing allocations in that range. Returns 995 * the number of blocks allocated by the call. 996 */ 997 static daddr_t 998 blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count) 999 { 1000 daddr_t nblks; 1001 u_daddr_t mask; 1002 1003 mask = bitrange(blk & BLIST_BMAP_MASK, count); 1004 1005 /* Count the number of blocks that we are allocating. */ 1006 nblks = bitcount64(scan->bm_bitmap & mask); 1007 1008 scan->bm_bitmap &= ~mask; 1009 return (nblks); 1010 } 1011 1012 /* 1013 * BLIST_META_FILL() - allocate specific blocks at a meta node 1014 * 1015 * This routine allocates the specified range of blocks, 1016 * regardless of any existing allocations in the range. The 1017 * range must be within the extent of this node. Returns the 1018 * number of blocks allocated by the call. 1019 */ 1020 static daddr_t 1021 blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, u_daddr_t radix) 1022 { 1023 daddr_t blk, endBlk, i, nblks, skip; 1024 int digit; 1025 1026 if (radix == BLIST_BMAP_RADIX) 1027 return (blst_leaf_fill(scan, allocBlk, count)); 1028 1029 endBlk = ummin(allocBlk + count, (allocBlk + radix) & -radix); 1030 radix /= BLIST_META_RADIX; 1031 skip = radix_to_skip(radix); 1032 blk = allocBlk & -radix; 1033 nblks = 0; 1034 while (blk < endBlk) { 1035 digit = (blk / radix) & BLIST_META_MASK; 1036 i = 1 + digit * skip; 1037 blk += radix; 1038 count = ummin(blk, endBlk) - allocBlk; 1039 nblks += blst_meta_fill(&scan[i], allocBlk, count, radix); 1040 if (scan[i].bm_bitmap == 0) 1041 scan->bm_bitmap &= ~((u_daddr_t)1 << digit); 1042 allocBlk = blk; 1043 } 1044 return (nblks); 1045 } 1046 1047 #ifdef BLIST_DEBUG 1048 1049 static void 1050 blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int tab) 1051 { 1052 daddr_t skip; 1053 u_daddr_t mask; 1054 int digit; 1055 1056 if (radix == BLIST_BMAP_RADIX) { 1057 printf( 1058 "%*.*s(%08llx,%lld): bitmap %0*llx big=%lld\n", 1059 tab, tab, "", 1060 (long long)blk, (long long)radix, 1061 1 + (BLIST_BMAP_RADIX - 1) / 4, 1062 (long long)scan->bm_bitmap, 1063 (long long)scan->bm_bighint 1064 ); 1065 return; 1066 } 1067 1068 printf( 1069 "%*.*s(%08llx): subtree (%lld/%lld) bitmap %0*llx big=%lld {\n", 1070 tab, tab, "", 1071 (long long)blk, (long long)radix, 1072 (long long)radix, 1073 1 + (BLIST_META_RADIX - 1) / 4, 1074 (long long)scan->bm_bitmap, 1075 (long long)scan->bm_bighint 1076 ); 1077 1078 radix /= BLIST_META_RADIX; 1079 skip = radix_to_skip(radix); 1080 tab += 4; 1081 1082 mask = scan->bm_bitmap; 1083 /* Examine the nonempty subtree associated with each bit set in mask */ 1084 do { 1085 digit = bitpos(mask); 1086 blst_radix_print(&scan[1 + digit * skip], blk + digit * radix, 1087 radix, tab); 1088 } while ((mask ^= bitrange(digit, 1)) != 0); 1089 tab -= 4; 1090 1091 printf( 1092 "%*.*s}\n", 1093 tab, tab, "" 1094 ); 1095 } 1096 1097 #endif 1098 1099 #ifdef BLIST_DEBUG 1100 1101 int 1102 main(int ac, char **av) 1103 { 1104 int size = BLIST_META_RADIX * BLIST_BMAP_RADIX; 1105 int i; 1106 blist_t bl; 1107 struct sbuf *s; 1108 1109 for (i = 1; i < ac; ++i) { 1110 const char *ptr = av[i]; 1111 if (*ptr != '-') { 1112 size = strtol(ptr, NULL, 0); 1113 continue; 1114 } 1115 ptr += 2; 1116 fprintf(stderr, "Bad option: %s\n", ptr - 2); 1117 exit(1); 1118 } 1119 bl = blist_create(size, M_WAITOK); 1120 blist_free(bl, 0, size); 1121 1122 for (;;) { 1123 char buf[1024]; 1124 long long da = 0; 1125 int count = 0, maxcount = 0; 1126 1127 printf("%lld/%lld/%lld> ", (long long)blist_avail(bl), 1128 (long long)size, (long long)bl->bl_radix); 1129 fflush(stdout); 1130 if (fgets(buf, sizeof(buf), stdin) == NULL) 1131 break; 1132 switch(buf[0]) { 1133 case 'r': 1134 if (sscanf(buf + 1, "%d", &count) == 1) { 1135 blist_resize(&bl, count, 1, M_WAITOK); 1136 } else { 1137 printf("?\n"); 1138 } 1139 case 'p': 1140 blist_print(bl); 1141 break; 1142 case 's': 1143 s = sbuf_new_auto(); 1144 blist_stats(bl, s); 1145 sbuf_finish(s); 1146 printf("%s", sbuf_data(s)); 1147 sbuf_delete(s); 1148 break; 1149 case 'a': 1150 if (sscanf(buf + 1, "%d%d", &count, &maxcount) == 2) { 1151 daddr_t blk = blist_alloc(bl, &count, maxcount); 1152 printf(" R=%08llx, c=%08d\n", 1153 (long long)blk, count); 1154 } else { 1155 printf("?\n"); 1156 } 1157 break; 1158 case 'f': 1159 if (sscanf(buf + 1, "%llx %d", &da, &count) == 2) { 1160 blist_free(bl, da, count); 1161 } else { 1162 printf("?\n"); 1163 } 1164 break; 1165 case 'l': 1166 if (sscanf(buf + 1, "%llx %d", &da, &count) == 2) { 1167 printf(" n=%jd\n", 1168 (intmax_t)blist_fill(bl, da, count)); 1169 } else { 1170 printf("?\n"); 1171 } 1172 break; 1173 case '?': 1174 case 'h': 1175 puts( 1176 "p -print\n" 1177 "s -stats\n" 1178 "a %d %d -allocate\n" 1179 "f %x %d -free\n" 1180 "l %x %d -fill\n" 1181 "r %d -resize\n" 1182 "h/? -help\n" 1183 "q -quit" 1184 ); 1185 break; 1186 case 'q': 1187 break; 1188 default: 1189 printf("?\n"); 1190 break; 1191 } 1192 if (buf[0] == 'q') 1193 break; 1194 } 1195 return (0); 1196 } 1197 1198 #endif 1199