1 /*- 2 * Copyright (c) 1998 Matthew Dillon. All Rights Reserved. 3 * Redistribution and use in source and binary forms, with or without 4 * modification, are permitted provided that the following conditions 5 * are met: 6 * 1. Redistributions of source code must retain the above copyright 7 * notice, this list of conditions and the following disclaimer. 8 * 2. Redistributions in binary form must reproduce the above copyright 9 * notice, this list of conditions and the following disclaimer in the 10 * documentation and/or other materials provided with the distribution. 11 * 3. Neither the name of the University nor the names of its contributors 12 * may be used to endorse or promote products derived from this software 13 * without specific prior written permission. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS 16 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY 19 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE 21 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 23 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 24 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 /* 28 * BLIST.C - Bitmap allocator/deallocator, using a radix tree with hinting 29 * 30 * This module implements a general bitmap allocator/deallocator. The 31 * allocator eats around 2 bits per 'block'. The module does not 32 * try to interpret the meaning of a 'block' other than to return 33 * SWAPBLK_NONE on an allocation failure. 34 * 35 * A radix tree is used to maintain the bitmap. Two radix constants are 36 * involved: One for the bitmaps contained in the leaf nodes (typically 37 * 64), and one for the meta nodes (typically 16). Both meta and leaf 38 * nodes have a hint field. This field gives us a hint as to the largest 39 * free contiguous range of blocks under the node. It may contain a 40 * value that is too high, but will never contain a value that is too 41 * low. When the radix tree is searched, allocation failures in subtrees 42 * update the hint. 43 * 44 * The radix tree also implements two collapsed states for meta nodes: 45 * the ALL-ALLOCATED state and the ALL-FREE state. If a meta node is 46 * in either of these two states, all information contained underneath 47 * the node is considered stale. These states are used to optimize 48 * allocation and freeing operations. 49 * 50 * The hinting greatly increases code efficiency for allocations while 51 * the general radix structure optimizes both allocations and frees. The 52 * radix tree should be able to operate well no matter how much 53 * fragmentation there is and no matter how large a bitmap is used. 54 * 55 * The blist code wires all necessary memory at creation time. Neither 56 * allocations nor frees require interaction with the memory subsystem. 57 * The non-blocking features of the blist code are used in the swap code 58 * (vm/swap_pager.c). 59 * 60 * LAYOUT: The radix tree is laid out recursively using a 61 * linear array. Each meta node is immediately followed (laid out 62 * sequentially in memory) by BLIST_META_RADIX lower level nodes. This 63 * is a recursive structure but one that can be easily scanned through 64 * a very simple 'skip' calculation. In order to support large radixes, 65 * portions of the tree may reside outside our memory allocation. We 66 * handle this with an early-termination optimization (when bighint is 67 * set to -1) on the scan. The memory allocation is only large enough 68 * to cover the number of blocks requested at creation time even if it 69 * must be encompassed in larger root-node radix. 70 * 71 * NOTE: the allocator cannot currently allocate more than 72 * BLIST_BMAP_RADIX blocks per call. It will panic with 'allocation too 73 * large' if you try. This is an area that could use improvement. The 74 * radix is large enough that this restriction does not effect the swap 75 * system, though. Currently only the allocation code is affected by 76 * this algorithmic unfeature. The freeing code can handle arbitrary 77 * ranges. 78 * 79 * This code can be compiled stand-alone for debugging. 80 */ 81 82 #include <sys/cdefs.h> 83 __FBSDID("$FreeBSD$"); 84 85 #ifdef _KERNEL 86 87 #include <sys/param.h> 88 #include <sys/systm.h> 89 #include <sys/lock.h> 90 #include <sys/kernel.h> 91 #include <sys/blist.h> 92 #include <sys/malloc.h> 93 #include <sys/sbuf.h> 94 #include <sys/proc.h> 95 #include <sys/mutex.h> 96 97 #else 98 99 #ifndef BLIST_NO_DEBUG 100 #define BLIST_DEBUG 101 #endif 102 103 #include <sys/types.h> 104 #include <sys/malloc.h> 105 #include <sys/sbuf.h> 106 #include <stdio.h> 107 #include <string.h> 108 #include <stdlib.h> 109 #include <stdarg.h> 110 #include <stdbool.h> 111 112 #define bitcount64(x) __bitcount64((uint64_t)(x)) 113 #define malloc(a,b,c) calloc(a, 1) 114 #define free(a,b) free(a) 115 #define CTASSERT(expr) 116 117 #include <sys/blist.h> 118 119 void panic(const char *ctl, ...); 120 121 #endif 122 123 /* 124 * static support functions 125 */ 126 static daddr_t blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count, 127 daddr_t cursor); 128 static daddr_t blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count, 129 u_daddr_t radix); 130 static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count); 131 static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, 132 u_daddr_t radix); 133 static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, 134 blist_t dest, daddr_t count); 135 static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count); 136 static daddr_t blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, 137 u_daddr_t radix); 138 static daddr_t blst_radix_init(blmeta_t *scan, daddr_t radix, daddr_t count); 139 #ifndef _KERNEL 140 static void blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, 141 int tab); 142 #endif 143 144 #ifdef _KERNEL 145 static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space"); 146 #endif 147 148 CTASSERT(BLIST_BMAP_RADIX % BLIST_META_RADIX == 0); 149 #define BLIST_META_MASK (BLIST_META_RADIX - 1) 150 151 /* 152 * For a subtree that can represent the state of up to 'radix' blocks, the 153 * number of leaf nodes of the subtree is L=radix/BLIST_BMAP_RADIX. If 'm' 154 * is short for BLIST_META_RADIX, then for a tree of height h with L=m**h 155 * leaf nodes, the total number of tree nodes is 1 + m + m**2 + ... + m**h, 156 * or, equivalently, (m**(h+1)-1)/(m-1). This quantity is called 'skip' 157 * in the 'meta' functions that process subtrees. Since integer division 158 * discards remainders, we can express this computation as 159 * skip = (m * m**h) / (m - 1) 160 * skip = (m * (radix / BLIST_BMAP_RADIX)) / (m - 1) 161 * and since m divides BLIST_BMAP_RADIX, we can simplify further to 162 * skip = (radix / (BLIST_BMAP_RADIX / m)) / (m - 1) 163 * skip = radix / ((BLIST_BMAP_RADIX / m) * (m - 1)) 164 * so that simple integer division by a constant can safely be used for the 165 * calculation. 166 */ 167 static inline daddr_t 168 radix_to_skip(daddr_t radix) 169 { 170 171 return (radix / 172 ((BLIST_BMAP_RADIX / BLIST_META_RADIX) * BLIST_META_MASK)); 173 } 174 175 /* 176 * Use binary search, or a faster method, to find the 1 bit in a u_daddr_t. 177 * Assumes that the argument has only one bit set. 178 */ 179 static inline int 180 bitpos(u_daddr_t mask) 181 { 182 int hi, lo, mid; 183 184 switch (sizeof(mask)) { 185 #ifdef HAVE_INLINE_FFSLL 186 case sizeof(long long): 187 return (ffsll(mask) - 1); 188 #endif 189 default: 190 lo = 0; 191 hi = BLIST_BMAP_RADIX; 192 while (lo + 1 < hi) { 193 mid = (lo + hi) >> 1; 194 if ((mask >> mid) != 0) 195 lo = mid; 196 else 197 hi = mid; 198 } 199 return (lo); 200 } 201 } 202 203 /* 204 * blist_create() - create a blist capable of handling up to the specified 205 * number of blocks 206 * 207 * blocks - must be greater than 0 208 * flags - malloc flags 209 * 210 * The smallest blist consists of a single leaf node capable of 211 * managing BLIST_BMAP_RADIX blocks. 212 */ 213 blist_t 214 blist_create(daddr_t blocks, int flags) 215 { 216 blist_t bl; 217 daddr_t nodes, radix; 218 219 /* 220 * Calculate the radix field used for scanning. 221 */ 222 radix = BLIST_BMAP_RADIX; 223 while (radix < blocks) { 224 radix *= BLIST_META_RADIX; 225 } 226 nodes = 1 + blst_radix_init(NULL, radix, blocks); 227 228 bl = malloc(sizeof(struct blist), M_SWAP, flags); 229 if (bl == NULL) 230 return (NULL); 231 232 bl->bl_blocks = blocks; 233 bl->bl_radix = radix; 234 bl->bl_cursor = 0; 235 bl->bl_root = malloc(nodes * sizeof(blmeta_t), M_SWAP, flags); 236 if (bl->bl_root == NULL) { 237 free(bl, M_SWAP); 238 return (NULL); 239 } 240 blst_radix_init(bl->bl_root, radix, blocks); 241 242 #if defined(BLIST_DEBUG) 243 printf( 244 "BLIST representing %lld blocks (%lld MB of swap)" 245 ", requiring %lldK of ram\n", 246 (long long)bl->bl_blocks, 247 (long long)bl->bl_blocks * 4 / 1024, 248 (long long)(nodes * sizeof(blmeta_t) + 1023) / 1024 249 ); 250 printf("BLIST raw radix tree contains %lld records\n", 251 (long long)nodes); 252 #endif 253 254 return (bl); 255 } 256 257 void 258 blist_destroy(blist_t bl) 259 { 260 free(bl->bl_root, M_SWAP); 261 free(bl, M_SWAP); 262 } 263 264 /* 265 * blist_alloc() - reserve space in the block bitmap. Return the base 266 * of a contiguous region or SWAPBLK_NONE if space could 267 * not be allocated. 268 */ 269 daddr_t 270 blist_alloc(blist_t bl, daddr_t count) 271 { 272 daddr_t blk; 273 274 /* 275 * This loop iterates at most twice. An allocation failure in the 276 * first iteration leads to a second iteration only if the cursor was 277 * non-zero. When the cursor is zero, an allocation failure will 278 * reduce the hint, stopping further iterations. 279 */ 280 while (count <= bl->bl_root->bm_bighint) { 281 blk = blst_meta_alloc(bl->bl_root, bl->bl_cursor, count, 282 bl->bl_radix); 283 if (blk != SWAPBLK_NONE) { 284 bl->bl_cursor = blk + count; 285 if (bl->bl_cursor == bl->bl_blocks) 286 bl->bl_cursor = 0; 287 return (blk); 288 } else if (bl->bl_cursor != 0) 289 bl->bl_cursor = 0; 290 } 291 return (SWAPBLK_NONE); 292 } 293 294 /* 295 * blist_avail() - return the number of free blocks. 296 */ 297 daddr_t 298 blist_avail(blist_t bl) 299 { 300 301 if (bl->bl_radix == BLIST_BMAP_RADIX) 302 return (bitcount64(bl->bl_root->u.bmu_bitmap)); 303 else 304 return (bl->bl_root->u.bmu_avail); 305 } 306 307 /* 308 * blist_free() - free up space in the block bitmap. Return the base 309 * of a contiguous region. Panic if an inconsistancy is 310 * found. 311 */ 312 void 313 blist_free(blist_t bl, daddr_t blkno, daddr_t count) 314 { 315 316 blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix); 317 } 318 319 /* 320 * blist_fill() - mark a region in the block bitmap as off-limits 321 * to the allocator (i.e. allocate it), ignoring any 322 * existing allocations. Return the number of blocks 323 * actually filled that were free before the call. 324 */ 325 daddr_t 326 blist_fill(blist_t bl, daddr_t blkno, daddr_t count) 327 { 328 329 return (blst_meta_fill(bl->bl_root, blkno, count, bl->bl_radix)); 330 } 331 332 /* 333 * blist_resize() - resize an existing radix tree to handle the 334 * specified number of blocks. This will reallocate 335 * the tree and transfer the previous bitmap to the new 336 * one. When extending the tree you can specify whether 337 * the new blocks are to left allocated or freed. 338 */ 339 void 340 blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags) 341 { 342 blist_t newbl = blist_create(count, flags); 343 blist_t save = *pbl; 344 345 *pbl = newbl; 346 if (count > save->bl_blocks) 347 count = save->bl_blocks; 348 blst_copy(save->bl_root, 0, save->bl_radix, newbl, count); 349 350 /* 351 * If resizing upwards, should we free the new space or not? 352 */ 353 if (freenew && count < newbl->bl_blocks) { 354 blist_free(newbl, count, newbl->bl_blocks - count); 355 } 356 blist_destroy(save); 357 } 358 359 #ifdef BLIST_DEBUG 360 361 /* 362 * blist_print() - dump radix tree 363 */ 364 void 365 blist_print(blist_t bl) 366 { 367 printf("BLIST cursor = %08jx {\n", (uintmax_t)bl->bl_cursor); 368 blst_radix_print(bl->bl_root, 0, bl->bl_radix, 4); 369 printf("}\n"); 370 } 371 372 #endif 373 374 static const u_daddr_t fib[] = { 375 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584, 376 4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418, 317811, 377 514229, 832040, 1346269, 2178309, 3524578, 378 }; 379 380 /* 381 * Use 'gap' to describe a maximal range of unallocated blocks/bits. 382 */ 383 struct gap_stats { 384 daddr_t start; /* current gap start, or SWAPBLK_NONE */ 385 daddr_t num; /* number of gaps observed */ 386 daddr_t max; /* largest gap size */ 387 daddr_t avg; /* average gap size */ 388 daddr_t err; /* sum - num * avg */ 389 daddr_t histo[nitems(fib)]; /* # gaps in each size range */ 390 int max_bucket; /* last histo elt with nonzero val */ 391 }; 392 393 /* 394 * gap_stats_counting() - is the state 'counting 1 bits'? 395 * or 'skipping 0 bits'? 396 */ 397 static inline bool 398 gap_stats_counting(const struct gap_stats *stats) 399 { 400 401 return (stats->start != SWAPBLK_NONE); 402 } 403 404 /* 405 * init_gap_stats() - initialize stats on gap sizes 406 */ 407 static inline void 408 init_gap_stats(struct gap_stats *stats) 409 { 410 411 bzero(stats, sizeof(*stats)); 412 stats->start = SWAPBLK_NONE; 413 } 414 415 /* 416 * update_gap_stats() - update stats on gap sizes 417 */ 418 static void 419 update_gap_stats(struct gap_stats *stats, daddr_t posn) 420 { 421 daddr_t size; 422 int hi, lo, mid; 423 424 if (!gap_stats_counting(stats)) { 425 stats->start = posn; 426 return; 427 } 428 size = posn - stats->start; 429 stats->start = SWAPBLK_NONE; 430 if (size > stats->max) 431 stats->max = size; 432 433 /* 434 * Find the fibonacci range that contains size, 435 * expecting to find it in an early range. 436 */ 437 lo = 0; 438 hi = 1; 439 while (hi < nitems(fib) && fib[hi] <= size) { 440 lo = hi; 441 hi *= 2; 442 } 443 if (hi >= nitems(fib)) 444 hi = nitems(fib); 445 while (lo + 1 != hi) { 446 mid = (lo + hi) >> 1; 447 if (fib[mid] <= size) 448 lo = mid; 449 else 450 hi = mid; 451 } 452 stats->histo[lo]++; 453 if (lo > stats->max_bucket) 454 stats->max_bucket = lo; 455 stats->err += size - stats->avg; 456 stats->num++; 457 stats->avg += stats->err / stats->num; 458 stats->err %= stats->num; 459 } 460 461 /* 462 * dump_gap_stats() - print stats on gap sizes 463 */ 464 static inline void 465 dump_gap_stats(const struct gap_stats *stats, struct sbuf *s) 466 { 467 int i; 468 469 sbuf_printf(s, "number of maximal free ranges: %jd\n", 470 (intmax_t)stats->num); 471 sbuf_printf(s, "largest free range: %jd\n", (intmax_t)stats->max); 472 sbuf_printf(s, "average maximal free range size: %jd\n", 473 (intmax_t)stats->avg); 474 sbuf_printf(s, "number of maximal free ranges of different sizes:\n"); 475 sbuf_printf(s, " count | size range\n"); 476 sbuf_printf(s, " ----- | ----------\n"); 477 for (i = 0; i < stats->max_bucket; i++) { 478 if (stats->histo[i] != 0) { 479 sbuf_printf(s, "%20jd | ", 480 (intmax_t)stats->histo[i]); 481 if (fib[i] != fib[i + 1] - 1) 482 sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i], 483 (intmax_t)fib[i + 1] - 1); 484 else 485 sbuf_printf(s, "%jd\n", (intmax_t)fib[i]); 486 } 487 } 488 sbuf_printf(s, "%20jd | ", (intmax_t)stats->histo[i]); 489 if (stats->histo[i] > 1) 490 sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i], 491 (intmax_t)stats->max); 492 else 493 sbuf_printf(s, "%jd\n", (intmax_t)stats->max); 494 } 495 496 /* 497 * blist_stats() - dump radix tree stats 498 */ 499 void 500 blist_stats(blist_t bl, struct sbuf *s) 501 { 502 struct gap_stats gstats; 503 struct gap_stats *stats = &gstats; 504 daddr_t i, nodes, radix; 505 u_daddr_t bit, diff, mask; 506 507 init_gap_stats(stats); 508 nodes = 0; 509 i = bl->bl_radix; 510 while (i < bl->bl_radix + bl->bl_blocks) { 511 /* 512 * Find max size subtree starting at i. 513 */ 514 radix = BLIST_BMAP_RADIX; 515 while (((i / radix) & BLIST_META_MASK) == 0) 516 radix *= BLIST_META_RADIX; 517 518 /* 519 * Check for skippable subtrees starting at i. 520 */ 521 while (radix > BLIST_BMAP_RADIX) { 522 if (bl->bl_root[nodes].u.bmu_avail == 0) { 523 if (gap_stats_counting(stats)) 524 update_gap_stats(stats, i); 525 break; 526 } 527 if (bl->bl_root[nodes].u.bmu_avail == radix) { 528 if (!gap_stats_counting(stats)) 529 update_gap_stats(stats, i); 530 break; 531 } 532 533 /* 534 * Skip subtree root. 535 */ 536 nodes++; 537 radix /= BLIST_META_RADIX; 538 } 539 if (radix == BLIST_BMAP_RADIX) { 540 /* 541 * Scan leaf. 542 */ 543 mask = bl->bl_root[nodes].u.bmu_bitmap; 544 diff = mask ^ (mask << 1); 545 if (gap_stats_counting(stats)) 546 diff ^= 1; 547 while (diff != 0) { 548 bit = diff & -diff; 549 update_gap_stats(stats, i + bitpos(bit)); 550 diff ^= bit; 551 } 552 } 553 nodes += radix_to_skip(radix); 554 i += radix; 555 } 556 update_gap_stats(stats, i); 557 dump_gap_stats(stats, s); 558 } 559 560 /************************************************************************ 561 * ALLOCATION SUPPORT FUNCTIONS * 562 ************************************************************************ 563 * 564 * These support functions do all the actual work. They may seem 565 * rather longish, but that's because I've commented them up. The 566 * actual code is straight forward. 567 * 568 */ 569 570 /* 571 * blist_leaf_alloc() - allocate at a leaf in the radix tree (a bitmap). 572 * 573 * This is the core of the allocator and is optimized for the 574 * BLIST_BMAP_RADIX block allocation case. Otherwise, execution 575 * time is proportional to log2(count) + bitpos time. 576 */ 577 static daddr_t 578 blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count, daddr_t cursor) 579 { 580 u_daddr_t mask; 581 int count1, lo, num_shifts, range1, range_ext; 582 583 if (count == BLIST_BMAP_RADIX) { 584 /* 585 * Optimize allocation of BLIST_BMAP_RADIX bits. If this wasn't 586 * a special case, then forming the final value of 'mask' below 587 * would require special handling to avoid an invalid left shift 588 * when count equals the number of bits in mask. 589 */ 590 if (~scan->u.bmu_bitmap != 0) { 591 scan->bm_bighint = BLIST_BMAP_RADIX - 1; 592 return (SWAPBLK_NONE); 593 } 594 if (cursor != blk) 595 return (SWAPBLK_NONE); 596 scan->u.bmu_bitmap = 0; 597 scan->bm_bighint = 0; 598 return (blk); 599 } 600 range1 = 0; 601 count1 = count - 1; 602 num_shifts = fls(count1); 603 mask = scan->u.bmu_bitmap; 604 while (mask != 0 && num_shifts > 0) { 605 /* 606 * If bit i is set in mask, then bits in [i, i+range1] are set 607 * in scan->u.bmu_bitmap. The value of range1 is equal to 608 * count1 >> num_shifts. Grow range and reduce num_shifts to 0, 609 * while preserving these invariants. The updates to mask leave 610 * fewer bits set, but each bit that remains set represents a 611 * longer string of consecutive bits set in scan->u.bmu_bitmap. 612 */ 613 num_shifts--; 614 range_ext = range1 + ((count1 >> num_shifts) & 1); 615 mask &= mask >> range_ext; 616 range1 += range_ext; 617 } 618 if (mask == 0) { 619 /* 620 * Update bighint. There is no allocation bigger than range1 621 * available in this leaf. 622 */ 623 scan->bm_bighint = range1; 624 return (SWAPBLK_NONE); 625 } 626 627 /* 628 * Discard any candidates that appear before the cursor. 629 */ 630 lo = cursor - blk; 631 mask &= ~(u_daddr_t)0 << lo; 632 633 if (mask == 0) 634 return (SWAPBLK_NONE); 635 636 /* 637 * The least significant set bit in mask marks the start of the first 638 * available range of sufficient size. Clear all the bits but that one, 639 * and then find its position. 640 */ 641 mask &= -mask; 642 lo = bitpos(mask); 643 644 /* 645 * Set in mask exactly the bits being allocated, and clear them from 646 * the set of available bits. 647 */ 648 mask = (mask << count) - mask; 649 scan->u.bmu_bitmap &= ~mask; 650 return (blk + lo); 651 } 652 653 /* 654 * blist_meta_alloc() - allocate at a meta in the radix tree. 655 * 656 * Attempt to allocate at a meta node. If we can't, we update 657 * bighint and return a failure. Updating bighint optimize future 658 * calls that hit this node. We have to check for our collapse cases 659 * and we have a few optimizations strewn in as well. 660 */ 661 static daddr_t 662 blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count, u_daddr_t radix) 663 { 664 daddr_t blk, i, next_skip, r, skip; 665 int child; 666 bool scan_from_start; 667 668 blk = cursor & -radix; 669 if (radix == BLIST_BMAP_RADIX) 670 return (blst_leaf_alloc(scan, blk, count, cursor)); 671 if (scan->u.bmu_avail < count) { 672 /* 673 * The meta node's hint must be too large if the allocation 674 * exceeds the number of free blocks. Reduce the hint, and 675 * return failure. 676 */ 677 scan->bm_bighint = scan->u.bmu_avail; 678 return (SWAPBLK_NONE); 679 } 680 skip = radix_to_skip(radix); 681 next_skip = skip / BLIST_META_RADIX; 682 683 /* 684 * An ALL-FREE meta node requires special handling before allocating 685 * any of its blocks. 686 */ 687 if (scan->u.bmu_avail == radix) { 688 radix /= BLIST_META_RADIX; 689 690 /* 691 * Reinitialize each of the meta node's children. An ALL-FREE 692 * meta node cannot have a terminator in any subtree. 693 */ 694 for (i = 1; i < skip; i += next_skip) { 695 if (next_skip == 1) 696 scan[i].u.bmu_bitmap = (u_daddr_t)-1; 697 else 698 scan[i].u.bmu_avail = radix; 699 scan[i].bm_bighint = radix; 700 } 701 } else { 702 radix /= BLIST_META_RADIX; 703 } 704 705 if (count > radix) { 706 /* 707 * The allocation exceeds the number of blocks that are 708 * managed by a subtree of this meta node. 709 */ 710 panic("allocation too large"); 711 } 712 scan_from_start = cursor == blk; 713 child = (cursor - blk) / radix; 714 blk += child * radix; 715 for (i = 1 + child * next_skip; i < skip; i += next_skip) { 716 if (count <= scan[i].bm_bighint) { 717 /* 718 * The allocation might fit in the i'th subtree. 719 */ 720 r = blst_meta_alloc(&scan[i], 721 cursor > blk ? cursor : blk, count, radix); 722 if (r != SWAPBLK_NONE) { 723 scan->u.bmu_avail -= count; 724 return (r); 725 } 726 } else if (scan[i].bm_bighint == (daddr_t)-1) { 727 /* 728 * Terminator 729 */ 730 break; 731 } 732 blk += radix; 733 } 734 735 /* 736 * We couldn't allocate count in this subtree, update bighint. 737 */ 738 if (scan_from_start && scan->bm_bighint >= count) 739 scan->bm_bighint = count - 1; 740 741 return (SWAPBLK_NONE); 742 } 743 744 /* 745 * BLST_LEAF_FREE() - free allocated block from leaf bitmap 746 * 747 */ 748 static void 749 blst_leaf_free(blmeta_t *scan, daddr_t blk, int count) 750 { 751 /* 752 * free some data in this bitmap 753 * 754 * e.g. 755 * 0000111111111110000 756 * \_________/\__/ 757 * v n 758 */ 759 int n = blk & (BLIST_BMAP_RADIX - 1); 760 u_daddr_t mask; 761 762 mask = ((u_daddr_t)-1 << n) & 763 ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n)); 764 765 if (scan->u.bmu_bitmap & mask) 766 panic("blst_radix_free: freeing free block"); 767 scan->u.bmu_bitmap |= mask; 768 769 /* 770 * We could probably do a better job here. We are required to make 771 * bighint at least as large as the biggest contiguous block of 772 * data. If we just shoehorn it, a little extra overhead will 773 * be incured on the next allocation (but only that one typically). 774 */ 775 scan->bm_bighint = BLIST_BMAP_RADIX; 776 } 777 778 /* 779 * BLST_META_FREE() - free allocated blocks from radix tree meta info 780 * 781 * This support routine frees a range of blocks from the bitmap. 782 * The range must be entirely enclosed by this radix node. If a 783 * meta node, we break the range down recursively to free blocks 784 * in subnodes (which means that this code can free an arbitrary 785 * range whereas the allocation code cannot allocate an arbitrary 786 * range). 787 */ 788 static void 789 blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, u_daddr_t radix) 790 { 791 daddr_t blk, i, next_skip, skip, v; 792 int child; 793 794 if (scan->bm_bighint == (daddr_t)-1) 795 panic("freeing invalid range"); 796 if (radix == BLIST_BMAP_RADIX) 797 return (blst_leaf_free(scan, freeBlk, count)); 798 skip = radix_to_skip(radix); 799 next_skip = skip / BLIST_META_RADIX; 800 801 if (scan->u.bmu_avail == 0) { 802 /* 803 * ALL-ALLOCATED special case, with possible 804 * shortcut to ALL-FREE special case. 805 */ 806 scan->u.bmu_avail = count; 807 scan->bm_bighint = count; 808 809 if (count != radix) { 810 for (i = 1; i < skip; i += next_skip) { 811 if (scan[i].bm_bighint == (daddr_t)-1) 812 break; 813 scan[i].bm_bighint = 0; 814 if (next_skip == 1) { 815 scan[i].u.bmu_bitmap = 0; 816 } else { 817 scan[i].u.bmu_avail = 0; 818 } 819 } 820 /* fall through */ 821 } 822 } else { 823 scan->u.bmu_avail += count; 824 /* scan->bm_bighint = radix; */ 825 } 826 827 /* 828 * ALL-FREE special case. 829 */ 830 831 if (scan->u.bmu_avail == radix) 832 return; 833 if (scan->u.bmu_avail > radix) 834 panic("blst_meta_free: freeing already free blocks (%lld) %lld/%lld", 835 (long long)count, (long long)scan->u.bmu_avail, 836 (long long)radix); 837 838 /* 839 * Break the free down into its components 840 */ 841 842 blk = freeBlk & -radix; 843 radix /= BLIST_META_RADIX; 844 845 child = (freeBlk - blk) / radix; 846 blk += child * radix; 847 i = 1 + child * next_skip; 848 while (i < skip && blk < freeBlk + count) { 849 v = blk + radix - freeBlk; 850 if (v > count) 851 v = count; 852 blst_meta_free(&scan[i], freeBlk, v, radix); 853 if (scan->bm_bighint < scan[i].bm_bighint) 854 scan->bm_bighint = scan[i].bm_bighint; 855 count -= v; 856 freeBlk += v; 857 blk += radix; 858 i += next_skip; 859 } 860 } 861 862 /* 863 * BLIST_RADIX_COPY() - copy one radix tree to another 864 * 865 * Locates free space in the source tree and frees it in the destination 866 * tree. The space may not already be free in the destination. 867 */ 868 static void 869 blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, blist_t dest, 870 daddr_t count) 871 { 872 daddr_t i, next_skip, skip; 873 874 /* 875 * Leaf node 876 */ 877 878 if (radix == BLIST_BMAP_RADIX) { 879 u_daddr_t v = scan->u.bmu_bitmap; 880 881 if (v == (u_daddr_t)-1) { 882 blist_free(dest, blk, count); 883 } else if (v != 0) { 884 int i; 885 886 for (i = 0; i < BLIST_BMAP_RADIX && i < count; ++i) { 887 if (v & ((u_daddr_t)1 << i)) 888 blist_free(dest, blk + i, 1); 889 } 890 } 891 return; 892 } 893 894 /* 895 * Meta node 896 */ 897 898 if (scan->u.bmu_avail == 0) { 899 /* 900 * Source all allocated, leave dest allocated 901 */ 902 return; 903 } 904 if (scan->u.bmu_avail == radix) { 905 /* 906 * Source all free, free entire dest 907 */ 908 if (count < radix) 909 blist_free(dest, blk, count); 910 else 911 blist_free(dest, blk, radix); 912 return; 913 } 914 915 916 skip = radix_to_skip(radix); 917 next_skip = skip / BLIST_META_RADIX; 918 radix /= BLIST_META_RADIX; 919 920 for (i = 1; count && i < skip; i += next_skip) { 921 if (scan[i].bm_bighint == (daddr_t)-1) 922 break; 923 924 if (count >= radix) { 925 blst_copy(&scan[i], blk, radix, dest, radix); 926 count -= radix; 927 } else { 928 if (count) { 929 blst_copy(&scan[i], blk, radix, dest, count); 930 } 931 count = 0; 932 } 933 blk += radix; 934 } 935 } 936 937 /* 938 * BLST_LEAF_FILL() - allocate specific blocks in leaf bitmap 939 * 940 * This routine allocates all blocks in the specified range 941 * regardless of any existing allocations in that range. Returns 942 * the number of blocks allocated by the call. 943 */ 944 static daddr_t 945 blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count) 946 { 947 int n = blk & (BLIST_BMAP_RADIX - 1); 948 daddr_t nblks; 949 u_daddr_t mask; 950 951 mask = ((u_daddr_t)-1 << n) & 952 ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n)); 953 954 /* Count the number of blocks that we are allocating. */ 955 nblks = bitcount64(scan->u.bmu_bitmap & mask); 956 957 scan->u.bmu_bitmap &= ~mask; 958 return (nblks); 959 } 960 961 /* 962 * BLIST_META_FILL() - allocate specific blocks at a meta node 963 * 964 * This routine allocates the specified range of blocks, 965 * regardless of any existing allocations in the range. The 966 * range must be within the extent of this node. Returns the 967 * number of blocks allocated by the call. 968 */ 969 static daddr_t 970 blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, u_daddr_t radix) 971 { 972 daddr_t blk, i, nblks, next_skip, skip, v; 973 int child; 974 975 if (scan->bm_bighint == (daddr_t)-1) 976 panic("filling invalid range"); 977 if (count > radix) { 978 /* 979 * The allocation exceeds the number of blocks that are 980 * managed by this node. 981 */ 982 panic("fill too large"); 983 } 984 if (radix == BLIST_BMAP_RADIX) 985 return (blst_leaf_fill(scan, allocBlk, count)); 986 if (count == radix || scan->u.bmu_avail == 0) { 987 /* 988 * ALL-ALLOCATED special case 989 */ 990 nblks = scan->u.bmu_avail; 991 scan->u.bmu_avail = 0; 992 scan->bm_bighint = 0; 993 return (nblks); 994 } 995 skip = radix_to_skip(radix); 996 next_skip = skip / BLIST_META_RADIX; 997 blk = allocBlk & -radix; 998 999 /* 1000 * An ALL-FREE meta node requires special handling before allocating 1001 * any of its blocks. 1002 */ 1003 if (scan->u.bmu_avail == radix) { 1004 radix /= BLIST_META_RADIX; 1005 1006 /* 1007 * Reinitialize each of the meta node's children. An ALL-FREE 1008 * meta node cannot have a terminator in any subtree. 1009 */ 1010 for (i = 1; i < skip; i += next_skip) { 1011 if (next_skip == 1) 1012 scan[i].u.bmu_bitmap = (u_daddr_t)-1; 1013 else 1014 scan[i].u.bmu_avail = radix; 1015 scan[i].bm_bighint = radix; 1016 } 1017 } else { 1018 radix /= BLIST_META_RADIX; 1019 } 1020 1021 nblks = 0; 1022 child = (allocBlk - blk) / radix; 1023 blk += child * radix; 1024 i = 1 + child * next_skip; 1025 while (i < skip && blk < allocBlk + count) { 1026 v = blk + radix - allocBlk; 1027 if (v > count) 1028 v = count; 1029 nblks += blst_meta_fill(&scan[i], allocBlk, v, radix); 1030 count -= v; 1031 allocBlk += v; 1032 blk += radix; 1033 i += next_skip; 1034 } 1035 scan->u.bmu_avail -= nblks; 1036 return (nblks); 1037 } 1038 1039 /* 1040 * BLST_RADIX_INIT() - initialize radix tree 1041 * 1042 * Initialize our meta structures and bitmaps and calculate the exact 1043 * amount of space required to manage 'count' blocks - this space may 1044 * be considerably less than the calculated radix due to the large 1045 * RADIX values we use. 1046 */ 1047 static daddr_t 1048 blst_radix_init(blmeta_t *scan, daddr_t radix, daddr_t count) 1049 { 1050 daddr_t i, memindex, next_skip, skip; 1051 1052 memindex = 0; 1053 1054 /* 1055 * Leaf node 1056 */ 1057 1058 if (radix == BLIST_BMAP_RADIX) { 1059 if (scan) { 1060 scan->bm_bighint = 0; 1061 scan->u.bmu_bitmap = 0; 1062 } 1063 return (memindex); 1064 } 1065 1066 /* 1067 * Meta node. If allocating the entire object we can special 1068 * case it. However, we need to figure out how much memory 1069 * is required to manage 'count' blocks, so we continue on anyway. 1070 */ 1071 1072 if (scan) { 1073 scan->bm_bighint = 0; 1074 scan->u.bmu_avail = 0; 1075 } 1076 1077 skip = radix_to_skip(radix); 1078 next_skip = skip / BLIST_META_RADIX; 1079 radix /= BLIST_META_RADIX; 1080 1081 for (i = 1; i < skip; i += next_skip) { 1082 if (count >= radix) { 1083 /* 1084 * Allocate the entire object 1085 */ 1086 memindex = i + 1087 blst_radix_init(((scan) ? &scan[i] : NULL), radix, 1088 radix); 1089 count -= radix; 1090 } else if (count > 0) { 1091 /* 1092 * Allocate a partial object 1093 */ 1094 memindex = i + 1095 blst_radix_init(((scan) ? &scan[i] : NULL), radix, 1096 count); 1097 count = 0; 1098 } else { 1099 /* 1100 * Add terminator and break out 1101 */ 1102 if (scan) 1103 scan[i].bm_bighint = (daddr_t)-1; 1104 break; 1105 } 1106 } 1107 if (memindex < i) 1108 memindex = i; 1109 return (memindex); 1110 } 1111 1112 #ifdef BLIST_DEBUG 1113 1114 static void 1115 blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int tab) 1116 { 1117 daddr_t i, next_skip, skip; 1118 1119 if (radix == BLIST_BMAP_RADIX) { 1120 printf( 1121 "%*.*s(%08llx,%lld): bitmap %016llx big=%lld\n", 1122 tab, tab, "", 1123 (long long)blk, (long long)radix, 1124 (long long)scan->u.bmu_bitmap, 1125 (long long)scan->bm_bighint 1126 ); 1127 return; 1128 } 1129 1130 if (scan->u.bmu_avail == 0) { 1131 printf( 1132 "%*.*s(%08llx,%lld) ALL ALLOCATED\n", 1133 tab, tab, "", 1134 (long long)blk, 1135 (long long)radix 1136 ); 1137 return; 1138 } 1139 if (scan->u.bmu_avail == radix) { 1140 printf( 1141 "%*.*s(%08llx,%lld) ALL FREE\n", 1142 tab, tab, "", 1143 (long long)blk, 1144 (long long)radix 1145 ); 1146 return; 1147 } 1148 1149 printf( 1150 "%*.*s(%08llx,%lld): subtree (%lld/%lld) big=%lld {\n", 1151 tab, tab, "", 1152 (long long)blk, (long long)radix, 1153 (long long)scan->u.bmu_avail, 1154 (long long)radix, 1155 (long long)scan->bm_bighint 1156 ); 1157 1158 skip = radix_to_skip(radix); 1159 next_skip = skip / BLIST_META_RADIX; 1160 radix /= BLIST_META_RADIX; 1161 tab += 4; 1162 1163 for (i = 1; i < skip; i += next_skip) { 1164 if (scan[i].bm_bighint == (daddr_t)-1) { 1165 printf( 1166 "%*.*s(%08llx,%lld): Terminator\n", 1167 tab, tab, "", 1168 (long long)blk, (long long)radix 1169 ); 1170 break; 1171 } 1172 blst_radix_print(&scan[i], blk, radix, tab); 1173 blk += radix; 1174 } 1175 tab -= 4; 1176 1177 printf( 1178 "%*.*s}\n", 1179 tab, tab, "" 1180 ); 1181 } 1182 1183 #endif 1184 1185 #ifdef BLIST_DEBUG 1186 1187 int 1188 main(int ac, char **av) 1189 { 1190 int size = 1024; 1191 int i; 1192 blist_t bl; 1193 struct sbuf *s; 1194 1195 for (i = 1; i < ac; ++i) { 1196 const char *ptr = av[i]; 1197 if (*ptr != '-') { 1198 size = strtol(ptr, NULL, 0); 1199 continue; 1200 } 1201 ptr += 2; 1202 fprintf(stderr, "Bad option: %s\n", ptr - 2); 1203 exit(1); 1204 } 1205 bl = blist_create(size, M_WAITOK); 1206 blist_free(bl, 0, size); 1207 1208 for (;;) { 1209 char buf[1024]; 1210 long long da = 0; 1211 long long count = 0; 1212 1213 printf("%lld/%lld/%lld> ", (long long)blist_avail(bl), 1214 (long long)size, (long long)bl->bl_radix); 1215 fflush(stdout); 1216 if (fgets(buf, sizeof(buf), stdin) == NULL) 1217 break; 1218 switch(buf[0]) { 1219 case 'r': 1220 if (sscanf(buf + 1, "%lld", &count) == 1) { 1221 blist_resize(&bl, count, 1, M_WAITOK); 1222 } else { 1223 printf("?\n"); 1224 } 1225 case 'p': 1226 blist_print(bl); 1227 break; 1228 case 's': 1229 s = sbuf_new_auto(); 1230 blist_stats(bl, s); 1231 sbuf_finish(s); 1232 printf("%s", sbuf_data(s)); 1233 sbuf_delete(s); 1234 break; 1235 case 'a': 1236 if (sscanf(buf + 1, "%lld", &count) == 1) { 1237 daddr_t blk = blist_alloc(bl, count); 1238 printf(" R=%08llx\n", (long long)blk); 1239 } else { 1240 printf("?\n"); 1241 } 1242 break; 1243 case 'f': 1244 if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) { 1245 blist_free(bl, da, count); 1246 } else { 1247 printf("?\n"); 1248 } 1249 break; 1250 case 'l': 1251 if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) { 1252 printf(" n=%jd\n", 1253 (intmax_t)blist_fill(bl, da, count)); 1254 } else { 1255 printf("?\n"); 1256 } 1257 break; 1258 case '?': 1259 case 'h': 1260 puts( 1261 "p -print\n" 1262 "s -stats\n" 1263 "a %d -allocate\n" 1264 "f %x %d -free\n" 1265 "l %x %d -fill\n" 1266 "r %d -resize\n" 1267 "h/? -help" 1268 ); 1269 break; 1270 default: 1271 printf("?\n"); 1272 break; 1273 } 1274 } 1275 return(0); 1276 } 1277 1278 void 1279 panic(const char *ctl, ...) 1280 { 1281 va_list va; 1282 1283 va_start(va, ctl); 1284 vfprintf(stderr, ctl, va); 1285 fprintf(stderr, "\n"); 1286 va_end(va); 1287 exit(1); 1288 } 1289 1290 #endif 1291