1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 /* 26 * Copyright (c) 2013, 2017 by Delphix. All rights reserved. 27 */ 28 29 #include <sys/zfs_context.h> 30 #include <sys/spa.h> 31 #include <sys/dmu.h> 32 #include <sys/dnode.h> 33 #include <sys/zio.h> 34 #include <sys/range_tree.h> 35 36 /* 37 * Range trees are tree-based data structures that can be used to 38 * track free space or generally any space allocation information. 39 * A range tree keeps track of individual segments and automatically 40 * provides facilities such as adjacent extent merging and extent 41 * splitting in response to range add/remove requests. 42 * 43 * A range tree starts out completely empty, with no segments in it. 44 * Adding an allocation via range_tree_add to the range tree can either: 45 * 1) create a new extent 46 * 2) extend an adjacent extent 47 * 3) merge two adjacent extents 48 * Conversely, removing an allocation via range_tree_remove can: 49 * 1) completely remove an extent 50 * 2) shorten an extent (if the allocation was near one of its ends) 51 * 3) split an extent into two extents, in effect punching a hole 52 * 53 * A range tree is also capable of 'bridging' gaps when adding 54 * allocations. This is useful for cases when close proximity of 55 * allocations is an important detail that needs to be represented 56 * in the range tree. See range_tree_set_gap(). The default behavior 57 * is not to bridge gaps (i.e. the maximum allowed gap size is 0). 58 * 59 * In order to traverse a range tree, use either the range_tree_walk() 60 * or range_tree_vacate() functions. 61 * 62 * To obtain more accurate information on individual segment 63 * operations that the range tree performs "under the hood", you can 64 * specify a set of callbacks by passing a range_tree_ops_t structure 65 * to the range_tree_create function. Any callbacks that are non-NULL 66 * are then called at the appropriate times. 67 * 68 * The range tree code also supports a special variant of range trees 69 * that can bridge small gaps between segments. This kind of tree is used 70 * by the dsl scanning code to group I/Os into mostly sequential chunks to 71 * optimize disk performance. The code here attempts to do this with as 72 * little memory and computational overhead as possible. One limitation of 73 * this implementation is that segments of range trees with gaps can only 74 * support removing complete segments. 75 */ 76 77 kmem_cache_t *range_seg_cache; 78 79 /* Generic ops for managing an AVL tree alongside a range tree */ 80 struct range_tree_ops rt_avl_ops = { 81 .rtop_create = rt_avl_create, 82 .rtop_destroy = rt_avl_destroy, 83 .rtop_add = rt_avl_add, 84 .rtop_remove = rt_avl_remove, 85 .rtop_vacate = rt_avl_vacate, 86 }; 87 88 void 89 range_tree_init(void) 90 { 91 ASSERT(range_seg_cache == NULL); 92 range_seg_cache = kmem_cache_create("range_seg_cache", 93 sizeof (range_seg_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 94 } 95 96 void 97 range_tree_fini(void) 98 { 99 kmem_cache_destroy(range_seg_cache); 100 range_seg_cache = NULL; 101 } 102 103 void 104 range_tree_stat_verify(range_tree_t *rt) 105 { 106 range_seg_t *rs; 107 uint64_t hist[RANGE_TREE_HISTOGRAM_SIZE] = { 0 }; 108 int i; 109 110 for (rs = avl_first(&rt->rt_root); rs != NULL; 111 rs = AVL_NEXT(&rt->rt_root, rs)) { 112 uint64_t size = rs->rs_end - rs->rs_start; 113 int idx = highbit64(size) - 1; 114 115 hist[idx]++; 116 ASSERT3U(hist[idx], !=, 0); 117 } 118 119 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { 120 if (hist[i] != rt->rt_histogram[i]) { 121 zfs_dbgmsg("i=%d, hist=%p, hist=%llu, rt_hist=%llu", 122 i, hist, hist[i], rt->rt_histogram[i]); 123 } 124 VERIFY3U(hist[i], ==, rt->rt_histogram[i]); 125 } 126 } 127 128 static void 129 range_tree_stat_incr(range_tree_t *rt, range_seg_t *rs) 130 { 131 uint64_t size = rs->rs_end - rs->rs_start; 132 int idx = highbit64(size) - 1; 133 134 ASSERT(size != 0); 135 ASSERT3U(idx, <, 136 sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram)); 137 138 rt->rt_histogram[idx]++; 139 ASSERT3U(rt->rt_histogram[idx], !=, 0); 140 } 141 142 static void 143 range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs) 144 { 145 uint64_t size = rs->rs_end - rs->rs_start; 146 int idx = highbit64(size) - 1; 147 148 ASSERT(size != 0); 149 ASSERT3U(idx, <, 150 sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram)); 151 152 ASSERT3U(rt->rt_histogram[idx], !=, 0); 153 rt->rt_histogram[idx]--; 154 } 155 156 /* 157 * NOTE: caller is responsible for all locking. 158 */ 159 static int 160 range_tree_seg_compare(const void *x1, const void *x2) 161 { 162 const range_seg_t *r1 = (const range_seg_t *)x1; 163 const range_seg_t *r2 = (const range_seg_t *)x2; 164 165 ASSERT3U(r1->rs_start, <=, r1->rs_end); 166 ASSERT3U(r2->rs_start, <=, r2->rs_end); 167 168 return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start)); 169 } 170 171 range_tree_t * 172 range_tree_create_impl(range_tree_ops_t *ops, void *arg, 173 int (*avl_compare) (const void *, const void *), uint64_t gap) 174 { 175 range_tree_t *rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP); 176 177 avl_create(&rt->rt_root, range_tree_seg_compare, 178 sizeof (range_seg_t), offsetof(range_seg_t, rs_node)); 179 180 rt->rt_ops = ops; 181 rt->rt_arg = arg; 182 rt->rt_gap = gap; 183 rt->rt_avl_compare = avl_compare; 184 185 if (rt->rt_ops != NULL && rt->rt_ops->rtop_create != NULL) 186 rt->rt_ops->rtop_create(rt, rt->rt_arg); 187 188 return (rt); 189 } 190 191 range_tree_t * 192 range_tree_create(range_tree_ops_t *ops, void *arg) 193 { 194 return (range_tree_create_impl(ops, arg, NULL, 0)); 195 } 196 197 void 198 range_tree_destroy(range_tree_t *rt) 199 { 200 VERIFY0(rt->rt_space); 201 202 if (rt->rt_ops != NULL && rt->rt_ops->rtop_destroy != NULL) 203 rt->rt_ops->rtop_destroy(rt, rt->rt_arg); 204 205 avl_destroy(&rt->rt_root); 206 kmem_free(rt, sizeof (*rt)); 207 } 208 209 void 210 range_tree_adjust_fill(range_tree_t *rt, range_seg_t *rs, int64_t delta) 211 { 212 ASSERT3U(rs->rs_fill + delta, !=, 0); 213 ASSERT3U(rs->rs_fill + delta, <=, rs->rs_end - rs->rs_start); 214 215 if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) 216 rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); 217 rs->rs_fill += delta; 218 if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) 219 rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); 220 } 221 222 static void 223 range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill) 224 { 225 range_tree_t *rt = arg; 226 avl_index_t where; 227 range_seg_t rsearch, *rs_before, *rs_after, *rs; 228 uint64_t end = start + size, gap = rt->rt_gap; 229 uint64_t bridge_size = 0; 230 boolean_t merge_before, merge_after; 231 232 ASSERT3U(size, !=, 0); 233 ASSERT3U(fill, <=, size); 234 235 rsearch.rs_start = start; 236 rsearch.rs_end = end; 237 rs = avl_find(&rt->rt_root, &rsearch, &where); 238 239 if (gap == 0 && rs != NULL && 240 rs->rs_start <= start && rs->rs_end >= end) { 241 zfs_panic_recover("zfs: allocating allocated segment" 242 "(offset=%llu size=%llu) of (offset=%llu size=%llu)\n", 243 (longlong_t)start, (longlong_t)size, 244 (longlong_t)rs->rs_start, 245 (longlong_t)rs->rs_end - rs->rs_start); 246 return; 247 } 248 249 /* 250 * If this is a gap-supporting range tree, it is possible that we 251 * are inserting into an existing segment. In this case simply 252 * bump the fill count and call the remove / add callbacks. If the 253 * new range will extend an existing segment, we remove the 254 * existing one, apply the new extent to it and re-insert it using 255 * the normal code paths. 256 */ 257 if (rs != NULL) { 258 ASSERT3U(gap, !=, 0); 259 if (rs->rs_start <= start && rs->rs_end >= end) { 260 range_tree_adjust_fill(rt, rs, fill); 261 return; 262 } 263 264 avl_remove(&rt->rt_root, rs); 265 if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) 266 rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); 267 268 range_tree_stat_decr(rt, rs); 269 rt->rt_space -= rs->rs_end - rs->rs_start; 270 271 fill += rs->rs_fill; 272 start = MIN(start, rs->rs_start); 273 end = MAX(end, rs->rs_end); 274 size = end - start; 275 276 range_tree_add_impl(rt, start, size, fill); 277 278 kmem_cache_free(range_seg_cache, rs); 279 return; 280 } 281 282 ASSERT3P(rs, ==, NULL); 283 284 /* 285 * Determine whether or not we will have to merge with our neighbors. 286 * If gap != 0, we might need to merge with our neighbors even if we 287 * aren't directly touching. 288 */ 289 rs_before = avl_nearest(&rt->rt_root, where, AVL_BEFORE); 290 rs_after = avl_nearest(&rt->rt_root, where, AVL_AFTER); 291 292 merge_before = (rs_before != NULL && rs_before->rs_end >= start - gap); 293 merge_after = (rs_after != NULL && rs_after->rs_start <= end + gap); 294 295 if (merge_before && gap != 0) 296 bridge_size += start - rs_before->rs_end; 297 if (merge_after && gap != 0) 298 bridge_size += rs_after->rs_start - end; 299 300 if (merge_before && merge_after) { 301 avl_remove(&rt->rt_root, rs_before); 302 if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) { 303 rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg); 304 rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg); 305 } 306 307 range_tree_stat_decr(rt, rs_before); 308 range_tree_stat_decr(rt, rs_after); 309 310 rs_after->rs_fill += rs_before->rs_fill + fill; 311 rs_after->rs_start = rs_before->rs_start; 312 kmem_cache_free(range_seg_cache, rs_before); 313 rs = rs_after; 314 } else if (merge_before) { 315 if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) 316 rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg); 317 318 range_tree_stat_decr(rt, rs_before); 319 320 rs_before->rs_fill += fill; 321 rs_before->rs_end = end; 322 rs = rs_before; 323 } else if (merge_after) { 324 if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) 325 rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg); 326 327 range_tree_stat_decr(rt, rs_after); 328 329 rs_after->rs_fill += fill; 330 rs_after->rs_start = start; 331 rs = rs_after; 332 } else { 333 rs = kmem_cache_alloc(range_seg_cache, KM_SLEEP); 334 335 rs->rs_fill = fill; 336 rs->rs_start = start; 337 rs->rs_end = end; 338 avl_insert(&rt->rt_root, rs, where); 339 } 340 341 if (gap != 0) 342 ASSERT3U(rs->rs_fill, <=, rs->rs_end - rs->rs_start); 343 else 344 ASSERT3U(rs->rs_fill, ==, rs->rs_end - rs->rs_start); 345 346 if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) 347 rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); 348 349 range_tree_stat_incr(rt, rs); 350 rt->rt_space += size + bridge_size; 351 } 352 353 void 354 range_tree_add(void *arg, uint64_t start, uint64_t size) 355 { 356 range_tree_add_impl(arg, start, size, size); 357 } 358 359 static void 360 range_tree_remove_impl(range_tree_t *rt, uint64_t start, uint64_t size, 361 boolean_t do_fill) 362 { 363 avl_index_t where; 364 range_seg_t rsearch, *rs, *newseg; 365 uint64_t end = start + size; 366 boolean_t left_over, right_over; 367 368 VERIFY3U(size, !=, 0); 369 VERIFY3U(size, <=, rt->rt_space); 370 371 rsearch.rs_start = start; 372 rsearch.rs_end = end; 373 rs = avl_find(&rt->rt_root, &rsearch, &where); 374 375 /* Make sure we completely overlap with someone */ 376 if (rs == NULL) { 377 zfs_panic_recover("zfs: freeing free segment " 378 "(offset=%llu size=%llu)", 379 (longlong_t)start, (longlong_t)size); 380 return; 381 } 382 383 /* 384 * Range trees with gap support must only remove complete segments 385 * from the tree. This allows us to maintain accurate fill accounting 386 * and to ensure that bridged sections are not leaked. If we need to 387 * remove less than the full segment, we can only adjust the fill count. 388 */ 389 if (rt->rt_gap != 0) { 390 if (do_fill) { 391 if (rs->rs_fill == size) { 392 start = rs->rs_start; 393 end = rs->rs_end; 394 size = end - start; 395 } else { 396 range_tree_adjust_fill(rt, rs, -size); 397 return; 398 } 399 } else if (rs->rs_start != start || rs->rs_end != end) { 400 zfs_panic_recover("zfs: freeing partial segment of " 401 "gap tree (offset=%llu size=%llu) of " 402 "(offset=%llu size=%llu)", 403 (longlong_t)start, (longlong_t)size, 404 (longlong_t)rs->rs_start, 405 (longlong_t)rs->rs_end - rs->rs_start); 406 return; 407 } 408 } 409 410 VERIFY3U(rs->rs_start, <=, start); 411 VERIFY3U(rs->rs_end, >=, end); 412 413 left_over = (rs->rs_start != start); 414 right_over = (rs->rs_end != end); 415 416 range_tree_stat_decr(rt, rs); 417 418 if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) 419 rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); 420 421 if (left_over && right_over) { 422 newseg = kmem_cache_alloc(range_seg_cache, KM_SLEEP); 423 newseg->rs_start = end; 424 newseg->rs_end = rs->rs_end; 425 newseg->rs_fill = newseg->rs_end - newseg->rs_start; 426 range_tree_stat_incr(rt, newseg); 427 428 rs->rs_end = start; 429 430 avl_insert_here(&rt->rt_root, newseg, rs, AVL_AFTER); 431 if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) 432 rt->rt_ops->rtop_add(rt, newseg, rt->rt_arg); 433 } else if (left_over) { 434 rs->rs_end = start; 435 } else if (right_over) { 436 rs->rs_start = end; 437 } else { 438 avl_remove(&rt->rt_root, rs); 439 kmem_cache_free(range_seg_cache, rs); 440 rs = NULL; 441 } 442 443 if (rs != NULL) { 444 /* 445 * The fill of the leftover segment will always be equal to 446 * the size, since we do not support removing partial segments 447 * of range trees with gaps. 448 */ 449 rs->rs_fill = rs->rs_end - rs->rs_start; 450 range_tree_stat_incr(rt, rs); 451 452 if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) 453 rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); 454 } 455 456 rt->rt_space -= size; 457 } 458 459 void 460 range_tree_remove(void *arg, uint64_t start, uint64_t size) 461 { 462 range_tree_remove_impl(arg, start, size, B_FALSE); 463 } 464 465 void 466 range_tree_remove_fill(range_tree_t *rt, uint64_t start, uint64_t size) 467 { 468 range_tree_remove_impl(rt, start, size, B_TRUE); 469 } 470 471 void 472 range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs, 473 uint64_t newstart, uint64_t newsize) 474 { 475 int64_t delta = newsize - (rs->rs_end - rs->rs_start); 476 477 range_tree_stat_decr(rt, rs); 478 if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) 479 rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); 480 481 rs->rs_start = newstart; 482 rs->rs_end = newstart + newsize; 483 484 range_tree_stat_incr(rt, rs); 485 if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) 486 rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); 487 488 rt->rt_space += delta; 489 } 490 491 static range_seg_t * 492 range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size) 493 { 494 range_seg_t rsearch; 495 uint64_t end = start + size; 496 497 VERIFY(size != 0); 498 499 rsearch.rs_start = start; 500 rsearch.rs_end = end; 501 return (avl_find(&rt->rt_root, &rsearch, NULL)); 502 } 503 504 range_seg_t * 505 range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size) 506 { 507 range_seg_t *rs = range_tree_find_impl(rt, start, size); 508 if (rs != NULL && rs->rs_start <= start && rs->rs_end >= start + size) 509 return (rs); 510 return (NULL); 511 } 512 513 void 514 range_tree_verify_not_present(range_tree_t *rt, uint64_t off, uint64_t size) 515 { 516 range_seg_t *rs = range_tree_find(rt, off, size); 517 if (rs != NULL) 518 panic("segment already in tree; rs=%p", (void *)rs); 519 } 520 521 boolean_t 522 range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size) 523 { 524 return (range_tree_find(rt, start, size) != NULL); 525 } 526 527 /* 528 * Ensure that this range is not in the tree, regardless of whether 529 * it is currently in the tree. 530 */ 531 void 532 range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size) 533 { 534 range_seg_t *rs; 535 536 if (size == 0) 537 return; 538 539 while ((rs = range_tree_find_impl(rt, start, size)) != NULL) { 540 uint64_t free_start = MAX(rs->rs_start, start); 541 uint64_t free_end = MIN(rs->rs_end, start + size); 542 range_tree_remove(rt, free_start, free_end - free_start); 543 } 544 } 545 546 void 547 range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst) 548 { 549 range_tree_t *rt; 550 551 ASSERT0(range_tree_space(*rtdst)); 552 ASSERT0(avl_numnodes(&(*rtdst)->rt_root)); 553 554 rt = *rtsrc; 555 *rtsrc = *rtdst; 556 *rtdst = rt; 557 } 558 559 void 560 range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg) 561 { 562 range_seg_t *rs; 563 void *cookie = NULL; 564 565 566 if (rt->rt_ops != NULL && rt->rt_ops->rtop_vacate != NULL) 567 rt->rt_ops->rtop_vacate(rt, rt->rt_arg); 568 569 while ((rs = avl_destroy_nodes(&rt->rt_root, &cookie)) != NULL) { 570 if (func != NULL) 571 func(arg, rs->rs_start, rs->rs_end - rs->rs_start); 572 kmem_cache_free(range_seg_cache, rs); 573 } 574 575 bzero(rt->rt_histogram, sizeof (rt->rt_histogram)); 576 rt->rt_space = 0; 577 } 578 579 void 580 range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg) 581 { 582 range_seg_t *rs; 583 584 for (rs = avl_first(&rt->rt_root); rs; rs = AVL_NEXT(&rt->rt_root, rs)) 585 func(arg, rs->rs_start, rs->rs_end - rs->rs_start); 586 } 587 588 range_seg_t * 589 range_tree_first(range_tree_t *rt) 590 { 591 return (avl_first(&rt->rt_root)); 592 } 593 594 uint64_t 595 range_tree_space(range_tree_t *rt) 596 { 597 return (rt->rt_space); 598 } 599 600 /* Generic range tree functions for maintaining segments in an AVL tree. */ 601 void 602 rt_avl_create(range_tree_t *rt, void *arg) 603 { 604 avl_tree_t *tree = arg; 605 606 avl_create(tree, rt->rt_avl_compare, sizeof (range_seg_t), 607 offsetof(range_seg_t, rs_pp_node)); 608 } 609 610 void 611 rt_avl_destroy(range_tree_t *rt, void *arg) 612 { 613 avl_tree_t *tree = arg; 614 615 ASSERT0(avl_numnodes(tree)); 616 avl_destroy(tree); 617 } 618 619 void 620 rt_avl_add(range_tree_t *rt, range_seg_t *rs, void *arg) 621 { 622 avl_tree_t *tree = arg; 623 avl_add(tree, rs); 624 } 625 626 void 627 rt_avl_remove(range_tree_t *rt, range_seg_t *rs, void *arg) 628 { 629 avl_tree_t *tree = arg; 630 avl_remove(tree, rs); 631 } 632 633 void 634 rt_avl_vacate(range_tree_t *rt, void *arg) 635 { 636 /* 637 * Normally one would walk the tree freeing nodes along the way. 638 * Since the nodes are shared with the range trees we can avoid 639 * walking all nodes and just reinitialize the avl tree. The nodes 640 * will be freed by the range tree, so we don't want to free them here. 641 */ 642 rt_avl_create(rt, arg); 643 } 644 645 boolean_t 646 range_tree_is_empty(range_tree_t *rt) 647 { 648 ASSERT(rt != NULL); 649 return (range_tree_space(rt) == 0); 650 } 651 652 uint64_t 653 range_tree_min(range_tree_t *rt) 654 { 655 range_seg_t *rs = avl_first(&rt->rt_root); 656 return (rs != NULL ? rs->rs_start : 0); 657 } 658 659 uint64_t 660 range_tree_max(range_tree_t *rt) 661 { 662 range_seg_t *rs = avl_last(&rt->rt_root); 663 return (rs != NULL ? rs->rs_end : 0); 664 } 665 666 uint64_t 667 range_tree_span(range_tree_t *rt) 668 { 669 return (range_tree_max(rt) - range_tree_min(rt)); 670 } 671