1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2014 by Delphix. All rights reserved. 24 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 25 */ 26 27 #include <sys/zfs_context.h> 28 #include <sys/dmu.h> 29 #include <sys/dmu_tx.h> 30 #include <sys/space_map.h> 31 #include <sys/metaslab_impl.h> 32 #include <sys/vdev_impl.h> 33 #include <sys/zio.h> 34 #include <sys/spa_impl.h> 35 36 /* 37 * Allow allocations to switch to gang blocks quickly. We do this to 38 * avoid having to load lots of space_maps in a given txg. There are, 39 * however, some cases where we want to avoid "fast" ganging and instead 40 * we want to do an exhaustive search of all metaslabs on this device. 41 * Currently we don't allow any gang, zil, or dump device related allocations 42 * to "fast" gang. 43 */ 44 #define CAN_FASTGANG(flags) \ 45 (!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \ 46 METASLAB_GANG_AVOID))) 47 48 #define METASLAB_WEIGHT_PRIMARY (1ULL << 63) 49 #define METASLAB_WEIGHT_SECONDARY (1ULL << 62) 50 #define METASLAB_ACTIVE_MASK \ 51 (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY) 52 53 uint64_t metaslab_aliquot = 512ULL << 10; 54 uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ 55 56 /* 57 * The in-core space map representation is more compact than its on-disk form. 58 * The zfs_condense_pct determines how much more compact the in-core 59 * space_map representation must be before we compact it on-disk. 60 * Values should be greater than or equal to 100. 61 */ 62 int zfs_condense_pct = 200; 63 64 /* 65 * This value defines the number of allowed allocation failures per vdev. 66 * If a device reaches this threshold in a given txg then we consider skipping 67 * allocations on that device. The value of zfs_mg_alloc_failures is computed 68 * in zio_init() unless it has been overridden in /etc/system. 69 */ 70 int zfs_mg_alloc_failures = 0; 71 72 /* 73 * The zfs_mg_noalloc_threshold defines which metaslab groups should 74 * be eligible for allocation. The value is defined as a percentage of 75 * a free space. Metaslab groups that have more free space than 76 * zfs_mg_noalloc_threshold are always eligible for allocations. Once 77 * a metaslab group's free space is less than or equal to the 78 * zfs_mg_noalloc_threshold the allocator will avoid allocating to that 79 * group unless all groups in the pool have reached zfs_mg_noalloc_threshold. 80 * Once all groups in the pool reach zfs_mg_noalloc_threshold then all 81 * groups are allowed to accept allocations. Gang blocks are always 82 * eligible to allocate on any metaslab group. The default value of 0 means 83 * no metaslab group will be excluded based on this criterion. 84 */ 85 int zfs_mg_noalloc_threshold = 0; 86 87 /* 88 * When set will load all metaslabs when pool is first opened. 89 */ 90 int metaslab_debug_load = 0; 91 92 /* 93 * When set will prevent metaslabs from being unloaded. 94 */ 95 int metaslab_debug_unload = 0; 96 97 /* 98 * Minimum size which forces the dynamic allocator to change 99 * it's allocation strategy. Once the space map cannot satisfy 100 * an allocation of this size then it switches to using more 101 * aggressive strategy (i.e search by size rather than offset). 102 */ 103 uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE; 104 105 /* 106 * The minimum free space, in percent, which must be available 107 * in a space map to continue allocations in a first-fit fashion. 108 * Once the space_map's free space drops below this level we dynamically 109 * switch to using best-fit allocations. 110 */ 111 int metaslab_df_free_pct = 4; 112 113 /* 114 * A metaslab is considered "free" if it contains a contiguous 115 * segment which is greater than metaslab_min_alloc_size. 116 */ 117 uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS; 118 119 /* 120 * Percentage of all cpus that can be used by the metaslab taskq. 121 */ 122 int metaslab_load_pct = 50; 123 124 /* 125 * Determines how many txgs a metaslab may remain loaded without having any 126 * allocations from it. As long as a metaslab continues to be used we will 127 * keep it loaded. 128 */ 129 int metaslab_unload_delay = TXG_SIZE * 2; 130 131 /* 132 * Should we be willing to write data to degraded vdevs? 133 */ 134 boolean_t zfs_write_to_degraded = B_FALSE; 135 136 /* 137 * Max number of metaslabs per group to preload. 138 */ 139 int metaslab_preload_limit = SPA_DVAS_PER_BP; 140 141 /* 142 * Enable/disable preloading of metaslab. 143 */ 144 boolean_t metaslab_preload_enabled = B_TRUE; 145 146 /* 147 * Enable/disable additional weight factor for each metaslab. 148 */ 149 boolean_t metaslab_weight_factor_enable = B_FALSE; 150 151 152 /* 153 * ========================================================================== 154 * Metaslab classes 155 * ========================================================================== 156 */ 157 metaslab_class_t * 158 metaslab_class_create(spa_t *spa, metaslab_ops_t *ops) 159 { 160 metaslab_class_t *mc; 161 162 mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); 163 164 mc->mc_spa = spa; 165 mc->mc_rotor = NULL; 166 mc->mc_ops = ops; 167 168 return (mc); 169 } 170 171 void 172 metaslab_class_destroy(metaslab_class_t *mc) 173 { 174 ASSERT(mc->mc_rotor == NULL); 175 ASSERT(mc->mc_alloc == 0); 176 ASSERT(mc->mc_deferred == 0); 177 ASSERT(mc->mc_space == 0); 178 ASSERT(mc->mc_dspace == 0); 179 180 kmem_free(mc, sizeof (metaslab_class_t)); 181 } 182 183 int 184 metaslab_class_validate(metaslab_class_t *mc) 185 { 186 metaslab_group_t *mg; 187 vdev_t *vd; 188 189 /* 190 * Must hold one of the spa_config locks. 191 */ 192 ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || 193 spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); 194 195 if ((mg = mc->mc_rotor) == NULL) 196 return (0); 197 198 do { 199 vd = mg->mg_vd; 200 ASSERT(vd->vdev_mg != NULL); 201 ASSERT3P(vd->vdev_top, ==, vd); 202 ASSERT3P(mg->mg_class, ==, mc); 203 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); 204 } while ((mg = mg->mg_next) != mc->mc_rotor); 205 206 return (0); 207 } 208 209 void 210 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, 211 int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) 212 { 213 atomic_add_64(&mc->mc_alloc, alloc_delta); 214 atomic_add_64(&mc->mc_deferred, defer_delta); 215 atomic_add_64(&mc->mc_space, space_delta); 216 atomic_add_64(&mc->mc_dspace, dspace_delta); 217 } 218 219 uint64_t 220 metaslab_class_get_alloc(metaslab_class_t *mc) 221 { 222 return (mc->mc_alloc); 223 } 224 225 uint64_t 226 metaslab_class_get_deferred(metaslab_class_t *mc) 227 { 228 return (mc->mc_deferred); 229 } 230 231 uint64_t 232 metaslab_class_get_space(metaslab_class_t *mc) 233 { 234 return (mc->mc_space); 235 } 236 237 uint64_t 238 metaslab_class_get_dspace(metaslab_class_t *mc) 239 { 240 return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); 241 } 242 243 /* 244 * ========================================================================== 245 * Metaslab groups 246 * ========================================================================== 247 */ 248 static int 249 metaslab_compare(const void *x1, const void *x2) 250 { 251 const metaslab_t *m1 = x1; 252 const metaslab_t *m2 = x2; 253 254 if (m1->ms_weight < m2->ms_weight) 255 return (1); 256 if (m1->ms_weight > m2->ms_weight) 257 return (-1); 258 259 /* 260 * If the weights are identical, use the offset to force uniqueness. 261 */ 262 if (m1->ms_start < m2->ms_start) 263 return (-1); 264 if (m1->ms_start > m2->ms_start) 265 return (1); 266 267 ASSERT3P(m1, ==, m2); 268 269 return (0); 270 } 271 272 /* 273 * Update the allocatable flag and the metaslab group's capacity. 274 * The allocatable flag is set to true if the capacity is below 275 * the zfs_mg_noalloc_threshold. If a metaslab group transitions 276 * from allocatable to non-allocatable or vice versa then the metaslab 277 * group's class is updated to reflect the transition. 278 */ 279 static void 280 metaslab_group_alloc_update(metaslab_group_t *mg) 281 { 282 vdev_t *vd = mg->mg_vd; 283 metaslab_class_t *mc = mg->mg_class; 284 vdev_stat_t *vs = &vd->vdev_stat; 285 boolean_t was_allocatable; 286 287 ASSERT(vd == vd->vdev_top); 288 289 mutex_enter(&mg->mg_lock); 290 was_allocatable = mg->mg_allocatable; 291 292 mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / 293 (vs->vs_space + 1); 294 295 mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold); 296 297 /* 298 * The mc_alloc_groups maintains a count of the number of 299 * groups in this metaslab class that are still above the 300 * zfs_mg_noalloc_threshold. This is used by the allocating 301 * threads to determine if they should avoid allocations to 302 * a given group. The allocator will avoid allocations to a group 303 * if that group has reached or is below the zfs_mg_noalloc_threshold 304 * and there are still other groups that are above the threshold. 305 * When a group transitions from allocatable to non-allocatable or 306 * vice versa we update the metaslab class to reflect that change. 307 * When the mc_alloc_groups value drops to 0 that means that all 308 * groups have reached the zfs_mg_noalloc_threshold making all groups 309 * eligible for allocations. This effectively means that all devices 310 * are balanced again. 311 */ 312 if (was_allocatable && !mg->mg_allocatable) 313 mc->mc_alloc_groups--; 314 else if (!was_allocatable && mg->mg_allocatable) 315 mc->mc_alloc_groups++; 316 mutex_exit(&mg->mg_lock); 317 } 318 319 metaslab_group_t * 320 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) 321 { 322 metaslab_group_t *mg; 323 324 mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); 325 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); 326 avl_create(&mg->mg_metaslab_tree, metaslab_compare, 327 sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); 328 mg->mg_vd = vd; 329 mg->mg_class = mc; 330 mg->mg_activation_count = 0; 331 332 mg->mg_taskq = taskq_create("metaslab_group_tasksq", metaslab_load_pct, 333 minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT); 334 335 return (mg); 336 } 337 338 void 339 metaslab_group_destroy(metaslab_group_t *mg) 340 { 341 ASSERT(mg->mg_prev == NULL); 342 ASSERT(mg->mg_next == NULL); 343 /* 344 * We may have gone below zero with the activation count 345 * either because we never activated in the first place or 346 * because we're done, and possibly removing the vdev. 347 */ 348 ASSERT(mg->mg_activation_count <= 0); 349 350 avl_destroy(&mg->mg_metaslab_tree); 351 mutex_destroy(&mg->mg_lock); 352 kmem_free(mg, sizeof (metaslab_group_t)); 353 } 354 355 void 356 metaslab_group_activate(metaslab_group_t *mg) 357 { 358 metaslab_class_t *mc = mg->mg_class; 359 metaslab_group_t *mgprev, *mgnext; 360 361 ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); 362 363 ASSERT(mc->mc_rotor != mg); 364 ASSERT(mg->mg_prev == NULL); 365 ASSERT(mg->mg_next == NULL); 366 ASSERT(mg->mg_activation_count <= 0); 367 368 if (++mg->mg_activation_count <= 0) 369 return; 370 371 mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); 372 metaslab_group_alloc_update(mg); 373 374 if ((mgprev = mc->mc_rotor) == NULL) { 375 mg->mg_prev = mg; 376 mg->mg_next = mg; 377 } else { 378 mgnext = mgprev->mg_next; 379 mg->mg_prev = mgprev; 380 mg->mg_next = mgnext; 381 mgprev->mg_next = mg; 382 mgnext->mg_prev = mg; 383 } 384 mc->mc_rotor = mg; 385 } 386 387 void 388 metaslab_group_passivate(metaslab_group_t *mg) 389 { 390 metaslab_class_t *mc = mg->mg_class; 391 metaslab_group_t *mgprev, *mgnext; 392 393 ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); 394 395 if (--mg->mg_activation_count != 0) { 396 ASSERT(mc->mc_rotor != mg); 397 ASSERT(mg->mg_prev == NULL); 398 ASSERT(mg->mg_next == NULL); 399 ASSERT(mg->mg_activation_count < 0); 400 return; 401 } 402 403 taskq_wait(mg->mg_taskq); 404 405 mgprev = mg->mg_prev; 406 mgnext = mg->mg_next; 407 408 if (mg == mgnext) { 409 mc->mc_rotor = NULL; 410 } else { 411 mc->mc_rotor = mgnext; 412 mgprev->mg_next = mgnext; 413 mgnext->mg_prev = mgprev; 414 } 415 416 mg->mg_prev = NULL; 417 mg->mg_next = NULL; 418 } 419 420 static void 421 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) 422 { 423 mutex_enter(&mg->mg_lock); 424 ASSERT(msp->ms_group == NULL); 425 msp->ms_group = mg; 426 msp->ms_weight = 0; 427 avl_add(&mg->mg_metaslab_tree, msp); 428 mutex_exit(&mg->mg_lock); 429 } 430 431 static void 432 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) 433 { 434 mutex_enter(&mg->mg_lock); 435 ASSERT(msp->ms_group == mg); 436 avl_remove(&mg->mg_metaslab_tree, msp); 437 msp->ms_group = NULL; 438 mutex_exit(&mg->mg_lock); 439 } 440 441 static void 442 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 443 { 444 /* 445 * Although in principle the weight can be any value, in 446 * practice we do not use values in the range [1, 510]. 447 */ 448 ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0); 449 ASSERT(MUTEX_HELD(&msp->ms_lock)); 450 451 mutex_enter(&mg->mg_lock); 452 ASSERT(msp->ms_group == mg); 453 avl_remove(&mg->mg_metaslab_tree, msp); 454 msp->ms_weight = weight; 455 avl_add(&mg->mg_metaslab_tree, msp); 456 mutex_exit(&mg->mg_lock); 457 } 458 459 /* 460 * Determine if a given metaslab group should skip allocations. A metaslab 461 * group should avoid allocations if its used capacity has crossed the 462 * zfs_mg_noalloc_threshold and there is at least one metaslab group 463 * that can still handle allocations. 464 */ 465 static boolean_t 466 metaslab_group_allocatable(metaslab_group_t *mg) 467 { 468 vdev_t *vd = mg->mg_vd; 469 spa_t *spa = vd->vdev_spa; 470 metaslab_class_t *mc = mg->mg_class; 471 472 /* 473 * A metaslab group is considered allocatable if its free capacity 474 * is greater than the set value of zfs_mg_noalloc_threshold, it's 475 * associated with a slog, or there are no other metaslab groups 476 * with free capacity greater than zfs_mg_noalloc_threshold. 477 */ 478 return (mg->mg_free_capacity > zfs_mg_noalloc_threshold || 479 mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0); 480 } 481 482 /* 483 * ========================================================================== 484 * Range tree callbacks 485 * ========================================================================== 486 */ 487 488 /* 489 * Comparison function for the private size-ordered tree. Tree is sorted 490 * by size, larger sizes at the end of the tree. 491 */ 492 static int 493 metaslab_rangesize_compare(const void *x1, const void *x2) 494 { 495 const range_seg_t *r1 = x1; 496 const range_seg_t *r2 = x2; 497 uint64_t rs_size1 = r1->rs_end - r1->rs_start; 498 uint64_t rs_size2 = r2->rs_end - r2->rs_start; 499 500 if (rs_size1 < rs_size2) 501 return (-1); 502 if (rs_size1 > rs_size2) 503 return (1); 504 505 if (r1->rs_start < r2->rs_start) 506 return (-1); 507 508 if (r1->rs_start > r2->rs_start) 509 return (1); 510 511 return (0); 512 } 513 514 /* 515 * Create any block allocator specific components. The current allocators 516 * rely on using both a size-ordered range_tree_t and an array of uint64_t's. 517 */ 518 static void 519 metaslab_rt_create(range_tree_t *rt, void *arg) 520 { 521 metaslab_t *msp = arg; 522 523 ASSERT3P(rt->rt_arg, ==, msp); 524 ASSERT(msp->ms_tree == NULL); 525 526 avl_create(&msp->ms_size_tree, metaslab_rangesize_compare, 527 sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node)); 528 } 529 530 /* 531 * Destroy the block allocator specific components. 532 */ 533 static void 534 metaslab_rt_destroy(range_tree_t *rt, void *arg) 535 { 536 metaslab_t *msp = arg; 537 538 ASSERT3P(rt->rt_arg, ==, msp); 539 ASSERT3P(msp->ms_tree, ==, rt); 540 ASSERT0(avl_numnodes(&msp->ms_size_tree)); 541 542 avl_destroy(&msp->ms_size_tree); 543 } 544 545 static void 546 metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg) 547 { 548 metaslab_t *msp = arg; 549 550 ASSERT3P(rt->rt_arg, ==, msp); 551 ASSERT3P(msp->ms_tree, ==, rt); 552 VERIFY(!msp->ms_condensing); 553 avl_add(&msp->ms_size_tree, rs); 554 } 555 556 static void 557 metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg) 558 { 559 metaslab_t *msp = arg; 560 561 ASSERT3P(rt->rt_arg, ==, msp); 562 ASSERT3P(msp->ms_tree, ==, rt); 563 VERIFY(!msp->ms_condensing); 564 avl_remove(&msp->ms_size_tree, rs); 565 } 566 567 static void 568 metaslab_rt_vacate(range_tree_t *rt, void *arg) 569 { 570 metaslab_t *msp = arg; 571 572 ASSERT3P(rt->rt_arg, ==, msp); 573 ASSERT3P(msp->ms_tree, ==, rt); 574 575 /* 576 * Normally one would walk the tree freeing nodes along the way. 577 * Since the nodes are shared with the range trees we can avoid 578 * walking all nodes and just reinitialize the avl tree. The nodes 579 * will be freed by the range tree, so we don't want to free them here. 580 */ 581 avl_create(&msp->ms_size_tree, metaslab_rangesize_compare, 582 sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node)); 583 } 584 585 static range_tree_ops_t metaslab_rt_ops = { 586 metaslab_rt_create, 587 metaslab_rt_destroy, 588 metaslab_rt_add, 589 metaslab_rt_remove, 590 metaslab_rt_vacate 591 }; 592 593 /* 594 * ========================================================================== 595 * Metaslab block operations 596 * ========================================================================== 597 */ 598 599 /* 600 * Return the maximum contiguous segment within the metaslab. 601 */ 602 uint64_t 603 metaslab_block_maxsize(metaslab_t *msp) 604 { 605 avl_tree_t *t = &msp->ms_size_tree; 606 range_seg_t *rs; 607 608 if (t == NULL || (rs = avl_last(t)) == NULL) 609 return (0ULL); 610 611 return (rs->rs_end - rs->rs_start); 612 } 613 614 uint64_t 615 metaslab_block_alloc(metaslab_t *msp, uint64_t size) 616 { 617 uint64_t start; 618 range_tree_t *rt = msp->ms_tree; 619 620 VERIFY(!msp->ms_condensing); 621 622 start = msp->ms_ops->msop_alloc(msp, size); 623 if (start != -1ULL) { 624 vdev_t *vd = msp->ms_group->mg_vd; 625 626 VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift)); 627 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 628 VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size); 629 range_tree_remove(rt, start, size); 630 } 631 return (start); 632 } 633 634 /* 635 * ========================================================================== 636 * Common allocator routines 637 * ========================================================================== 638 */ 639 640 /* 641 * This is a helper function that can be used by the allocator to find 642 * a suitable block to allocate. This will search the specified AVL 643 * tree looking for a block that matches the specified criteria. 644 */ 645 static uint64_t 646 metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, 647 uint64_t align) 648 { 649 range_seg_t *rs, rsearch; 650 avl_index_t where; 651 652 rsearch.rs_start = *cursor; 653 rsearch.rs_end = *cursor + size; 654 655 rs = avl_find(t, &rsearch, &where); 656 if (rs == NULL) 657 rs = avl_nearest(t, where, AVL_AFTER); 658 659 while (rs != NULL) { 660 uint64_t offset = P2ROUNDUP(rs->rs_start, align); 661 662 if (offset + size <= rs->rs_end) { 663 *cursor = offset + size; 664 return (offset); 665 } 666 rs = AVL_NEXT(t, rs); 667 } 668 669 /* 670 * If we know we've searched the whole map (*cursor == 0), give up. 671 * Otherwise, reset the cursor to the beginning and try again. 672 */ 673 if (*cursor == 0) 674 return (-1ULL); 675 676 *cursor = 0; 677 return (metaslab_block_picker(t, cursor, size, align)); 678 } 679 680 /* 681 * ========================================================================== 682 * The first-fit block allocator 683 * ========================================================================== 684 */ 685 static uint64_t 686 metaslab_ff_alloc(metaslab_t *msp, uint64_t size) 687 { 688 /* 689 * Find the largest power of 2 block size that evenly divides the 690 * requested size. This is used to try to allocate blocks with similar 691 * alignment from the same area of the metaslab (i.e. same cursor 692 * bucket) but it does not guarantee that other allocations sizes 693 * may exist in the same region. 694 */ 695 uint64_t align = size & -size; 696 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; 697 avl_tree_t *t = &msp->ms_tree->rt_root; 698 699 return (metaslab_block_picker(t, cursor, size, align)); 700 } 701 702 /* ARGSUSED */ 703 static boolean_t 704 metaslab_ff_fragmented(metaslab_t *msp) 705 { 706 return (B_TRUE); 707 } 708 709 static metaslab_ops_t metaslab_ff_ops = { 710 metaslab_ff_alloc, 711 metaslab_ff_fragmented 712 }; 713 714 /* 715 * ========================================================================== 716 * Dynamic block allocator - 717 * Uses the first fit allocation scheme until space get low and then 718 * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold 719 * and metaslab_df_free_pct to determine when to switch the allocation scheme. 720 * ========================================================================== 721 */ 722 static uint64_t 723 metaslab_df_alloc(metaslab_t *msp, uint64_t size) 724 { 725 /* 726 * Find the largest power of 2 block size that evenly divides the 727 * requested size. This is used to try to allocate blocks with similar 728 * alignment from the same area of the metaslab (i.e. same cursor 729 * bucket) but it does not guarantee that other allocations sizes 730 * may exist in the same region. 731 */ 732 uint64_t align = size & -size; 733 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; 734 range_tree_t *rt = msp->ms_tree; 735 avl_tree_t *t = &rt->rt_root; 736 uint64_t max_size = metaslab_block_maxsize(msp); 737 int free_pct = range_tree_space(rt) * 100 / msp->ms_size; 738 739 ASSERT(MUTEX_HELD(&msp->ms_lock)); 740 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree)); 741 742 if (max_size < size) 743 return (-1ULL); 744 745 /* 746 * If we're running low on space switch to using the size 747 * sorted AVL tree (best-fit). 748 */ 749 if (max_size < metaslab_df_alloc_threshold || 750 free_pct < metaslab_df_free_pct) { 751 t = &msp->ms_size_tree; 752 *cursor = 0; 753 } 754 755 return (metaslab_block_picker(t, cursor, size, 1ULL)); 756 } 757 758 static boolean_t 759 metaslab_df_fragmented(metaslab_t *msp) 760 { 761 range_tree_t *rt = msp->ms_tree; 762 uint64_t max_size = metaslab_block_maxsize(msp); 763 int free_pct = range_tree_space(rt) * 100 / msp->ms_size; 764 765 if (max_size >= metaslab_df_alloc_threshold && 766 free_pct >= metaslab_df_free_pct) 767 return (B_FALSE); 768 769 return (B_TRUE); 770 } 771 772 static metaslab_ops_t metaslab_df_ops = { 773 metaslab_df_alloc, 774 metaslab_df_fragmented 775 }; 776 777 /* 778 * ========================================================================== 779 * Cursor fit block allocator - 780 * Select the largest region in the metaslab, set the cursor to the beginning 781 * of the range and the cursor_end to the end of the range. As allocations 782 * are made advance the cursor. Continue allocating from the cursor until 783 * the range is exhausted and then find a new range. 784 * ========================================================================== 785 */ 786 static uint64_t 787 metaslab_cf_alloc(metaslab_t *msp, uint64_t size) 788 { 789 range_tree_t *rt = msp->ms_tree; 790 avl_tree_t *t = &msp->ms_size_tree; 791 uint64_t *cursor = &msp->ms_lbas[0]; 792 uint64_t *cursor_end = &msp->ms_lbas[1]; 793 uint64_t offset = 0; 794 795 ASSERT(MUTEX_HELD(&msp->ms_lock)); 796 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root)); 797 798 ASSERT3U(*cursor_end, >=, *cursor); 799 800 if ((*cursor + size) > *cursor_end) { 801 range_seg_t *rs; 802 803 rs = avl_last(&msp->ms_size_tree); 804 if (rs == NULL || (rs->rs_end - rs->rs_start) < size) 805 return (-1ULL); 806 807 *cursor = rs->rs_start; 808 *cursor_end = rs->rs_end; 809 } 810 811 offset = *cursor; 812 *cursor += size; 813 814 return (offset); 815 } 816 817 static boolean_t 818 metaslab_cf_fragmented(metaslab_t *msp) 819 { 820 return (metaslab_block_maxsize(msp) < metaslab_min_alloc_size); 821 } 822 823 static metaslab_ops_t metaslab_cf_ops = { 824 metaslab_cf_alloc, 825 metaslab_cf_fragmented 826 }; 827 828 /* 829 * ========================================================================== 830 * New dynamic fit allocator - 831 * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift 832 * contiguous blocks. If no region is found then just use the largest segment 833 * that remains. 834 * ========================================================================== 835 */ 836 837 /* 838 * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift) 839 * to request from the allocator. 840 */ 841 uint64_t metaslab_ndf_clump_shift = 4; 842 843 static uint64_t 844 metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) 845 { 846 avl_tree_t *t = &msp->ms_tree->rt_root; 847 avl_index_t where; 848 range_seg_t *rs, rsearch; 849 uint64_t hbit = highbit64(size); 850 uint64_t *cursor = &msp->ms_lbas[hbit - 1]; 851 uint64_t max_size = metaslab_block_maxsize(msp); 852 853 ASSERT(MUTEX_HELD(&msp->ms_lock)); 854 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree)); 855 856 if (max_size < size) 857 return (-1ULL); 858 859 rsearch.rs_start = *cursor; 860 rsearch.rs_end = *cursor + size; 861 862 rs = avl_find(t, &rsearch, &where); 863 if (rs == NULL || (rs->rs_end - rs->rs_start) < size) { 864 t = &msp->ms_size_tree; 865 866 rsearch.rs_start = 0; 867 rsearch.rs_end = MIN(max_size, 868 1ULL << (hbit + metaslab_ndf_clump_shift)); 869 rs = avl_find(t, &rsearch, &where); 870 if (rs == NULL) 871 rs = avl_nearest(t, where, AVL_AFTER); 872 ASSERT(rs != NULL); 873 } 874 875 if ((rs->rs_end - rs->rs_start) >= size) { 876 *cursor = rs->rs_start + size; 877 return (rs->rs_start); 878 } 879 return (-1ULL); 880 } 881 882 static boolean_t 883 metaslab_ndf_fragmented(metaslab_t *msp) 884 { 885 return (metaslab_block_maxsize(msp) <= 886 (metaslab_min_alloc_size << metaslab_ndf_clump_shift)); 887 } 888 889 static metaslab_ops_t metaslab_ndf_ops = { 890 metaslab_ndf_alloc, 891 metaslab_ndf_fragmented 892 }; 893 894 metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops; 895 896 /* 897 * ========================================================================== 898 * Metaslabs 899 * ========================================================================== 900 */ 901 902 /* 903 * Wait for any in-progress metaslab loads to complete. 904 */ 905 void 906 metaslab_load_wait(metaslab_t *msp) 907 { 908 ASSERT(MUTEX_HELD(&msp->ms_lock)); 909 910 while (msp->ms_loading) { 911 ASSERT(!msp->ms_loaded); 912 cv_wait(&msp->ms_load_cv, &msp->ms_lock); 913 } 914 } 915 916 int 917 metaslab_load(metaslab_t *msp) 918 { 919 int error = 0; 920 921 ASSERT(MUTEX_HELD(&msp->ms_lock)); 922 ASSERT(!msp->ms_loaded); 923 ASSERT(!msp->ms_loading); 924 925 msp->ms_loading = B_TRUE; 926 927 /* 928 * If the space map has not been allocated yet, then treat 929 * all the space in the metaslab as free and add it to the 930 * ms_tree. 931 */ 932 if (msp->ms_sm != NULL) 933 error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE); 934 else 935 range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size); 936 937 msp->ms_loaded = (error == 0); 938 msp->ms_loading = B_FALSE; 939 940 if (msp->ms_loaded) { 941 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 942 range_tree_walk(msp->ms_defertree[t], 943 range_tree_remove, msp->ms_tree); 944 } 945 } 946 cv_broadcast(&msp->ms_load_cv); 947 return (error); 948 } 949 950 void 951 metaslab_unload(metaslab_t *msp) 952 { 953 ASSERT(MUTEX_HELD(&msp->ms_lock)); 954 range_tree_vacate(msp->ms_tree, NULL, NULL); 955 msp->ms_loaded = B_FALSE; 956 msp->ms_weight &= ~METASLAB_ACTIVE_MASK; 957 } 958 959 metaslab_t * 960 metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg) 961 { 962 vdev_t *vd = mg->mg_vd; 963 objset_t *mos = vd->vdev_spa->spa_meta_objset; 964 metaslab_t *msp; 965 966 msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); 967 mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL); 968 cv_init(&msp->ms_load_cv, NULL, CV_DEFAULT, NULL); 969 msp->ms_id = id; 970 msp->ms_start = id << vd->vdev_ms_shift; 971 msp->ms_size = 1ULL << vd->vdev_ms_shift; 972 973 /* 974 * We only open space map objects that already exist. All others 975 * will be opened when we finally allocate an object for it. 976 */ 977 if (object != 0) { 978 VERIFY0(space_map_open(&msp->ms_sm, mos, object, msp->ms_start, 979 msp->ms_size, vd->vdev_ashift, &msp->ms_lock)); 980 ASSERT(msp->ms_sm != NULL); 981 } 982 983 /* 984 * We create the main range tree here, but we don't create the 985 * alloctree and freetree until metaslab_sync_done(). This serves 986 * two purposes: it allows metaslab_sync_done() to detect the 987 * addition of new space; and for debugging, it ensures that we'd 988 * data fault on any attempt to use this metaslab before it's ready. 989 */ 990 msp->ms_tree = range_tree_create(&metaslab_rt_ops, msp, &msp->ms_lock); 991 metaslab_group_add(mg, msp); 992 993 msp->ms_ops = mg->mg_class->mc_ops; 994 995 /* 996 * If we're opening an existing pool (txg == 0) or creating 997 * a new one (txg == TXG_INITIAL), all space is available now. 998 * If we're adding space to an existing pool, the new space 999 * does not become available until after this txg has synced. 1000 */ 1001 if (txg <= TXG_INITIAL) 1002 metaslab_sync_done(msp, 0); 1003 1004 /* 1005 * If metaslab_debug_load is set and we're initializing a metaslab 1006 * that has an allocated space_map object then load the its space 1007 * map so that can verify frees. 1008 */ 1009 if (metaslab_debug_load && msp->ms_sm != NULL) { 1010 mutex_enter(&msp->ms_lock); 1011 VERIFY0(metaslab_load(msp)); 1012 mutex_exit(&msp->ms_lock); 1013 } 1014 1015 if (txg != 0) { 1016 vdev_dirty(vd, 0, NULL, txg); 1017 vdev_dirty(vd, VDD_METASLAB, msp, txg); 1018 } 1019 1020 return (msp); 1021 } 1022 1023 void 1024 metaslab_fini(metaslab_t *msp) 1025 { 1026 metaslab_group_t *mg = msp->ms_group; 1027 1028 metaslab_group_remove(mg, msp); 1029 1030 mutex_enter(&msp->ms_lock); 1031 1032 VERIFY(msp->ms_group == NULL); 1033 vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm), 1034 0, -msp->ms_size); 1035 space_map_close(msp->ms_sm); 1036 1037 metaslab_unload(msp); 1038 range_tree_destroy(msp->ms_tree); 1039 1040 for (int t = 0; t < TXG_SIZE; t++) { 1041 range_tree_destroy(msp->ms_alloctree[t]); 1042 range_tree_destroy(msp->ms_freetree[t]); 1043 } 1044 1045 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1046 range_tree_destroy(msp->ms_defertree[t]); 1047 } 1048 1049 ASSERT0(msp->ms_deferspace); 1050 1051 mutex_exit(&msp->ms_lock); 1052 cv_destroy(&msp->ms_load_cv); 1053 mutex_destroy(&msp->ms_lock); 1054 1055 kmem_free(msp, sizeof (metaslab_t)); 1056 } 1057 1058 /* 1059 * Apply a weighting factor based on the histogram information for this 1060 * metaslab. The current weighting factor is somewhat arbitrary and requires 1061 * additional investigation. The implementation provides a measure of 1062 * "weighted" free space and gives a higher weighting for larger contiguous 1063 * regions. The weighting factor is determined by counting the number of 1064 * sm_shift sectors that exist in each region represented by the histogram. 1065 * That value is then multiplied by the power of 2 exponent and the sm_shift 1066 * value. 1067 * 1068 * For example, assume the 2^21 histogram bucket has 4 2MB regions and the 1069 * metaslab has an sm_shift value of 9 (512B): 1070 * 1071 * 1) calculate the number of sm_shift sectors in the region: 1072 * 2^21 / 2^9 = 2^12 = 4096 * 4 (number of regions) = 16384 1073 * 2) multiply by the power of 2 exponent and the sm_shift value: 1074 * 16384 * 21 * 9 = 3096576 1075 * This value will be added to the weighting of the metaslab. 1076 */ 1077 static uint64_t 1078 metaslab_weight_factor(metaslab_t *msp) 1079 { 1080 uint64_t factor = 0; 1081 uint64_t sectors; 1082 int i; 1083 1084 /* 1085 * A null space map means that the entire metaslab is free, 1086 * calculate a weight factor that spans the entire size of the 1087 * metaslab. 1088 */ 1089 if (msp->ms_sm == NULL) { 1090 vdev_t *vd = msp->ms_group->mg_vd; 1091 1092 i = highbit64(msp->ms_size) - 1; 1093 sectors = msp->ms_size >> vd->vdev_ashift; 1094 return (sectors * i * vd->vdev_ashift); 1095 } 1096 1097 if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) 1098 return (0); 1099 1100 for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE(msp->ms_sm); i++) { 1101 if (msp->ms_sm->sm_phys->smp_histogram[i] == 0) 1102 continue; 1103 1104 /* 1105 * Determine the number of sm_shift sectors in the region 1106 * indicated by the histogram. For example, given an 1107 * sm_shift value of 9 (512 bytes) and i = 4 then we know 1108 * that we're looking at an 8K region in the histogram 1109 * (i.e. 9 + 4 = 13, 2^13 = 8192). To figure out the 1110 * number of sm_shift sectors (512 bytes in this example), 1111 * we would take 8192 / 512 = 16. Since the histogram 1112 * is offset by sm_shift we can simply use the value of 1113 * of i to calculate this (i.e. 2^i = 16 where i = 4). 1114 */ 1115 sectors = msp->ms_sm->sm_phys->smp_histogram[i] << i; 1116 factor += (i + msp->ms_sm->sm_shift) * sectors; 1117 } 1118 return (factor * msp->ms_sm->sm_shift); 1119 } 1120 1121 static uint64_t 1122 metaslab_weight(metaslab_t *msp) 1123 { 1124 metaslab_group_t *mg = msp->ms_group; 1125 vdev_t *vd = mg->mg_vd; 1126 uint64_t weight, space; 1127 1128 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1129 1130 /* 1131 * This vdev is in the process of being removed so there is nothing 1132 * for us to do here. 1133 */ 1134 if (vd->vdev_removing) { 1135 ASSERT0(space_map_allocated(msp->ms_sm)); 1136 ASSERT0(vd->vdev_ms_shift); 1137 return (0); 1138 } 1139 1140 /* 1141 * The baseline weight is the metaslab's free space. 1142 */ 1143 space = msp->ms_size - space_map_allocated(msp->ms_sm); 1144 weight = space; 1145 1146 /* 1147 * Modern disks have uniform bit density and constant angular velocity. 1148 * Therefore, the outer recording zones are faster (higher bandwidth) 1149 * than the inner zones by the ratio of outer to inner track diameter, 1150 * which is typically around 2:1. We account for this by assigning 1151 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). 1152 * In effect, this means that we'll select the metaslab with the most 1153 * free bandwidth rather than simply the one with the most free space. 1154 */ 1155 weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count; 1156 ASSERT(weight >= space && weight <= 2 * space); 1157 1158 msp->ms_factor = metaslab_weight_factor(msp); 1159 if (metaslab_weight_factor_enable) 1160 weight += msp->ms_factor; 1161 1162 if (msp->ms_loaded && !msp->ms_ops->msop_fragmented(msp)) { 1163 /* 1164 * If this metaslab is one we're actively using, adjust its 1165 * weight to make it preferable to any inactive metaslab so 1166 * we'll polish it off. 1167 */ 1168 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); 1169 } 1170 1171 return (weight); 1172 } 1173 1174 static int 1175 metaslab_activate(metaslab_t *msp, uint64_t activation_weight) 1176 { 1177 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1178 1179 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { 1180 metaslab_load_wait(msp); 1181 if (!msp->ms_loaded) { 1182 int error = metaslab_load(msp); 1183 if (error) { 1184 metaslab_group_sort(msp->ms_group, msp, 0); 1185 return (error); 1186 } 1187 } 1188 1189 metaslab_group_sort(msp->ms_group, msp, 1190 msp->ms_weight | activation_weight); 1191 } 1192 ASSERT(msp->ms_loaded); 1193 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); 1194 1195 return (0); 1196 } 1197 1198 static void 1199 metaslab_passivate(metaslab_t *msp, uint64_t size) 1200 { 1201 /* 1202 * If size < SPA_MINBLOCKSIZE, then we will not allocate from 1203 * this metaslab again. In that case, it had better be empty, 1204 * or we would be leaving space on the table. 1205 */ 1206 ASSERT(size >= SPA_MINBLOCKSIZE || range_tree_space(msp->ms_tree) == 0); 1207 metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size)); 1208 ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); 1209 } 1210 1211 static void 1212 metaslab_preload(void *arg) 1213 { 1214 metaslab_t *msp = arg; 1215 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1216 1217 mutex_enter(&msp->ms_lock); 1218 metaslab_load_wait(msp); 1219 if (!msp->ms_loaded) 1220 (void) metaslab_load(msp); 1221 1222 /* 1223 * Set the ms_access_txg value so that we don't unload it right away. 1224 */ 1225 msp->ms_access_txg = spa_syncing_txg(spa) + metaslab_unload_delay + 1; 1226 mutex_exit(&msp->ms_lock); 1227 } 1228 1229 static void 1230 metaslab_group_preload(metaslab_group_t *mg) 1231 { 1232 spa_t *spa = mg->mg_vd->vdev_spa; 1233 metaslab_t *msp; 1234 avl_tree_t *t = &mg->mg_metaslab_tree; 1235 int m = 0; 1236 1237 if (spa_shutting_down(spa) || !metaslab_preload_enabled) { 1238 taskq_wait(mg->mg_taskq); 1239 return; 1240 } 1241 mutex_enter(&mg->mg_lock); 1242 1243 /* 1244 * Prefetch the next potential metaslabs 1245 */ 1246 for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) { 1247 1248 /* If we have reached our preload limit then we're done */ 1249 if (++m > metaslab_preload_limit) 1250 break; 1251 1252 VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload, 1253 msp, TQ_SLEEP) != NULL); 1254 } 1255 mutex_exit(&mg->mg_lock); 1256 } 1257 1258 /* 1259 * Determine if the space map's on-disk footprint is past our tolerance 1260 * for inefficiency. We would like to use the following criteria to make 1261 * our decision: 1262 * 1263 * 1. The size of the space map object should not dramatically increase as a 1264 * result of writing out the free space range tree. 1265 * 1266 * 2. The minimal on-disk space map representation is zfs_condense_pct/100 1267 * times the size than the free space range tree representation 1268 * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB). 1269 * 1270 * Checking the first condition is tricky since we don't want to walk 1271 * the entire AVL tree calculating the estimated on-disk size. Instead we 1272 * use the size-ordered range tree in the metaslab and calculate the 1273 * size required to write out the largest segment in our free tree. If the 1274 * size required to represent that segment on disk is larger than the space 1275 * map object then we avoid condensing this map. 1276 * 1277 * To determine the second criterion we use a best-case estimate and assume 1278 * each segment can be represented on-disk as a single 64-bit entry. We refer 1279 * to this best-case estimate as the space map's minimal form. 1280 */ 1281 static boolean_t 1282 metaslab_should_condense(metaslab_t *msp) 1283 { 1284 space_map_t *sm = msp->ms_sm; 1285 range_seg_t *rs; 1286 uint64_t size, entries, segsz; 1287 1288 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1289 ASSERT(msp->ms_loaded); 1290 1291 /* 1292 * Use the ms_size_tree range tree, which is ordered by size, to 1293 * obtain the largest segment in the free tree. If the tree is empty 1294 * then we should condense the map. 1295 */ 1296 rs = avl_last(&msp->ms_size_tree); 1297 if (rs == NULL) 1298 return (B_TRUE); 1299 1300 /* 1301 * Calculate the number of 64-bit entries this segment would 1302 * require when written to disk. If this single segment would be 1303 * larger on-disk than the entire current on-disk structure, then 1304 * clearly condensing will increase the on-disk structure size. 1305 */ 1306 size = (rs->rs_end - rs->rs_start) >> sm->sm_shift; 1307 entries = size / (MIN(size, SM_RUN_MAX)); 1308 segsz = entries * sizeof (uint64_t); 1309 1310 return (segsz <= space_map_length(msp->ms_sm) && 1311 space_map_length(msp->ms_sm) >= (zfs_condense_pct * 1312 sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root)) / 100); 1313 } 1314 1315 /* 1316 * Condense the on-disk space map representation to its minimized form. 1317 * The minimized form consists of a small number of allocations followed by 1318 * the entries of the free range tree. 1319 */ 1320 static void 1321 metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) 1322 { 1323 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1324 range_tree_t *freetree = msp->ms_freetree[txg & TXG_MASK]; 1325 range_tree_t *condense_tree; 1326 space_map_t *sm = msp->ms_sm; 1327 1328 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1329 ASSERT3U(spa_sync_pass(spa), ==, 1); 1330 ASSERT(msp->ms_loaded); 1331 1332 spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, " 1333 "smp size %llu, segments %lu", txg, msp->ms_id, msp, 1334 space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root)); 1335 1336 /* 1337 * Create an range tree that is 100% allocated. We remove segments 1338 * that have been freed in this txg, any deferred frees that exist, 1339 * and any allocation in the future. Removing segments should be 1340 * a relatively inexpensive operation since we expect these trees to 1341 * have a small number of nodes. 1342 */ 1343 condense_tree = range_tree_create(NULL, NULL, &msp->ms_lock); 1344 range_tree_add(condense_tree, msp->ms_start, msp->ms_size); 1345 1346 /* 1347 * Remove what's been freed in this txg from the condense_tree. 1348 * Since we're in sync_pass 1, we know that all the frees from 1349 * this txg are in the freetree. 1350 */ 1351 range_tree_walk(freetree, range_tree_remove, condense_tree); 1352 1353 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1354 range_tree_walk(msp->ms_defertree[t], 1355 range_tree_remove, condense_tree); 1356 } 1357 1358 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 1359 range_tree_walk(msp->ms_alloctree[(txg + t) & TXG_MASK], 1360 range_tree_remove, condense_tree); 1361 } 1362 1363 /* 1364 * We're about to drop the metaslab's lock thus allowing 1365 * other consumers to change it's content. Set the 1366 * metaslab's ms_condensing flag to ensure that 1367 * allocations on this metaslab do not occur while we're 1368 * in the middle of committing it to disk. This is only critical 1369 * for the ms_tree as all other range trees use per txg 1370 * views of their content. 1371 */ 1372 msp->ms_condensing = B_TRUE; 1373 1374 mutex_exit(&msp->ms_lock); 1375 space_map_truncate(sm, tx); 1376 mutex_enter(&msp->ms_lock); 1377 1378 /* 1379 * While we would ideally like to create a space_map representation 1380 * that consists only of allocation records, doing so can be 1381 * prohibitively expensive because the in-core free tree can be 1382 * large, and therefore computationally expensive to subtract 1383 * from the condense_tree. Instead we sync out two trees, a cheap 1384 * allocation only tree followed by the in-core free tree. While not 1385 * optimal, this is typically close to optimal, and much cheaper to 1386 * compute. 1387 */ 1388 space_map_write(sm, condense_tree, SM_ALLOC, tx); 1389 range_tree_vacate(condense_tree, NULL, NULL); 1390 range_tree_destroy(condense_tree); 1391 1392 space_map_write(sm, msp->ms_tree, SM_FREE, tx); 1393 msp->ms_condensing = B_FALSE; 1394 } 1395 1396 /* 1397 * Write a metaslab to disk in the context of the specified transaction group. 1398 */ 1399 void 1400 metaslab_sync(metaslab_t *msp, uint64_t txg) 1401 { 1402 metaslab_group_t *mg = msp->ms_group; 1403 vdev_t *vd = mg->mg_vd; 1404 spa_t *spa = vd->vdev_spa; 1405 objset_t *mos = spa_meta_objset(spa); 1406 range_tree_t *alloctree = msp->ms_alloctree[txg & TXG_MASK]; 1407 range_tree_t **freetree = &msp->ms_freetree[txg & TXG_MASK]; 1408 range_tree_t **freed_tree = 1409 &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK]; 1410 dmu_tx_t *tx; 1411 uint64_t object = space_map_object(msp->ms_sm); 1412 1413 ASSERT(!vd->vdev_ishole); 1414 1415 /* 1416 * This metaslab has just been added so there's no work to do now. 1417 */ 1418 if (*freetree == NULL) { 1419 ASSERT3P(alloctree, ==, NULL); 1420 return; 1421 } 1422 1423 ASSERT3P(alloctree, !=, NULL); 1424 ASSERT3P(*freetree, !=, NULL); 1425 ASSERT3P(*freed_tree, !=, NULL); 1426 1427 if (range_tree_space(alloctree) == 0 && 1428 range_tree_space(*freetree) == 0) 1429 return; 1430 1431 /* 1432 * The only state that can actually be changing concurrently with 1433 * metaslab_sync() is the metaslab's ms_tree. No other thread can 1434 * be modifying this txg's alloctree, freetree, freed_tree, or 1435 * space_map_phys_t. Therefore, we only hold ms_lock to satify 1436 * space_map ASSERTs. We drop it whenever we call into the DMU, 1437 * because the DMU can call down to us (e.g. via zio_free()) at 1438 * any time. 1439 */ 1440 1441 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 1442 1443 if (msp->ms_sm == NULL) { 1444 uint64_t new_object; 1445 1446 new_object = space_map_alloc(mos, tx); 1447 VERIFY3U(new_object, !=, 0); 1448 1449 VERIFY0(space_map_open(&msp->ms_sm, mos, new_object, 1450 msp->ms_start, msp->ms_size, vd->vdev_ashift, 1451 &msp->ms_lock)); 1452 ASSERT(msp->ms_sm != NULL); 1453 } 1454 1455 mutex_enter(&msp->ms_lock); 1456 1457 if (msp->ms_loaded && spa_sync_pass(spa) == 1 && 1458 metaslab_should_condense(msp)) { 1459 metaslab_condense(msp, txg, tx); 1460 } else { 1461 space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx); 1462 space_map_write(msp->ms_sm, *freetree, SM_FREE, tx); 1463 } 1464 1465 range_tree_vacate(alloctree, NULL, NULL); 1466 1467 if (msp->ms_loaded) { 1468 /* 1469 * When the space map is loaded, we have an accruate 1470 * histogram in the range tree. This gives us an opportunity 1471 * to bring the space map's histogram up-to-date so we clear 1472 * it first before updating it. 1473 */ 1474 space_map_histogram_clear(msp->ms_sm); 1475 space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx); 1476 } else { 1477 /* 1478 * Since the space map is not loaded we simply update the 1479 * exisiting histogram with what was freed in this txg. This 1480 * means that the on-disk histogram may not have an accurate 1481 * view of the free space but it's close enough to allow 1482 * us to make allocation decisions. 1483 */ 1484 space_map_histogram_add(msp->ms_sm, *freetree, tx); 1485 } 1486 1487 /* 1488 * For sync pass 1, we avoid traversing this txg's free range tree 1489 * and instead will just swap the pointers for freetree and 1490 * freed_tree. We can safely do this since the freed_tree is 1491 * guaranteed to be empty on the initial pass. 1492 */ 1493 if (spa_sync_pass(spa) == 1) { 1494 range_tree_swap(freetree, freed_tree); 1495 } else { 1496 range_tree_vacate(*freetree, range_tree_add, *freed_tree); 1497 } 1498 1499 ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK])); 1500 ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK])); 1501 1502 mutex_exit(&msp->ms_lock); 1503 1504 if (object != space_map_object(msp->ms_sm)) { 1505 object = space_map_object(msp->ms_sm); 1506 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * 1507 msp->ms_id, sizeof (uint64_t), &object, tx); 1508 } 1509 dmu_tx_commit(tx); 1510 } 1511 1512 /* 1513 * Called after a transaction group has completely synced to mark 1514 * all of the metaslab's free space as usable. 1515 */ 1516 void 1517 metaslab_sync_done(metaslab_t *msp, uint64_t txg) 1518 { 1519 metaslab_group_t *mg = msp->ms_group; 1520 vdev_t *vd = mg->mg_vd; 1521 range_tree_t **freed_tree; 1522 range_tree_t **defer_tree; 1523 int64_t alloc_delta, defer_delta; 1524 1525 ASSERT(!vd->vdev_ishole); 1526 1527 mutex_enter(&msp->ms_lock); 1528 1529 /* 1530 * If this metaslab is just becoming available, initialize its 1531 * alloctrees, freetrees, and defertree and add its capacity to 1532 * the vdev. 1533 */ 1534 if (msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK] == NULL) { 1535 for (int t = 0; t < TXG_SIZE; t++) { 1536 ASSERT(msp->ms_alloctree[t] == NULL); 1537 ASSERT(msp->ms_freetree[t] == NULL); 1538 1539 msp->ms_alloctree[t] = range_tree_create(NULL, msp, 1540 &msp->ms_lock); 1541 msp->ms_freetree[t] = range_tree_create(NULL, msp, 1542 &msp->ms_lock); 1543 } 1544 1545 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1546 ASSERT(msp->ms_defertree[t] == NULL); 1547 1548 msp->ms_defertree[t] = range_tree_create(NULL, msp, 1549 &msp->ms_lock); 1550 } 1551 1552 vdev_space_update(vd, 0, 0, msp->ms_size); 1553 } 1554 1555 freed_tree = &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK]; 1556 defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE]; 1557 1558 alloc_delta = space_map_alloc_delta(msp->ms_sm); 1559 defer_delta = range_tree_space(*freed_tree) - 1560 range_tree_space(*defer_tree); 1561 1562 vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0); 1563 1564 ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK])); 1565 ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK])); 1566 1567 /* 1568 * If there's a metaslab_load() in progress, wait for it to complete 1569 * so that we have a consistent view of the in-core space map. 1570 */ 1571 metaslab_load_wait(msp); 1572 1573 /* 1574 * Move the frees from the defer_tree back to the free 1575 * range tree (if it's loaded). Swap the freed_tree and the 1576 * defer_tree -- this is safe to do because we've just emptied out 1577 * the defer_tree. 1578 */ 1579 range_tree_vacate(*defer_tree, 1580 msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree); 1581 range_tree_swap(freed_tree, defer_tree); 1582 1583 space_map_update(msp->ms_sm); 1584 1585 msp->ms_deferspace += defer_delta; 1586 ASSERT3S(msp->ms_deferspace, >=, 0); 1587 ASSERT3S(msp->ms_deferspace, <=, msp->ms_size); 1588 if (msp->ms_deferspace != 0) { 1589 /* 1590 * Keep syncing this metaslab until all deferred frees 1591 * are back in circulation. 1592 */ 1593 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 1594 } 1595 1596 if (msp->ms_loaded && msp->ms_access_txg < txg) { 1597 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 1598 VERIFY0(range_tree_space( 1599 msp->ms_alloctree[(txg + t) & TXG_MASK])); 1600 } 1601 1602 if (!metaslab_debug_unload) 1603 metaslab_unload(msp); 1604 } 1605 1606 metaslab_group_sort(mg, msp, metaslab_weight(msp)); 1607 mutex_exit(&msp->ms_lock); 1608 1609 } 1610 1611 void 1612 metaslab_sync_reassess(metaslab_group_t *mg) 1613 { 1614 int64_t failures = mg->mg_alloc_failures; 1615 1616 metaslab_group_alloc_update(mg); 1617 atomic_add_64(&mg->mg_alloc_failures, -failures); 1618 1619 /* 1620 * Preload the next potential metaslabs 1621 */ 1622 metaslab_group_preload(mg); 1623 } 1624 1625 static uint64_t 1626 metaslab_distance(metaslab_t *msp, dva_t *dva) 1627 { 1628 uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift; 1629 uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift; 1630 uint64_t start = msp->ms_id; 1631 1632 if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) 1633 return (1ULL << 63); 1634 1635 if (offset < start) 1636 return ((start - offset) << ms_shift); 1637 if (offset > start) 1638 return ((offset - start) << ms_shift); 1639 return (0); 1640 } 1641 1642 static uint64_t 1643 metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, 1644 uint64_t txg, uint64_t min_distance, dva_t *dva, int d, int flags) 1645 { 1646 spa_t *spa = mg->mg_vd->vdev_spa; 1647 metaslab_t *msp = NULL; 1648 uint64_t offset = -1ULL; 1649 avl_tree_t *t = &mg->mg_metaslab_tree; 1650 uint64_t activation_weight; 1651 uint64_t target_distance; 1652 int i; 1653 1654 activation_weight = METASLAB_WEIGHT_PRIMARY; 1655 for (i = 0; i < d; i++) { 1656 if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { 1657 activation_weight = METASLAB_WEIGHT_SECONDARY; 1658 break; 1659 } 1660 } 1661 1662 for (;;) { 1663 boolean_t was_active; 1664 1665 mutex_enter(&mg->mg_lock); 1666 for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) { 1667 if (msp->ms_weight < asize) { 1668 spa_dbgmsg(spa, "%s: failed to meet weight " 1669 "requirement: vdev %llu, txg %llu, mg %p, " 1670 "msp %p, psize %llu, asize %llu, " 1671 "failures %llu, weight %llu", 1672 spa_name(spa), mg->mg_vd->vdev_id, txg, 1673 mg, msp, psize, asize, 1674 mg->mg_alloc_failures, msp->ms_weight); 1675 mutex_exit(&mg->mg_lock); 1676 return (-1ULL); 1677 } 1678 1679 /* 1680 * If the selected metaslab is condensing, skip it. 1681 */ 1682 if (msp->ms_condensing) 1683 continue; 1684 1685 was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; 1686 if (activation_weight == METASLAB_WEIGHT_PRIMARY) 1687 break; 1688 1689 target_distance = min_distance + 1690 (space_map_allocated(msp->ms_sm) != 0 ? 0 : 1691 min_distance >> 1); 1692 1693 for (i = 0; i < d; i++) 1694 if (metaslab_distance(msp, &dva[i]) < 1695 target_distance) 1696 break; 1697 if (i == d) 1698 break; 1699 } 1700 mutex_exit(&mg->mg_lock); 1701 if (msp == NULL) 1702 return (-1ULL); 1703 1704 mutex_enter(&msp->ms_lock); 1705 1706 /* 1707 * If we've already reached the allowable number of failed 1708 * allocation attempts on this metaslab group then we 1709 * consider skipping it. We skip it only if we're allowed 1710 * to "fast" gang, the physical size is larger than 1711 * a gang block, and we're attempting to allocate from 1712 * the primary metaslab. 1713 */ 1714 if (mg->mg_alloc_failures > zfs_mg_alloc_failures && 1715 CAN_FASTGANG(flags) && psize > SPA_GANGBLOCKSIZE && 1716 activation_weight == METASLAB_WEIGHT_PRIMARY) { 1717 spa_dbgmsg(spa, "%s: skipping metaslab group: " 1718 "vdev %llu, txg %llu, mg %p, msp[%llu] %p, " 1719 "psize %llu, asize %llu, failures %llu", 1720 spa_name(spa), mg->mg_vd->vdev_id, txg, mg, 1721 msp->ms_id, msp, psize, asize, 1722 mg->mg_alloc_failures); 1723 mutex_exit(&msp->ms_lock); 1724 return (-1ULL); 1725 } 1726 1727 /* 1728 * Ensure that the metaslab we have selected is still 1729 * capable of handling our request. It's possible that 1730 * another thread may have changed the weight while we 1731 * were blocked on the metaslab lock. 1732 */ 1733 if (msp->ms_weight < asize || (was_active && 1734 !(msp->ms_weight & METASLAB_ACTIVE_MASK) && 1735 activation_weight == METASLAB_WEIGHT_PRIMARY)) { 1736 mutex_exit(&msp->ms_lock); 1737 continue; 1738 } 1739 1740 if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) && 1741 activation_weight == METASLAB_WEIGHT_PRIMARY) { 1742 metaslab_passivate(msp, 1743 msp->ms_weight & ~METASLAB_ACTIVE_MASK); 1744 mutex_exit(&msp->ms_lock); 1745 continue; 1746 } 1747 1748 if (metaslab_activate(msp, activation_weight) != 0) { 1749 mutex_exit(&msp->ms_lock); 1750 continue; 1751 } 1752 1753 /* 1754 * If this metaslab is currently condensing then pick again as 1755 * we can't manipulate this metaslab until it's committed 1756 * to disk. 1757 */ 1758 if (msp->ms_condensing) { 1759 mutex_exit(&msp->ms_lock); 1760 continue; 1761 } 1762 1763 if ((offset = metaslab_block_alloc(msp, asize)) != -1ULL) 1764 break; 1765 1766 atomic_inc_64(&mg->mg_alloc_failures); 1767 1768 metaslab_passivate(msp, metaslab_block_maxsize(msp)); 1769 mutex_exit(&msp->ms_lock); 1770 } 1771 1772 if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0) 1773 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); 1774 1775 range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, asize); 1776 msp->ms_access_txg = txg + metaslab_unload_delay; 1777 1778 mutex_exit(&msp->ms_lock); 1779 1780 return (offset); 1781 } 1782 1783 /* 1784 * Allocate a block for the specified i/o. 1785 */ 1786 static int 1787 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, 1788 dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags) 1789 { 1790 metaslab_group_t *mg, *rotor; 1791 vdev_t *vd; 1792 int dshift = 3; 1793 int all_zero; 1794 int zio_lock = B_FALSE; 1795 boolean_t allocatable; 1796 uint64_t offset = -1ULL; 1797 uint64_t asize; 1798 uint64_t distance; 1799 1800 ASSERT(!DVA_IS_VALID(&dva[d])); 1801 1802 /* 1803 * For testing, make some blocks above a certain size be gang blocks. 1804 */ 1805 if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) 1806 return (SET_ERROR(ENOSPC)); 1807 1808 /* 1809 * Start at the rotor and loop through all mgs until we find something. 1810 * Note that there's no locking on mc_rotor or mc_aliquot because 1811 * nothing actually breaks if we miss a few updates -- we just won't 1812 * allocate quite as evenly. It all balances out over time. 1813 * 1814 * If we are doing ditto or log blocks, try to spread them across 1815 * consecutive vdevs. If we're forced to reuse a vdev before we've 1816 * allocated all of our ditto blocks, then try and spread them out on 1817 * that vdev as much as possible. If it turns out to not be possible, 1818 * gradually lower our standards until anything becomes acceptable. 1819 * Also, allocating on consecutive vdevs (as opposed to random vdevs) 1820 * gives us hope of containing our fault domains to something we're 1821 * able to reason about. Otherwise, any two top-level vdev failures 1822 * will guarantee the loss of data. With consecutive allocation, 1823 * only two adjacent top-level vdev failures will result in data loss. 1824 * 1825 * If we are doing gang blocks (hintdva is non-NULL), try to keep 1826 * ourselves on the same vdev as our gang block header. That 1827 * way, we can hope for locality in vdev_cache, plus it makes our 1828 * fault domains something tractable. 1829 */ 1830 if (hintdva) { 1831 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); 1832 1833 /* 1834 * It's possible the vdev we're using as the hint no 1835 * longer exists (i.e. removed). Consult the rotor when 1836 * all else fails. 1837 */ 1838 if (vd != NULL) { 1839 mg = vd->vdev_mg; 1840 1841 if (flags & METASLAB_HINTBP_AVOID && 1842 mg->mg_next != NULL) 1843 mg = mg->mg_next; 1844 } else { 1845 mg = mc->mc_rotor; 1846 } 1847 } else if (d != 0) { 1848 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); 1849 mg = vd->vdev_mg->mg_next; 1850 } else { 1851 mg = mc->mc_rotor; 1852 } 1853 1854 /* 1855 * If the hint put us into the wrong metaslab class, or into a 1856 * metaslab group that has been passivated, just follow the rotor. 1857 */ 1858 if (mg->mg_class != mc || mg->mg_activation_count <= 0) 1859 mg = mc->mc_rotor; 1860 1861 rotor = mg; 1862 top: 1863 all_zero = B_TRUE; 1864 do { 1865 ASSERT(mg->mg_activation_count == 1); 1866 1867 vd = mg->mg_vd; 1868 1869 /* 1870 * Don't allocate from faulted devices. 1871 */ 1872 if (zio_lock) { 1873 spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); 1874 allocatable = vdev_allocatable(vd); 1875 spa_config_exit(spa, SCL_ZIO, FTAG); 1876 } else { 1877 allocatable = vdev_allocatable(vd); 1878 } 1879 1880 /* 1881 * Determine if the selected metaslab group is eligible 1882 * for allocations. If we're ganging or have requested 1883 * an allocation for the smallest gang block size 1884 * then we don't want to avoid allocating to the this 1885 * metaslab group. If we're in this condition we should 1886 * try to allocate from any device possible so that we 1887 * don't inadvertently return ENOSPC and suspend the pool 1888 * even though space is still available. 1889 */ 1890 if (allocatable && CAN_FASTGANG(flags) && 1891 psize > SPA_GANGBLOCKSIZE) 1892 allocatable = metaslab_group_allocatable(mg); 1893 1894 if (!allocatable) 1895 goto next; 1896 1897 /* 1898 * Avoid writing single-copy data to a failing vdev 1899 * unless the user instructs us that it is okay. 1900 */ 1901 if ((vd->vdev_stat.vs_write_errors > 0 || 1902 vd->vdev_state < VDEV_STATE_HEALTHY) && 1903 d == 0 && dshift == 3 && 1904 !(zfs_write_to_degraded && vd->vdev_state == 1905 VDEV_STATE_DEGRADED)) { 1906 all_zero = B_FALSE; 1907 goto next; 1908 } 1909 1910 ASSERT(mg->mg_class == mc); 1911 1912 distance = vd->vdev_asize >> dshift; 1913 if (distance <= (1ULL << vd->vdev_ms_shift)) 1914 distance = 0; 1915 else 1916 all_zero = B_FALSE; 1917 1918 asize = vdev_psize_to_asize(vd, psize); 1919 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); 1920 1921 offset = metaslab_group_alloc(mg, psize, asize, txg, distance, 1922 dva, d, flags); 1923 if (offset != -1ULL) { 1924 /* 1925 * If we've just selected this metaslab group, 1926 * figure out whether the corresponding vdev is 1927 * over- or under-used relative to the pool, 1928 * and set an allocation bias to even it out. 1929 */ 1930 if (mc->mc_aliquot == 0) { 1931 vdev_stat_t *vs = &vd->vdev_stat; 1932 int64_t vu, cu; 1933 1934 vu = (vs->vs_alloc * 100) / (vs->vs_space + 1); 1935 cu = (mc->mc_alloc * 100) / (mc->mc_space + 1); 1936 1937 /* 1938 * Calculate how much more or less we should 1939 * try to allocate from this device during 1940 * this iteration around the rotor. 1941 * For example, if a device is 80% full 1942 * and the pool is 20% full then we should 1943 * reduce allocations by 60% on this device. 1944 * 1945 * mg_bias = (20 - 80) * 512K / 100 = -307K 1946 * 1947 * This reduces allocations by 307K for this 1948 * iteration. 1949 */ 1950 mg->mg_bias = ((cu - vu) * 1951 (int64_t)mg->mg_aliquot) / 100; 1952 } 1953 1954 if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= 1955 mg->mg_aliquot + mg->mg_bias) { 1956 mc->mc_rotor = mg->mg_next; 1957 mc->mc_aliquot = 0; 1958 } 1959 1960 DVA_SET_VDEV(&dva[d], vd->vdev_id); 1961 DVA_SET_OFFSET(&dva[d], offset); 1962 DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER)); 1963 DVA_SET_ASIZE(&dva[d], asize); 1964 1965 return (0); 1966 } 1967 next: 1968 mc->mc_rotor = mg->mg_next; 1969 mc->mc_aliquot = 0; 1970 } while ((mg = mg->mg_next) != rotor); 1971 1972 if (!all_zero) { 1973 dshift++; 1974 ASSERT(dshift < 64); 1975 goto top; 1976 } 1977 1978 if (!allocatable && !zio_lock) { 1979 dshift = 3; 1980 zio_lock = B_TRUE; 1981 goto top; 1982 } 1983 1984 bzero(&dva[d], sizeof (dva_t)); 1985 1986 return (SET_ERROR(ENOSPC)); 1987 } 1988 1989 /* 1990 * Free the block represented by DVA in the context of the specified 1991 * transaction group. 1992 */ 1993 static void 1994 metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now) 1995 { 1996 uint64_t vdev = DVA_GET_VDEV(dva); 1997 uint64_t offset = DVA_GET_OFFSET(dva); 1998 uint64_t size = DVA_GET_ASIZE(dva); 1999 vdev_t *vd; 2000 metaslab_t *msp; 2001 2002 ASSERT(DVA_IS_VALID(dva)); 2003 2004 if (txg > spa_freeze_txg(spa)) 2005 return; 2006 2007 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 2008 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { 2009 cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu", 2010 (u_longlong_t)vdev, (u_longlong_t)offset); 2011 ASSERT(0); 2012 return; 2013 } 2014 2015 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 2016 2017 if (DVA_GET_GANG(dva)) 2018 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 2019 2020 mutex_enter(&msp->ms_lock); 2021 2022 if (now) { 2023 range_tree_remove(msp->ms_alloctree[txg & TXG_MASK], 2024 offset, size); 2025 2026 VERIFY(!msp->ms_condensing); 2027 VERIFY3U(offset, >=, msp->ms_start); 2028 VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size); 2029 VERIFY3U(range_tree_space(msp->ms_tree) + size, <=, 2030 msp->ms_size); 2031 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 2032 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 2033 range_tree_add(msp->ms_tree, offset, size); 2034 } else { 2035 if (range_tree_space(msp->ms_freetree[txg & TXG_MASK]) == 0) 2036 vdev_dirty(vd, VDD_METASLAB, msp, txg); 2037 range_tree_add(msp->ms_freetree[txg & TXG_MASK], 2038 offset, size); 2039 } 2040 2041 mutex_exit(&msp->ms_lock); 2042 } 2043 2044 /* 2045 * Intent log support: upon opening the pool after a crash, notify the SPA 2046 * of blocks that the intent log has allocated for immediate write, but 2047 * which are still considered free by the SPA because the last transaction 2048 * group didn't commit yet. 2049 */ 2050 static int 2051 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) 2052 { 2053 uint64_t vdev = DVA_GET_VDEV(dva); 2054 uint64_t offset = DVA_GET_OFFSET(dva); 2055 uint64_t size = DVA_GET_ASIZE(dva); 2056 vdev_t *vd; 2057 metaslab_t *msp; 2058 int error = 0; 2059 2060 ASSERT(DVA_IS_VALID(dva)); 2061 2062 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 2063 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) 2064 return (SET_ERROR(ENXIO)); 2065 2066 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 2067 2068 if (DVA_GET_GANG(dva)) 2069 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 2070 2071 mutex_enter(&msp->ms_lock); 2072 2073 if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) 2074 error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); 2075 2076 if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size)) 2077 error = SET_ERROR(ENOENT); 2078 2079 if (error || txg == 0) { /* txg == 0 indicates dry run */ 2080 mutex_exit(&msp->ms_lock); 2081 return (error); 2082 } 2083 2084 VERIFY(!msp->ms_condensing); 2085 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 2086 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 2087 VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size); 2088 range_tree_remove(msp->ms_tree, offset, size); 2089 2090 if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ 2091 if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0) 2092 vdev_dirty(vd, VDD_METASLAB, msp, txg); 2093 range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size); 2094 } 2095 2096 mutex_exit(&msp->ms_lock); 2097 2098 return (0); 2099 } 2100 2101 int 2102 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, 2103 int ndvas, uint64_t txg, blkptr_t *hintbp, int flags) 2104 { 2105 dva_t *dva = bp->blk_dva; 2106 dva_t *hintdva = hintbp->blk_dva; 2107 int error = 0; 2108 2109 ASSERT(bp->blk_birth == 0); 2110 ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); 2111 2112 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 2113 2114 if (mc->mc_rotor == NULL) { /* no vdevs in this class */ 2115 spa_config_exit(spa, SCL_ALLOC, FTAG); 2116 return (SET_ERROR(ENOSPC)); 2117 } 2118 2119 ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); 2120 ASSERT(BP_GET_NDVAS(bp) == 0); 2121 ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); 2122 2123 for (int d = 0; d < ndvas; d++) { 2124 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, 2125 txg, flags); 2126 if (error != 0) { 2127 for (d--; d >= 0; d--) { 2128 metaslab_free_dva(spa, &dva[d], txg, B_TRUE); 2129 bzero(&dva[d], sizeof (dva_t)); 2130 } 2131 spa_config_exit(spa, SCL_ALLOC, FTAG); 2132 return (error); 2133 } 2134 } 2135 ASSERT(error == 0); 2136 ASSERT(BP_GET_NDVAS(bp) == ndvas); 2137 2138 spa_config_exit(spa, SCL_ALLOC, FTAG); 2139 2140 BP_SET_BIRTH(bp, txg, txg); 2141 2142 return (0); 2143 } 2144 2145 void 2146 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) 2147 { 2148 const dva_t *dva = bp->blk_dva; 2149 int ndvas = BP_GET_NDVAS(bp); 2150 2151 ASSERT(!BP_IS_HOLE(bp)); 2152 ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); 2153 2154 spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); 2155 2156 for (int d = 0; d < ndvas; d++) 2157 metaslab_free_dva(spa, &dva[d], txg, now); 2158 2159 spa_config_exit(spa, SCL_FREE, FTAG); 2160 } 2161 2162 int 2163 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) 2164 { 2165 const dva_t *dva = bp->blk_dva; 2166 int ndvas = BP_GET_NDVAS(bp); 2167 int error = 0; 2168 2169 ASSERT(!BP_IS_HOLE(bp)); 2170 2171 if (txg != 0) { 2172 /* 2173 * First do a dry run to make sure all DVAs are claimable, 2174 * so we don't have to unwind from partial failures below. 2175 */ 2176 if ((error = metaslab_claim(spa, bp, 0)) != 0) 2177 return (error); 2178 } 2179 2180 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 2181 2182 for (int d = 0; d < ndvas; d++) 2183 if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0) 2184 break; 2185 2186 spa_config_exit(spa, SCL_ALLOC, FTAG); 2187 2188 ASSERT(error == 0 || txg == 0); 2189 2190 return (error); 2191 } 2192 2193 void 2194 metaslab_check_free(spa_t *spa, const blkptr_t *bp) 2195 { 2196 if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) 2197 return; 2198 2199 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 2200 for (int i = 0; i < BP_GET_NDVAS(bp); i++) { 2201 uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]); 2202 vdev_t *vd = vdev_lookup_top(spa, vdev); 2203 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); 2204 uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]); 2205 metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 2206 2207 if (msp->ms_loaded) 2208 range_tree_verify(msp->ms_tree, offset, size); 2209 2210 for (int j = 0; j < TXG_SIZE; j++) 2211 range_tree_verify(msp->ms_freetree[j], offset, size); 2212 for (int j = 0; j < TXG_DEFER_SIZE; j++) 2213 range_tree_verify(msp->ms_defertree[j], offset, size); 2214 } 2215 spa_config_exit(spa, SCL_VDEV, FTAG); 2216 } 2217