1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 24 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 25 * Copyright (c) 2014 Integros [integros.com] 26 * Copyright (c) 2017, Intel Corporation. 27 */ 28 29 #include <sys/zfs_context.h> 30 #include <sys/dmu.h> 31 #include <sys/dmu_tx.h> 32 #include <sys/space_map.h> 33 #include <sys/metaslab_impl.h> 34 #include <sys/vdev_impl.h> 35 #include <sys/zio.h> 36 #include <sys/spa_impl.h> 37 #include <sys/zfeature.h> 38 #include <sys/vdev_indirect_mapping.h> 39 #include <sys/zap.h> 40 41 #define GANG_ALLOCATION(flags) \ 42 ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER)) 43 44 uint64_t metaslab_aliquot = 512ULL << 10; 45 uint64_t metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ 46 47 /* 48 * Since we can touch multiple metaslabs (and their respective space maps) 49 * with each transaction group, we benefit from having a smaller space map 50 * block size since it allows us to issue more I/O operations scattered 51 * around the disk. 52 */ 53 int zfs_metaslab_sm_blksz = (1 << 12); 54 55 /* 56 * The in-core space map representation is more compact than its on-disk form. 57 * The zfs_condense_pct determines how much more compact the in-core 58 * space map representation must be before we compact it on-disk. 59 * Values should be greater than or equal to 100. 60 */ 61 int zfs_condense_pct = 200; 62 63 /* 64 * Condensing a metaslab is not guaranteed to actually reduce the amount of 65 * space used on disk. In particular, a space map uses data in increments of 66 * MAX(1 << ashift, space_map_blksize), so a metaslab might use the 67 * same number of blocks after condensing. Since the goal of condensing is to 68 * reduce the number of IOPs required to read the space map, we only want to 69 * condense when we can be sure we will reduce the number of blocks used by the 70 * space map. Unfortunately, we cannot precisely compute whether or not this is 71 * the case in metaslab_should_condense since we are holding ms_lock. Instead, 72 * we apply the following heuristic: do not condense a spacemap unless the 73 * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold 74 * blocks. 75 */ 76 int zfs_metaslab_condense_block_threshold = 4; 77 78 /* 79 * The zfs_mg_noalloc_threshold defines which metaslab groups should 80 * be eligible for allocation. The value is defined as a percentage of 81 * free space. Metaslab groups that have more free space than 82 * zfs_mg_noalloc_threshold are always eligible for allocations. Once 83 * a metaslab group's free space is less than or equal to the 84 * zfs_mg_noalloc_threshold the allocator will avoid allocating to that 85 * group unless all groups in the pool have reached zfs_mg_noalloc_threshold. 86 * Once all groups in the pool reach zfs_mg_noalloc_threshold then all 87 * groups are allowed to accept allocations. Gang blocks are always 88 * eligible to allocate on any metaslab group. The default value of 0 means 89 * no metaslab group will be excluded based on this criterion. 90 */ 91 int zfs_mg_noalloc_threshold = 0; 92 93 /* 94 * Metaslab groups are considered eligible for allocations if their 95 * fragmenation metric (measured as a percentage) is less than or equal to 96 * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold 97 * then it will be skipped unless all metaslab groups within the metaslab 98 * class have also crossed this threshold. 99 */ 100 int zfs_mg_fragmentation_threshold = 85; 101 102 /* 103 * Allow metaslabs to keep their active state as long as their fragmentation 104 * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An 105 * active metaslab that exceeds this threshold will no longer keep its active 106 * status allowing better metaslabs to be selected. 107 */ 108 int zfs_metaslab_fragmentation_threshold = 70; 109 110 /* 111 * When set will load all metaslabs when pool is first opened. 112 */ 113 int metaslab_debug_load = 0; 114 115 /* 116 * When set will prevent metaslabs from being unloaded. 117 */ 118 int metaslab_debug_unload = 0; 119 120 /* 121 * Minimum size which forces the dynamic allocator to change 122 * it's allocation strategy. Once the space map cannot satisfy 123 * an allocation of this size then it switches to using more 124 * aggressive strategy (i.e search by size rather than offset). 125 */ 126 uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE; 127 128 /* 129 * The minimum free space, in percent, which must be available 130 * in a space map to continue allocations in a first-fit fashion. 131 * Once the space map's free space drops below this level we dynamically 132 * switch to using best-fit allocations. 133 */ 134 int metaslab_df_free_pct = 4; 135 136 /* 137 * A metaslab is considered "free" if it contains a contiguous 138 * segment which is greater than metaslab_min_alloc_size. 139 */ 140 uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS; 141 142 /* 143 * Percentage of all cpus that can be used by the metaslab taskq. 144 */ 145 int metaslab_load_pct = 50; 146 147 /* 148 * Determines how many txgs a metaslab may remain loaded without having any 149 * allocations from it. As long as a metaslab continues to be used we will 150 * keep it loaded. 151 */ 152 int metaslab_unload_delay = TXG_SIZE * 2; 153 154 /* 155 * Max number of metaslabs per group to preload. 156 */ 157 int metaslab_preload_limit = SPA_DVAS_PER_BP; 158 159 /* 160 * Enable/disable preloading of metaslab. 161 */ 162 boolean_t metaslab_preload_enabled = B_TRUE; 163 164 /* 165 * Enable/disable fragmentation weighting on metaslabs. 166 */ 167 boolean_t metaslab_fragmentation_factor_enabled = B_TRUE; 168 169 /* 170 * Enable/disable lba weighting (i.e. outer tracks are given preference). 171 */ 172 boolean_t metaslab_lba_weighting_enabled = B_TRUE; 173 174 /* 175 * Enable/disable metaslab group biasing. 176 */ 177 boolean_t metaslab_bias_enabled = B_TRUE; 178 179 /* 180 * Enable/disable remapping of indirect DVAs to their concrete vdevs. 181 */ 182 boolean_t zfs_remap_blkptr_enable = B_TRUE; 183 184 /* 185 * Enable/disable segment-based metaslab selection. 186 */ 187 boolean_t zfs_metaslab_segment_weight_enabled = B_TRUE; 188 189 /* 190 * When using segment-based metaslab selection, we will continue 191 * allocating from the active metaslab until we have exhausted 192 * zfs_metaslab_switch_threshold of its buckets. 193 */ 194 int zfs_metaslab_switch_threshold = 2; 195 196 /* 197 * Internal switch to enable/disable the metaslab allocation tracing 198 * facility. 199 */ 200 boolean_t metaslab_trace_enabled = B_TRUE; 201 202 /* 203 * Maximum entries that the metaslab allocation tracing facility will keep 204 * in a given list when running in non-debug mode. We limit the number 205 * of entries in non-debug mode to prevent us from using up too much memory. 206 * The limit should be sufficiently large that we don't expect any allocation 207 * to every exceed this value. In debug mode, the system will panic if this 208 * limit is ever reached allowing for further investigation. 209 */ 210 uint64_t metaslab_trace_max_entries = 5000; 211 212 /* 213 * Maximum number of metaslabs per group that can be disabled 214 * simultaneously. 215 */ 216 int max_disabled_ms = 3; 217 218 static uint64_t metaslab_weight(metaslab_t *); 219 static void metaslab_set_fragmentation(metaslab_t *); 220 static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t); 221 static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t); 222 static void metaslab_passivate(metaslab_t *msp, uint64_t weight); 223 static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp); 224 225 kmem_cache_t *metaslab_alloc_trace_cache; 226 227 /* 228 * ========================================================================== 229 * Metaslab classes 230 * ========================================================================== 231 */ 232 metaslab_class_t * 233 metaslab_class_create(spa_t *spa, metaslab_ops_t *ops) 234 { 235 metaslab_class_t *mc; 236 237 mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); 238 239 mc->mc_spa = spa; 240 mc->mc_rotor = NULL; 241 mc->mc_ops = ops; 242 mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL); 243 mc->mc_alloc_slots = kmem_zalloc(spa->spa_alloc_count * 244 sizeof (zfs_refcount_t), KM_SLEEP); 245 mc->mc_alloc_max_slots = kmem_zalloc(spa->spa_alloc_count * 246 sizeof (uint64_t), KM_SLEEP); 247 for (int i = 0; i < spa->spa_alloc_count; i++) 248 zfs_refcount_create_tracked(&mc->mc_alloc_slots[i]); 249 250 return (mc); 251 } 252 253 void 254 metaslab_class_destroy(metaslab_class_t *mc) 255 { 256 ASSERT(mc->mc_rotor == NULL); 257 ASSERT(mc->mc_alloc == 0); 258 ASSERT(mc->mc_deferred == 0); 259 ASSERT(mc->mc_space == 0); 260 ASSERT(mc->mc_dspace == 0); 261 262 for (int i = 0; i < mc->mc_spa->spa_alloc_count; i++) 263 zfs_refcount_destroy(&mc->mc_alloc_slots[i]); 264 kmem_free(mc->mc_alloc_slots, mc->mc_spa->spa_alloc_count * 265 sizeof (zfs_refcount_t)); 266 kmem_free(mc->mc_alloc_max_slots, mc->mc_spa->spa_alloc_count * 267 sizeof (uint64_t)); 268 mutex_destroy(&mc->mc_lock); 269 kmem_free(mc, sizeof (metaslab_class_t)); 270 } 271 272 int 273 metaslab_class_validate(metaslab_class_t *mc) 274 { 275 metaslab_group_t *mg; 276 vdev_t *vd; 277 278 /* 279 * Must hold one of the spa_config locks. 280 */ 281 ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || 282 spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); 283 284 if ((mg = mc->mc_rotor) == NULL) 285 return (0); 286 287 do { 288 vd = mg->mg_vd; 289 ASSERT(vd->vdev_mg != NULL); 290 ASSERT3P(vd->vdev_top, ==, vd); 291 ASSERT3P(mg->mg_class, ==, mc); 292 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); 293 } while ((mg = mg->mg_next) != mc->mc_rotor); 294 295 return (0); 296 } 297 298 static void 299 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, 300 int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) 301 { 302 atomic_add_64(&mc->mc_alloc, alloc_delta); 303 atomic_add_64(&mc->mc_deferred, defer_delta); 304 atomic_add_64(&mc->mc_space, space_delta); 305 atomic_add_64(&mc->mc_dspace, dspace_delta); 306 } 307 308 uint64_t 309 metaslab_class_get_alloc(metaslab_class_t *mc) 310 { 311 return (mc->mc_alloc); 312 } 313 314 uint64_t 315 metaslab_class_get_deferred(metaslab_class_t *mc) 316 { 317 return (mc->mc_deferred); 318 } 319 320 uint64_t 321 metaslab_class_get_space(metaslab_class_t *mc) 322 { 323 return (mc->mc_space); 324 } 325 326 uint64_t 327 metaslab_class_get_dspace(metaslab_class_t *mc) 328 { 329 return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); 330 } 331 332 void 333 metaslab_class_histogram_verify(metaslab_class_t *mc) 334 { 335 spa_t *spa = mc->mc_spa; 336 vdev_t *rvd = spa->spa_root_vdev; 337 uint64_t *mc_hist; 338 int i; 339 340 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 341 return; 342 343 mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 344 KM_SLEEP); 345 346 for (int c = 0; c < rvd->vdev_children; c++) { 347 vdev_t *tvd = rvd->vdev_child[c]; 348 metaslab_group_t *mg = tvd->vdev_mg; 349 350 /* 351 * Skip any holes, uninitialized top-levels, or 352 * vdevs that are not in this metalab class. 353 */ 354 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || 355 mg->mg_class != mc) { 356 continue; 357 } 358 359 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 360 mc_hist[i] += mg->mg_histogram[i]; 361 } 362 363 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 364 VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]); 365 366 kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 367 } 368 369 /* 370 * Calculate the metaslab class's fragmentation metric. The metric 371 * is weighted based on the space contribution of each metaslab group. 372 * The return value will be a number between 0 and 100 (inclusive), or 373 * ZFS_FRAG_INVALID if the metric has not been set. See comment above the 374 * zfs_frag_table for more information about the metric. 375 */ 376 uint64_t 377 metaslab_class_fragmentation(metaslab_class_t *mc) 378 { 379 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 380 uint64_t fragmentation = 0; 381 382 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 383 384 for (int c = 0; c < rvd->vdev_children; c++) { 385 vdev_t *tvd = rvd->vdev_child[c]; 386 metaslab_group_t *mg = tvd->vdev_mg; 387 388 /* 389 * Skip any holes, uninitialized top-levels, 390 * or vdevs that are not in this metalab class. 391 */ 392 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || 393 mg->mg_class != mc) { 394 continue; 395 } 396 397 /* 398 * If a metaslab group does not contain a fragmentation 399 * metric then just bail out. 400 */ 401 if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { 402 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 403 return (ZFS_FRAG_INVALID); 404 } 405 406 /* 407 * Determine how much this metaslab_group is contributing 408 * to the overall pool fragmentation metric. 409 */ 410 fragmentation += mg->mg_fragmentation * 411 metaslab_group_get_space(mg); 412 } 413 fragmentation /= metaslab_class_get_space(mc); 414 415 ASSERT3U(fragmentation, <=, 100); 416 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 417 return (fragmentation); 418 } 419 420 /* 421 * Calculate the amount of expandable space that is available in 422 * this metaslab class. If a device is expanded then its expandable 423 * space will be the amount of allocatable space that is currently not 424 * part of this metaslab class. 425 */ 426 uint64_t 427 metaslab_class_expandable_space(metaslab_class_t *mc) 428 { 429 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 430 uint64_t space = 0; 431 432 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 433 for (int c = 0; c < rvd->vdev_children; c++) { 434 uint64_t tspace; 435 vdev_t *tvd = rvd->vdev_child[c]; 436 metaslab_group_t *mg = tvd->vdev_mg; 437 438 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || 439 mg->mg_class != mc) { 440 continue; 441 } 442 443 /* 444 * Calculate if we have enough space to add additional 445 * metaslabs. We report the expandable space in terms 446 * of the metaslab size since that's the unit of expansion. 447 * Adjust by efi system partition size. 448 */ 449 tspace = tvd->vdev_max_asize - tvd->vdev_asize; 450 if (tspace > mc->mc_spa->spa_bootsize) { 451 tspace -= mc->mc_spa->spa_bootsize; 452 } 453 space += P2ALIGN(tspace, 1ULL << tvd->vdev_ms_shift); 454 } 455 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 456 return (space); 457 } 458 459 static int 460 metaslab_compare(const void *x1, const void *x2) 461 { 462 const metaslab_t *m1 = (const metaslab_t *)x1; 463 const metaslab_t *m2 = (const metaslab_t *)x2; 464 465 int sort1 = 0; 466 int sort2 = 0; 467 if (m1->ms_allocator != -1 && m1->ms_primary) 468 sort1 = 1; 469 else if (m1->ms_allocator != -1 && !m1->ms_primary) 470 sort1 = 2; 471 if (m2->ms_allocator != -1 && m2->ms_primary) 472 sort2 = 1; 473 else if (m2->ms_allocator != -1 && !m2->ms_primary) 474 sort2 = 2; 475 476 /* 477 * Sort inactive metaslabs first, then primaries, then secondaries. When 478 * selecting a metaslab to allocate from, an allocator first tries its 479 * primary, then secondary active metaslab. If it doesn't have active 480 * metaslabs, or can't allocate from them, it searches for an inactive 481 * metaslab to activate. If it can't find a suitable one, it will steal 482 * a primary or secondary metaslab from another allocator. 483 */ 484 if (sort1 < sort2) 485 return (-1); 486 if (sort1 > sort2) 487 return (1); 488 489 int cmp = AVL_CMP(m2->ms_weight, m1->ms_weight); 490 if (likely(cmp)) 491 return (cmp); 492 493 IMPLY(AVL_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2); 494 495 return (AVL_CMP(m1->ms_start, m2->ms_start)); 496 } 497 498 uint64_t 499 metaslab_allocated_space(metaslab_t *msp) 500 { 501 return (msp->ms_allocated_space); 502 } 503 504 /* 505 * Verify that the space accounting on disk matches the in-core range_trees. 506 */ 507 static void 508 metaslab_verify_space(metaslab_t *msp, uint64_t txg) 509 { 510 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 511 uint64_t allocating = 0; 512 uint64_t sm_free_space, msp_free_space; 513 514 ASSERT(MUTEX_HELD(&msp->ms_lock)); 515 ASSERT(!msp->ms_condensing); 516 517 if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) 518 return; 519 520 /* 521 * We can only verify the metaslab space when we're called 522 * from syncing context with a loaded metaslab that has an 523 * allocated space map. Calling this in non-syncing context 524 * does not provide a consistent view of the metaslab since 525 * we're performing allocations in the future. 526 */ 527 if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL || 528 !msp->ms_loaded) 529 return; 530 531 /* 532 * Even though the smp_alloc field can get negative (e.g. 533 * see vdev_checkpoint_sm), that should never be the case 534 * when it come's to a metaslab's space map. 535 */ 536 ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0); 537 538 sm_free_space = msp->ms_size - metaslab_allocated_space(msp); 539 540 /* 541 * Account for future allocations since we would have 542 * already deducted that space from the ms_allocatable. 543 */ 544 for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { 545 allocating += 546 range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]); 547 } 548 549 ASSERT3U(msp->ms_deferspace, ==, 550 range_tree_space(msp->ms_defer[0]) + 551 range_tree_space(msp->ms_defer[1])); 552 553 msp_free_space = range_tree_space(msp->ms_allocatable) + allocating + 554 msp->ms_deferspace + range_tree_space(msp->ms_freed); 555 556 VERIFY3U(sm_free_space, ==, msp_free_space); 557 } 558 559 /* 560 * ========================================================================== 561 * Metaslab groups 562 * ========================================================================== 563 */ 564 /* 565 * Update the allocatable flag and the metaslab group's capacity. 566 * The allocatable flag is set to true if the capacity is below 567 * the zfs_mg_noalloc_threshold or has a fragmentation value that is 568 * greater than zfs_mg_fragmentation_threshold. If a metaslab group 569 * transitions from allocatable to non-allocatable or vice versa then the 570 * metaslab group's class is updated to reflect the transition. 571 */ 572 static void 573 metaslab_group_alloc_update(metaslab_group_t *mg) 574 { 575 vdev_t *vd = mg->mg_vd; 576 metaslab_class_t *mc = mg->mg_class; 577 vdev_stat_t *vs = &vd->vdev_stat; 578 boolean_t was_allocatable; 579 boolean_t was_initialized; 580 581 ASSERT(vd == vd->vdev_top); 582 ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==, 583 SCL_ALLOC); 584 585 mutex_enter(&mg->mg_lock); 586 was_allocatable = mg->mg_allocatable; 587 was_initialized = mg->mg_initialized; 588 589 mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / 590 (vs->vs_space + 1); 591 592 mutex_enter(&mc->mc_lock); 593 594 /* 595 * If the metaslab group was just added then it won't 596 * have any space until we finish syncing out this txg. 597 * At that point we will consider it initialized and available 598 * for allocations. We also don't consider non-activated 599 * metaslab groups (e.g. vdevs that are in the middle of being removed) 600 * to be initialized, because they can't be used for allocation. 601 */ 602 mg->mg_initialized = metaslab_group_initialized(mg); 603 if (!was_initialized && mg->mg_initialized) { 604 mc->mc_groups++; 605 } else if (was_initialized && !mg->mg_initialized) { 606 ASSERT3U(mc->mc_groups, >, 0); 607 mc->mc_groups--; 608 } 609 if (mg->mg_initialized) 610 mg->mg_no_free_space = B_FALSE; 611 612 /* 613 * A metaslab group is considered allocatable if it has plenty 614 * of free space or is not heavily fragmented. We only take 615 * fragmentation into account if the metaslab group has a valid 616 * fragmentation metric (i.e. a value between 0 and 100). 617 */ 618 mg->mg_allocatable = (mg->mg_activation_count > 0 && 619 mg->mg_free_capacity > zfs_mg_noalloc_threshold && 620 (mg->mg_fragmentation == ZFS_FRAG_INVALID || 621 mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)); 622 623 /* 624 * The mc_alloc_groups maintains a count of the number of 625 * groups in this metaslab class that are still above the 626 * zfs_mg_noalloc_threshold. This is used by the allocating 627 * threads to determine if they should avoid allocations to 628 * a given group. The allocator will avoid allocations to a group 629 * if that group has reached or is below the zfs_mg_noalloc_threshold 630 * and there are still other groups that are above the threshold. 631 * When a group transitions from allocatable to non-allocatable or 632 * vice versa we update the metaslab class to reflect that change. 633 * When the mc_alloc_groups value drops to 0 that means that all 634 * groups have reached the zfs_mg_noalloc_threshold making all groups 635 * eligible for allocations. This effectively means that all devices 636 * are balanced again. 637 */ 638 if (was_allocatable && !mg->mg_allocatable) 639 mc->mc_alloc_groups--; 640 else if (!was_allocatable && mg->mg_allocatable) 641 mc->mc_alloc_groups++; 642 mutex_exit(&mc->mc_lock); 643 644 mutex_exit(&mg->mg_lock); 645 } 646 647 metaslab_group_t * 648 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators) 649 { 650 metaslab_group_t *mg; 651 652 mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); 653 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); 654 mutex_init(&mg->mg_ms_disabled_lock, NULL, MUTEX_DEFAULT, NULL); 655 cv_init(&mg->mg_ms_disabled_cv, NULL, CV_DEFAULT, NULL); 656 mg->mg_primaries = kmem_zalloc(allocators * sizeof (metaslab_t *), 657 KM_SLEEP); 658 mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *), 659 KM_SLEEP); 660 avl_create(&mg->mg_metaslab_tree, metaslab_compare, 661 sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); 662 mg->mg_vd = vd; 663 mg->mg_class = mc; 664 mg->mg_activation_count = 0; 665 mg->mg_initialized = B_FALSE; 666 mg->mg_no_free_space = B_TRUE; 667 mg->mg_allocators = allocators; 668 669 mg->mg_alloc_queue_depth = kmem_zalloc(allocators * 670 sizeof (zfs_refcount_t), KM_SLEEP); 671 mg->mg_cur_max_alloc_queue_depth = kmem_zalloc(allocators * 672 sizeof (uint64_t), KM_SLEEP); 673 for (int i = 0; i < allocators; i++) { 674 zfs_refcount_create_tracked(&mg->mg_alloc_queue_depth[i]); 675 mg->mg_cur_max_alloc_queue_depth[i] = 0; 676 } 677 678 mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct, 679 minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT); 680 681 return (mg); 682 } 683 684 void 685 metaslab_group_destroy(metaslab_group_t *mg) 686 { 687 ASSERT(mg->mg_prev == NULL); 688 ASSERT(mg->mg_next == NULL); 689 /* 690 * We may have gone below zero with the activation count 691 * either because we never activated in the first place or 692 * because we're done, and possibly removing the vdev. 693 */ 694 ASSERT(mg->mg_activation_count <= 0); 695 696 taskq_destroy(mg->mg_taskq); 697 avl_destroy(&mg->mg_metaslab_tree); 698 kmem_free(mg->mg_primaries, mg->mg_allocators * sizeof (metaslab_t *)); 699 kmem_free(mg->mg_secondaries, mg->mg_allocators * 700 sizeof (metaslab_t *)); 701 mutex_destroy(&mg->mg_lock); 702 mutex_destroy(&mg->mg_ms_disabled_lock); 703 cv_destroy(&mg->mg_ms_disabled_cv); 704 705 for (int i = 0; i < mg->mg_allocators; i++) { 706 zfs_refcount_destroy(&mg->mg_alloc_queue_depth[i]); 707 mg->mg_cur_max_alloc_queue_depth[i] = 0; 708 } 709 kmem_free(mg->mg_alloc_queue_depth, mg->mg_allocators * 710 sizeof (zfs_refcount_t)); 711 kmem_free(mg->mg_cur_max_alloc_queue_depth, mg->mg_allocators * 712 sizeof (uint64_t)); 713 714 kmem_free(mg, sizeof (metaslab_group_t)); 715 } 716 717 void 718 metaslab_group_activate(metaslab_group_t *mg) 719 { 720 metaslab_class_t *mc = mg->mg_class; 721 metaslab_group_t *mgprev, *mgnext; 722 723 ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER), !=, 0); 724 725 ASSERT(mc->mc_rotor != mg); 726 ASSERT(mg->mg_prev == NULL); 727 ASSERT(mg->mg_next == NULL); 728 ASSERT(mg->mg_activation_count <= 0); 729 730 if (++mg->mg_activation_count <= 0) 731 return; 732 733 mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); 734 metaslab_group_alloc_update(mg); 735 736 if ((mgprev = mc->mc_rotor) == NULL) { 737 mg->mg_prev = mg; 738 mg->mg_next = mg; 739 } else { 740 mgnext = mgprev->mg_next; 741 mg->mg_prev = mgprev; 742 mg->mg_next = mgnext; 743 mgprev->mg_next = mg; 744 mgnext->mg_prev = mg; 745 } 746 mc->mc_rotor = mg; 747 } 748 749 /* 750 * Passivate a metaslab group and remove it from the allocation rotor. 751 * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating 752 * a metaslab group. This function will momentarily drop spa_config_locks 753 * that are lower than the SCL_ALLOC lock (see comment below). 754 */ 755 void 756 metaslab_group_passivate(metaslab_group_t *mg) 757 { 758 metaslab_class_t *mc = mg->mg_class; 759 spa_t *spa = mc->mc_spa; 760 metaslab_group_t *mgprev, *mgnext; 761 int locks = spa_config_held(spa, SCL_ALL, RW_WRITER); 762 763 ASSERT3U(spa_config_held(spa, SCL_ALLOC | SCL_ZIO, RW_WRITER), ==, 764 (SCL_ALLOC | SCL_ZIO)); 765 766 if (--mg->mg_activation_count != 0) { 767 ASSERT(mc->mc_rotor != mg); 768 ASSERT(mg->mg_prev == NULL); 769 ASSERT(mg->mg_next == NULL); 770 ASSERT(mg->mg_activation_count < 0); 771 return; 772 } 773 774 /* 775 * The spa_config_lock is an array of rwlocks, ordered as 776 * follows (from highest to lowest): 777 * SCL_CONFIG > SCL_STATE > SCL_L2ARC > SCL_ALLOC > 778 * SCL_ZIO > SCL_FREE > SCL_VDEV 779 * (For more information about the spa_config_lock see spa_misc.c) 780 * The higher the lock, the broader its coverage. When we passivate 781 * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO 782 * config locks. However, the metaslab group's taskq might be trying 783 * to preload metaslabs so we must drop the SCL_ZIO lock and any 784 * lower locks to allow the I/O to complete. At a minimum, 785 * we continue to hold the SCL_ALLOC lock, which prevents any future 786 * allocations from taking place and any changes to the vdev tree. 787 */ 788 spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa); 789 taskq_wait(mg->mg_taskq); 790 spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER); 791 metaslab_group_alloc_update(mg); 792 for (int i = 0; i < mg->mg_allocators; i++) { 793 metaslab_t *msp = mg->mg_primaries[i]; 794 if (msp != NULL) { 795 mutex_enter(&msp->ms_lock); 796 metaslab_passivate(msp, 797 metaslab_weight_from_range_tree(msp)); 798 mutex_exit(&msp->ms_lock); 799 } 800 msp = mg->mg_secondaries[i]; 801 if (msp != NULL) { 802 mutex_enter(&msp->ms_lock); 803 metaslab_passivate(msp, 804 metaslab_weight_from_range_tree(msp)); 805 mutex_exit(&msp->ms_lock); 806 } 807 } 808 809 mgprev = mg->mg_prev; 810 mgnext = mg->mg_next; 811 812 if (mg == mgnext) { 813 mc->mc_rotor = NULL; 814 } else { 815 mc->mc_rotor = mgnext; 816 mgprev->mg_next = mgnext; 817 mgnext->mg_prev = mgprev; 818 } 819 820 mg->mg_prev = NULL; 821 mg->mg_next = NULL; 822 } 823 824 boolean_t 825 metaslab_group_initialized(metaslab_group_t *mg) 826 { 827 vdev_t *vd = mg->mg_vd; 828 vdev_stat_t *vs = &vd->vdev_stat; 829 830 return (vs->vs_space != 0 && mg->mg_activation_count > 0); 831 } 832 833 uint64_t 834 metaslab_group_get_space(metaslab_group_t *mg) 835 { 836 return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count); 837 } 838 839 void 840 metaslab_group_histogram_verify(metaslab_group_t *mg) 841 { 842 uint64_t *mg_hist; 843 vdev_t *vd = mg->mg_vd; 844 uint64_t ashift = vd->vdev_ashift; 845 int i; 846 847 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 848 return; 849 850 mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 851 KM_SLEEP); 852 853 ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=, 854 SPACE_MAP_HISTOGRAM_SIZE + ashift); 855 856 for (int m = 0; m < vd->vdev_ms_count; m++) { 857 metaslab_t *msp = vd->vdev_ms[m]; 858 ASSERT(msp != NULL); 859 860 /* skip if not active or not a member */ 861 if (msp->ms_sm == NULL || msp->ms_group != mg) 862 continue; 863 864 for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) 865 mg_hist[i + ashift] += 866 msp->ms_sm->sm_phys->smp_histogram[i]; 867 } 868 869 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++) 870 VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]); 871 872 kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 873 } 874 875 static void 876 metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp) 877 { 878 metaslab_class_t *mc = mg->mg_class; 879 uint64_t ashift = mg->mg_vd->vdev_ashift; 880 881 ASSERT(MUTEX_HELD(&msp->ms_lock)); 882 if (msp->ms_sm == NULL) 883 return; 884 885 mutex_enter(&mg->mg_lock); 886 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 887 mg->mg_histogram[i + ashift] += 888 msp->ms_sm->sm_phys->smp_histogram[i]; 889 mc->mc_histogram[i + ashift] += 890 msp->ms_sm->sm_phys->smp_histogram[i]; 891 } 892 mutex_exit(&mg->mg_lock); 893 } 894 895 void 896 metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp) 897 { 898 metaslab_class_t *mc = mg->mg_class; 899 uint64_t ashift = mg->mg_vd->vdev_ashift; 900 901 ASSERT(MUTEX_HELD(&msp->ms_lock)); 902 if (msp->ms_sm == NULL) 903 return; 904 905 mutex_enter(&mg->mg_lock); 906 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 907 ASSERT3U(mg->mg_histogram[i + ashift], >=, 908 msp->ms_sm->sm_phys->smp_histogram[i]); 909 ASSERT3U(mc->mc_histogram[i + ashift], >=, 910 msp->ms_sm->sm_phys->smp_histogram[i]); 911 912 mg->mg_histogram[i + ashift] -= 913 msp->ms_sm->sm_phys->smp_histogram[i]; 914 mc->mc_histogram[i + ashift] -= 915 msp->ms_sm->sm_phys->smp_histogram[i]; 916 } 917 mutex_exit(&mg->mg_lock); 918 } 919 920 static void 921 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) 922 { 923 ASSERT(msp->ms_group == NULL); 924 mutex_enter(&mg->mg_lock); 925 msp->ms_group = mg; 926 msp->ms_weight = 0; 927 avl_add(&mg->mg_metaslab_tree, msp); 928 mutex_exit(&mg->mg_lock); 929 930 mutex_enter(&msp->ms_lock); 931 metaslab_group_histogram_add(mg, msp); 932 mutex_exit(&msp->ms_lock); 933 } 934 935 static void 936 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) 937 { 938 mutex_enter(&msp->ms_lock); 939 metaslab_group_histogram_remove(mg, msp); 940 mutex_exit(&msp->ms_lock); 941 942 mutex_enter(&mg->mg_lock); 943 ASSERT(msp->ms_group == mg); 944 avl_remove(&mg->mg_metaslab_tree, msp); 945 msp->ms_group = NULL; 946 mutex_exit(&mg->mg_lock); 947 } 948 949 static void 950 metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 951 { 952 ASSERT(MUTEX_HELD(&mg->mg_lock)); 953 ASSERT(msp->ms_group == mg); 954 avl_remove(&mg->mg_metaslab_tree, msp); 955 msp->ms_weight = weight; 956 avl_add(&mg->mg_metaslab_tree, msp); 957 958 } 959 960 static void 961 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 962 { 963 /* 964 * Although in principle the weight can be any value, in 965 * practice we do not use values in the range [1, 511]. 966 */ 967 ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0); 968 ASSERT(MUTEX_HELD(&msp->ms_lock)); 969 970 mutex_enter(&mg->mg_lock); 971 metaslab_group_sort_impl(mg, msp, weight); 972 mutex_exit(&mg->mg_lock); 973 } 974 975 /* 976 * Calculate the fragmentation for a given metaslab group. We can use 977 * a simple average here since all metaslabs within the group must have 978 * the same size. The return value will be a value between 0 and 100 979 * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this 980 * group have a fragmentation metric. 981 */ 982 uint64_t 983 metaslab_group_fragmentation(metaslab_group_t *mg) 984 { 985 vdev_t *vd = mg->mg_vd; 986 uint64_t fragmentation = 0; 987 uint64_t valid_ms = 0; 988 989 for (int m = 0; m < vd->vdev_ms_count; m++) { 990 metaslab_t *msp = vd->vdev_ms[m]; 991 992 if (msp->ms_fragmentation == ZFS_FRAG_INVALID) 993 continue; 994 if (msp->ms_group != mg) 995 continue; 996 997 valid_ms++; 998 fragmentation += msp->ms_fragmentation; 999 } 1000 1001 if (valid_ms <= mg->mg_vd->vdev_ms_count / 2) 1002 return (ZFS_FRAG_INVALID); 1003 1004 fragmentation /= valid_ms; 1005 ASSERT3U(fragmentation, <=, 100); 1006 return (fragmentation); 1007 } 1008 1009 /* 1010 * Determine if a given metaslab group should skip allocations. A metaslab 1011 * group should avoid allocations if its free capacity is less than the 1012 * zfs_mg_noalloc_threshold or its fragmentation metric is greater than 1013 * zfs_mg_fragmentation_threshold and there is at least one metaslab group 1014 * that can still handle allocations. If the allocation throttle is enabled 1015 * then we skip allocations to devices that have reached their maximum 1016 * allocation queue depth unless the selected metaslab group is the only 1017 * eligible group remaining. 1018 */ 1019 static boolean_t 1020 metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, 1021 uint64_t psize, int allocator, int d) 1022 { 1023 spa_t *spa = mg->mg_vd->vdev_spa; 1024 metaslab_class_t *mc = mg->mg_class; 1025 1026 /* 1027 * We can only consider skipping this metaslab group if it's 1028 * in the normal metaslab class and there are other metaslab 1029 * groups to select from. Otherwise, we always consider it eligible 1030 * for allocations. 1031 */ 1032 if ((mc != spa_normal_class(spa) && 1033 mc != spa_special_class(spa) && 1034 mc != spa_dedup_class(spa)) || 1035 mc->mc_groups <= 1) 1036 return (B_TRUE); 1037 1038 /* 1039 * If the metaslab group's mg_allocatable flag is set (see comments 1040 * in metaslab_group_alloc_update() for more information) and 1041 * the allocation throttle is disabled then allow allocations to this 1042 * device. However, if the allocation throttle is enabled then 1043 * check if we have reached our allocation limit (mg_alloc_queue_depth) 1044 * to determine if we should allow allocations to this metaslab group. 1045 * If all metaslab groups are no longer considered allocatable 1046 * (mc_alloc_groups == 0) or we're trying to allocate the smallest 1047 * gang block size then we allow allocations on this metaslab group 1048 * regardless of the mg_allocatable or throttle settings. 1049 */ 1050 if (mg->mg_allocatable) { 1051 metaslab_group_t *mgp; 1052 int64_t qdepth; 1053 uint64_t qmax = mg->mg_cur_max_alloc_queue_depth[allocator]; 1054 1055 if (!mc->mc_alloc_throttle_enabled) 1056 return (B_TRUE); 1057 1058 /* 1059 * If this metaslab group does not have any free space, then 1060 * there is no point in looking further. 1061 */ 1062 if (mg->mg_no_free_space) 1063 return (B_FALSE); 1064 1065 /* 1066 * Relax allocation throttling for ditto blocks. Due to 1067 * random imbalances in allocation it tends to push copies 1068 * to one vdev, that looks a bit better at the moment. 1069 */ 1070 qmax = qmax * (4 + d) / 4; 1071 1072 qdepth = zfs_refcount_count( 1073 &mg->mg_alloc_queue_depth[allocator]); 1074 1075 /* 1076 * If this metaslab group is below its qmax or it's 1077 * the only allocatable metasable group, then attempt 1078 * to allocate from it. 1079 */ 1080 if (qdepth < qmax || mc->mc_alloc_groups == 1) 1081 return (B_TRUE); 1082 ASSERT3U(mc->mc_alloc_groups, >, 1); 1083 1084 /* 1085 * Since this metaslab group is at or over its qmax, we 1086 * need to determine if there are metaslab groups after this 1087 * one that might be able to handle this allocation. This is 1088 * racy since we can't hold the locks for all metaslab 1089 * groups at the same time when we make this check. 1090 */ 1091 for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) { 1092 qmax = mgp->mg_cur_max_alloc_queue_depth[allocator]; 1093 qmax = qmax * (4 + d) / 4; 1094 qdepth = zfs_refcount_count( 1095 &mgp->mg_alloc_queue_depth[allocator]); 1096 1097 /* 1098 * If there is another metaslab group that 1099 * might be able to handle the allocation, then 1100 * we return false so that we skip this group. 1101 */ 1102 if (qdepth < qmax && !mgp->mg_no_free_space) 1103 return (B_FALSE); 1104 } 1105 1106 /* 1107 * We didn't find another group to handle the allocation 1108 * so we can't skip this metaslab group even though 1109 * we are at or over our qmax. 1110 */ 1111 return (B_TRUE); 1112 1113 } else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) { 1114 return (B_TRUE); 1115 } 1116 return (B_FALSE); 1117 } 1118 1119 /* 1120 * ========================================================================== 1121 * Range tree callbacks 1122 * ========================================================================== 1123 */ 1124 1125 /* 1126 * Comparison function for the private size-ordered tree. Tree is sorted 1127 * by size, larger sizes at the end of the tree. 1128 */ 1129 static int 1130 metaslab_rangesize_compare(const void *x1, const void *x2) 1131 { 1132 const range_seg_t *r1 = x1; 1133 const range_seg_t *r2 = x2; 1134 uint64_t rs_size1 = r1->rs_end - r1->rs_start; 1135 uint64_t rs_size2 = r2->rs_end - r2->rs_start; 1136 1137 int cmp = AVL_CMP(rs_size1, rs_size2); 1138 if (likely(cmp)) 1139 return (cmp); 1140 1141 return (AVL_CMP(r1->rs_start, r2->rs_start)); 1142 } 1143 1144 /* 1145 * ========================================================================== 1146 * Common allocator routines 1147 * ========================================================================== 1148 */ 1149 1150 /* 1151 * Return the maximum contiguous segment within the metaslab. 1152 */ 1153 uint64_t 1154 metaslab_block_maxsize(metaslab_t *msp) 1155 { 1156 avl_tree_t *t = &msp->ms_allocatable_by_size; 1157 range_seg_t *rs; 1158 1159 if (t == NULL || (rs = avl_last(t)) == NULL) 1160 return (0ULL); 1161 1162 return (rs->rs_end - rs->rs_start); 1163 } 1164 1165 static range_seg_t * 1166 metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size) 1167 { 1168 range_seg_t *rs, rsearch; 1169 avl_index_t where; 1170 1171 rsearch.rs_start = start; 1172 rsearch.rs_end = start + size; 1173 1174 rs = avl_find(t, &rsearch, &where); 1175 if (rs == NULL) { 1176 rs = avl_nearest(t, where, AVL_AFTER); 1177 } 1178 1179 return (rs); 1180 } 1181 1182 /* 1183 * This is a helper function that can be used by the allocator to find 1184 * a suitable block to allocate. This will search the specified AVL 1185 * tree looking for a block that matches the specified criteria. 1186 */ 1187 static uint64_t 1188 metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, 1189 uint64_t align) 1190 { 1191 range_seg_t *rs = metaslab_block_find(t, *cursor, size); 1192 1193 while (rs != NULL) { 1194 uint64_t offset = P2ROUNDUP(rs->rs_start, align); 1195 1196 if (offset + size <= rs->rs_end) { 1197 *cursor = offset + size; 1198 return (offset); 1199 } 1200 rs = AVL_NEXT(t, rs); 1201 } 1202 1203 /* 1204 * If we know we've searched the whole map (*cursor == 0), give up. 1205 * Otherwise, reset the cursor to the beginning and try again. 1206 */ 1207 if (*cursor == 0) 1208 return (-1ULL); 1209 1210 *cursor = 0; 1211 return (metaslab_block_picker(t, cursor, size, align)); 1212 } 1213 1214 /* 1215 * ========================================================================== 1216 * The first-fit block allocator 1217 * ========================================================================== 1218 */ 1219 static uint64_t 1220 metaslab_ff_alloc(metaslab_t *msp, uint64_t size) 1221 { 1222 /* 1223 * Find the largest power of 2 block size that evenly divides the 1224 * requested size. This is used to try to allocate blocks with similar 1225 * alignment from the same area of the metaslab (i.e. same cursor 1226 * bucket) but it does not guarantee that other allocations sizes 1227 * may exist in the same region. 1228 */ 1229 uint64_t align = size & -size; 1230 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; 1231 avl_tree_t *t = &msp->ms_allocatable->rt_root; 1232 1233 return (metaslab_block_picker(t, cursor, size, align)); 1234 } 1235 1236 static metaslab_ops_t metaslab_ff_ops = { 1237 metaslab_ff_alloc 1238 }; 1239 1240 /* 1241 * ========================================================================== 1242 * Dynamic block allocator - 1243 * Uses the first fit allocation scheme until space get low and then 1244 * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold 1245 * and metaslab_df_free_pct to determine when to switch the allocation scheme. 1246 * ========================================================================== 1247 */ 1248 static uint64_t 1249 metaslab_df_alloc(metaslab_t *msp, uint64_t size) 1250 { 1251 /* 1252 * Find the largest power of 2 block size that evenly divides the 1253 * requested size. This is used to try to allocate blocks with similar 1254 * alignment from the same area of the metaslab (i.e. same cursor 1255 * bucket) but it does not guarantee that other allocations sizes 1256 * may exist in the same region. 1257 */ 1258 uint64_t align = size & -size; 1259 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; 1260 range_tree_t *rt = msp->ms_allocatable; 1261 avl_tree_t *t = &rt->rt_root; 1262 uint64_t max_size = metaslab_block_maxsize(msp); 1263 int free_pct = range_tree_space(rt) * 100 / msp->ms_size; 1264 1265 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1266 ASSERT3U(avl_numnodes(t), ==, 1267 avl_numnodes(&msp->ms_allocatable_by_size)); 1268 1269 if (max_size < size) 1270 return (-1ULL); 1271 1272 /* 1273 * If we're running low on space switch to using the size 1274 * sorted AVL tree (best-fit). 1275 */ 1276 if (max_size < metaslab_df_alloc_threshold || 1277 free_pct < metaslab_df_free_pct) { 1278 t = &msp->ms_allocatable_by_size; 1279 *cursor = 0; 1280 } 1281 1282 return (metaslab_block_picker(t, cursor, size, 1ULL)); 1283 } 1284 1285 static metaslab_ops_t metaslab_df_ops = { 1286 metaslab_df_alloc 1287 }; 1288 1289 /* 1290 * ========================================================================== 1291 * Cursor fit block allocator - 1292 * Select the largest region in the metaslab, set the cursor to the beginning 1293 * of the range and the cursor_end to the end of the range. As allocations 1294 * are made advance the cursor. Continue allocating from the cursor until 1295 * the range is exhausted and then find a new range. 1296 * ========================================================================== 1297 */ 1298 static uint64_t 1299 metaslab_cf_alloc(metaslab_t *msp, uint64_t size) 1300 { 1301 range_tree_t *rt = msp->ms_allocatable; 1302 avl_tree_t *t = &msp->ms_allocatable_by_size; 1303 uint64_t *cursor = &msp->ms_lbas[0]; 1304 uint64_t *cursor_end = &msp->ms_lbas[1]; 1305 uint64_t offset = 0; 1306 1307 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1308 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root)); 1309 1310 ASSERT3U(*cursor_end, >=, *cursor); 1311 1312 if ((*cursor + size) > *cursor_end) { 1313 range_seg_t *rs; 1314 1315 rs = avl_last(&msp->ms_allocatable_by_size); 1316 if (rs == NULL || (rs->rs_end - rs->rs_start) < size) 1317 return (-1ULL); 1318 1319 *cursor = rs->rs_start; 1320 *cursor_end = rs->rs_end; 1321 } 1322 1323 offset = *cursor; 1324 *cursor += size; 1325 1326 return (offset); 1327 } 1328 1329 static metaslab_ops_t metaslab_cf_ops = { 1330 metaslab_cf_alloc 1331 }; 1332 1333 /* 1334 * ========================================================================== 1335 * New dynamic fit allocator - 1336 * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift 1337 * contiguous blocks. If no region is found then just use the largest segment 1338 * that remains. 1339 * ========================================================================== 1340 */ 1341 1342 /* 1343 * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift) 1344 * to request from the allocator. 1345 */ 1346 uint64_t metaslab_ndf_clump_shift = 4; 1347 1348 static uint64_t 1349 metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) 1350 { 1351 avl_tree_t *t = &msp->ms_allocatable->rt_root; 1352 avl_index_t where; 1353 range_seg_t *rs, rsearch; 1354 uint64_t hbit = highbit64(size); 1355 uint64_t *cursor = &msp->ms_lbas[hbit - 1]; 1356 uint64_t max_size = metaslab_block_maxsize(msp); 1357 1358 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1359 ASSERT3U(avl_numnodes(t), ==, 1360 avl_numnodes(&msp->ms_allocatable_by_size)); 1361 1362 if (max_size < size) 1363 return (-1ULL); 1364 1365 rsearch.rs_start = *cursor; 1366 rsearch.rs_end = *cursor + size; 1367 1368 rs = avl_find(t, &rsearch, &where); 1369 if (rs == NULL || (rs->rs_end - rs->rs_start) < size) { 1370 t = &msp->ms_allocatable_by_size; 1371 1372 rsearch.rs_start = 0; 1373 rsearch.rs_end = MIN(max_size, 1374 1ULL << (hbit + metaslab_ndf_clump_shift)); 1375 rs = avl_find(t, &rsearch, &where); 1376 if (rs == NULL) 1377 rs = avl_nearest(t, where, AVL_AFTER); 1378 ASSERT(rs != NULL); 1379 } 1380 1381 if ((rs->rs_end - rs->rs_start) >= size) { 1382 *cursor = rs->rs_start + size; 1383 return (rs->rs_start); 1384 } 1385 return (-1ULL); 1386 } 1387 1388 static metaslab_ops_t metaslab_ndf_ops = { 1389 metaslab_ndf_alloc 1390 }; 1391 1392 metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops; 1393 1394 /* 1395 * ========================================================================== 1396 * Metaslabs 1397 * ========================================================================== 1398 */ 1399 1400 static void 1401 metaslab_aux_histograms_clear(metaslab_t *msp) 1402 { 1403 /* 1404 * Auxiliary histograms are only cleared when resetting them, 1405 * which can only happen while the metaslab is loaded. 1406 */ 1407 ASSERT(msp->ms_loaded); 1408 1409 bzero(msp->ms_synchist, sizeof (msp->ms_synchist)); 1410 for (int t = 0; t < TXG_DEFER_SIZE; t++) 1411 bzero(msp->ms_deferhist[t], sizeof (msp->ms_deferhist[t])); 1412 } 1413 1414 static void 1415 metaslab_aux_histogram_add(uint64_t *histogram, uint64_t shift, 1416 range_tree_t *rt) 1417 { 1418 /* 1419 * This is modeled after space_map_histogram_add(), so refer to that 1420 * function for implementation details. We want this to work like 1421 * the space map histogram, and not the range tree histogram, as we 1422 * are essentially constructing a delta that will be later subtracted 1423 * from the space map histogram. 1424 */ 1425 int idx = 0; 1426 for (int i = shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { 1427 ASSERT3U(i, >=, idx + shift); 1428 histogram[idx] += rt->rt_histogram[i] << (i - idx - shift); 1429 1430 if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) { 1431 ASSERT3U(idx + shift, ==, i); 1432 idx++; 1433 ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE); 1434 } 1435 } 1436 } 1437 1438 /* 1439 * Called at every sync pass that the metaslab gets synced. 1440 * 1441 * The reason is that we want our auxiliary histograms to be updated 1442 * wherever the metaslab's space map histogram is updated. This way 1443 * we stay consistent on which parts of the metaslab space map's 1444 * histogram are currently not available for allocations (e.g because 1445 * they are in the defer, freed, and freeing trees). 1446 */ 1447 static void 1448 metaslab_aux_histograms_update(metaslab_t *msp) 1449 { 1450 space_map_t *sm = msp->ms_sm; 1451 ASSERT(sm != NULL); 1452 1453 /* 1454 * This is similar to the metaslab's space map histogram updates 1455 * that take place in metaslab_sync(). The only difference is that 1456 * we only care about segments that haven't made it into the 1457 * ms_allocatable tree yet. 1458 */ 1459 if (msp->ms_loaded) { 1460 metaslab_aux_histograms_clear(msp); 1461 1462 metaslab_aux_histogram_add(msp->ms_synchist, 1463 sm->sm_shift, msp->ms_freed); 1464 1465 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1466 metaslab_aux_histogram_add(msp->ms_deferhist[t], 1467 sm->sm_shift, msp->ms_defer[t]); 1468 } 1469 } 1470 1471 metaslab_aux_histogram_add(msp->ms_synchist, 1472 sm->sm_shift, msp->ms_freeing); 1473 } 1474 1475 /* 1476 * Called every time we are done syncing (writing to) the metaslab, 1477 * i.e. at the end of each sync pass. 1478 * [see the comment in metaslab_impl.h for ms_synchist, ms_deferhist] 1479 */ 1480 static void 1481 metaslab_aux_histograms_update_done(metaslab_t *msp, boolean_t defer_allowed) 1482 { 1483 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1484 space_map_t *sm = msp->ms_sm; 1485 1486 if (sm == NULL) { 1487 /* 1488 * We came here from metaslab_init() when creating/opening a 1489 * pool, looking at a metaslab that hasn't had any allocations 1490 * yet. 1491 */ 1492 return; 1493 } 1494 1495 /* 1496 * This is similar to the actions that we take for the ms_freed 1497 * and ms_defer trees in metaslab_sync_done(). 1498 */ 1499 uint64_t hist_index = spa_syncing_txg(spa) % TXG_DEFER_SIZE; 1500 if (defer_allowed) { 1501 bcopy(msp->ms_synchist, msp->ms_deferhist[hist_index], 1502 sizeof (msp->ms_synchist)); 1503 } else { 1504 bzero(msp->ms_deferhist[hist_index], 1505 sizeof (msp->ms_deferhist[hist_index])); 1506 } 1507 bzero(msp->ms_synchist, sizeof (msp->ms_synchist)); 1508 } 1509 1510 /* 1511 * Ensure that the metaslab's weight and fragmentation are consistent 1512 * with the contents of the histogram (either the range tree's histogram 1513 * or the space map's depending whether the metaslab is loaded). 1514 */ 1515 static void 1516 metaslab_verify_weight_and_frag(metaslab_t *msp) 1517 { 1518 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1519 1520 if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) 1521 return; 1522 1523 /* see comment in metaslab_verify_unflushed_changes() */ 1524 if (msp->ms_group == NULL) 1525 return; 1526 1527 /* 1528 * Devices being removed always return a weight of 0 and leave 1529 * fragmentation and ms_max_size as is - there is nothing for 1530 * us to verify here. 1531 */ 1532 vdev_t *vd = msp->ms_group->mg_vd; 1533 if (vd->vdev_removing) 1534 return; 1535 1536 /* 1537 * If the metaslab is dirty it probably means that we've done 1538 * some allocations or frees that have changed our histograms 1539 * and thus the weight. 1540 */ 1541 for (int t = 0; t < TXG_SIZE; t++) { 1542 if (txg_list_member(&vd->vdev_ms_list, msp, t)) 1543 return; 1544 } 1545 1546 /* 1547 * This verification checks that our in-memory state is consistent 1548 * with what's on disk. If the pool is read-only then there aren't 1549 * any changes and we just have the initially-loaded state. 1550 */ 1551 if (!spa_writeable(msp->ms_group->mg_vd->vdev_spa)) 1552 return; 1553 1554 /* some extra verification for in-core tree if you can */ 1555 if (msp->ms_loaded) { 1556 range_tree_stat_verify(msp->ms_allocatable); 1557 VERIFY(space_map_histogram_verify(msp->ms_sm, 1558 msp->ms_allocatable)); 1559 } 1560 1561 uint64_t weight = msp->ms_weight; 1562 uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; 1563 boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight); 1564 uint64_t frag = msp->ms_fragmentation; 1565 uint64_t max_segsize = msp->ms_max_size; 1566 1567 msp->ms_weight = 0; 1568 msp->ms_fragmentation = 0; 1569 msp->ms_max_size = 0; 1570 1571 /* 1572 * This function is used for verification purposes. Regardless of 1573 * whether metaslab_weight() thinks this metaslab should be active or 1574 * not, we want to ensure that the actual weight (and therefore the 1575 * value of ms_weight) would be the same if it was to be recalculated 1576 * at this point. 1577 */ 1578 msp->ms_weight = metaslab_weight(msp) | was_active; 1579 1580 VERIFY3U(max_segsize, ==, msp->ms_max_size); 1581 1582 /* 1583 * If the weight type changed then there is no point in doing 1584 * verification. Revert fields to their original values. 1585 */ 1586 if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) || 1587 (!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) { 1588 msp->ms_fragmentation = frag; 1589 msp->ms_weight = weight; 1590 return; 1591 } 1592 1593 VERIFY3U(msp->ms_fragmentation, ==, frag); 1594 VERIFY3U(msp->ms_weight, ==, weight); 1595 } 1596 1597 /* 1598 * Wait for any in-progress metaslab loads to complete. 1599 */ 1600 static void 1601 metaslab_load_wait(metaslab_t *msp) 1602 { 1603 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1604 1605 while (msp->ms_loading) { 1606 ASSERT(!msp->ms_loaded); 1607 cv_wait(&msp->ms_load_cv, &msp->ms_lock); 1608 } 1609 } 1610 1611 static int 1612 metaslab_load_impl(metaslab_t *msp) 1613 { 1614 int error = 0; 1615 1616 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1617 ASSERT(msp->ms_loading); 1618 ASSERT(!msp->ms_condensing); 1619 1620 /* 1621 * We temporarily drop the lock to unblock other operations while we 1622 * are reading the space map. Therefore, metaslab_sync() and 1623 * metaslab_sync_done() can run at the same time as we do. 1624 * 1625 * metaslab_sync() can append to the space map while we are loading. 1626 * Therefore we load only entries that existed when we started the 1627 * load. Additionally, metaslab_sync_done() has to wait for the load 1628 * to complete because there are potential races like metaslab_load() 1629 * loading parts of the space map that are currently being appended 1630 * by metaslab_sync(). If we didn't, the ms_allocatable would have 1631 * entries that metaslab_sync_done() would try to re-add later. 1632 * 1633 * That's why before dropping the lock we remember the synced length 1634 * of the metaslab and read up to that point of the space map, 1635 * ignoring entries appended by metaslab_sync() that happen after we 1636 * drop the lock. 1637 */ 1638 uint64_t length = msp->ms_synced_length; 1639 mutex_exit(&msp->ms_lock); 1640 1641 if (msp->ms_sm != NULL) { 1642 error = space_map_load_length(msp->ms_sm, msp->ms_allocatable, 1643 SM_FREE, length); 1644 } else { 1645 /* 1646 * The space map has not been allocated yet, so treat 1647 * all the space in the metaslab as free and add it to the 1648 * ms_allocatable tree. 1649 */ 1650 range_tree_add(msp->ms_allocatable, 1651 msp->ms_start, msp->ms_size); 1652 } 1653 1654 /* 1655 * We need to grab the ms_sync_lock to prevent metaslab_sync() from 1656 * changing the ms_sm and the metaslab's range trees while we are 1657 * about to use them and populate the ms_allocatable. The ms_lock 1658 * is insufficient for this because metaslab_sync() doesn't hold 1659 * the ms_lock while writing the ms_checkpointing tree to disk. 1660 */ 1661 mutex_enter(&msp->ms_sync_lock); 1662 mutex_enter(&msp->ms_lock); 1663 ASSERT(!msp->ms_condensing); 1664 1665 if (error != 0) { 1666 mutex_exit(&msp->ms_sync_lock); 1667 return (error); 1668 } 1669 1670 ASSERT3P(msp->ms_group, !=, NULL); 1671 msp->ms_loaded = B_TRUE; 1672 1673 /* 1674 * The ms_allocatable contains the segments that exist in the 1675 * ms_defer trees [see ms_synced_length]. Thus we need to remove 1676 * them from ms_allocatable as they will be added again in 1677 * metaslab_sync_done(). 1678 */ 1679 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1680 range_tree_walk(msp->ms_defer[t], 1681 range_tree_remove, msp->ms_allocatable); 1682 } 1683 1684 /* 1685 * Call metaslab_recalculate_weight_and_sort() now that the 1686 * metaslab is loaded so we get the metaslab's real weight. 1687 * 1688 * Unless this metaslab was created with older software and 1689 * has not yet been converted to use segment-based weight, we 1690 * expect the new weight to be better or equal to the weight 1691 * that the metaslab had while it was not loaded. This is 1692 * because the old weight does not take into account the 1693 * consolidation of adjacent segments between TXGs. [see 1694 * comment for ms_synchist and ms_deferhist[] for more info] 1695 */ 1696 uint64_t weight = msp->ms_weight; 1697 metaslab_recalculate_weight_and_sort(msp); 1698 if (!WEIGHT_IS_SPACEBASED(weight)) 1699 ASSERT3U(weight, <=, msp->ms_weight); 1700 msp->ms_max_size = metaslab_block_maxsize(msp); 1701 1702 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1703 metaslab_verify_space(msp, spa_syncing_txg(spa)); 1704 mutex_exit(&msp->ms_sync_lock); 1705 1706 return (0); 1707 } 1708 1709 int 1710 metaslab_load(metaslab_t *msp) 1711 { 1712 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1713 1714 /* 1715 * There may be another thread loading the same metaslab, if that's 1716 * the case just wait until the other thread is done and return. 1717 */ 1718 metaslab_load_wait(msp); 1719 if (msp->ms_loaded) 1720 return (0); 1721 VERIFY(!msp->ms_loading); 1722 ASSERT(!msp->ms_condensing); 1723 1724 msp->ms_loading = B_TRUE; 1725 int error = metaslab_load_impl(msp); 1726 msp->ms_loading = B_FALSE; 1727 cv_broadcast(&msp->ms_load_cv); 1728 1729 return (error); 1730 } 1731 1732 void 1733 metaslab_unload(metaslab_t *msp) 1734 { 1735 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1736 1737 metaslab_verify_weight_and_frag(msp); 1738 1739 range_tree_vacate(msp->ms_allocatable, NULL, NULL); 1740 msp->ms_loaded = B_FALSE; 1741 1742 msp->ms_weight &= ~METASLAB_ACTIVE_MASK; 1743 msp->ms_max_size = 0; 1744 1745 /* 1746 * We explicitly recalculate the metaslab's weight based on its space 1747 * map (as it is now not loaded). We want unload metaslabs to always 1748 * have their weights calculated from the space map histograms, while 1749 * loaded ones have it calculated from their in-core range tree 1750 * [see metaslab_load()]. This way, the weight reflects the information 1751 * available in-core, whether it is loaded or not 1752 * 1753 * If ms_group == NULL means that we came here from metaslab_fini(), 1754 * at which point it doesn't make sense for us to do the recalculation 1755 * and the sorting. 1756 */ 1757 if (msp->ms_group != NULL) 1758 metaslab_recalculate_weight_and_sort(msp); 1759 } 1760 1761 static void 1762 metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta, 1763 int64_t defer_delta, int64_t space_delta) 1764 { 1765 vdev_space_update(vd, alloc_delta, defer_delta, space_delta); 1766 1767 ASSERT3P(vd->vdev_spa->spa_root_vdev, ==, vd->vdev_parent); 1768 ASSERT(vd->vdev_ms_count != 0); 1769 1770 metaslab_class_space_update(mc, alloc_delta, defer_delta, space_delta, 1771 vdev_deflated_space(vd, space_delta)); 1772 } 1773 1774 int 1775 metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, 1776 metaslab_t **msp) 1777 { 1778 vdev_t *vd = mg->mg_vd; 1779 spa_t *spa = vd->vdev_spa; 1780 objset_t *mos = spa->spa_meta_objset; 1781 metaslab_t *ms; 1782 int error; 1783 1784 ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); 1785 mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL); 1786 mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL); 1787 cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); 1788 1789 ms->ms_id = id; 1790 ms->ms_start = id << vd->vdev_ms_shift; 1791 ms->ms_size = 1ULL << vd->vdev_ms_shift; 1792 ms->ms_allocator = -1; 1793 ms->ms_new = B_TRUE; 1794 1795 /* 1796 * We only open space map objects that already exist. All others 1797 * will be opened when we finally allocate an object for it. 1798 * 1799 * Note: 1800 * When called from vdev_expand(), we can't call into the DMU as 1801 * we are holding the spa_config_lock as a writer and we would 1802 * deadlock [see relevant comment in vdev_metaslab_init()]. in 1803 * that case, the object parameter is zero though, so we won't 1804 * call into the DMU. 1805 */ 1806 if (object != 0) { 1807 error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start, 1808 ms->ms_size, vd->vdev_ashift); 1809 1810 if (error != 0) { 1811 kmem_free(ms, sizeof (metaslab_t)); 1812 return (error); 1813 } 1814 1815 ASSERT(ms->ms_sm != NULL); 1816 ASSERT3S(space_map_allocated(ms->ms_sm), >=, 0); 1817 ms->ms_allocated_space = space_map_allocated(ms->ms_sm); 1818 } 1819 1820 /* 1821 * We create the ms_allocatable here, but we don't create the 1822 * other range trees until metaslab_sync_done(). This serves 1823 * two purposes: it allows metaslab_sync_done() to detect the 1824 * addition of new space; and for debugging, it ensures that 1825 * we'd data fault on any attempt to use this metaslab before 1826 * it's ready. 1827 */ 1828 ms->ms_allocatable = range_tree_create_impl(&rt_avl_ops, 1829 &ms->ms_allocatable_by_size, metaslab_rangesize_compare, 0); 1830 1831 ms->ms_trim = range_tree_create(NULL, NULL); 1832 1833 metaslab_group_add(mg, ms); 1834 metaslab_set_fragmentation(ms); 1835 1836 /* 1837 * If we're opening an existing pool (txg == 0) or creating 1838 * a new one (txg == TXG_INITIAL), all space is available now. 1839 * If we're adding space to an existing pool, the new space 1840 * does not become available until after this txg has synced. 1841 * The metaslab's weight will also be initialized when we sync 1842 * out this txg. This ensures that we don't attempt to allocate 1843 * from it before we have initialized it completely. 1844 */ 1845 if (txg <= TXG_INITIAL) { 1846 metaslab_sync_done(ms, 0); 1847 metaslab_space_update(vd, mg->mg_class, 1848 metaslab_allocated_space(ms), 0, 0); 1849 } 1850 1851 /* 1852 * If metaslab_debug_load is set and we're initializing a metaslab 1853 * that has an allocated space map object then load the space map 1854 * so that we can verify frees. 1855 */ 1856 if (metaslab_debug_load && ms->ms_sm != NULL) { 1857 mutex_enter(&ms->ms_lock); 1858 VERIFY0(metaslab_load(ms)); 1859 mutex_exit(&ms->ms_lock); 1860 } 1861 1862 if (txg != 0) { 1863 vdev_dirty(vd, 0, NULL, txg); 1864 vdev_dirty(vd, VDD_METASLAB, ms, txg); 1865 } 1866 1867 *msp = ms; 1868 1869 return (0); 1870 } 1871 1872 void 1873 metaslab_fini(metaslab_t *msp) 1874 { 1875 metaslab_group_t *mg = msp->ms_group; 1876 vdev_t *vd = mg->mg_vd; 1877 1878 metaslab_group_remove(mg, msp); 1879 1880 mutex_enter(&msp->ms_lock); 1881 VERIFY(msp->ms_group == NULL); 1882 metaslab_space_update(vd, mg->mg_class, 1883 -metaslab_allocated_space(msp), 0, -msp->ms_size); 1884 1885 space_map_close(msp->ms_sm); 1886 1887 metaslab_unload(msp); 1888 1889 range_tree_destroy(msp->ms_allocatable); 1890 range_tree_destroy(msp->ms_freeing); 1891 range_tree_destroy(msp->ms_freed); 1892 1893 for (int t = 0; t < TXG_SIZE; t++) { 1894 range_tree_destroy(msp->ms_allocating[t]); 1895 } 1896 1897 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1898 range_tree_destroy(msp->ms_defer[t]); 1899 } 1900 ASSERT0(msp->ms_deferspace); 1901 1902 range_tree_destroy(msp->ms_checkpointing); 1903 1904 for (int t = 0; t < TXG_SIZE; t++) 1905 ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t)); 1906 1907 range_tree_vacate(msp->ms_trim, NULL, NULL); 1908 range_tree_destroy(msp->ms_trim); 1909 1910 mutex_exit(&msp->ms_lock); 1911 cv_destroy(&msp->ms_load_cv); 1912 mutex_destroy(&msp->ms_lock); 1913 mutex_destroy(&msp->ms_sync_lock); 1914 ASSERT3U(msp->ms_allocator, ==, -1); 1915 1916 kmem_free(msp, sizeof (metaslab_t)); 1917 } 1918 1919 #define FRAGMENTATION_TABLE_SIZE 17 1920 1921 /* 1922 * This table defines a segment size based fragmentation metric that will 1923 * allow each metaslab to derive its own fragmentation value. This is done 1924 * by calculating the space in each bucket of the spacemap histogram and 1925 * multiplying that by the fragmentation metric in this table. Doing 1926 * this for all buckets and dividing it by the total amount of free 1927 * space in this metaslab (i.e. the total free space in all buckets) gives 1928 * us the fragmentation metric. This means that a high fragmentation metric 1929 * equates to most of the free space being comprised of small segments. 1930 * Conversely, if the metric is low, then most of the free space is in 1931 * large segments. A 10% change in fragmentation equates to approximately 1932 * double the number of segments. 1933 * 1934 * This table defines 0% fragmented space using 16MB segments. Testing has 1935 * shown that segments that are greater than or equal to 16MB do not suffer 1936 * from drastic performance problems. Using this value, we derive the rest 1937 * of the table. Since the fragmentation value is never stored on disk, it 1938 * is possible to change these calculations in the future. 1939 */ 1940 int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = { 1941 100, /* 512B */ 1942 100, /* 1K */ 1943 98, /* 2K */ 1944 95, /* 4K */ 1945 90, /* 8K */ 1946 80, /* 16K */ 1947 70, /* 32K */ 1948 60, /* 64K */ 1949 50, /* 128K */ 1950 40, /* 256K */ 1951 30, /* 512K */ 1952 20, /* 1M */ 1953 15, /* 2M */ 1954 10, /* 4M */ 1955 5, /* 8M */ 1956 0 /* 16M */ 1957 }; 1958 1959 /* 1960 * Calculate the metaslab's fragmentation metric and set ms_fragmentation. 1961 * Setting this value to ZFS_FRAG_INVALID means that the metaslab has not 1962 * been upgraded and does not support this metric. Otherwise, the return 1963 * value should be in the range [0, 100]. 1964 */ 1965 static void 1966 metaslab_set_fragmentation(metaslab_t *msp) 1967 { 1968 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1969 uint64_t fragmentation = 0; 1970 uint64_t total = 0; 1971 boolean_t feature_enabled = spa_feature_is_enabled(spa, 1972 SPA_FEATURE_SPACEMAP_HISTOGRAM); 1973 1974 if (!feature_enabled) { 1975 msp->ms_fragmentation = ZFS_FRAG_INVALID; 1976 return; 1977 } 1978 1979 /* 1980 * A null space map means that the entire metaslab is free 1981 * and thus is not fragmented. 1982 */ 1983 if (msp->ms_sm == NULL) { 1984 msp->ms_fragmentation = 0; 1985 return; 1986 } 1987 1988 /* 1989 * If this metaslab's space map has not been upgraded, flag it 1990 * so that we upgrade next time we encounter it. 1991 */ 1992 if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) { 1993 uint64_t txg = spa_syncing_txg(spa); 1994 vdev_t *vd = msp->ms_group->mg_vd; 1995 1996 /* 1997 * If we've reached the final dirty txg, then we must 1998 * be shutting down the pool. We don't want to dirty 1999 * any data past this point so skip setting the condense 2000 * flag. We can retry this action the next time the pool 2001 * is imported. 2002 */ 2003 if (spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) { 2004 msp->ms_condense_wanted = B_TRUE; 2005 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 2006 zfs_dbgmsg("txg %llu, requesting force condense: " 2007 "ms_id %llu, vdev_id %llu", txg, msp->ms_id, 2008 vd->vdev_id); 2009 } 2010 msp->ms_fragmentation = ZFS_FRAG_INVALID; 2011 return; 2012 } 2013 2014 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 2015 uint64_t space = 0; 2016 uint8_t shift = msp->ms_sm->sm_shift; 2017 2018 int idx = MIN(shift - SPA_MINBLOCKSHIFT + i, 2019 FRAGMENTATION_TABLE_SIZE - 1); 2020 2021 if (msp->ms_sm->sm_phys->smp_histogram[i] == 0) 2022 continue; 2023 2024 space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift); 2025 total += space; 2026 2027 ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE); 2028 fragmentation += space * zfs_frag_table[idx]; 2029 } 2030 2031 if (total > 0) 2032 fragmentation /= total; 2033 ASSERT3U(fragmentation, <=, 100); 2034 2035 msp->ms_fragmentation = fragmentation; 2036 } 2037 2038 /* 2039 * Compute a weight -- a selection preference value -- for the given metaslab. 2040 * This is based on the amount of free space, the level of fragmentation, 2041 * the LBA range, and whether the metaslab is loaded. 2042 */ 2043 static uint64_t 2044 metaslab_space_weight(metaslab_t *msp) 2045 { 2046 metaslab_group_t *mg = msp->ms_group; 2047 vdev_t *vd = mg->mg_vd; 2048 uint64_t weight, space; 2049 2050 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2051 ASSERT(!vd->vdev_removing); 2052 2053 /* 2054 * The baseline weight is the metaslab's free space. 2055 */ 2056 space = msp->ms_size - metaslab_allocated_space(msp); 2057 2058 if (metaslab_fragmentation_factor_enabled && 2059 msp->ms_fragmentation != ZFS_FRAG_INVALID) { 2060 /* 2061 * Use the fragmentation information to inversely scale 2062 * down the baseline weight. We need to ensure that we 2063 * don't exclude this metaslab completely when it's 100% 2064 * fragmented. To avoid this we reduce the fragmented value 2065 * by 1. 2066 */ 2067 space = (space * (100 - (msp->ms_fragmentation - 1))) / 100; 2068 2069 /* 2070 * If space < SPA_MINBLOCKSIZE, then we will not allocate from 2071 * this metaslab again. The fragmentation metric may have 2072 * decreased the space to something smaller than 2073 * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE 2074 * so that we can consume any remaining space. 2075 */ 2076 if (space > 0 && space < SPA_MINBLOCKSIZE) 2077 space = SPA_MINBLOCKSIZE; 2078 } 2079 weight = space; 2080 2081 /* 2082 * Modern disks have uniform bit density and constant angular velocity. 2083 * Therefore, the outer recording zones are faster (higher bandwidth) 2084 * than the inner zones by the ratio of outer to inner track diameter, 2085 * which is typically around 2:1. We account for this by assigning 2086 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). 2087 * In effect, this means that we'll select the metaslab with the most 2088 * free bandwidth rather than simply the one with the most free space. 2089 */ 2090 if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) { 2091 weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count; 2092 ASSERT(weight >= space && weight <= 2 * space); 2093 } 2094 2095 /* 2096 * If this metaslab is one we're actively using, adjust its 2097 * weight to make it preferable to any inactive metaslab so 2098 * we'll polish it off. If the fragmentation on this metaslab 2099 * has exceed our threshold, then don't mark it active. 2100 */ 2101 if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID && 2102 msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) { 2103 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); 2104 } 2105 2106 WEIGHT_SET_SPACEBASED(weight); 2107 return (weight); 2108 } 2109 2110 /* 2111 * Return the weight of the specified metaslab, according to the segment-based 2112 * weighting algorithm. The metaslab must be loaded. This function can 2113 * be called within a sync pass since it relies only on the metaslab's 2114 * range tree which is always accurate when the metaslab is loaded. 2115 */ 2116 static uint64_t 2117 metaslab_weight_from_range_tree(metaslab_t *msp) 2118 { 2119 uint64_t weight = 0; 2120 uint32_t segments = 0; 2121 2122 ASSERT(msp->ms_loaded); 2123 2124 for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT; 2125 i--) { 2126 uint8_t shift = msp->ms_group->mg_vd->vdev_ashift; 2127 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; 2128 2129 segments <<= 1; 2130 segments += msp->ms_allocatable->rt_histogram[i]; 2131 2132 /* 2133 * The range tree provides more precision than the space map 2134 * and must be downgraded so that all values fit within the 2135 * space map's histogram. This allows us to compare loaded 2136 * vs. unloaded metaslabs to determine which metaslab is 2137 * considered "best". 2138 */ 2139 if (i > max_idx) 2140 continue; 2141 2142 if (segments != 0) { 2143 WEIGHT_SET_COUNT(weight, segments); 2144 WEIGHT_SET_INDEX(weight, i); 2145 WEIGHT_SET_ACTIVE(weight, 0); 2146 break; 2147 } 2148 } 2149 return (weight); 2150 } 2151 2152 /* 2153 * Calculate the weight based on the on-disk histogram. This should only 2154 * be called after a sync pass has completely finished since the on-disk 2155 * information is updated in metaslab_sync(). 2156 */ 2157 static uint64_t 2158 metaslab_weight_from_spacemap(metaslab_t *msp) 2159 { 2160 space_map_t *sm = msp->ms_sm; 2161 ASSERT(!msp->ms_loaded); 2162 ASSERT(sm != NULL); 2163 ASSERT3U(space_map_object(sm), !=, 0); 2164 ASSERT3U(sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); 2165 2166 /* 2167 * Create a joint histogram from all the segments that have made 2168 * it to the metaslab's space map histogram, that are not yet 2169 * available for allocation because they are still in the freeing 2170 * pipeline (e.g. freeing, freed, and defer trees). Then subtract 2171 * these segments from the space map's histogram to get a more 2172 * accurate weight. 2173 */ 2174 uint64_t deferspace_histogram[SPACE_MAP_HISTOGRAM_SIZE] = {0}; 2175 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) 2176 deferspace_histogram[i] += msp->ms_synchist[i]; 2177 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2178 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 2179 deferspace_histogram[i] += msp->ms_deferhist[t][i]; 2180 } 2181 } 2182 2183 uint64_t weight = 0; 2184 for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) { 2185 ASSERT3U(sm->sm_phys->smp_histogram[i], >=, 2186 deferspace_histogram[i]); 2187 uint64_t count = 2188 sm->sm_phys->smp_histogram[i] - deferspace_histogram[i]; 2189 if (count != 0) { 2190 WEIGHT_SET_COUNT(weight, count); 2191 WEIGHT_SET_INDEX(weight, i + sm->sm_shift); 2192 WEIGHT_SET_ACTIVE(weight, 0); 2193 break; 2194 } 2195 } 2196 return (weight); 2197 } 2198 2199 /* 2200 * Compute a segment-based weight for the specified metaslab. The weight 2201 * is determined by highest bucket in the histogram. The information 2202 * for the highest bucket is encoded into the weight value. 2203 */ 2204 static uint64_t 2205 metaslab_segment_weight(metaslab_t *msp) 2206 { 2207 metaslab_group_t *mg = msp->ms_group; 2208 uint64_t weight = 0; 2209 uint8_t shift = mg->mg_vd->vdev_ashift; 2210 2211 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2212 2213 /* 2214 * The metaslab is completely free. 2215 */ 2216 if (metaslab_allocated_space(msp) == 0) { 2217 int idx = highbit64(msp->ms_size) - 1; 2218 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; 2219 2220 if (idx < max_idx) { 2221 WEIGHT_SET_COUNT(weight, 1ULL); 2222 WEIGHT_SET_INDEX(weight, idx); 2223 } else { 2224 WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx)); 2225 WEIGHT_SET_INDEX(weight, max_idx); 2226 } 2227 WEIGHT_SET_ACTIVE(weight, 0); 2228 ASSERT(!WEIGHT_IS_SPACEBASED(weight)); 2229 2230 return (weight); 2231 } 2232 2233 ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); 2234 2235 /* 2236 * If the metaslab is fully allocated then just make the weight 0. 2237 */ 2238 if (metaslab_allocated_space(msp) == msp->ms_size) 2239 return (0); 2240 /* 2241 * If the metaslab is already loaded, then use the range tree to 2242 * determine the weight. Otherwise, we rely on the space map information 2243 * to generate the weight. 2244 */ 2245 if (msp->ms_loaded) { 2246 weight = metaslab_weight_from_range_tree(msp); 2247 } else { 2248 weight = metaslab_weight_from_spacemap(msp); 2249 } 2250 2251 /* 2252 * If the metaslab was active the last time we calculated its weight 2253 * then keep it active. We want to consume the entire region that 2254 * is associated with this weight. 2255 */ 2256 if (msp->ms_activation_weight != 0 && weight != 0) 2257 WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight)); 2258 return (weight); 2259 } 2260 2261 /* 2262 * Determine if we should attempt to allocate from this metaslab. If the 2263 * metaslab has a maximum size then we can quickly determine if the desired 2264 * allocation size can be satisfied. Otherwise, if we're using segment-based 2265 * weighting then we can determine the maximum allocation that this metaslab 2266 * can accommodate based on the index encoded in the weight. If we're using 2267 * space-based weights then rely on the entire weight (excluding the weight 2268 * type bit). 2269 */ 2270 boolean_t 2271 metaslab_should_allocate(metaslab_t *msp, uint64_t asize) 2272 { 2273 boolean_t should_allocate; 2274 2275 if (msp->ms_max_size != 0) 2276 return (msp->ms_max_size >= asize); 2277 2278 if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) { 2279 /* 2280 * The metaslab segment weight indicates segments in the 2281 * range [2^i, 2^(i+1)), where i is the index in the weight. 2282 * Since the asize might be in the middle of the range, we 2283 * should attempt the allocation if asize < 2^(i+1). 2284 */ 2285 should_allocate = (asize < 2286 1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1)); 2287 } else { 2288 should_allocate = (asize <= 2289 (msp->ms_weight & ~METASLAB_WEIGHT_TYPE)); 2290 } 2291 return (should_allocate); 2292 } 2293 2294 static uint64_t 2295 metaslab_weight(metaslab_t *msp) 2296 { 2297 vdev_t *vd = msp->ms_group->mg_vd; 2298 spa_t *spa = vd->vdev_spa; 2299 uint64_t weight; 2300 2301 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2302 2303 /* 2304 * If this vdev is in the process of being removed, there is nothing 2305 * for us to do here. 2306 */ 2307 if (vd->vdev_removing) 2308 return (0); 2309 2310 metaslab_set_fragmentation(msp); 2311 2312 /* 2313 * Update the maximum size if the metaslab is loaded. This will 2314 * ensure that we get an accurate maximum size if newly freed space 2315 * has been added back into the free tree. 2316 */ 2317 if (msp->ms_loaded) 2318 msp->ms_max_size = metaslab_block_maxsize(msp); 2319 else 2320 ASSERT0(msp->ms_max_size); 2321 2322 /* 2323 * Segment-based weighting requires space map histogram support. 2324 */ 2325 if (zfs_metaslab_segment_weight_enabled && 2326 spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) && 2327 (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size == 2328 sizeof (space_map_phys_t))) { 2329 weight = metaslab_segment_weight(msp); 2330 } else { 2331 weight = metaslab_space_weight(msp); 2332 } 2333 return (weight); 2334 } 2335 2336 void 2337 metaslab_recalculate_weight_and_sort(metaslab_t *msp) 2338 { 2339 /* note: we preserve the mask (e.g. indication of primary, etc..) */ 2340 uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; 2341 metaslab_group_sort(msp->ms_group, msp, 2342 metaslab_weight(msp) | was_active); 2343 } 2344 2345 static int 2346 metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp, 2347 int allocator, uint64_t activation_weight) 2348 { 2349 /* 2350 * If we're activating for the claim code, we don't want to actually 2351 * set the metaslab up for a specific allocator. 2352 */ 2353 if (activation_weight == METASLAB_WEIGHT_CLAIM) 2354 return (0); 2355 metaslab_t **arr = (activation_weight == METASLAB_WEIGHT_PRIMARY ? 2356 mg->mg_primaries : mg->mg_secondaries); 2357 2358 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2359 mutex_enter(&mg->mg_lock); 2360 if (arr[allocator] != NULL) { 2361 mutex_exit(&mg->mg_lock); 2362 return (EEXIST); 2363 } 2364 2365 arr[allocator] = msp; 2366 ASSERT3S(msp->ms_allocator, ==, -1); 2367 msp->ms_allocator = allocator; 2368 msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY); 2369 mutex_exit(&mg->mg_lock); 2370 2371 return (0); 2372 } 2373 2374 static int 2375 metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight) 2376 { 2377 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2378 2379 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { 2380 int error = metaslab_load(msp); 2381 if (error != 0) { 2382 metaslab_group_sort(msp->ms_group, msp, 0); 2383 return (error); 2384 } 2385 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) { 2386 /* 2387 * The metaslab was activated for another allocator 2388 * while we were waiting, we should reselect. 2389 */ 2390 return (EBUSY); 2391 } 2392 if ((error = metaslab_activate_allocator(msp->ms_group, msp, 2393 allocator, activation_weight)) != 0) { 2394 return (error); 2395 } 2396 2397 msp->ms_activation_weight = msp->ms_weight; 2398 metaslab_group_sort(msp->ms_group, msp, 2399 msp->ms_weight | activation_weight); 2400 } 2401 ASSERT(msp->ms_loaded); 2402 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); 2403 2404 return (0); 2405 } 2406 2407 static void 2408 metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp, 2409 uint64_t weight) 2410 { 2411 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2412 if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) { 2413 metaslab_group_sort(mg, msp, weight); 2414 return; 2415 } 2416 2417 mutex_enter(&mg->mg_lock); 2418 ASSERT3P(msp->ms_group, ==, mg); 2419 if (msp->ms_primary) { 2420 ASSERT3U(0, <=, msp->ms_allocator); 2421 ASSERT3U(msp->ms_allocator, <, mg->mg_allocators); 2422 ASSERT3P(mg->mg_primaries[msp->ms_allocator], ==, msp); 2423 ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); 2424 mg->mg_primaries[msp->ms_allocator] = NULL; 2425 } else { 2426 ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); 2427 ASSERT3P(mg->mg_secondaries[msp->ms_allocator], ==, msp); 2428 mg->mg_secondaries[msp->ms_allocator] = NULL; 2429 } 2430 msp->ms_allocator = -1; 2431 metaslab_group_sort_impl(mg, msp, weight); 2432 mutex_exit(&mg->mg_lock); 2433 } 2434 2435 static void 2436 metaslab_passivate(metaslab_t *msp, uint64_t weight) 2437 { 2438 uint64_t size = weight & ~METASLAB_WEIGHT_TYPE; 2439 2440 /* 2441 * If size < SPA_MINBLOCKSIZE, then we will not allocate from 2442 * this metaslab again. In that case, it had better be empty, 2443 * or we would be leaving space on the table. 2444 */ 2445 ASSERT(size >= SPA_MINBLOCKSIZE || 2446 range_tree_is_empty(msp->ms_allocatable)); 2447 ASSERT0(weight & METASLAB_ACTIVE_MASK); 2448 2449 msp->ms_activation_weight = 0; 2450 metaslab_passivate_allocator(msp->ms_group, msp, weight); 2451 ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); 2452 } 2453 2454 /* 2455 * Segment-based metaslabs are activated once and remain active until 2456 * we either fail an allocation attempt (similar to space-based metaslabs) 2457 * or have exhausted the free space in zfs_metaslab_switch_threshold 2458 * buckets since the metaslab was activated. This function checks to see 2459 * if we've exhaused the zfs_metaslab_switch_threshold buckets in the 2460 * metaslab and passivates it proactively. This will allow us to select a 2461 * metaslabs with larger contiguous region if any remaining within this 2462 * metaslab group. If we're in sync pass > 1, then we continue using this 2463 * metaslab so that we don't dirty more block and cause more sync passes. 2464 */ 2465 void 2466 metaslab_segment_may_passivate(metaslab_t *msp) 2467 { 2468 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 2469 2470 if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1) 2471 return; 2472 2473 /* 2474 * Since we are in the middle of a sync pass, the most accurate 2475 * information that is accessible to us is the in-core range tree 2476 * histogram; calculate the new weight based on that information. 2477 */ 2478 uint64_t weight = metaslab_weight_from_range_tree(msp); 2479 int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight); 2480 int current_idx = WEIGHT_GET_INDEX(weight); 2481 2482 if (current_idx <= activation_idx - zfs_metaslab_switch_threshold) 2483 metaslab_passivate(msp, weight); 2484 } 2485 2486 static void 2487 metaslab_preload(void *arg) 2488 { 2489 metaslab_t *msp = arg; 2490 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 2491 2492 ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock)); 2493 2494 mutex_enter(&msp->ms_lock); 2495 (void) metaslab_load(msp); 2496 msp->ms_selected_txg = spa_syncing_txg(spa); 2497 mutex_exit(&msp->ms_lock); 2498 } 2499 2500 static void 2501 metaslab_group_preload(metaslab_group_t *mg) 2502 { 2503 spa_t *spa = mg->mg_vd->vdev_spa; 2504 metaslab_t *msp; 2505 avl_tree_t *t = &mg->mg_metaslab_tree; 2506 int m = 0; 2507 2508 if (spa_shutting_down(spa) || !metaslab_preload_enabled) { 2509 taskq_wait(mg->mg_taskq); 2510 return; 2511 } 2512 2513 mutex_enter(&mg->mg_lock); 2514 2515 /* 2516 * Load the next potential metaslabs 2517 */ 2518 for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) { 2519 ASSERT3P(msp->ms_group, ==, mg); 2520 2521 /* 2522 * We preload only the maximum number of metaslabs specified 2523 * by metaslab_preload_limit. If a metaslab is being forced 2524 * to condense then we preload it too. This will ensure 2525 * that force condensing happens in the next txg. 2526 */ 2527 if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) { 2528 continue; 2529 } 2530 2531 VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload, 2532 msp, TQ_SLEEP) != TASKQID_INVALID); 2533 } 2534 mutex_exit(&mg->mg_lock); 2535 } 2536 2537 /* 2538 * Determine if the space map's on-disk footprint is past our tolerance 2539 * for inefficiency. We would like to use the following criteria to make 2540 * our decision: 2541 * 2542 * 1. The size of the space map object should not dramatically increase as a 2543 * result of writing out the free space range tree. 2544 * 2545 * 2. The minimal on-disk space map representation is zfs_condense_pct/100 2546 * times the size than the free space range tree representation 2547 * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1MB). 2548 * 2549 * 3. The on-disk size of the space map should actually decrease. 2550 * 2551 * Unfortunately, we cannot compute the on-disk size of the space map in this 2552 * context because we cannot accurately compute the effects of compression, etc. 2553 * Instead, we apply the heuristic described in the block comment for 2554 * zfs_metaslab_condense_block_threshold - we only condense if the space used 2555 * is greater than a threshold number of blocks. 2556 */ 2557 static boolean_t 2558 metaslab_should_condense(metaslab_t *msp) 2559 { 2560 space_map_t *sm = msp->ms_sm; 2561 vdev_t *vd = msp->ms_group->mg_vd; 2562 uint64_t vdev_blocksize = 1 << vd->vdev_ashift; 2563 uint64_t current_txg = spa_syncing_txg(vd->vdev_spa); 2564 2565 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2566 ASSERT(msp->ms_loaded); 2567 2568 /* 2569 * Allocations and frees in early passes are generally more space 2570 * efficient (in terms of blocks described in space map entries) 2571 * than the ones in later passes (e.g. we don't compress after 2572 * sync pass 5) and condensing a metaslab multiple times in a txg 2573 * could degrade performance. 2574 * 2575 * Thus we prefer condensing each metaslab at most once every txg at 2576 * the earliest sync pass possible. If a metaslab is eligible for 2577 * condensing again after being considered for condensing within the 2578 * same txg, it will hopefully be dirty in the next txg where it will 2579 * be condensed at an earlier pass. 2580 */ 2581 if (msp->ms_condense_checked_txg == current_txg) 2582 return (B_FALSE); 2583 msp->ms_condense_checked_txg = current_txg; 2584 2585 /* 2586 * We always condense metaslabs that are empty and metaslabs for 2587 * which a condense request has been made. 2588 */ 2589 if (avl_is_empty(&msp->ms_allocatable_by_size) || 2590 msp->ms_condense_wanted) 2591 return (B_TRUE); 2592 2593 uint64_t object_size = space_map_length(msp->ms_sm); 2594 uint64_t optimal_size = space_map_estimate_optimal_size(sm, 2595 msp->ms_allocatable, SM_NO_VDEVID); 2596 2597 dmu_object_info_t doi; 2598 dmu_object_info_from_db(sm->sm_dbuf, &doi); 2599 uint64_t record_size = MAX(doi.doi_data_block_size, vdev_blocksize); 2600 2601 return (object_size >= (optimal_size * zfs_condense_pct / 100) && 2602 object_size > zfs_metaslab_condense_block_threshold * record_size); 2603 } 2604 2605 /* 2606 * Condense the on-disk space map representation to its minimized form. 2607 * The minimized form consists of a small number of allocations followed by 2608 * the entries of the free range tree. 2609 */ 2610 static void 2611 metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) 2612 { 2613 range_tree_t *condense_tree; 2614 space_map_t *sm = msp->ms_sm; 2615 2616 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2617 ASSERT(msp->ms_loaded); 2618 2619 zfs_dbgmsg("condensing: txg %llu, msp[%llu] %p, vdev id %llu, " 2620 "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg, 2621 msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id, 2622 msp->ms_group->mg_vd->vdev_spa->spa_name, 2623 space_map_length(msp->ms_sm), 2624 avl_numnodes(&msp->ms_allocatable->rt_root), 2625 msp->ms_condense_wanted ? "TRUE" : "FALSE"); 2626 2627 msp->ms_condense_wanted = B_FALSE; 2628 2629 /* 2630 * Create an range tree that is 100% allocated. We remove segments 2631 * that have been freed in this txg, any deferred frees that exist, 2632 * and any allocation in the future. Removing segments should be 2633 * a relatively inexpensive operation since we expect these trees to 2634 * have a small number of nodes. 2635 */ 2636 condense_tree = range_tree_create(NULL, NULL); 2637 range_tree_add(condense_tree, msp->ms_start, msp->ms_size); 2638 2639 range_tree_walk(msp->ms_freeing, range_tree_remove, condense_tree); 2640 range_tree_walk(msp->ms_freed, range_tree_remove, condense_tree); 2641 2642 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2643 range_tree_walk(msp->ms_defer[t], 2644 range_tree_remove, condense_tree); 2645 } 2646 2647 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 2648 range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK], 2649 range_tree_remove, condense_tree); 2650 } 2651 2652 /* 2653 * We're about to drop the metaslab's lock thus allowing 2654 * other consumers to change it's content. Set the 2655 * metaslab's ms_condensing flag to ensure that 2656 * allocations on this metaslab do not occur while we're 2657 * in the middle of committing it to disk. This is only critical 2658 * for ms_allocatable as all other range trees use per txg 2659 * views of their content. 2660 */ 2661 msp->ms_condensing = B_TRUE; 2662 2663 mutex_exit(&msp->ms_lock); 2664 space_map_truncate(sm, zfs_metaslab_sm_blksz, tx); 2665 2666 /* 2667 * While we would ideally like to create a space map representation 2668 * that consists only of allocation records, doing so can be 2669 * prohibitively expensive because the in-core free tree can be 2670 * large, and therefore computationally expensive to subtract 2671 * from the condense_tree. Instead we sync out two trees, a cheap 2672 * allocation only tree followed by the in-core free tree. While not 2673 * optimal, this is typically close to optimal, and much cheaper to 2674 * compute. 2675 */ 2676 space_map_write(sm, condense_tree, SM_ALLOC, SM_NO_VDEVID, tx); 2677 range_tree_vacate(condense_tree, NULL, NULL); 2678 range_tree_destroy(condense_tree); 2679 2680 space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx); 2681 mutex_enter(&msp->ms_lock); 2682 msp->ms_condensing = B_FALSE; 2683 } 2684 2685 /* 2686 * Write a metaslab to disk in the context of the specified transaction group. 2687 */ 2688 void 2689 metaslab_sync(metaslab_t *msp, uint64_t txg) 2690 { 2691 metaslab_group_t *mg = msp->ms_group; 2692 vdev_t *vd = mg->mg_vd; 2693 spa_t *spa = vd->vdev_spa; 2694 objset_t *mos = spa_meta_objset(spa); 2695 range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK]; 2696 dmu_tx_t *tx; 2697 uint64_t object = space_map_object(msp->ms_sm); 2698 2699 ASSERT(!vd->vdev_ishole); 2700 2701 /* 2702 * This metaslab has just been added so there's no work to do now. 2703 */ 2704 if (msp->ms_freeing == NULL) { 2705 ASSERT3P(alloctree, ==, NULL); 2706 return; 2707 } 2708 2709 ASSERT3P(alloctree, !=, NULL); 2710 ASSERT3P(msp->ms_freeing, !=, NULL); 2711 ASSERT3P(msp->ms_freed, !=, NULL); 2712 ASSERT3P(msp->ms_checkpointing, !=, NULL); 2713 ASSERT3P(msp->ms_trim, !=, NULL); 2714 2715 /* 2716 * Normally, we don't want to process a metaslab if there are no 2717 * allocations or frees to perform. However, if the metaslab is being 2718 * forced to condense and it's loaded, we need to let it through. 2719 */ 2720 if (range_tree_is_empty(alloctree) && 2721 range_tree_is_empty(msp->ms_freeing) && 2722 range_tree_is_empty(msp->ms_checkpointing) && 2723 !(msp->ms_loaded && msp->ms_condense_wanted)) 2724 return; 2725 2726 2727 VERIFY(txg <= spa_final_dirty_txg(spa)); 2728 2729 /* 2730 * The only state that can actually be changing concurrently 2731 * with metaslab_sync() is the metaslab's ms_allocatable. No 2732 * other thread can be modifying this txg's alloc, freeing, 2733 * freed, or space_map_phys_t. We drop ms_lock whenever we 2734 * could call into the DMU, because the DMU can call down to 2735 * us (e.g. via zio_free()) at any time. 2736 * 2737 * The spa_vdev_remove_thread() can be reading metaslab state 2738 * concurrently, and it is locked out by the ms_sync_lock. 2739 * Note that the ms_lock is insufficient for this, because it 2740 * is dropped by space_map_write(). 2741 */ 2742 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 2743 2744 if (msp->ms_sm == NULL) { 2745 uint64_t new_object; 2746 2747 new_object = space_map_alloc(mos, zfs_metaslab_sm_blksz, tx); 2748 VERIFY3U(new_object, !=, 0); 2749 2750 VERIFY0(space_map_open(&msp->ms_sm, mos, new_object, 2751 msp->ms_start, msp->ms_size, vd->vdev_ashift)); 2752 2753 ASSERT(msp->ms_sm != NULL); 2754 ASSERT0(metaslab_allocated_space(msp)); 2755 } 2756 2757 if (!range_tree_is_empty(msp->ms_checkpointing) && 2758 vd->vdev_checkpoint_sm == NULL) { 2759 ASSERT(spa_has_checkpoint(spa)); 2760 2761 uint64_t new_object = space_map_alloc(mos, 2762 vdev_standard_sm_blksz, tx); 2763 VERIFY3U(new_object, !=, 0); 2764 2765 VERIFY0(space_map_open(&vd->vdev_checkpoint_sm, 2766 mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift)); 2767 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); 2768 2769 /* 2770 * We save the space map object as an entry in vdev_top_zap 2771 * so it can be retrieved when the pool is reopened after an 2772 * export or through zdb. 2773 */ 2774 VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, 2775 vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, 2776 sizeof (new_object), 1, &new_object, tx)); 2777 } 2778 2779 mutex_enter(&msp->ms_sync_lock); 2780 mutex_enter(&msp->ms_lock); 2781 2782 /* 2783 * Note: metaslab_condense() clears the space map's histogram. 2784 * Therefore we must verify and remove this histogram before 2785 * condensing. 2786 */ 2787 metaslab_group_histogram_verify(mg); 2788 metaslab_class_histogram_verify(mg->mg_class); 2789 metaslab_group_histogram_remove(mg, msp); 2790 2791 if (msp->ms_loaded && metaslab_should_condense(msp)) { 2792 metaslab_condense(msp, txg, tx); 2793 } else { 2794 mutex_exit(&msp->ms_lock); 2795 space_map_write(msp->ms_sm, alloctree, SM_ALLOC, 2796 SM_NO_VDEVID, tx); 2797 space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE, 2798 SM_NO_VDEVID, tx); 2799 mutex_enter(&msp->ms_lock); 2800 } 2801 2802 msp->ms_allocated_space += range_tree_space(alloctree); 2803 ASSERT3U(msp->ms_allocated_space, >=, 2804 range_tree_space(msp->ms_freeing)); 2805 msp->ms_allocated_space -= range_tree_space(msp->ms_freeing); 2806 2807 if (!range_tree_is_empty(msp->ms_checkpointing)) { 2808 ASSERT(spa_has_checkpoint(spa)); 2809 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); 2810 2811 /* 2812 * Since we are doing writes to disk and the ms_checkpointing 2813 * tree won't be changing during that time, we drop the 2814 * ms_lock while writing to the checkpoint space map. 2815 */ 2816 mutex_exit(&msp->ms_lock); 2817 space_map_write(vd->vdev_checkpoint_sm, 2818 msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx); 2819 mutex_enter(&msp->ms_lock); 2820 2821 spa->spa_checkpoint_info.sci_dspace += 2822 range_tree_space(msp->ms_checkpointing); 2823 vd->vdev_stat.vs_checkpoint_space += 2824 range_tree_space(msp->ms_checkpointing); 2825 ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==, 2826 -space_map_allocated(vd->vdev_checkpoint_sm)); 2827 2828 range_tree_vacate(msp->ms_checkpointing, NULL, NULL); 2829 } 2830 2831 if (msp->ms_loaded) { 2832 /* 2833 * When the space map is loaded, we have an accurate 2834 * histogram in the range tree. This gives us an opportunity 2835 * to bring the space map's histogram up-to-date so we clear 2836 * it first before updating it. 2837 */ 2838 space_map_histogram_clear(msp->ms_sm); 2839 space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx); 2840 2841 /* 2842 * Since we've cleared the histogram we need to add back 2843 * any free space that has already been processed, plus 2844 * any deferred space. This allows the on-disk histogram 2845 * to accurately reflect all free space even if some space 2846 * is not yet available for allocation (i.e. deferred). 2847 */ 2848 space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx); 2849 2850 /* 2851 * Add back any deferred free space that has not been 2852 * added back into the in-core free tree yet. This will 2853 * ensure that we don't end up with a space map histogram 2854 * that is completely empty unless the metaslab is fully 2855 * allocated. 2856 */ 2857 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2858 space_map_histogram_add(msp->ms_sm, 2859 msp->ms_defer[t], tx); 2860 } 2861 } 2862 2863 /* 2864 * Always add the free space from this sync pass to the space 2865 * map histogram. We want to make sure that the on-disk histogram 2866 * accounts for all free space. If the space map is not loaded, 2867 * then we will lose some accuracy but will correct it the next 2868 * time we load the space map. 2869 */ 2870 space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx); 2871 metaslab_aux_histograms_update(msp); 2872 2873 metaslab_group_histogram_add(mg, msp); 2874 metaslab_group_histogram_verify(mg); 2875 metaslab_class_histogram_verify(mg->mg_class); 2876 2877 /* 2878 * For sync pass 1, we avoid traversing this txg's free range tree 2879 * and instead will just swap the pointers for freeing and freed. 2880 * We can safely do this since the freed_tree is guaranteed to be 2881 * empty on the initial pass. 2882 */ 2883 if (spa_sync_pass(spa) == 1) { 2884 range_tree_swap(&msp->ms_freeing, &msp->ms_freed); 2885 ASSERT0(msp->ms_allocated_this_txg); 2886 } else { 2887 range_tree_vacate(msp->ms_freeing, 2888 range_tree_add, msp->ms_freed); 2889 } 2890 msp->ms_allocated_this_txg += range_tree_space(alloctree); 2891 range_tree_vacate(alloctree, NULL, NULL); 2892 2893 ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); 2894 ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg) 2895 & TXG_MASK])); 2896 ASSERT0(range_tree_space(msp->ms_freeing)); 2897 ASSERT0(range_tree_space(msp->ms_checkpointing)); 2898 2899 mutex_exit(&msp->ms_lock); 2900 2901 if (object != space_map_object(msp->ms_sm)) { 2902 object = space_map_object(msp->ms_sm); 2903 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * 2904 msp->ms_id, sizeof (uint64_t), &object, tx); 2905 } 2906 mutex_exit(&msp->ms_sync_lock); 2907 dmu_tx_commit(tx); 2908 } 2909 2910 /* 2911 * Called after a transaction group has completely synced to mark 2912 * all of the metaslab's free space as usable. 2913 */ 2914 void 2915 metaslab_sync_done(metaslab_t *msp, uint64_t txg) 2916 { 2917 metaslab_group_t *mg = msp->ms_group; 2918 vdev_t *vd = mg->mg_vd; 2919 spa_t *spa = vd->vdev_spa; 2920 range_tree_t **defer_tree; 2921 int64_t alloc_delta, defer_delta; 2922 boolean_t defer_allowed = B_TRUE; 2923 2924 ASSERT(!vd->vdev_ishole); 2925 2926 mutex_enter(&msp->ms_lock); 2927 2928 /* 2929 * If this metaslab is just becoming available, initialize its 2930 * range trees and add its capacity to the vdev. 2931 */ 2932 if (msp->ms_freed == NULL) { 2933 for (int t = 0; t < TXG_SIZE; t++) { 2934 ASSERT(msp->ms_allocating[t] == NULL); 2935 2936 msp->ms_allocating[t] = range_tree_create(NULL, NULL); 2937 } 2938 2939 ASSERT3P(msp->ms_freeing, ==, NULL); 2940 msp->ms_freeing = range_tree_create(NULL, NULL); 2941 2942 ASSERT3P(msp->ms_freed, ==, NULL); 2943 msp->ms_freed = range_tree_create(NULL, NULL); 2944 2945 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2946 ASSERT(msp->ms_defer[t] == NULL); 2947 2948 msp->ms_defer[t] = range_tree_create(NULL, NULL); 2949 } 2950 2951 ASSERT3P(msp->ms_checkpointing, ==, NULL); 2952 msp->ms_checkpointing = range_tree_create(NULL, NULL); 2953 2954 metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size); 2955 } 2956 ASSERT0(range_tree_space(msp->ms_freeing)); 2957 ASSERT0(range_tree_space(msp->ms_checkpointing)); 2958 2959 defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE]; 2960 2961 uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) - 2962 metaslab_class_get_alloc(spa_normal_class(spa)); 2963 if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) { 2964 defer_allowed = B_FALSE; 2965 } 2966 2967 defer_delta = 0; 2968 alloc_delta = msp->ms_allocated_this_txg - 2969 range_tree_space(msp->ms_freed); 2970 if (defer_allowed) { 2971 defer_delta = range_tree_space(msp->ms_freed) - 2972 range_tree_space(*defer_tree); 2973 } else { 2974 defer_delta -= range_tree_space(*defer_tree); 2975 } 2976 2977 metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta, 2978 defer_delta, 0); 2979 2980 /* 2981 * If there's a metaslab_load() in progress, wait for it to complete 2982 * so that we have a consistent view of the in-core space map. 2983 */ 2984 metaslab_load_wait(msp); 2985 2986 /* 2987 * When auto-trimming is enabled, free ranges which are added to 2988 * ms_allocatable are also be added to ms_trim. The ms_trim tree is 2989 * periodically consumed by the vdev_autotrim_thread() which issues 2990 * trims for all ranges and then vacates the tree. The ms_trim tree 2991 * can be discarded at any time with the sole consequence of recent 2992 * frees not being trimmed. 2993 */ 2994 if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON) { 2995 range_tree_walk(*defer_tree, range_tree_add, msp->ms_trim); 2996 if (!defer_allowed) { 2997 range_tree_walk(msp->ms_freed, range_tree_add, 2998 msp->ms_trim); 2999 } 3000 } else { 3001 range_tree_vacate(msp->ms_trim, NULL, NULL); 3002 } 3003 3004 /* 3005 * Move the frees from the defer_tree back to the free 3006 * range tree (if it's loaded). Swap the freed_tree and 3007 * the defer_tree -- this is safe to do because we've 3008 * just emptied out the defer_tree. 3009 */ 3010 range_tree_vacate(*defer_tree, 3011 msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable); 3012 if (defer_allowed) { 3013 range_tree_swap(&msp->ms_freed, defer_tree); 3014 } else { 3015 range_tree_vacate(msp->ms_freed, 3016 msp->ms_loaded ? range_tree_add : NULL, 3017 msp->ms_allocatable); 3018 } 3019 3020 msp->ms_synced_length = space_map_length(msp->ms_sm); 3021 3022 msp->ms_deferspace += defer_delta; 3023 ASSERT3S(msp->ms_deferspace, >=, 0); 3024 ASSERT3S(msp->ms_deferspace, <=, msp->ms_size); 3025 if (msp->ms_deferspace != 0) { 3026 /* 3027 * Keep syncing this metaslab until all deferred frees 3028 * are back in circulation. 3029 */ 3030 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 3031 } 3032 metaslab_aux_histograms_update_done(msp, defer_allowed); 3033 3034 if (msp->ms_new) { 3035 msp->ms_new = B_FALSE; 3036 mutex_enter(&mg->mg_lock); 3037 mg->mg_ms_ready++; 3038 mutex_exit(&mg->mg_lock); 3039 } 3040 3041 /* 3042 * Re-sort metaslab within its group now that we've adjusted 3043 * its allocatable space. 3044 */ 3045 metaslab_recalculate_weight_and_sort(msp); 3046 3047 /* 3048 * If the metaslab is loaded and we've not tried to load or allocate 3049 * from it in 'metaslab_unload_delay' txgs, then unload it. 3050 */ 3051 if (msp->ms_loaded && 3052 msp->ms_disabled == 0 && 3053 msp->ms_selected_txg + metaslab_unload_delay < txg) { 3054 3055 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 3056 VERIFY0(range_tree_space( 3057 msp->ms_allocating[(txg + t) & TXG_MASK])); 3058 } 3059 if (msp->ms_allocator != -1) { 3060 metaslab_passivate(msp, msp->ms_weight & 3061 ~METASLAB_ACTIVE_MASK); 3062 } 3063 3064 if (!metaslab_debug_unload) 3065 metaslab_unload(msp); 3066 } 3067 3068 ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); 3069 ASSERT0(range_tree_space(msp->ms_freeing)); 3070 ASSERT0(range_tree_space(msp->ms_freed)); 3071 ASSERT0(range_tree_space(msp->ms_checkpointing)); 3072 3073 msp->ms_allocated_this_txg = 0; 3074 mutex_exit(&msp->ms_lock); 3075 } 3076 3077 void 3078 metaslab_sync_reassess(metaslab_group_t *mg) 3079 { 3080 spa_t *spa = mg->mg_class->mc_spa; 3081 3082 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 3083 metaslab_group_alloc_update(mg); 3084 mg->mg_fragmentation = metaslab_group_fragmentation(mg); 3085 3086 /* 3087 * Preload the next potential metaslabs but only on active 3088 * metaslab groups. We can get into a state where the metaslab 3089 * is no longer active since we dirty metaslabs as we remove a 3090 * a device, thus potentially making the metaslab group eligible 3091 * for preloading. 3092 */ 3093 if (mg->mg_activation_count > 0) { 3094 metaslab_group_preload(mg); 3095 } 3096 spa_config_exit(spa, SCL_ALLOC, FTAG); 3097 } 3098 3099 /* 3100 * When writing a ditto block (i.e. more than one DVA for a given BP) on 3101 * the same vdev as an existing DVA of this BP, then try to allocate it 3102 * on a different metaslab than existing DVAs (i.e. a unique metaslab). 3103 */ 3104 static boolean_t 3105 metaslab_is_unique(metaslab_t *msp, dva_t *dva) 3106 { 3107 uint64_t dva_ms_id; 3108 3109 if (DVA_GET_ASIZE(dva) == 0) 3110 return (B_TRUE); 3111 3112 if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) 3113 return (B_TRUE); 3114 3115 dva_ms_id = DVA_GET_OFFSET(dva) >> msp->ms_group->mg_vd->vdev_ms_shift; 3116 3117 return (msp->ms_id != dva_ms_id); 3118 } 3119 3120 /* 3121 * ========================================================================== 3122 * Metaslab allocation tracing facility 3123 * ========================================================================== 3124 */ 3125 kstat_t *metaslab_trace_ksp; 3126 kstat_named_t metaslab_trace_over_limit; 3127 3128 void 3129 metaslab_alloc_trace_init(void) 3130 { 3131 ASSERT(metaslab_alloc_trace_cache == NULL); 3132 metaslab_alloc_trace_cache = kmem_cache_create( 3133 "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t), 3134 0, NULL, NULL, NULL, NULL, NULL, 0); 3135 metaslab_trace_ksp = kstat_create("zfs", 0, "metaslab_trace_stats", 3136 "misc", KSTAT_TYPE_NAMED, 1, KSTAT_FLAG_VIRTUAL); 3137 if (metaslab_trace_ksp != NULL) { 3138 metaslab_trace_ksp->ks_data = &metaslab_trace_over_limit; 3139 kstat_named_init(&metaslab_trace_over_limit, 3140 "metaslab_trace_over_limit", KSTAT_DATA_UINT64); 3141 kstat_install(metaslab_trace_ksp); 3142 } 3143 } 3144 3145 void 3146 metaslab_alloc_trace_fini(void) 3147 { 3148 if (metaslab_trace_ksp != NULL) { 3149 kstat_delete(metaslab_trace_ksp); 3150 metaslab_trace_ksp = NULL; 3151 } 3152 kmem_cache_destroy(metaslab_alloc_trace_cache); 3153 metaslab_alloc_trace_cache = NULL; 3154 } 3155 3156 /* 3157 * Add an allocation trace element to the allocation tracing list. 3158 */ 3159 static void 3160 metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg, 3161 metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset, 3162 int allocator) 3163 { 3164 if (!metaslab_trace_enabled) 3165 return; 3166 3167 /* 3168 * When the tracing list reaches its maximum we remove 3169 * the second element in the list before adding a new one. 3170 * By removing the second element we preserve the original 3171 * entry as a clue to what allocations steps have already been 3172 * performed. 3173 */ 3174 if (zal->zal_size == metaslab_trace_max_entries) { 3175 metaslab_alloc_trace_t *mat_next; 3176 #ifdef DEBUG 3177 panic("too many entries in allocation list"); 3178 #endif 3179 atomic_inc_64(&metaslab_trace_over_limit.value.ui64); 3180 zal->zal_size--; 3181 mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list)); 3182 list_remove(&zal->zal_list, mat_next); 3183 kmem_cache_free(metaslab_alloc_trace_cache, mat_next); 3184 } 3185 3186 metaslab_alloc_trace_t *mat = 3187 kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP); 3188 list_link_init(&mat->mat_list_node); 3189 mat->mat_mg = mg; 3190 mat->mat_msp = msp; 3191 mat->mat_size = psize; 3192 mat->mat_dva_id = dva_id; 3193 mat->mat_offset = offset; 3194 mat->mat_weight = 0; 3195 mat->mat_allocator = allocator; 3196 3197 if (msp != NULL) 3198 mat->mat_weight = msp->ms_weight; 3199 3200 /* 3201 * The list is part of the zio so locking is not required. Only 3202 * a single thread will perform allocations for a given zio. 3203 */ 3204 list_insert_tail(&zal->zal_list, mat); 3205 zal->zal_size++; 3206 3207 ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries); 3208 } 3209 3210 void 3211 metaslab_trace_init(zio_alloc_list_t *zal) 3212 { 3213 list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t), 3214 offsetof(metaslab_alloc_trace_t, mat_list_node)); 3215 zal->zal_size = 0; 3216 } 3217 3218 void 3219 metaslab_trace_fini(zio_alloc_list_t *zal) 3220 { 3221 metaslab_alloc_trace_t *mat; 3222 3223 while ((mat = list_remove_head(&zal->zal_list)) != NULL) 3224 kmem_cache_free(metaslab_alloc_trace_cache, mat); 3225 list_destroy(&zal->zal_list); 3226 zal->zal_size = 0; 3227 } 3228 3229 /* 3230 * ========================================================================== 3231 * Metaslab block operations 3232 * ========================================================================== 3233 */ 3234 3235 static void 3236 metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags, 3237 int allocator) 3238 { 3239 if (!(flags & METASLAB_ASYNC_ALLOC) || 3240 (flags & METASLAB_DONT_THROTTLE)) 3241 return; 3242 3243 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 3244 if (!mg->mg_class->mc_alloc_throttle_enabled) 3245 return; 3246 3247 (void) zfs_refcount_add(&mg->mg_alloc_queue_depth[allocator], tag); 3248 } 3249 3250 static void 3251 metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator) 3252 { 3253 uint64_t max = mg->mg_max_alloc_queue_depth; 3254 uint64_t cur = mg->mg_cur_max_alloc_queue_depth[allocator]; 3255 while (cur < max) { 3256 if (atomic_cas_64(&mg->mg_cur_max_alloc_queue_depth[allocator], 3257 cur, cur + 1) == cur) { 3258 atomic_inc_64( 3259 &mg->mg_class->mc_alloc_max_slots[allocator]); 3260 return; 3261 } 3262 cur = mg->mg_cur_max_alloc_queue_depth[allocator]; 3263 } 3264 } 3265 3266 void 3267 metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags, 3268 int allocator, boolean_t io_complete) 3269 { 3270 if (!(flags & METASLAB_ASYNC_ALLOC) || 3271 (flags & METASLAB_DONT_THROTTLE)) 3272 return; 3273 3274 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 3275 if (!mg->mg_class->mc_alloc_throttle_enabled) 3276 return; 3277 3278 (void) zfs_refcount_remove(&mg->mg_alloc_queue_depth[allocator], tag); 3279 if (io_complete) 3280 metaslab_group_increment_qdepth(mg, allocator); 3281 } 3282 3283 void 3284 metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag, 3285 int allocator) 3286 { 3287 #ifdef ZFS_DEBUG 3288 const dva_t *dva = bp->blk_dva; 3289 int ndvas = BP_GET_NDVAS(bp); 3290 3291 for (int d = 0; d < ndvas; d++) { 3292 uint64_t vdev = DVA_GET_VDEV(&dva[d]); 3293 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 3294 VERIFY(zfs_refcount_not_held( 3295 &mg->mg_alloc_queue_depth[allocator], tag)); 3296 } 3297 #endif 3298 } 3299 3300 static uint64_t 3301 metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) 3302 { 3303 uint64_t start; 3304 range_tree_t *rt = msp->ms_allocatable; 3305 metaslab_class_t *mc = msp->ms_group->mg_class; 3306 3307 VERIFY(!msp->ms_condensing); 3308 VERIFY0(msp->ms_disabled); 3309 3310 start = mc->mc_ops->msop_alloc(msp, size); 3311 if (start != -1ULL) { 3312 metaslab_group_t *mg = msp->ms_group; 3313 vdev_t *vd = mg->mg_vd; 3314 3315 VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift)); 3316 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 3317 VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size); 3318 range_tree_remove(rt, start, size); 3319 range_tree_clear(msp->ms_trim, start, size); 3320 3321 if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) 3322 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); 3323 3324 range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size); 3325 3326 /* Track the last successful allocation */ 3327 msp->ms_alloc_txg = txg; 3328 metaslab_verify_space(msp, txg); 3329 } 3330 3331 /* 3332 * Now that we've attempted the allocation we need to update the 3333 * metaslab's maximum block size since it may have changed. 3334 */ 3335 msp->ms_max_size = metaslab_block_maxsize(msp); 3336 return (start); 3337 } 3338 3339 /* 3340 * Find the metaslab with the highest weight that is less than what we've 3341 * already tried. In the common case, this means that we will examine each 3342 * metaslab at most once. Note that concurrent callers could reorder metaslabs 3343 * by activation/passivation once we have dropped the mg_lock. If a metaslab is 3344 * activated by another thread, and we fail to allocate from the metaslab we 3345 * have selected, we may not try the newly-activated metaslab, and instead 3346 * activate another metaslab. This is not optimal, but generally does not cause 3347 * any problems (a possible exception being if every metaslab is completely full 3348 * except for the the newly-activated metaslab which we fail to examine). 3349 */ 3350 static metaslab_t * 3351 find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, 3352 dva_t *dva, int d, boolean_t want_unique, uint64_t asize, int allocator, 3353 zio_alloc_list_t *zal, metaslab_t *search, boolean_t *was_active) 3354 { 3355 avl_index_t idx; 3356 avl_tree_t *t = &mg->mg_metaslab_tree; 3357 metaslab_t *msp = avl_find(t, search, &idx); 3358 if (msp == NULL) 3359 msp = avl_nearest(t, idx, AVL_AFTER); 3360 3361 for (; msp != NULL; msp = AVL_NEXT(t, msp)) { 3362 int i; 3363 if (!metaslab_should_allocate(msp, asize)) { 3364 metaslab_trace_add(zal, mg, msp, asize, d, 3365 TRACE_TOO_SMALL, allocator); 3366 continue; 3367 } 3368 3369 /* 3370 * If the selected metaslab is condensing or disabled, 3371 * skip it. 3372 */ 3373 if (msp->ms_condensing || msp->ms_disabled > 0) 3374 continue; 3375 3376 *was_active = msp->ms_allocator != -1; 3377 /* 3378 * If we're activating as primary, this is our first allocation 3379 * from this disk, so we don't need to check how close we are. 3380 * If the metaslab under consideration was already active, 3381 * we're getting desperate enough to steal another allocator's 3382 * metaslab, so we still don't care about distances. 3383 */ 3384 if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active) 3385 break; 3386 3387 for (i = 0; i < d; i++) { 3388 if (want_unique && 3389 !metaslab_is_unique(msp, &dva[i])) 3390 break; /* try another metaslab */ 3391 } 3392 if (i == d) 3393 break; 3394 } 3395 3396 if (msp != NULL) { 3397 search->ms_weight = msp->ms_weight; 3398 search->ms_start = msp->ms_start + 1; 3399 search->ms_allocator = msp->ms_allocator; 3400 search->ms_primary = msp->ms_primary; 3401 } 3402 return (msp); 3403 } 3404 3405 /* ARGSUSED */ 3406 static uint64_t 3407 metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, 3408 uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, 3409 int d, int allocator) 3410 { 3411 metaslab_t *msp = NULL; 3412 uint64_t offset = -1ULL; 3413 uint64_t activation_weight; 3414 3415 activation_weight = METASLAB_WEIGHT_PRIMARY; 3416 for (int i = 0; i < d; i++) { 3417 if (activation_weight == METASLAB_WEIGHT_PRIMARY && 3418 DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { 3419 activation_weight = METASLAB_WEIGHT_SECONDARY; 3420 } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && 3421 DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { 3422 activation_weight = METASLAB_WEIGHT_CLAIM; 3423 break; 3424 } 3425 } 3426 3427 /* 3428 * If we don't have enough metaslabs active to fill the entire array, we 3429 * just use the 0th slot. 3430 */ 3431 if (mg->mg_ms_ready < mg->mg_allocators * 3) 3432 allocator = 0; 3433 3434 ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2); 3435 3436 metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP); 3437 search->ms_weight = UINT64_MAX; 3438 search->ms_start = 0; 3439 /* 3440 * At the end of the metaslab tree are the already-active metaslabs, 3441 * first the primaries, then the secondaries. When we resume searching 3442 * through the tree, we need to consider ms_allocator and ms_primary so 3443 * we start in the location right after where we left off, and don't 3444 * accidentally loop forever considering the same metaslabs. 3445 */ 3446 search->ms_allocator = -1; 3447 search->ms_primary = B_TRUE; 3448 for (;;) { 3449 boolean_t was_active = B_FALSE; 3450 3451 mutex_enter(&mg->mg_lock); 3452 3453 if (activation_weight == METASLAB_WEIGHT_PRIMARY && 3454 mg->mg_primaries[allocator] != NULL) { 3455 msp = mg->mg_primaries[allocator]; 3456 was_active = B_TRUE; 3457 } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && 3458 mg->mg_secondaries[allocator] != NULL) { 3459 msp = mg->mg_secondaries[allocator]; 3460 was_active = B_TRUE; 3461 } else { 3462 msp = find_valid_metaslab(mg, activation_weight, dva, d, 3463 want_unique, asize, allocator, zal, search, 3464 &was_active); 3465 } 3466 3467 mutex_exit(&mg->mg_lock); 3468 if (msp == NULL) { 3469 kmem_free(search, sizeof (*search)); 3470 return (-1ULL); 3471 } 3472 3473 mutex_enter(&msp->ms_lock); 3474 /* 3475 * Ensure that the metaslab we have selected is still 3476 * capable of handling our request. It's possible that 3477 * another thread may have changed the weight while we 3478 * were blocked on the metaslab lock. We check the 3479 * active status first to see if we need to reselect 3480 * a new metaslab. 3481 */ 3482 if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) { 3483 mutex_exit(&msp->ms_lock); 3484 continue; 3485 } 3486 3487 /* 3488 * If the metaslab is freshly activated for an allocator that 3489 * isn't the one we're allocating from, or if it's a primary and 3490 * we're seeking a secondary (or vice versa), we go back and 3491 * select a new metaslab. 3492 */ 3493 if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) && 3494 (msp->ms_allocator != -1) && 3495 (msp->ms_allocator != allocator || ((activation_weight == 3496 METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) { 3497 mutex_exit(&msp->ms_lock); 3498 continue; 3499 } 3500 3501 if (msp->ms_weight & METASLAB_WEIGHT_CLAIM && 3502 activation_weight != METASLAB_WEIGHT_CLAIM) { 3503 metaslab_passivate(msp, msp->ms_weight & 3504 ~METASLAB_WEIGHT_CLAIM); 3505 mutex_exit(&msp->ms_lock); 3506 continue; 3507 } 3508 3509 if (metaslab_activate(msp, allocator, activation_weight) != 0) { 3510 mutex_exit(&msp->ms_lock); 3511 continue; 3512 } 3513 3514 msp->ms_selected_txg = txg; 3515 3516 /* 3517 * Now that we have the lock, recheck to see if we should 3518 * continue to use this metaslab for this allocation. The 3519 * the metaslab is now loaded so metaslab_should_allocate() can 3520 * accurately determine if the allocation attempt should 3521 * proceed. 3522 */ 3523 if (!metaslab_should_allocate(msp, asize)) { 3524 /* Passivate this metaslab and select a new one. */ 3525 metaslab_trace_add(zal, mg, msp, asize, d, 3526 TRACE_TOO_SMALL, allocator); 3527 goto next; 3528 } 3529 3530 /* 3531 * If this metaslab is currently condensing then pick again as 3532 * we can't manipulate this metaslab until it's committed 3533 * to disk. If this metaslab is being initialized, we shouldn't 3534 * allocate from it since the allocated region might be 3535 * overwritten after allocation. 3536 */ 3537 if (msp->ms_condensing) { 3538 metaslab_trace_add(zal, mg, msp, asize, d, 3539 TRACE_CONDENSING, allocator); 3540 metaslab_passivate(msp, msp->ms_weight & 3541 ~METASLAB_ACTIVE_MASK); 3542 mutex_exit(&msp->ms_lock); 3543 continue; 3544 } else if (msp->ms_disabled > 0) { 3545 metaslab_trace_add(zal, mg, msp, asize, d, 3546 TRACE_DISABLED, allocator); 3547 metaslab_passivate(msp, msp->ms_weight & 3548 ~METASLAB_ACTIVE_MASK); 3549 mutex_exit(&msp->ms_lock); 3550 continue; 3551 } 3552 3553 offset = metaslab_block_alloc(msp, asize, txg); 3554 metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator); 3555 3556 if (offset != -1ULL) { 3557 /* Proactively passivate the metaslab, if needed */ 3558 metaslab_segment_may_passivate(msp); 3559 break; 3560 } 3561 next: 3562 ASSERT(msp->ms_loaded); 3563 3564 /* 3565 * We were unable to allocate from this metaslab so determine 3566 * a new weight for this metaslab. Now that we have loaded 3567 * the metaslab we can provide a better hint to the metaslab 3568 * selector. 3569 * 3570 * For space-based metaslabs, we use the maximum block size. 3571 * This information is only available when the metaslab 3572 * is loaded and is more accurate than the generic free 3573 * space weight that was calculated by metaslab_weight(). 3574 * This information allows us to quickly compare the maximum 3575 * available allocation in the metaslab to the allocation 3576 * size being requested. 3577 * 3578 * For segment-based metaslabs, determine the new weight 3579 * based on the highest bucket in the range tree. We 3580 * explicitly use the loaded segment weight (i.e. the range 3581 * tree histogram) since it contains the space that is 3582 * currently available for allocation and is accurate 3583 * even within a sync pass. 3584 */ 3585 if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) { 3586 uint64_t weight = metaslab_block_maxsize(msp); 3587 WEIGHT_SET_SPACEBASED(weight); 3588 metaslab_passivate(msp, weight); 3589 } else { 3590 metaslab_passivate(msp, 3591 metaslab_weight_from_range_tree(msp)); 3592 } 3593 3594 /* 3595 * We have just failed an allocation attempt, check 3596 * that metaslab_should_allocate() agrees. Otherwise, 3597 * we may end up in an infinite loop retrying the same 3598 * metaslab. 3599 */ 3600 ASSERT(!metaslab_should_allocate(msp, asize)); 3601 3602 mutex_exit(&msp->ms_lock); 3603 } 3604 mutex_exit(&msp->ms_lock); 3605 kmem_free(search, sizeof (*search)); 3606 return (offset); 3607 } 3608 3609 static uint64_t 3610 metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, 3611 uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, 3612 int d, int allocator) 3613 { 3614 uint64_t offset; 3615 ASSERT(mg->mg_initialized); 3616 3617 offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique, 3618 dva, d, allocator); 3619 3620 mutex_enter(&mg->mg_lock); 3621 if (offset == -1ULL) { 3622 mg->mg_failed_allocations++; 3623 metaslab_trace_add(zal, mg, NULL, asize, d, 3624 TRACE_GROUP_FAILURE, allocator); 3625 if (asize == SPA_GANGBLOCKSIZE) { 3626 /* 3627 * This metaslab group was unable to allocate 3628 * the minimum gang block size so it must be out of 3629 * space. We must notify the allocation throttle 3630 * to start skipping allocation attempts to this 3631 * metaslab group until more space becomes available. 3632 * Note: this failure cannot be caused by the 3633 * allocation throttle since the allocation throttle 3634 * is only responsible for skipping devices and 3635 * not failing block allocations. 3636 */ 3637 mg->mg_no_free_space = B_TRUE; 3638 } 3639 } 3640 mg->mg_allocations++; 3641 mutex_exit(&mg->mg_lock); 3642 return (offset); 3643 } 3644 3645 /* 3646 * Allocate a block for the specified i/o. 3647 */ 3648 int 3649 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, 3650 dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags, 3651 zio_alloc_list_t *zal, int allocator) 3652 { 3653 metaslab_group_t *mg, *rotor; 3654 vdev_t *vd; 3655 boolean_t try_hard = B_FALSE; 3656 3657 ASSERT(!DVA_IS_VALID(&dva[d])); 3658 3659 /* 3660 * For testing, make some blocks above a certain size be gang blocks. 3661 * This will also test spilling from special to normal. 3662 */ 3663 if (psize >= metaslab_force_ganging && (ddi_get_lbolt() & 3) == 0) { 3664 metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG, 3665 allocator); 3666 return (SET_ERROR(ENOSPC)); 3667 } 3668 3669 /* 3670 * Start at the rotor and loop through all mgs until we find something. 3671 * Note that there's no locking on mc_rotor or mc_aliquot because 3672 * nothing actually breaks if we miss a few updates -- we just won't 3673 * allocate quite as evenly. It all balances out over time. 3674 * 3675 * If we are doing ditto or log blocks, try to spread them across 3676 * consecutive vdevs. If we're forced to reuse a vdev before we've 3677 * allocated all of our ditto blocks, then try and spread them out on 3678 * that vdev as much as possible. If it turns out to not be possible, 3679 * gradually lower our standards until anything becomes acceptable. 3680 * Also, allocating on consecutive vdevs (as opposed to random vdevs) 3681 * gives us hope of containing our fault domains to something we're 3682 * able to reason about. Otherwise, any two top-level vdev failures 3683 * will guarantee the loss of data. With consecutive allocation, 3684 * only two adjacent top-level vdev failures will result in data loss. 3685 * 3686 * If we are doing gang blocks (hintdva is non-NULL), try to keep 3687 * ourselves on the same vdev as our gang block header. That 3688 * way, we can hope for locality in vdev_cache, plus it makes our 3689 * fault domains something tractable. 3690 */ 3691 if (hintdva) { 3692 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); 3693 3694 /* 3695 * It's possible the vdev we're using as the hint no 3696 * longer exists or its mg has been closed (e.g. by 3697 * device removal). Consult the rotor when 3698 * all else fails. 3699 */ 3700 if (vd != NULL && vd->vdev_mg != NULL) { 3701 mg = vd->vdev_mg; 3702 3703 if (flags & METASLAB_HINTBP_AVOID && 3704 mg->mg_next != NULL) 3705 mg = mg->mg_next; 3706 } else { 3707 mg = mc->mc_rotor; 3708 } 3709 } else if (d != 0) { 3710 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); 3711 mg = vd->vdev_mg->mg_next; 3712 } else { 3713 ASSERT(mc->mc_rotor != NULL); 3714 mg = mc->mc_rotor; 3715 } 3716 3717 /* 3718 * If the hint put us into the wrong metaslab class, or into a 3719 * metaslab group that has been passivated, just follow the rotor. 3720 */ 3721 if (mg->mg_class != mc || mg->mg_activation_count <= 0) 3722 mg = mc->mc_rotor; 3723 3724 rotor = mg; 3725 top: 3726 do { 3727 boolean_t allocatable; 3728 3729 ASSERT(mg->mg_activation_count == 1); 3730 vd = mg->mg_vd; 3731 3732 /* 3733 * Don't allocate from faulted devices. 3734 */ 3735 if (try_hard) { 3736 spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); 3737 allocatable = vdev_allocatable(vd); 3738 spa_config_exit(spa, SCL_ZIO, FTAG); 3739 } else { 3740 allocatable = vdev_allocatable(vd); 3741 } 3742 3743 /* 3744 * Determine if the selected metaslab group is eligible 3745 * for allocations. If we're ganging then don't allow 3746 * this metaslab group to skip allocations since that would 3747 * inadvertently return ENOSPC and suspend the pool 3748 * even though space is still available. 3749 */ 3750 if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) { 3751 allocatable = metaslab_group_allocatable(mg, rotor, 3752 psize, allocator, d); 3753 } 3754 3755 if (!allocatable) { 3756 metaslab_trace_add(zal, mg, NULL, psize, d, 3757 TRACE_NOT_ALLOCATABLE, allocator); 3758 goto next; 3759 } 3760 3761 ASSERT(mg->mg_initialized); 3762 3763 /* 3764 * Avoid writing single-copy data to a failing, 3765 * non-redundant vdev, unless we've already tried all 3766 * other vdevs. 3767 */ 3768 if ((vd->vdev_stat.vs_write_errors > 0 || 3769 vd->vdev_state < VDEV_STATE_HEALTHY) && 3770 d == 0 && !try_hard && vd->vdev_children == 0) { 3771 metaslab_trace_add(zal, mg, NULL, psize, d, 3772 TRACE_VDEV_ERROR, allocator); 3773 goto next; 3774 } 3775 3776 ASSERT(mg->mg_class == mc); 3777 3778 uint64_t asize = vdev_psize_to_asize(vd, psize); 3779 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); 3780 3781 /* 3782 * If we don't need to try hard, then require that the 3783 * block be on an different metaslab from any other DVAs 3784 * in this BP (unique=true). If we are trying hard, then 3785 * allow any metaslab to be used (unique=false). 3786 */ 3787 uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg, 3788 !try_hard, dva, d, allocator); 3789 3790 if (offset != -1ULL) { 3791 /* 3792 * If we've just selected this metaslab group, 3793 * figure out whether the corresponding vdev is 3794 * over- or under-used relative to the pool, 3795 * and set an allocation bias to even it out. 3796 */ 3797 if (mc->mc_aliquot == 0 && metaslab_bias_enabled) { 3798 vdev_stat_t *vs = &vd->vdev_stat; 3799 int64_t vu, cu; 3800 3801 vu = (vs->vs_alloc * 100) / (vs->vs_space + 1); 3802 cu = (mc->mc_alloc * 100) / (mc->mc_space + 1); 3803 3804 /* 3805 * Calculate how much more or less we should 3806 * try to allocate from this device during 3807 * this iteration around the rotor. 3808 * For example, if a device is 80% full 3809 * and the pool is 20% full then we should 3810 * reduce allocations by 60% on this device. 3811 * 3812 * mg_bias = (20 - 80) * 512K / 100 = -307K 3813 * 3814 * This reduces allocations by 307K for this 3815 * iteration. 3816 */ 3817 mg->mg_bias = ((cu - vu) * 3818 (int64_t)mg->mg_aliquot) / 100; 3819 } else if (!metaslab_bias_enabled) { 3820 mg->mg_bias = 0; 3821 } 3822 3823 if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= 3824 mg->mg_aliquot + mg->mg_bias) { 3825 mc->mc_rotor = mg->mg_next; 3826 mc->mc_aliquot = 0; 3827 } 3828 3829 DVA_SET_VDEV(&dva[d], vd->vdev_id); 3830 DVA_SET_OFFSET(&dva[d], offset); 3831 DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER)); 3832 DVA_SET_ASIZE(&dva[d], asize); 3833 3834 return (0); 3835 } 3836 next: 3837 mc->mc_rotor = mg->mg_next; 3838 mc->mc_aliquot = 0; 3839 } while ((mg = mg->mg_next) != rotor); 3840 3841 /* 3842 * If we haven't tried hard, do so now. 3843 */ 3844 if (!try_hard) { 3845 try_hard = B_TRUE; 3846 goto top; 3847 } 3848 3849 bzero(&dva[d], sizeof (dva_t)); 3850 3851 metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator); 3852 return (SET_ERROR(ENOSPC)); 3853 } 3854 3855 void 3856 metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize, 3857 boolean_t checkpoint) 3858 { 3859 metaslab_t *msp; 3860 spa_t *spa = vd->vdev_spa; 3861 3862 ASSERT(vdev_is_concrete(vd)); 3863 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 3864 ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); 3865 3866 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 3867 3868 VERIFY(!msp->ms_condensing); 3869 VERIFY3U(offset, >=, msp->ms_start); 3870 VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size); 3871 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 3872 VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift)); 3873 3874 metaslab_check_free_impl(vd, offset, asize); 3875 3876 mutex_enter(&msp->ms_lock); 3877 if (range_tree_is_empty(msp->ms_freeing) && 3878 range_tree_is_empty(msp->ms_checkpointing)) { 3879 vdev_dirty(vd, VDD_METASLAB, msp, spa_syncing_txg(spa)); 3880 } 3881 3882 if (checkpoint) { 3883 ASSERT(spa_has_checkpoint(spa)); 3884 range_tree_add(msp->ms_checkpointing, offset, asize); 3885 } else { 3886 range_tree_add(msp->ms_freeing, offset, asize); 3887 } 3888 mutex_exit(&msp->ms_lock); 3889 } 3890 3891 /* ARGSUSED */ 3892 void 3893 metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, 3894 uint64_t size, void *arg) 3895 { 3896 boolean_t *checkpoint = arg; 3897 3898 ASSERT3P(checkpoint, !=, NULL); 3899 3900 if (vd->vdev_ops->vdev_op_remap != NULL) 3901 vdev_indirect_mark_obsolete(vd, offset, size); 3902 else 3903 metaslab_free_impl(vd, offset, size, *checkpoint); 3904 } 3905 3906 static void 3907 metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size, 3908 boolean_t checkpoint) 3909 { 3910 spa_t *spa = vd->vdev_spa; 3911 3912 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 3913 3914 if (spa_syncing_txg(spa) > spa_freeze_txg(spa)) 3915 return; 3916 3917 if (spa->spa_vdev_removal != NULL && 3918 spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id && 3919 vdev_is_concrete(vd)) { 3920 /* 3921 * Note: we check if the vdev is concrete because when 3922 * we complete the removal, we first change the vdev to be 3923 * an indirect vdev (in open context), and then (in syncing 3924 * context) clear spa_vdev_removal. 3925 */ 3926 free_from_removing_vdev(vd, offset, size); 3927 } else if (vd->vdev_ops->vdev_op_remap != NULL) { 3928 vdev_indirect_mark_obsolete(vd, offset, size); 3929 vd->vdev_ops->vdev_op_remap(vd, offset, size, 3930 metaslab_free_impl_cb, &checkpoint); 3931 } else { 3932 metaslab_free_concrete(vd, offset, size, checkpoint); 3933 } 3934 } 3935 3936 typedef struct remap_blkptr_cb_arg { 3937 blkptr_t *rbca_bp; 3938 spa_remap_cb_t rbca_cb; 3939 vdev_t *rbca_remap_vd; 3940 uint64_t rbca_remap_offset; 3941 void *rbca_cb_arg; 3942 } remap_blkptr_cb_arg_t; 3943 3944 void 3945 remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, 3946 uint64_t size, void *arg) 3947 { 3948 remap_blkptr_cb_arg_t *rbca = arg; 3949 blkptr_t *bp = rbca->rbca_bp; 3950 3951 /* We can not remap split blocks. */ 3952 if (size != DVA_GET_ASIZE(&bp->blk_dva[0])) 3953 return; 3954 ASSERT0(inner_offset); 3955 3956 if (rbca->rbca_cb != NULL) { 3957 /* 3958 * At this point we know that we are not handling split 3959 * blocks and we invoke the callback on the previous 3960 * vdev which must be indirect. 3961 */ 3962 ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops); 3963 3964 rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id, 3965 rbca->rbca_remap_offset, size, rbca->rbca_cb_arg); 3966 3967 /* set up remap_blkptr_cb_arg for the next call */ 3968 rbca->rbca_remap_vd = vd; 3969 rbca->rbca_remap_offset = offset; 3970 } 3971 3972 /* 3973 * The phys birth time is that of dva[0]. This ensures that we know 3974 * when each dva was written, so that resilver can determine which 3975 * blocks need to be scrubbed (i.e. those written during the time 3976 * the vdev was offline). It also ensures that the key used in 3977 * the ARC hash table is unique (i.e. dva[0] + phys_birth). If 3978 * we didn't change the phys_birth, a lookup in the ARC for a 3979 * remapped BP could find the data that was previously stored at 3980 * this vdev + offset. 3981 */ 3982 vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa, 3983 DVA_GET_VDEV(&bp->blk_dva[0])); 3984 vdev_indirect_births_t *vib = oldvd->vdev_indirect_births; 3985 bp->blk_phys_birth = vdev_indirect_births_physbirth(vib, 3986 DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0])); 3987 3988 DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id); 3989 DVA_SET_OFFSET(&bp->blk_dva[0], offset); 3990 } 3991 3992 /* 3993 * If the block pointer contains any indirect DVAs, modify them to refer to 3994 * concrete DVAs. Note that this will sometimes not be possible, leaving 3995 * the indirect DVA in place. This happens if the indirect DVA spans multiple 3996 * segments in the mapping (i.e. it is a "split block"). 3997 * 3998 * If the BP was remapped, calls the callback on the original dva (note the 3999 * callback can be called multiple times if the original indirect DVA refers 4000 * to another indirect DVA, etc). 4001 * 4002 * Returns TRUE if the BP was remapped. 4003 */ 4004 boolean_t 4005 spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg) 4006 { 4007 remap_blkptr_cb_arg_t rbca; 4008 4009 if (!zfs_remap_blkptr_enable) 4010 return (B_FALSE); 4011 4012 if (!spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) 4013 return (B_FALSE); 4014 4015 /* 4016 * Dedup BP's can not be remapped, because ddt_phys_select() depends 4017 * on DVA[0] being the same in the BP as in the DDT (dedup table). 4018 */ 4019 if (BP_GET_DEDUP(bp)) 4020 return (B_FALSE); 4021 4022 /* 4023 * Gang blocks can not be remapped, because 4024 * zio_checksum_gang_verifier() depends on the DVA[0] that's in 4025 * the BP used to read the gang block header (GBH) being the same 4026 * as the DVA[0] that we allocated for the GBH. 4027 */ 4028 if (BP_IS_GANG(bp)) 4029 return (B_FALSE); 4030 4031 /* 4032 * Embedded BP's have no DVA to remap. 4033 */ 4034 if (BP_GET_NDVAS(bp) < 1) 4035 return (B_FALSE); 4036 4037 /* 4038 * Note: we only remap dva[0]. If we remapped other dvas, we 4039 * would no longer know what their phys birth txg is. 4040 */ 4041 dva_t *dva = &bp->blk_dva[0]; 4042 4043 uint64_t offset = DVA_GET_OFFSET(dva); 4044 uint64_t size = DVA_GET_ASIZE(dva); 4045 vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); 4046 4047 if (vd->vdev_ops->vdev_op_remap == NULL) 4048 return (B_FALSE); 4049 4050 rbca.rbca_bp = bp; 4051 rbca.rbca_cb = callback; 4052 rbca.rbca_remap_vd = vd; 4053 rbca.rbca_remap_offset = offset; 4054 rbca.rbca_cb_arg = arg; 4055 4056 /* 4057 * remap_blkptr_cb() will be called in order for each level of 4058 * indirection, until a concrete vdev is reached or a split block is 4059 * encountered. old_vd and old_offset are updated within the callback 4060 * as we go from the one indirect vdev to the next one (either concrete 4061 * or indirect again) in that order. 4062 */ 4063 vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca); 4064 4065 /* Check if the DVA wasn't remapped because it is a split block */ 4066 if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id) 4067 return (B_FALSE); 4068 4069 return (B_TRUE); 4070 } 4071 4072 /* 4073 * Undo the allocation of a DVA which happened in the given transaction group. 4074 */ 4075 void 4076 metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg) 4077 { 4078 metaslab_t *msp; 4079 vdev_t *vd; 4080 uint64_t vdev = DVA_GET_VDEV(dva); 4081 uint64_t offset = DVA_GET_OFFSET(dva); 4082 uint64_t size = DVA_GET_ASIZE(dva); 4083 4084 ASSERT(DVA_IS_VALID(dva)); 4085 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 4086 4087 if (txg > spa_freeze_txg(spa)) 4088 return; 4089 4090 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 4091 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { 4092 cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu", 4093 (u_longlong_t)vdev, (u_longlong_t)offset); 4094 ASSERT(0); 4095 return; 4096 } 4097 4098 ASSERT(!vd->vdev_removing); 4099 ASSERT(vdev_is_concrete(vd)); 4100 ASSERT0(vd->vdev_indirect_config.vic_mapping_object); 4101 ASSERT3P(vd->vdev_indirect_mapping, ==, NULL); 4102 4103 if (DVA_GET_GANG(dva)) 4104 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 4105 4106 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 4107 4108 mutex_enter(&msp->ms_lock); 4109 range_tree_remove(msp->ms_allocating[txg & TXG_MASK], 4110 offset, size); 4111 4112 VERIFY(!msp->ms_condensing); 4113 VERIFY3U(offset, >=, msp->ms_start); 4114 VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size); 4115 VERIFY3U(range_tree_space(msp->ms_allocatable) + size, <=, 4116 msp->ms_size); 4117 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 4118 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 4119 range_tree_add(msp->ms_allocatable, offset, size); 4120 mutex_exit(&msp->ms_lock); 4121 } 4122 4123 /* 4124 * Free the block represented by the given DVA. 4125 */ 4126 void 4127 metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint) 4128 { 4129 uint64_t vdev = DVA_GET_VDEV(dva); 4130 uint64_t offset = DVA_GET_OFFSET(dva); 4131 uint64_t size = DVA_GET_ASIZE(dva); 4132 vdev_t *vd = vdev_lookup_top(spa, vdev); 4133 4134 ASSERT(DVA_IS_VALID(dva)); 4135 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 4136 4137 if (DVA_GET_GANG(dva)) { 4138 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 4139 } 4140 4141 metaslab_free_impl(vd, offset, size, checkpoint); 4142 } 4143 4144 /* 4145 * Reserve some allocation slots. The reservation system must be called 4146 * before we call into the allocator. If there aren't any available slots 4147 * then the I/O will be throttled until an I/O completes and its slots are 4148 * freed up. The function returns true if it was successful in placing 4149 * the reservation. 4150 */ 4151 boolean_t 4152 metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator, 4153 zio_t *zio, int flags) 4154 { 4155 uint64_t available_slots = 0; 4156 boolean_t slot_reserved = B_FALSE; 4157 uint64_t max = mc->mc_alloc_max_slots[allocator]; 4158 4159 ASSERT(mc->mc_alloc_throttle_enabled); 4160 mutex_enter(&mc->mc_lock); 4161 4162 uint64_t reserved_slots = 4163 zfs_refcount_count(&mc->mc_alloc_slots[allocator]); 4164 if (reserved_slots < max) 4165 available_slots = max - reserved_slots; 4166 4167 if (slots <= available_slots || GANG_ALLOCATION(flags) || 4168 flags & METASLAB_MUST_RESERVE) { 4169 /* 4170 * We reserve the slots individually so that we can unreserve 4171 * them individually when an I/O completes. 4172 */ 4173 for (int d = 0; d < slots; d++) { 4174 reserved_slots = 4175 zfs_refcount_add(&mc->mc_alloc_slots[allocator], 4176 zio); 4177 } 4178 zio->io_flags |= ZIO_FLAG_IO_ALLOCATING; 4179 slot_reserved = B_TRUE; 4180 } 4181 4182 mutex_exit(&mc->mc_lock); 4183 return (slot_reserved); 4184 } 4185 4186 void 4187 metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, 4188 int allocator, zio_t *zio) 4189 { 4190 ASSERT(mc->mc_alloc_throttle_enabled); 4191 mutex_enter(&mc->mc_lock); 4192 for (int d = 0; d < slots; d++) { 4193 (void) zfs_refcount_remove(&mc->mc_alloc_slots[allocator], 4194 zio); 4195 } 4196 mutex_exit(&mc->mc_lock); 4197 } 4198 4199 static int 4200 metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size, 4201 uint64_t txg) 4202 { 4203 metaslab_t *msp; 4204 spa_t *spa = vd->vdev_spa; 4205 int error = 0; 4206 4207 if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count) 4208 return (ENXIO); 4209 4210 ASSERT3P(vd->vdev_ms, !=, NULL); 4211 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 4212 4213 mutex_enter(&msp->ms_lock); 4214 4215 if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) 4216 error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM); 4217 /* 4218 * No need to fail in that case; someone else has activated the 4219 * metaslab, but that doesn't preclude us from using it. 4220 */ 4221 if (error == EBUSY) 4222 error = 0; 4223 4224 if (error == 0 && 4225 !range_tree_contains(msp->ms_allocatable, offset, size)) 4226 error = SET_ERROR(ENOENT); 4227 4228 if (error || txg == 0) { /* txg == 0 indicates dry run */ 4229 mutex_exit(&msp->ms_lock); 4230 return (error); 4231 } 4232 4233 VERIFY(!msp->ms_condensing); 4234 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 4235 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 4236 VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=, 4237 msp->ms_size); 4238 range_tree_remove(msp->ms_allocatable, offset, size); 4239 range_tree_clear(msp->ms_trim, offset, size); 4240 4241 if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ 4242 if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) 4243 vdev_dirty(vd, VDD_METASLAB, msp, txg); 4244 range_tree_add(msp->ms_allocating[txg & TXG_MASK], 4245 offset, size); 4246 } 4247 4248 mutex_exit(&msp->ms_lock); 4249 4250 return (0); 4251 } 4252 4253 typedef struct metaslab_claim_cb_arg_t { 4254 uint64_t mcca_txg; 4255 int mcca_error; 4256 } metaslab_claim_cb_arg_t; 4257 4258 /* ARGSUSED */ 4259 static void 4260 metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, 4261 uint64_t size, void *arg) 4262 { 4263 metaslab_claim_cb_arg_t *mcca_arg = arg; 4264 4265 if (mcca_arg->mcca_error == 0) { 4266 mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset, 4267 size, mcca_arg->mcca_txg); 4268 } 4269 } 4270 4271 int 4272 metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg) 4273 { 4274 if (vd->vdev_ops->vdev_op_remap != NULL) { 4275 metaslab_claim_cb_arg_t arg; 4276 4277 /* 4278 * Only zdb(1M) can claim on indirect vdevs. This is used 4279 * to detect leaks of mapped space (that are not accounted 4280 * for in the obsolete counts, spacemap, or bpobj). 4281 */ 4282 ASSERT(!spa_writeable(vd->vdev_spa)); 4283 arg.mcca_error = 0; 4284 arg.mcca_txg = txg; 4285 4286 vd->vdev_ops->vdev_op_remap(vd, offset, size, 4287 metaslab_claim_impl_cb, &arg); 4288 4289 if (arg.mcca_error == 0) { 4290 arg.mcca_error = metaslab_claim_concrete(vd, 4291 offset, size, txg); 4292 } 4293 return (arg.mcca_error); 4294 } else { 4295 return (metaslab_claim_concrete(vd, offset, size, txg)); 4296 } 4297 } 4298 4299 /* 4300 * Intent log support: upon opening the pool after a crash, notify the SPA 4301 * of blocks that the intent log has allocated for immediate write, but 4302 * which are still considered free by the SPA because the last transaction 4303 * group didn't commit yet. 4304 */ 4305 static int 4306 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) 4307 { 4308 uint64_t vdev = DVA_GET_VDEV(dva); 4309 uint64_t offset = DVA_GET_OFFSET(dva); 4310 uint64_t size = DVA_GET_ASIZE(dva); 4311 vdev_t *vd; 4312 4313 if ((vd = vdev_lookup_top(spa, vdev)) == NULL) { 4314 return (SET_ERROR(ENXIO)); 4315 } 4316 4317 ASSERT(DVA_IS_VALID(dva)); 4318 4319 if (DVA_GET_GANG(dva)) 4320 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 4321 4322 return (metaslab_claim_impl(vd, offset, size, txg)); 4323 } 4324 4325 int 4326 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, 4327 int ndvas, uint64_t txg, blkptr_t *hintbp, int flags, 4328 zio_alloc_list_t *zal, zio_t *zio, int allocator) 4329 { 4330 dva_t *dva = bp->blk_dva; 4331 dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL; 4332 int error = 0; 4333 4334 ASSERT(bp->blk_birth == 0); 4335 ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); 4336 4337 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 4338 4339 if (mc->mc_rotor == NULL) { /* no vdevs in this class */ 4340 spa_config_exit(spa, SCL_ALLOC, FTAG); 4341 return (SET_ERROR(ENOSPC)); 4342 } 4343 4344 ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); 4345 ASSERT(BP_GET_NDVAS(bp) == 0); 4346 ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); 4347 ASSERT3P(zal, !=, NULL); 4348 4349 for (int d = 0; d < ndvas; d++) { 4350 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, 4351 txg, flags, zal, allocator); 4352 if (error != 0) { 4353 for (d--; d >= 0; d--) { 4354 metaslab_unalloc_dva(spa, &dva[d], txg); 4355 metaslab_group_alloc_decrement(spa, 4356 DVA_GET_VDEV(&dva[d]), zio, flags, 4357 allocator, B_FALSE); 4358 bzero(&dva[d], sizeof (dva_t)); 4359 } 4360 spa_config_exit(spa, SCL_ALLOC, FTAG); 4361 return (error); 4362 } else { 4363 /* 4364 * Update the metaslab group's queue depth 4365 * based on the newly allocated dva. 4366 */ 4367 metaslab_group_alloc_increment(spa, 4368 DVA_GET_VDEV(&dva[d]), zio, flags, allocator); 4369 } 4370 4371 } 4372 ASSERT(error == 0); 4373 ASSERT(BP_GET_NDVAS(bp) == ndvas); 4374 4375 spa_config_exit(spa, SCL_ALLOC, FTAG); 4376 4377 BP_SET_BIRTH(bp, txg, txg); 4378 4379 return (0); 4380 } 4381 4382 void 4383 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) 4384 { 4385 const dva_t *dva = bp->blk_dva; 4386 int ndvas = BP_GET_NDVAS(bp); 4387 4388 ASSERT(!BP_IS_HOLE(bp)); 4389 ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); 4390 4391 /* 4392 * If we have a checkpoint for the pool we need to make sure that 4393 * the blocks that we free that are part of the checkpoint won't be 4394 * reused until the checkpoint is discarded or we revert to it. 4395 * 4396 * The checkpoint flag is passed down the metaslab_free code path 4397 * and is set whenever we want to add a block to the checkpoint's 4398 * accounting. That is, we "checkpoint" blocks that existed at the 4399 * time the checkpoint was created and are therefore referenced by 4400 * the checkpointed uberblock. 4401 * 4402 * Note that, we don't checkpoint any blocks if the current 4403 * syncing txg <= spa_checkpoint_txg. We want these frees to sync 4404 * normally as they will be referenced by the checkpointed uberblock. 4405 */ 4406 boolean_t checkpoint = B_FALSE; 4407 if (bp->blk_birth <= spa->spa_checkpoint_txg && 4408 spa_syncing_txg(spa) > spa->spa_checkpoint_txg) { 4409 /* 4410 * At this point, if the block is part of the checkpoint 4411 * there is no way it was created in the current txg. 4412 */ 4413 ASSERT(!now); 4414 ASSERT3U(spa_syncing_txg(spa), ==, txg); 4415 checkpoint = B_TRUE; 4416 } 4417 4418 spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); 4419 4420 for (int d = 0; d < ndvas; d++) { 4421 if (now) { 4422 metaslab_unalloc_dva(spa, &dva[d], txg); 4423 } else { 4424 ASSERT3U(txg, ==, spa_syncing_txg(spa)); 4425 metaslab_free_dva(spa, &dva[d], checkpoint); 4426 } 4427 } 4428 4429 spa_config_exit(spa, SCL_FREE, FTAG); 4430 } 4431 4432 int 4433 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) 4434 { 4435 const dva_t *dva = bp->blk_dva; 4436 int ndvas = BP_GET_NDVAS(bp); 4437 int error = 0; 4438 4439 ASSERT(!BP_IS_HOLE(bp)); 4440 4441 if (txg != 0) { 4442 /* 4443 * First do a dry run to make sure all DVAs are claimable, 4444 * so we don't have to unwind from partial failures below. 4445 */ 4446 if ((error = metaslab_claim(spa, bp, 0)) != 0) 4447 return (error); 4448 } 4449 4450 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 4451 4452 for (int d = 0; d < ndvas; d++) { 4453 error = metaslab_claim_dva(spa, &dva[d], txg); 4454 if (error != 0) 4455 break; 4456 } 4457 4458 spa_config_exit(spa, SCL_ALLOC, FTAG); 4459 4460 ASSERT(error == 0 || txg == 0); 4461 4462 return (error); 4463 } 4464 4465 /* ARGSUSED */ 4466 static void 4467 metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset, 4468 uint64_t size, void *arg) 4469 { 4470 if (vd->vdev_ops == &vdev_indirect_ops) 4471 return; 4472 4473 metaslab_check_free_impl(vd, offset, size); 4474 } 4475 4476 static void 4477 metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size) 4478 { 4479 metaslab_t *msp; 4480 spa_t *spa = vd->vdev_spa; 4481 4482 if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) 4483 return; 4484 4485 if (vd->vdev_ops->vdev_op_remap != NULL) { 4486 vd->vdev_ops->vdev_op_remap(vd, offset, size, 4487 metaslab_check_free_impl_cb, NULL); 4488 return; 4489 } 4490 4491 ASSERT(vdev_is_concrete(vd)); 4492 ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); 4493 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 4494 4495 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 4496 4497 mutex_enter(&msp->ms_lock); 4498 if (msp->ms_loaded) { 4499 range_tree_verify_not_present(msp->ms_allocatable, 4500 offset, size); 4501 } 4502 4503 range_tree_verify_not_present(msp->ms_trim, offset, size); 4504 range_tree_verify_not_present(msp->ms_freeing, offset, size); 4505 range_tree_verify_not_present(msp->ms_checkpointing, offset, size); 4506 range_tree_verify_not_present(msp->ms_freed, offset, size); 4507 for (int j = 0; j < TXG_DEFER_SIZE; j++) 4508 range_tree_verify_not_present(msp->ms_defer[j], offset, size); 4509 mutex_exit(&msp->ms_lock); 4510 } 4511 4512 void 4513 metaslab_check_free(spa_t *spa, const blkptr_t *bp) 4514 { 4515 if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) 4516 return; 4517 4518 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 4519 for (int i = 0; i < BP_GET_NDVAS(bp); i++) { 4520 uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]); 4521 vdev_t *vd = vdev_lookup_top(spa, vdev); 4522 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); 4523 uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]); 4524 4525 if (DVA_GET_GANG(&bp->blk_dva[i])) 4526 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 4527 4528 ASSERT3P(vd, !=, NULL); 4529 4530 metaslab_check_free_impl(vd, offset, size); 4531 } 4532 spa_config_exit(spa, SCL_VDEV, FTAG); 4533 } 4534 4535 static void 4536 metaslab_group_disable_wait(metaslab_group_t *mg) 4537 { 4538 ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock)); 4539 while (mg->mg_disabled_updating) { 4540 cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock); 4541 } 4542 } 4543 4544 static void 4545 metaslab_group_disabled_increment(metaslab_group_t *mg) 4546 { 4547 ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock)); 4548 ASSERT(mg->mg_disabled_updating); 4549 4550 while (mg->mg_ms_disabled >= max_disabled_ms) { 4551 cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock); 4552 } 4553 mg->mg_ms_disabled++; 4554 ASSERT3U(mg->mg_ms_disabled, <=, max_disabled_ms); 4555 } 4556 4557 /* 4558 * Mark the metaslab as disabled to prevent any allocations on this metaslab. 4559 * We must also track how many metaslabs are currently disabled within a 4560 * metaslab group and limit them to prevent allocation failures from 4561 * occurring because all metaslabs are disabled. 4562 */ 4563 void 4564 metaslab_disable(metaslab_t *msp) 4565 { 4566 ASSERT(!MUTEX_HELD(&msp->ms_lock)); 4567 metaslab_group_t *mg = msp->ms_group; 4568 4569 mutex_enter(&mg->mg_ms_disabled_lock); 4570 4571 /* 4572 * To keep an accurate count of how many threads have disabled 4573 * a specific metaslab group, we only allow one thread to mark 4574 * the metaslab group at a time. This ensures that the value of 4575 * ms_disabled will be accurate when we decide to mark a metaslab 4576 * group as disabled. To do this we force all other threads 4577 * to wait till the metaslab's mg_disabled_updating flag is no 4578 * longer set. 4579 */ 4580 metaslab_group_disable_wait(mg); 4581 mg->mg_disabled_updating = B_TRUE; 4582 if (msp->ms_disabled == 0) { 4583 metaslab_group_disabled_increment(mg); 4584 } 4585 mutex_enter(&msp->ms_lock); 4586 msp->ms_disabled++; 4587 mutex_exit(&msp->ms_lock); 4588 4589 mg->mg_disabled_updating = B_FALSE; 4590 cv_broadcast(&mg->mg_ms_disabled_cv); 4591 mutex_exit(&mg->mg_ms_disabled_lock); 4592 } 4593 4594 void 4595 metaslab_enable(metaslab_t *msp, boolean_t sync) 4596 { 4597 metaslab_group_t *mg = msp->ms_group; 4598 spa_t *spa = mg->mg_vd->vdev_spa; 4599 4600 /* 4601 * Wait for the outstanding IO to be synced to prevent newly 4602 * allocated blocks from being overwritten. This used by 4603 * initialize and TRIM which are modifying unallocated space. 4604 */ 4605 if (sync) 4606 txg_wait_synced(spa_get_dsl(spa), 0); 4607 4608 mutex_enter(&mg->mg_ms_disabled_lock); 4609 mutex_enter(&msp->ms_lock); 4610 if (--msp->ms_disabled == 0) { 4611 mg->mg_ms_disabled--; 4612 cv_broadcast(&mg->mg_ms_disabled_cv); 4613 } 4614 mutex_exit(&msp->ms_lock); 4615 mutex_exit(&mg->mg_ms_disabled_lock); 4616 } 4617