1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 24 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 25 * Copyright (c) 2014 Integros [integros.com] 26 * Copyright (c) 2017, Intel Corporation. 27 */ 28 29 #include <sys/zfs_context.h> 30 #include <sys/dmu.h> 31 #include <sys/dmu_tx.h> 32 #include <sys/space_map.h> 33 #include <sys/metaslab_impl.h> 34 #include <sys/vdev_impl.h> 35 #include <sys/zio.h> 36 #include <sys/spa_impl.h> 37 #include <sys/zfeature.h> 38 #include <sys/vdev_indirect_mapping.h> 39 #include <sys/zap.h> 40 41 #define GANG_ALLOCATION(flags) \ 42 ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER)) 43 44 uint64_t metaslab_aliquot = 512ULL << 10; 45 uint64_t metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ 46 47 /* 48 * Since we can touch multiple metaslabs (and their respective space maps) 49 * with each transaction group, we benefit from having a smaller space map 50 * block size since it allows us to issue more I/O operations scattered 51 * around the disk. 52 */ 53 int zfs_metaslab_sm_blksz = (1 << 12); 54 55 /* 56 * The in-core space map representation is more compact than its on-disk form. 57 * The zfs_condense_pct determines how much more compact the in-core 58 * space map representation must be before we compact it on-disk. 59 * Values should be greater than or equal to 100. 60 */ 61 int zfs_condense_pct = 200; 62 63 /* 64 * Condensing a metaslab is not guaranteed to actually reduce the amount of 65 * space used on disk. In particular, a space map uses data in increments of 66 * MAX(1 << ashift, space_map_blksize), so a metaslab might use the 67 * same number of blocks after condensing. Since the goal of condensing is to 68 * reduce the number of IOPs required to read the space map, we only want to 69 * condense when we can be sure we will reduce the number of blocks used by the 70 * space map. Unfortunately, we cannot precisely compute whether or not this is 71 * the case in metaslab_should_condense since we are holding ms_lock. Instead, 72 * we apply the following heuristic: do not condense a spacemap unless the 73 * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold 74 * blocks. 75 */ 76 int zfs_metaslab_condense_block_threshold = 4; 77 78 /* 79 * The zfs_mg_noalloc_threshold defines which metaslab groups should 80 * be eligible for allocation. The value is defined as a percentage of 81 * free space. Metaslab groups that have more free space than 82 * zfs_mg_noalloc_threshold are always eligible for allocations. Once 83 * a metaslab group's free space is less than or equal to the 84 * zfs_mg_noalloc_threshold the allocator will avoid allocating to that 85 * group unless all groups in the pool have reached zfs_mg_noalloc_threshold. 86 * Once all groups in the pool reach zfs_mg_noalloc_threshold then all 87 * groups are allowed to accept allocations. Gang blocks are always 88 * eligible to allocate on any metaslab group. The default value of 0 means 89 * no metaslab group will be excluded based on this criterion. 90 */ 91 int zfs_mg_noalloc_threshold = 0; 92 93 /* 94 * Metaslab groups are considered eligible for allocations if their 95 * fragmenation metric (measured as a percentage) is less than or equal to 96 * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold 97 * then it will be skipped unless all metaslab groups within the metaslab 98 * class have also crossed this threshold. 99 */ 100 int zfs_mg_fragmentation_threshold = 85; 101 102 /* 103 * Allow metaslabs to keep their active state as long as their fragmentation 104 * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An 105 * active metaslab that exceeds this threshold will no longer keep its active 106 * status allowing better metaslabs to be selected. 107 */ 108 int zfs_metaslab_fragmentation_threshold = 70; 109 110 /* 111 * When set will load all metaslabs when pool is first opened. 112 */ 113 int metaslab_debug_load = 0; 114 115 /* 116 * When set will prevent metaslabs from being unloaded. 117 */ 118 int metaslab_debug_unload = 0; 119 120 /* 121 * Minimum size which forces the dynamic allocator to change 122 * it's allocation strategy. Once the space map cannot satisfy 123 * an allocation of this size then it switches to using more 124 * aggressive strategy (i.e search by size rather than offset). 125 */ 126 uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE; 127 128 /* 129 * The minimum free space, in percent, which must be available 130 * in a space map to continue allocations in a first-fit fashion. 131 * Once the space map's free space drops below this level we dynamically 132 * switch to using best-fit allocations. 133 */ 134 int metaslab_df_free_pct = 4; 135 136 /* 137 * A metaslab is considered "free" if it contains a contiguous 138 * segment which is greater than metaslab_min_alloc_size. 139 */ 140 uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS; 141 142 /* 143 * Percentage of all cpus that can be used by the metaslab taskq. 144 */ 145 int metaslab_load_pct = 50; 146 147 /* 148 * Determines how many txgs a metaslab may remain loaded without having any 149 * allocations from it. As long as a metaslab continues to be used we will 150 * keep it loaded. 151 */ 152 int metaslab_unload_delay = TXG_SIZE * 2; 153 154 /* 155 * Max number of metaslabs per group to preload. 156 */ 157 int metaslab_preload_limit = SPA_DVAS_PER_BP; 158 159 /* 160 * Enable/disable preloading of metaslab. 161 */ 162 boolean_t metaslab_preload_enabled = B_TRUE; 163 164 /* 165 * Enable/disable fragmentation weighting on metaslabs. 166 */ 167 boolean_t metaslab_fragmentation_factor_enabled = B_TRUE; 168 169 /* 170 * Enable/disable lba weighting (i.e. outer tracks are given preference). 171 */ 172 boolean_t metaslab_lba_weighting_enabled = B_TRUE; 173 174 /* 175 * Enable/disable metaslab group biasing. 176 */ 177 boolean_t metaslab_bias_enabled = B_TRUE; 178 179 /* 180 * Enable/disable remapping of indirect DVAs to their concrete vdevs. 181 */ 182 boolean_t zfs_remap_blkptr_enable = B_TRUE; 183 184 /* 185 * Enable/disable segment-based metaslab selection. 186 */ 187 boolean_t zfs_metaslab_segment_weight_enabled = B_TRUE; 188 189 /* 190 * When using segment-based metaslab selection, we will continue 191 * allocating from the active metaslab until we have exhausted 192 * zfs_metaslab_switch_threshold of its buckets. 193 */ 194 int zfs_metaslab_switch_threshold = 2; 195 196 /* 197 * Internal switch to enable/disable the metaslab allocation tracing 198 * facility. 199 */ 200 boolean_t metaslab_trace_enabled = B_TRUE; 201 202 /* 203 * Maximum entries that the metaslab allocation tracing facility will keep 204 * in a given list when running in non-debug mode. We limit the number 205 * of entries in non-debug mode to prevent us from using up too much memory. 206 * The limit should be sufficiently large that we don't expect any allocation 207 * to every exceed this value. In debug mode, the system will panic if this 208 * limit is ever reached allowing for further investigation. 209 */ 210 uint64_t metaslab_trace_max_entries = 5000; 211 212 static uint64_t metaslab_weight(metaslab_t *); 213 static void metaslab_set_fragmentation(metaslab_t *); 214 static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t); 215 static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t); 216 static void metaslab_passivate(metaslab_t *msp, uint64_t weight); 217 static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp); 218 219 kmem_cache_t *metaslab_alloc_trace_cache; 220 221 /* 222 * ========================================================================== 223 * Metaslab classes 224 * ========================================================================== 225 */ 226 metaslab_class_t * 227 metaslab_class_create(spa_t *spa, metaslab_ops_t *ops) 228 { 229 metaslab_class_t *mc; 230 231 mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); 232 233 mc->mc_spa = spa; 234 mc->mc_rotor = NULL; 235 mc->mc_ops = ops; 236 mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL); 237 mc->mc_alloc_slots = kmem_zalloc(spa->spa_alloc_count * 238 sizeof (zfs_refcount_t), KM_SLEEP); 239 mc->mc_alloc_max_slots = kmem_zalloc(spa->spa_alloc_count * 240 sizeof (uint64_t), KM_SLEEP); 241 for (int i = 0; i < spa->spa_alloc_count; i++) 242 zfs_refcount_create_tracked(&mc->mc_alloc_slots[i]); 243 244 return (mc); 245 } 246 247 void 248 metaslab_class_destroy(metaslab_class_t *mc) 249 { 250 ASSERT(mc->mc_rotor == NULL); 251 ASSERT(mc->mc_alloc == 0); 252 ASSERT(mc->mc_deferred == 0); 253 ASSERT(mc->mc_space == 0); 254 ASSERT(mc->mc_dspace == 0); 255 256 for (int i = 0; i < mc->mc_spa->spa_alloc_count; i++) 257 zfs_refcount_destroy(&mc->mc_alloc_slots[i]); 258 kmem_free(mc->mc_alloc_slots, mc->mc_spa->spa_alloc_count * 259 sizeof (zfs_refcount_t)); 260 kmem_free(mc->mc_alloc_max_slots, mc->mc_spa->spa_alloc_count * 261 sizeof (uint64_t)); 262 mutex_destroy(&mc->mc_lock); 263 kmem_free(mc, sizeof (metaslab_class_t)); 264 } 265 266 int 267 metaslab_class_validate(metaslab_class_t *mc) 268 { 269 metaslab_group_t *mg; 270 vdev_t *vd; 271 272 /* 273 * Must hold one of the spa_config locks. 274 */ 275 ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || 276 spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); 277 278 if ((mg = mc->mc_rotor) == NULL) 279 return (0); 280 281 do { 282 vd = mg->mg_vd; 283 ASSERT(vd->vdev_mg != NULL); 284 ASSERT3P(vd->vdev_top, ==, vd); 285 ASSERT3P(mg->mg_class, ==, mc); 286 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); 287 } while ((mg = mg->mg_next) != mc->mc_rotor); 288 289 return (0); 290 } 291 292 static void 293 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, 294 int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) 295 { 296 atomic_add_64(&mc->mc_alloc, alloc_delta); 297 atomic_add_64(&mc->mc_deferred, defer_delta); 298 atomic_add_64(&mc->mc_space, space_delta); 299 atomic_add_64(&mc->mc_dspace, dspace_delta); 300 } 301 302 uint64_t 303 metaslab_class_get_alloc(metaslab_class_t *mc) 304 { 305 return (mc->mc_alloc); 306 } 307 308 uint64_t 309 metaslab_class_get_deferred(metaslab_class_t *mc) 310 { 311 return (mc->mc_deferred); 312 } 313 314 uint64_t 315 metaslab_class_get_space(metaslab_class_t *mc) 316 { 317 return (mc->mc_space); 318 } 319 320 uint64_t 321 metaslab_class_get_dspace(metaslab_class_t *mc) 322 { 323 return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); 324 } 325 326 void 327 metaslab_class_histogram_verify(metaslab_class_t *mc) 328 { 329 spa_t *spa = mc->mc_spa; 330 vdev_t *rvd = spa->spa_root_vdev; 331 uint64_t *mc_hist; 332 int i; 333 334 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 335 return; 336 337 mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 338 KM_SLEEP); 339 340 for (int c = 0; c < rvd->vdev_children; c++) { 341 vdev_t *tvd = rvd->vdev_child[c]; 342 metaslab_group_t *mg = tvd->vdev_mg; 343 344 /* 345 * Skip any holes, uninitialized top-levels, or 346 * vdevs that are not in this metalab class. 347 */ 348 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || 349 mg->mg_class != mc) { 350 continue; 351 } 352 353 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 354 mc_hist[i] += mg->mg_histogram[i]; 355 } 356 357 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 358 VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]); 359 360 kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 361 } 362 363 /* 364 * Calculate the metaslab class's fragmentation metric. The metric 365 * is weighted based on the space contribution of each metaslab group. 366 * The return value will be a number between 0 and 100 (inclusive), or 367 * ZFS_FRAG_INVALID if the metric has not been set. See comment above the 368 * zfs_frag_table for more information about the metric. 369 */ 370 uint64_t 371 metaslab_class_fragmentation(metaslab_class_t *mc) 372 { 373 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 374 uint64_t fragmentation = 0; 375 376 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 377 378 for (int c = 0; c < rvd->vdev_children; c++) { 379 vdev_t *tvd = rvd->vdev_child[c]; 380 metaslab_group_t *mg = tvd->vdev_mg; 381 382 /* 383 * Skip any holes, uninitialized top-levels, 384 * or vdevs that are not in this metalab class. 385 */ 386 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || 387 mg->mg_class != mc) { 388 continue; 389 } 390 391 /* 392 * If a metaslab group does not contain a fragmentation 393 * metric then just bail out. 394 */ 395 if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { 396 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 397 return (ZFS_FRAG_INVALID); 398 } 399 400 /* 401 * Determine how much this metaslab_group is contributing 402 * to the overall pool fragmentation metric. 403 */ 404 fragmentation += mg->mg_fragmentation * 405 metaslab_group_get_space(mg); 406 } 407 fragmentation /= metaslab_class_get_space(mc); 408 409 ASSERT3U(fragmentation, <=, 100); 410 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 411 return (fragmentation); 412 } 413 414 /* 415 * Calculate the amount of expandable space that is available in 416 * this metaslab class. If a device is expanded then its expandable 417 * space will be the amount of allocatable space that is currently not 418 * part of this metaslab class. 419 */ 420 uint64_t 421 metaslab_class_expandable_space(metaslab_class_t *mc) 422 { 423 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 424 uint64_t space = 0; 425 426 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 427 for (int c = 0; c < rvd->vdev_children; c++) { 428 uint64_t tspace; 429 vdev_t *tvd = rvd->vdev_child[c]; 430 metaslab_group_t *mg = tvd->vdev_mg; 431 432 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || 433 mg->mg_class != mc) { 434 continue; 435 } 436 437 /* 438 * Calculate if we have enough space to add additional 439 * metaslabs. We report the expandable space in terms 440 * of the metaslab size since that's the unit of expansion. 441 * Adjust by efi system partition size. 442 */ 443 tspace = tvd->vdev_max_asize - tvd->vdev_asize; 444 if (tspace > mc->mc_spa->spa_bootsize) { 445 tspace -= mc->mc_spa->spa_bootsize; 446 } 447 space += P2ALIGN(tspace, 1ULL << tvd->vdev_ms_shift); 448 } 449 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 450 return (space); 451 } 452 453 static int 454 metaslab_compare(const void *x1, const void *x2) 455 { 456 const metaslab_t *m1 = x1; 457 const metaslab_t *m2 = x2; 458 459 int sort1 = 0; 460 int sort2 = 0; 461 if (m1->ms_allocator != -1 && m1->ms_primary) 462 sort1 = 1; 463 else if (m1->ms_allocator != -1 && !m1->ms_primary) 464 sort1 = 2; 465 if (m2->ms_allocator != -1 && m2->ms_primary) 466 sort2 = 1; 467 else if (m2->ms_allocator != -1 && !m2->ms_primary) 468 sort2 = 2; 469 470 /* 471 * Sort inactive metaslabs first, then primaries, then secondaries. When 472 * selecting a metaslab to allocate from, an allocator first tries its 473 * primary, then secondary active metaslab. If it doesn't have active 474 * metaslabs, or can't allocate from them, it searches for an inactive 475 * metaslab to activate. If it can't find a suitable one, it will steal 476 * a primary or secondary metaslab from another allocator. 477 */ 478 if (sort1 < sort2) 479 return (-1); 480 if (sort1 > sort2) 481 return (1); 482 483 if (m1->ms_weight < m2->ms_weight) 484 return (1); 485 if (m1->ms_weight > m2->ms_weight) 486 return (-1); 487 488 /* 489 * If the weights are identical, use the offset to force uniqueness. 490 */ 491 if (m1->ms_start < m2->ms_start) 492 return (-1); 493 if (m1->ms_start > m2->ms_start) 494 return (1); 495 496 ASSERT3P(m1, ==, m2); 497 498 return (0); 499 } 500 501 uint64_t 502 metaslab_allocated_space(metaslab_t *msp) 503 { 504 return (msp->ms_allocated_space); 505 } 506 507 /* 508 * Verify that the space accounting on disk matches the in-core range_trees. 509 */ 510 static void 511 metaslab_verify_space(metaslab_t *msp, uint64_t txg) 512 { 513 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 514 uint64_t allocating = 0; 515 uint64_t sm_free_space, msp_free_space; 516 517 ASSERT(MUTEX_HELD(&msp->ms_lock)); 518 ASSERT(!msp->ms_condensing); 519 520 if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) 521 return; 522 523 /* 524 * We can only verify the metaslab space when we're called 525 * from syncing context with a loaded metaslab that has an 526 * allocated space map. Calling this in non-syncing context 527 * does not provide a consistent view of the metaslab since 528 * we're performing allocations in the future. 529 */ 530 if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL || 531 !msp->ms_loaded) 532 return; 533 534 /* 535 * Even though the smp_alloc field can get negative (e.g. 536 * see vdev_checkpoint_sm), that should never be the case 537 * when it come's to a metaslab's space map. 538 */ 539 ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0); 540 541 sm_free_space = msp->ms_size - metaslab_allocated_space(msp); 542 543 /* 544 * Account for future allocations since we would have 545 * already deducted that space from the ms_allocatable. 546 */ 547 for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { 548 allocating += 549 range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]); 550 } 551 552 ASSERT3U(msp->ms_deferspace, ==, 553 range_tree_space(msp->ms_defer[0]) + 554 range_tree_space(msp->ms_defer[1])); 555 556 msp_free_space = range_tree_space(msp->ms_allocatable) + allocating + 557 msp->ms_deferspace + range_tree_space(msp->ms_freed); 558 559 VERIFY3U(sm_free_space, ==, msp_free_space); 560 } 561 562 /* 563 * ========================================================================== 564 * Metaslab groups 565 * ========================================================================== 566 */ 567 /* 568 * Update the allocatable flag and the metaslab group's capacity. 569 * The allocatable flag is set to true if the capacity is below 570 * the zfs_mg_noalloc_threshold or has a fragmentation value that is 571 * greater than zfs_mg_fragmentation_threshold. If a metaslab group 572 * transitions from allocatable to non-allocatable or vice versa then the 573 * metaslab group's class is updated to reflect the transition. 574 */ 575 static void 576 metaslab_group_alloc_update(metaslab_group_t *mg) 577 { 578 vdev_t *vd = mg->mg_vd; 579 metaslab_class_t *mc = mg->mg_class; 580 vdev_stat_t *vs = &vd->vdev_stat; 581 boolean_t was_allocatable; 582 boolean_t was_initialized; 583 584 ASSERT(vd == vd->vdev_top); 585 ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==, 586 SCL_ALLOC); 587 588 mutex_enter(&mg->mg_lock); 589 was_allocatable = mg->mg_allocatable; 590 was_initialized = mg->mg_initialized; 591 592 mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / 593 (vs->vs_space + 1); 594 595 mutex_enter(&mc->mc_lock); 596 597 /* 598 * If the metaslab group was just added then it won't 599 * have any space until we finish syncing out this txg. 600 * At that point we will consider it initialized and available 601 * for allocations. We also don't consider non-activated 602 * metaslab groups (e.g. vdevs that are in the middle of being removed) 603 * to be initialized, because they can't be used for allocation. 604 */ 605 mg->mg_initialized = metaslab_group_initialized(mg); 606 if (!was_initialized && mg->mg_initialized) { 607 mc->mc_groups++; 608 } else if (was_initialized && !mg->mg_initialized) { 609 ASSERT3U(mc->mc_groups, >, 0); 610 mc->mc_groups--; 611 } 612 if (mg->mg_initialized) 613 mg->mg_no_free_space = B_FALSE; 614 615 /* 616 * A metaslab group is considered allocatable if it has plenty 617 * of free space or is not heavily fragmented. We only take 618 * fragmentation into account if the metaslab group has a valid 619 * fragmentation metric (i.e. a value between 0 and 100). 620 */ 621 mg->mg_allocatable = (mg->mg_activation_count > 0 && 622 mg->mg_free_capacity > zfs_mg_noalloc_threshold && 623 (mg->mg_fragmentation == ZFS_FRAG_INVALID || 624 mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)); 625 626 /* 627 * The mc_alloc_groups maintains a count of the number of 628 * groups in this metaslab class that are still above the 629 * zfs_mg_noalloc_threshold. This is used by the allocating 630 * threads to determine if they should avoid allocations to 631 * a given group. The allocator will avoid allocations to a group 632 * if that group has reached or is below the zfs_mg_noalloc_threshold 633 * and there are still other groups that are above the threshold. 634 * When a group transitions from allocatable to non-allocatable or 635 * vice versa we update the metaslab class to reflect that change. 636 * When the mc_alloc_groups value drops to 0 that means that all 637 * groups have reached the zfs_mg_noalloc_threshold making all groups 638 * eligible for allocations. This effectively means that all devices 639 * are balanced again. 640 */ 641 if (was_allocatable && !mg->mg_allocatable) 642 mc->mc_alloc_groups--; 643 else if (!was_allocatable && mg->mg_allocatable) 644 mc->mc_alloc_groups++; 645 mutex_exit(&mc->mc_lock); 646 647 mutex_exit(&mg->mg_lock); 648 } 649 650 metaslab_group_t * 651 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators) 652 { 653 metaslab_group_t *mg; 654 655 mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); 656 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); 657 mutex_init(&mg->mg_ms_initialize_lock, NULL, MUTEX_DEFAULT, NULL); 658 cv_init(&mg->mg_ms_initialize_cv, NULL, CV_DEFAULT, NULL); 659 mg->mg_primaries = kmem_zalloc(allocators * sizeof (metaslab_t *), 660 KM_SLEEP); 661 mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *), 662 KM_SLEEP); 663 avl_create(&mg->mg_metaslab_tree, metaslab_compare, 664 sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); 665 mg->mg_vd = vd; 666 mg->mg_class = mc; 667 mg->mg_activation_count = 0; 668 mg->mg_initialized = B_FALSE; 669 mg->mg_no_free_space = B_TRUE; 670 mg->mg_allocators = allocators; 671 672 mg->mg_alloc_queue_depth = kmem_zalloc(allocators * 673 sizeof (zfs_refcount_t), KM_SLEEP); 674 mg->mg_cur_max_alloc_queue_depth = kmem_zalloc(allocators * 675 sizeof (uint64_t), KM_SLEEP); 676 for (int i = 0; i < allocators; i++) { 677 zfs_refcount_create_tracked(&mg->mg_alloc_queue_depth[i]); 678 mg->mg_cur_max_alloc_queue_depth[i] = 0; 679 } 680 681 mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct, 682 minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT); 683 684 return (mg); 685 } 686 687 void 688 metaslab_group_destroy(metaslab_group_t *mg) 689 { 690 ASSERT(mg->mg_prev == NULL); 691 ASSERT(mg->mg_next == NULL); 692 /* 693 * We may have gone below zero with the activation count 694 * either because we never activated in the first place or 695 * because we're done, and possibly removing the vdev. 696 */ 697 ASSERT(mg->mg_activation_count <= 0); 698 699 taskq_destroy(mg->mg_taskq); 700 avl_destroy(&mg->mg_metaslab_tree); 701 kmem_free(mg->mg_primaries, mg->mg_allocators * sizeof (metaslab_t *)); 702 kmem_free(mg->mg_secondaries, mg->mg_allocators * 703 sizeof (metaslab_t *)); 704 mutex_destroy(&mg->mg_lock); 705 mutex_destroy(&mg->mg_ms_initialize_lock); 706 cv_destroy(&mg->mg_ms_initialize_cv); 707 708 for (int i = 0; i < mg->mg_allocators; i++) { 709 zfs_refcount_destroy(&mg->mg_alloc_queue_depth[i]); 710 mg->mg_cur_max_alloc_queue_depth[i] = 0; 711 } 712 kmem_free(mg->mg_alloc_queue_depth, mg->mg_allocators * 713 sizeof (zfs_refcount_t)); 714 kmem_free(mg->mg_cur_max_alloc_queue_depth, mg->mg_allocators * 715 sizeof (uint64_t)); 716 717 kmem_free(mg, sizeof (metaslab_group_t)); 718 } 719 720 void 721 metaslab_group_activate(metaslab_group_t *mg) 722 { 723 metaslab_class_t *mc = mg->mg_class; 724 metaslab_group_t *mgprev, *mgnext; 725 726 ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER), !=, 0); 727 728 ASSERT(mc->mc_rotor != mg); 729 ASSERT(mg->mg_prev == NULL); 730 ASSERT(mg->mg_next == NULL); 731 ASSERT(mg->mg_activation_count <= 0); 732 733 if (++mg->mg_activation_count <= 0) 734 return; 735 736 mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); 737 metaslab_group_alloc_update(mg); 738 739 if ((mgprev = mc->mc_rotor) == NULL) { 740 mg->mg_prev = mg; 741 mg->mg_next = mg; 742 } else { 743 mgnext = mgprev->mg_next; 744 mg->mg_prev = mgprev; 745 mg->mg_next = mgnext; 746 mgprev->mg_next = mg; 747 mgnext->mg_prev = mg; 748 } 749 mc->mc_rotor = mg; 750 } 751 752 /* 753 * Passivate a metaslab group and remove it from the allocation rotor. 754 * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating 755 * a metaslab group. This function will momentarily drop spa_config_locks 756 * that are lower than the SCL_ALLOC lock (see comment below). 757 */ 758 void 759 metaslab_group_passivate(metaslab_group_t *mg) 760 { 761 metaslab_class_t *mc = mg->mg_class; 762 spa_t *spa = mc->mc_spa; 763 metaslab_group_t *mgprev, *mgnext; 764 int locks = spa_config_held(spa, SCL_ALL, RW_WRITER); 765 766 ASSERT3U(spa_config_held(spa, SCL_ALLOC | SCL_ZIO, RW_WRITER), ==, 767 (SCL_ALLOC | SCL_ZIO)); 768 769 if (--mg->mg_activation_count != 0) { 770 ASSERT(mc->mc_rotor != mg); 771 ASSERT(mg->mg_prev == NULL); 772 ASSERT(mg->mg_next == NULL); 773 ASSERT(mg->mg_activation_count < 0); 774 return; 775 } 776 777 /* 778 * The spa_config_lock is an array of rwlocks, ordered as 779 * follows (from highest to lowest): 780 * SCL_CONFIG > SCL_STATE > SCL_L2ARC > SCL_ALLOC > 781 * SCL_ZIO > SCL_FREE > SCL_VDEV 782 * (For more information about the spa_config_lock see spa_misc.c) 783 * The higher the lock, the broader its coverage. When we passivate 784 * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO 785 * config locks. However, the metaslab group's taskq might be trying 786 * to preload metaslabs so we must drop the SCL_ZIO lock and any 787 * lower locks to allow the I/O to complete. At a minimum, 788 * we continue to hold the SCL_ALLOC lock, which prevents any future 789 * allocations from taking place and any changes to the vdev tree. 790 */ 791 spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa); 792 taskq_wait(mg->mg_taskq); 793 spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER); 794 metaslab_group_alloc_update(mg); 795 for (int i = 0; i < mg->mg_allocators; i++) { 796 metaslab_t *msp = mg->mg_primaries[i]; 797 if (msp != NULL) { 798 mutex_enter(&msp->ms_lock); 799 metaslab_passivate(msp, 800 metaslab_weight_from_range_tree(msp)); 801 mutex_exit(&msp->ms_lock); 802 } 803 msp = mg->mg_secondaries[i]; 804 if (msp != NULL) { 805 mutex_enter(&msp->ms_lock); 806 metaslab_passivate(msp, 807 metaslab_weight_from_range_tree(msp)); 808 mutex_exit(&msp->ms_lock); 809 } 810 } 811 812 mgprev = mg->mg_prev; 813 mgnext = mg->mg_next; 814 815 if (mg == mgnext) { 816 mc->mc_rotor = NULL; 817 } else { 818 mc->mc_rotor = mgnext; 819 mgprev->mg_next = mgnext; 820 mgnext->mg_prev = mgprev; 821 } 822 823 mg->mg_prev = NULL; 824 mg->mg_next = NULL; 825 } 826 827 boolean_t 828 metaslab_group_initialized(metaslab_group_t *mg) 829 { 830 vdev_t *vd = mg->mg_vd; 831 vdev_stat_t *vs = &vd->vdev_stat; 832 833 return (vs->vs_space != 0 && mg->mg_activation_count > 0); 834 } 835 836 uint64_t 837 metaslab_group_get_space(metaslab_group_t *mg) 838 { 839 return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count); 840 } 841 842 void 843 metaslab_group_histogram_verify(metaslab_group_t *mg) 844 { 845 uint64_t *mg_hist; 846 vdev_t *vd = mg->mg_vd; 847 uint64_t ashift = vd->vdev_ashift; 848 int i; 849 850 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 851 return; 852 853 mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 854 KM_SLEEP); 855 856 ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=, 857 SPACE_MAP_HISTOGRAM_SIZE + ashift); 858 859 for (int m = 0; m < vd->vdev_ms_count; m++) { 860 metaslab_t *msp = vd->vdev_ms[m]; 861 ASSERT(msp != NULL); 862 863 /* skip if not active or not a member */ 864 if (msp->ms_sm == NULL || msp->ms_group != mg) 865 continue; 866 867 for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) 868 mg_hist[i + ashift] += 869 msp->ms_sm->sm_phys->smp_histogram[i]; 870 } 871 872 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++) 873 VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]); 874 875 kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 876 } 877 878 static void 879 metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp) 880 { 881 metaslab_class_t *mc = mg->mg_class; 882 uint64_t ashift = mg->mg_vd->vdev_ashift; 883 884 ASSERT(MUTEX_HELD(&msp->ms_lock)); 885 if (msp->ms_sm == NULL) 886 return; 887 888 mutex_enter(&mg->mg_lock); 889 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 890 mg->mg_histogram[i + ashift] += 891 msp->ms_sm->sm_phys->smp_histogram[i]; 892 mc->mc_histogram[i + ashift] += 893 msp->ms_sm->sm_phys->smp_histogram[i]; 894 } 895 mutex_exit(&mg->mg_lock); 896 } 897 898 void 899 metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp) 900 { 901 metaslab_class_t *mc = mg->mg_class; 902 uint64_t ashift = mg->mg_vd->vdev_ashift; 903 904 ASSERT(MUTEX_HELD(&msp->ms_lock)); 905 if (msp->ms_sm == NULL) 906 return; 907 908 mutex_enter(&mg->mg_lock); 909 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 910 ASSERT3U(mg->mg_histogram[i + ashift], >=, 911 msp->ms_sm->sm_phys->smp_histogram[i]); 912 ASSERT3U(mc->mc_histogram[i + ashift], >=, 913 msp->ms_sm->sm_phys->smp_histogram[i]); 914 915 mg->mg_histogram[i + ashift] -= 916 msp->ms_sm->sm_phys->smp_histogram[i]; 917 mc->mc_histogram[i + ashift] -= 918 msp->ms_sm->sm_phys->smp_histogram[i]; 919 } 920 mutex_exit(&mg->mg_lock); 921 } 922 923 static void 924 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) 925 { 926 ASSERT(msp->ms_group == NULL); 927 mutex_enter(&mg->mg_lock); 928 msp->ms_group = mg; 929 msp->ms_weight = 0; 930 avl_add(&mg->mg_metaslab_tree, msp); 931 mutex_exit(&mg->mg_lock); 932 933 mutex_enter(&msp->ms_lock); 934 metaslab_group_histogram_add(mg, msp); 935 mutex_exit(&msp->ms_lock); 936 } 937 938 static void 939 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) 940 { 941 mutex_enter(&msp->ms_lock); 942 metaslab_group_histogram_remove(mg, msp); 943 mutex_exit(&msp->ms_lock); 944 945 mutex_enter(&mg->mg_lock); 946 ASSERT(msp->ms_group == mg); 947 avl_remove(&mg->mg_metaslab_tree, msp); 948 msp->ms_group = NULL; 949 mutex_exit(&mg->mg_lock); 950 } 951 952 static void 953 metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 954 { 955 ASSERT(MUTEX_HELD(&mg->mg_lock)); 956 ASSERT(msp->ms_group == mg); 957 avl_remove(&mg->mg_metaslab_tree, msp); 958 msp->ms_weight = weight; 959 avl_add(&mg->mg_metaslab_tree, msp); 960 961 } 962 963 static void 964 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 965 { 966 /* 967 * Although in principle the weight can be any value, in 968 * practice we do not use values in the range [1, 511]. 969 */ 970 ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0); 971 ASSERT(MUTEX_HELD(&msp->ms_lock)); 972 973 mutex_enter(&mg->mg_lock); 974 metaslab_group_sort_impl(mg, msp, weight); 975 mutex_exit(&mg->mg_lock); 976 } 977 978 /* 979 * Calculate the fragmentation for a given metaslab group. We can use 980 * a simple average here since all metaslabs within the group must have 981 * the same size. The return value will be a value between 0 and 100 982 * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this 983 * group have a fragmentation metric. 984 */ 985 uint64_t 986 metaslab_group_fragmentation(metaslab_group_t *mg) 987 { 988 vdev_t *vd = mg->mg_vd; 989 uint64_t fragmentation = 0; 990 uint64_t valid_ms = 0; 991 992 for (int m = 0; m < vd->vdev_ms_count; m++) { 993 metaslab_t *msp = vd->vdev_ms[m]; 994 995 if (msp->ms_fragmentation == ZFS_FRAG_INVALID) 996 continue; 997 if (msp->ms_group != mg) 998 continue; 999 1000 valid_ms++; 1001 fragmentation += msp->ms_fragmentation; 1002 } 1003 1004 if (valid_ms <= mg->mg_vd->vdev_ms_count / 2) 1005 return (ZFS_FRAG_INVALID); 1006 1007 fragmentation /= valid_ms; 1008 ASSERT3U(fragmentation, <=, 100); 1009 return (fragmentation); 1010 } 1011 1012 /* 1013 * Determine if a given metaslab group should skip allocations. A metaslab 1014 * group should avoid allocations if its free capacity is less than the 1015 * zfs_mg_noalloc_threshold or its fragmentation metric is greater than 1016 * zfs_mg_fragmentation_threshold and there is at least one metaslab group 1017 * that can still handle allocations. If the allocation throttle is enabled 1018 * then we skip allocations to devices that have reached their maximum 1019 * allocation queue depth unless the selected metaslab group is the only 1020 * eligible group remaining. 1021 */ 1022 static boolean_t 1023 metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, 1024 uint64_t psize, int allocator) 1025 { 1026 spa_t *spa = mg->mg_vd->vdev_spa; 1027 metaslab_class_t *mc = mg->mg_class; 1028 1029 /* 1030 * We can only consider skipping this metaslab group if it's 1031 * in the normal metaslab class and there are other metaslab 1032 * groups to select from. Otherwise, we always consider it eligible 1033 * for allocations. 1034 */ 1035 if ((mc != spa_normal_class(spa) && 1036 mc != spa_special_class(spa) && 1037 mc != spa_dedup_class(spa)) || 1038 mc->mc_groups <= 1) 1039 return (B_TRUE); 1040 1041 /* 1042 * If the metaslab group's mg_allocatable flag is set (see comments 1043 * in metaslab_group_alloc_update() for more information) and 1044 * the allocation throttle is disabled then allow allocations to this 1045 * device. However, if the allocation throttle is enabled then 1046 * check if we have reached our allocation limit (mg_alloc_queue_depth) 1047 * to determine if we should allow allocations to this metaslab group. 1048 * If all metaslab groups are no longer considered allocatable 1049 * (mc_alloc_groups == 0) or we're trying to allocate the smallest 1050 * gang block size then we allow allocations on this metaslab group 1051 * regardless of the mg_allocatable or throttle settings. 1052 */ 1053 if (mg->mg_allocatable) { 1054 metaslab_group_t *mgp; 1055 int64_t qdepth; 1056 uint64_t qmax = mg->mg_cur_max_alloc_queue_depth[allocator]; 1057 1058 if (!mc->mc_alloc_throttle_enabled) 1059 return (B_TRUE); 1060 1061 /* 1062 * If this metaslab group does not have any free space, then 1063 * there is no point in looking further. 1064 */ 1065 if (mg->mg_no_free_space) 1066 return (B_FALSE); 1067 1068 qdepth = zfs_refcount_count( 1069 &mg->mg_alloc_queue_depth[allocator]); 1070 1071 /* 1072 * If this metaslab group is below its qmax or it's 1073 * the only allocatable metasable group, then attempt 1074 * to allocate from it. 1075 */ 1076 if (qdepth < qmax || mc->mc_alloc_groups == 1) 1077 return (B_TRUE); 1078 ASSERT3U(mc->mc_alloc_groups, >, 1); 1079 1080 /* 1081 * Since this metaslab group is at or over its qmax, we 1082 * need to determine if there are metaslab groups after this 1083 * one that might be able to handle this allocation. This is 1084 * racy since we can't hold the locks for all metaslab 1085 * groups at the same time when we make this check. 1086 */ 1087 for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) { 1088 qmax = mgp->mg_cur_max_alloc_queue_depth[allocator]; 1089 1090 qdepth = zfs_refcount_count( 1091 &mgp->mg_alloc_queue_depth[allocator]); 1092 1093 /* 1094 * If there is another metaslab group that 1095 * might be able to handle the allocation, then 1096 * we return false so that we skip this group. 1097 */ 1098 if (qdepth < qmax && !mgp->mg_no_free_space) 1099 return (B_FALSE); 1100 } 1101 1102 /* 1103 * We didn't find another group to handle the allocation 1104 * so we can't skip this metaslab group even though 1105 * we are at or over our qmax. 1106 */ 1107 return (B_TRUE); 1108 1109 } else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) { 1110 return (B_TRUE); 1111 } 1112 return (B_FALSE); 1113 } 1114 1115 /* 1116 * ========================================================================== 1117 * Range tree callbacks 1118 * ========================================================================== 1119 */ 1120 1121 /* 1122 * Comparison function for the private size-ordered tree. Tree is sorted 1123 * by size, larger sizes at the end of the tree. 1124 */ 1125 static int 1126 metaslab_rangesize_compare(const void *x1, const void *x2) 1127 { 1128 const range_seg_t *r1 = x1; 1129 const range_seg_t *r2 = x2; 1130 uint64_t rs_size1 = r1->rs_end - r1->rs_start; 1131 uint64_t rs_size2 = r2->rs_end - r2->rs_start; 1132 1133 if (rs_size1 < rs_size2) 1134 return (-1); 1135 if (rs_size1 > rs_size2) 1136 return (1); 1137 1138 if (r1->rs_start < r2->rs_start) 1139 return (-1); 1140 1141 if (r1->rs_start > r2->rs_start) 1142 return (1); 1143 1144 return (0); 1145 } 1146 1147 /* 1148 * Create any block allocator specific components. The current allocators 1149 * rely on using both a size-ordered range_tree_t and an array of uint64_t's. 1150 */ 1151 static void 1152 metaslab_rt_create(range_tree_t *rt, void *arg) 1153 { 1154 metaslab_t *msp = arg; 1155 1156 ASSERT3P(rt->rt_arg, ==, msp); 1157 ASSERT(msp->ms_allocatable == NULL); 1158 1159 avl_create(&msp->ms_allocatable_by_size, metaslab_rangesize_compare, 1160 sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node)); 1161 } 1162 1163 /* 1164 * Destroy the block allocator specific components. 1165 */ 1166 static void 1167 metaslab_rt_destroy(range_tree_t *rt, void *arg) 1168 { 1169 metaslab_t *msp = arg; 1170 1171 ASSERT3P(rt->rt_arg, ==, msp); 1172 ASSERT3P(msp->ms_allocatable, ==, rt); 1173 ASSERT0(avl_numnodes(&msp->ms_allocatable_by_size)); 1174 1175 avl_destroy(&msp->ms_allocatable_by_size); 1176 } 1177 1178 static void 1179 metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg) 1180 { 1181 metaslab_t *msp = arg; 1182 1183 ASSERT3P(rt->rt_arg, ==, msp); 1184 ASSERT3P(msp->ms_allocatable, ==, rt); 1185 VERIFY(!msp->ms_condensing); 1186 avl_add(&msp->ms_allocatable_by_size, rs); 1187 } 1188 1189 static void 1190 metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg) 1191 { 1192 metaslab_t *msp = arg; 1193 1194 ASSERT3P(rt->rt_arg, ==, msp); 1195 ASSERT3P(msp->ms_allocatable, ==, rt); 1196 VERIFY(!msp->ms_condensing); 1197 avl_remove(&msp->ms_allocatable_by_size, rs); 1198 } 1199 1200 static void 1201 metaslab_rt_vacate(range_tree_t *rt, void *arg) 1202 { 1203 metaslab_t *msp = arg; 1204 1205 ASSERT3P(rt->rt_arg, ==, msp); 1206 ASSERT3P(msp->ms_allocatable, ==, rt); 1207 1208 /* 1209 * Normally one would walk the tree freeing nodes along the way. 1210 * Since the nodes are shared with the range trees we can avoid 1211 * walking all nodes and just reinitialize the avl tree. The nodes 1212 * will be freed by the range tree, so we don't want to free them here. 1213 */ 1214 avl_create(&msp->ms_allocatable_by_size, metaslab_rangesize_compare, 1215 sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node)); 1216 } 1217 1218 static range_tree_ops_t metaslab_rt_ops = { 1219 metaslab_rt_create, 1220 metaslab_rt_destroy, 1221 metaslab_rt_add, 1222 metaslab_rt_remove, 1223 metaslab_rt_vacate 1224 }; 1225 1226 /* 1227 * ========================================================================== 1228 * Common allocator routines 1229 * ========================================================================== 1230 */ 1231 1232 /* 1233 * Return the maximum contiguous segment within the metaslab. 1234 */ 1235 uint64_t 1236 metaslab_block_maxsize(metaslab_t *msp) 1237 { 1238 avl_tree_t *t = &msp->ms_allocatable_by_size; 1239 range_seg_t *rs; 1240 1241 if (t == NULL || (rs = avl_last(t)) == NULL) 1242 return (0ULL); 1243 1244 return (rs->rs_end - rs->rs_start); 1245 } 1246 1247 static range_seg_t * 1248 metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size) 1249 { 1250 range_seg_t *rs, rsearch; 1251 avl_index_t where; 1252 1253 rsearch.rs_start = start; 1254 rsearch.rs_end = start + size; 1255 1256 rs = avl_find(t, &rsearch, &where); 1257 if (rs == NULL) { 1258 rs = avl_nearest(t, where, AVL_AFTER); 1259 } 1260 1261 return (rs); 1262 } 1263 1264 /* 1265 * This is a helper function that can be used by the allocator to find 1266 * a suitable block to allocate. This will search the specified AVL 1267 * tree looking for a block that matches the specified criteria. 1268 */ 1269 static uint64_t 1270 metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, 1271 uint64_t align) 1272 { 1273 range_seg_t *rs = metaslab_block_find(t, *cursor, size); 1274 1275 while (rs != NULL) { 1276 uint64_t offset = P2ROUNDUP(rs->rs_start, align); 1277 1278 if (offset + size <= rs->rs_end) { 1279 *cursor = offset + size; 1280 return (offset); 1281 } 1282 rs = AVL_NEXT(t, rs); 1283 } 1284 1285 /* 1286 * If we know we've searched the whole map (*cursor == 0), give up. 1287 * Otherwise, reset the cursor to the beginning and try again. 1288 */ 1289 if (*cursor == 0) 1290 return (-1ULL); 1291 1292 *cursor = 0; 1293 return (metaslab_block_picker(t, cursor, size, align)); 1294 } 1295 1296 /* 1297 * ========================================================================== 1298 * The first-fit block allocator 1299 * ========================================================================== 1300 */ 1301 static uint64_t 1302 metaslab_ff_alloc(metaslab_t *msp, uint64_t size) 1303 { 1304 /* 1305 * Find the largest power of 2 block size that evenly divides the 1306 * requested size. This is used to try to allocate blocks with similar 1307 * alignment from the same area of the metaslab (i.e. same cursor 1308 * bucket) but it does not guarantee that other allocations sizes 1309 * may exist in the same region. 1310 */ 1311 uint64_t align = size & -size; 1312 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; 1313 avl_tree_t *t = &msp->ms_allocatable->rt_root; 1314 1315 return (metaslab_block_picker(t, cursor, size, align)); 1316 } 1317 1318 static metaslab_ops_t metaslab_ff_ops = { 1319 metaslab_ff_alloc 1320 }; 1321 1322 /* 1323 * ========================================================================== 1324 * Dynamic block allocator - 1325 * Uses the first fit allocation scheme until space get low and then 1326 * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold 1327 * and metaslab_df_free_pct to determine when to switch the allocation scheme. 1328 * ========================================================================== 1329 */ 1330 static uint64_t 1331 metaslab_df_alloc(metaslab_t *msp, uint64_t size) 1332 { 1333 /* 1334 * Find the largest power of 2 block size that evenly divides the 1335 * requested size. This is used to try to allocate blocks with similar 1336 * alignment from the same area of the metaslab (i.e. same cursor 1337 * bucket) but it does not guarantee that other allocations sizes 1338 * may exist in the same region. 1339 */ 1340 uint64_t align = size & -size; 1341 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; 1342 range_tree_t *rt = msp->ms_allocatable; 1343 avl_tree_t *t = &rt->rt_root; 1344 uint64_t max_size = metaslab_block_maxsize(msp); 1345 int free_pct = range_tree_space(rt) * 100 / msp->ms_size; 1346 1347 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1348 ASSERT3U(avl_numnodes(t), ==, 1349 avl_numnodes(&msp->ms_allocatable_by_size)); 1350 1351 if (max_size < size) 1352 return (-1ULL); 1353 1354 /* 1355 * If we're running low on space switch to using the size 1356 * sorted AVL tree (best-fit). 1357 */ 1358 if (max_size < metaslab_df_alloc_threshold || 1359 free_pct < metaslab_df_free_pct) { 1360 t = &msp->ms_allocatable_by_size; 1361 *cursor = 0; 1362 } 1363 1364 return (metaslab_block_picker(t, cursor, size, 1ULL)); 1365 } 1366 1367 static metaslab_ops_t metaslab_df_ops = { 1368 metaslab_df_alloc 1369 }; 1370 1371 /* 1372 * ========================================================================== 1373 * Cursor fit block allocator - 1374 * Select the largest region in the metaslab, set the cursor to the beginning 1375 * of the range and the cursor_end to the end of the range. As allocations 1376 * are made advance the cursor. Continue allocating from the cursor until 1377 * the range is exhausted and then find a new range. 1378 * ========================================================================== 1379 */ 1380 static uint64_t 1381 metaslab_cf_alloc(metaslab_t *msp, uint64_t size) 1382 { 1383 range_tree_t *rt = msp->ms_allocatable; 1384 avl_tree_t *t = &msp->ms_allocatable_by_size; 1385 uint64_t *cursor = &msp->ms_lbas[0]; 1386 uint64_t *cursor_end = &msp->ms_lbas[1]; 1387 uint64_t offset = 0; 1388 1389 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1390 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root)); 1391 1392 ASSERT3U(*cursor_end, >=, *cursor); 1393 1394 if ((*cursor + size) > *cursor_end) { 1395 range_seg_t *rs; 1396 1397 rs = avl_last(&msp->ms_allocatable_by_size); 1398 if (rs == NULL || (rs->rs_end - rs->rs_start) < size) 1399 return (-1ULL); 1400 1401 *cursor = rs->rs_start; 1402 *cursor_end = rs->rs_end; 1403 } 1404 1405 offset = *cursor; 1406 *cursor += size; 1407 1408 return (offset); 1409 } 1410 1411 static metaslab_ops_t metaslab_cf_ops = { 1412 metaslab_cf_alloc 1413 }; 1414 1415 /* 1416 * ========================================================================== 1417 * New dynamic fit allocator - 1418 * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift 1419 * contiguous blocks. If no region is found then just use the largest segment 1420 * that remains. 1421 * ========================================================================== 1422 */ 1423 1424 /* 1425 * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift) 1426 * to request from the allocator. 1427 */ 1428 uint64_t metaslab_ndf_clump_shift = 4; 1429 1430 static uint64_t 1431 metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) 1432 { 1433 avl_tree_t *t = &msp->ms_allocatable->rt_root; 1434 avl_index_t where; 1435 range_seg_t *rs, rsearch; 1436 uint64_t hbit = highbit64(size); 1437 uint64_t *cursor = &msp->ms_lbas[hbit - 1]; 1438 uint64_t max_size = metaslab_block_maxsize(msp); 1439 1440 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1441 ASSERT3U(avl_numnodes(t), ==, 1442 avl_numnodes(&msp->ms_allocatable_by_size)); 1443 1444 if (max_size < size) 1445 return (-1ULL); 1446 1447 rsearch.rs_start = *cursor; 1448 rsearch.rs_end = *cursor + size; 1449 1450 rs = avl_find(t, &rsearch, &where); 1451 if (rs == NULL || (rs->rs_end - rs->rs_start) < size) { 1452 t = &msp->ms_allocatable_by_size; 1453 1454 rsearch.rs_start = 0; 1455 rsearch.rs_end = MIN(max_size, 1456 1ULL << (hbit + metaslab_ndf_clump_shift)); 1457 rs = avl_find(t, &rsearch, &where); 1458 if (rs == NULL) 1459 rs = avl_nearest(t, where, AVL_AFTER); 1460 ASSERT(rs != NULL); 1461 } 1462 1463 if ((rs->rs_end - rs->rs_start) >= size) { 1464 *cursor = rs->rs_start + size; 1465 return (rs->rs_start); 1466 } 1467 return (-1ULL); 1468 } 1469 1470 static metaslab_ops_t metaslab_ndf_ops = { 1471 metaslab_ndf_alloc 1472 }; 1473 1474 metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops; 1475 1476 /* 1477 * ========================================================================== 1478 * Metaslabs 1479 * ========================================================================== 1480 */ 1481 1482 static void 1483 metaslab_aux_histograms_clear(metaslab_t *msp) 1484 { 1485 /* 1486 * Auxiliary histograms are only cleared when resetting them, 1487 * which can only happen while the metaslab is loaded. 1488 */ 1489 ASSERT(msp->ms_loaded); 1490 1491 bzero(msp->ms_synchist, sizeof (msp->ms_synchist)); 1492 for (int t = 0; t < TXG_DEFER_SIZE; t++) 1493 bzero(msp->ms_deferhist[t], sizeof (msp->ms_deferhist[t])); 1494 } 1495 1496 static void 1497 metaslab_aux_histogram_add(uint64_t *histogram, uint64_t shift, 1498 range_tree_t *rt) 1499 { 1500 /* 1501 * This is modeled after space_map_histogram_add(), so refer to that 1502 * function for implementation details. We want this to work like 1503 * the space map histogram, and not the range tree histogram, as we 1504 * are essentially constructing a delta that will be later subtracted 1505 * from the space map histogram. 1506 */ 1507 int idx = 0; 1508 for (int i = shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { 1509 ASSERT3U(i, >=, idx + shift); 1510 histogram[idx] += rt->rt_histogram[i] << (i - idx - shift); 1511 1512 if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) { 1513 ASSERT3U(idx + shift, ==, i); 1514 idx++; 1515 ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE); 1516 } 1517 } 1518 } 1519 1520 /* 1521 * Called at every sync pass that the metaslab gets synced. 1522 * 1523 * The reason is that we want our auxiliary histograms to be updated 1524 * wherever the metaslab's space map histogram is updated. This way 1525 * we stay consistent on which parts of the metaslab space map's 1526 * histogram are currently not available for allocations (e.g because 1527 * they are in the defer, freed, and freeing trees). 1528 */ 1529 static void 1530 metaslab_aux_histograms_update(metaslab_t *msp) 1531 { 1532 space_map_t *sm = msp->ms_sm; 1533 ASSERT(sm != NULL); 1534 1535 /* 1536 * This is similar to the metaslab's space map histogram updates 1537 * that take place in metaslab_sync(). The only difference is that 1538 * we only care about segments that haven't made it into the 1539 * ms_allocatable tree yet. 1540 */ 1541 if (msp->ms_loaded) { 1542 metaslab_aux_histograms_clear(msp); 1543 1544 metaslab_aux_histogram_add(msp->ms_synchist, 1545 sm->sm_shift, msp->ms_freed); 1546 1547 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1548 metaslab_aux_histogram_add(msp->ms_deferhist[t], 1549 sm->sm_shift, msp->ms_defer[t]); 1550 } 1551 } 1552 1553 metaslab_aux_histogram_add(msp->ms_synchist, 1554 sm->sm_shift, msp->ms_freeing); 1555 } 1556 1557 /* 1558 * Called every time we are done syncing (writing to) the metaslab, 1559 * i.e. at the end of each sync pass. 1560 * [see the comment in metaslab_impl.h for ms_synchist, ms_deferhist] 1561 */ 1562 static void 1563 metaslab_aux_histograms_update_done(metaslab_t *msp, boolean_t defer_allowed) 1564 { 1565 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1566 space_map_t *sm = msp->ms_sm; 1567 1568 if (sm == NULL) { 1569 /* 1570 * We came here from metaslab_init() when creating/opening a 1571 * pool, looking at a metaslab that hasn't had any allocations 1572 * yet. 1573 */ 1574 return; 1575 } 1576 1577 /* 1578 * This is similar to the actions that we take for the ms_freed 1579 * and ms_defer trees in metaslab_sync_done(). 1580 */ 1581 uint64_t hist_index = spa_syncing_txg(spa) % TXG_DEFER_SIZE; 1582 if (defer_allowed) { 1583 bcopy(msp->ms_synchist, msp->ms_deferhist[hist_index], 1584 sizeof (msp->ms_synchist)); 1585 } else { 1586 bzero(msp->ms_deferhist[hist_index], 1587 sizeof (msp->ms_deferhist[hist_index])); 1588 } 1589 bzero(msp->ms_synchist, sizeof (msp->ms_synchist)); 1590 } 1591 1592 /* 1593 * Ensure that the metaslab's weight and fragmentation are consistent 1594 * with the contents of the histogram (either the range tree's histogram 1595 * or the space map's depending whether the metaslab is loaded). 1596 */ 1597 static void 1598 metaslab_verify_weight_and_frag(metaslab_t *msp) 1599 { 1600 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1601 1602 if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) 1603 return; 1604 1605 /* see comment in metaslab_verify_unflushed_changes() */ 1606 if (msp->ms_group == NULL) 1607 return; 1608 1609 /* 1610 * Devices being removed always return a weight of 0 and leave 1611 * fragmentation and ms_max_size as is - there is nothing for 1612 * us to verify here. 1613 */ 1614 vdev_t *vd = msp->ms_group->mg_vd; 1615 if (vd->vdev_removing) 1616 return; 1617 1618 /* 1619 * If the metaslab is dirty it probably means that we've done 1620 * some allocations or frees that have changed our histograms 1621 * and thus the weight. 1622 */ 1623 for (int t = 0; t < TXG_SIZE; t++) { 1624 if (txg_list_member(&vd->vdev_ms_list, msp, t)) 1625 return; 1626 } 1627 1628 /* 1629 * This verification checks that our in-memory state is consistent 1630 * with what's on disk. If the pool is read-only then there aren't 1631 * any changes and we just have the initially-loaded state. 1632 */ 1633 if (!spa_writeable(msp->ms_group->mg_vd->vdev_spa)) 1634 return; 1635 1636 /* some extra verification for in-core tree if you can */ 1637 if (msp->ms_loaded) { 1638 range_tree_stat_verify(msp->ms_allocatable); 1639 VERIFY(space_map_histogram_verify(msp->ms_sm, 1640 msp->ms_allocatable)); 1641 } 1642 1643 uint64_t weight = msp->ms_weight; 1644 uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; 1645 boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight); 1646 uint64_t frag = msp->ms_fragmentation; 1647 uint64_t max_segsize = msp->ms_max_size; 1648 1649 msp->ms_weight = 0; 1650 msp->ms_fragmentation = 0; 1651 msp->ms_max_size = 0; 1652 1653 /* 1654 * This function is used for verification purposes. Regardless of 1655 * whether metaslab_weight() thinks this metaslab should be active or 1656 * not, we want to ensure that the actual weight (and therefore the 1657 * value of ms_weight) would be the same if it was to be recalculated 1658 * at this point. 1659 */ 1660 msp->ms_weight = metaslab_weight(msp) | was_active; 1661 1662 VERIFY3U(max_segsize, ==, msp->ms_max_size); 1663 1664 /* 1665 * If the weight type changed then there is no point in doing 1666 * verification. Revert fields to their original values. 1667 */ 1668 if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) || 1669 (!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) { 1670 msp->ms_fragmentation = frag; 1671 msp->ms_weight = weight; 1672 return; 1673 } 1674 1675 VERIFY3U(msp->ms_fragmentation, ==, frag); 1676 VERIFY3U(msp->ms_weight, ==, weight); 1677 } 1678 1679 /* 1680 * Wait for any in-progress metaslab loads to complete. 1681 */ 1682 static void 1683 metaslab_load_wait(metaslab_t *msp) 1684 { 1685 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1686 1687 while (msp->ms_loading) { 1688 ASSERT(!msp->ms_loaded); 1689 cv_wait(&msp->ms_load_cv, &msp->ms_lock); 1690 } 1691 } 1692 1693 static int 1694 metaslab_load_impl(metaslab_t *msp) 1695 { 1696 int error = 0; 1697 1698 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1699 ASSERT(msp->ms_loading); 1700 ASSERT(!msp->ms_condensing); 1701 1702 /* 1703 * We temporarily drop the lock to unblock other operations while we 1704 * are reading the space map. Therefore, metaslab_sync() and 1705 * metaslab_sync_done() can run at the same time as we do. 1706 * 1707 * metaslab_sync() can append to the space map while we are loading. 1708 * Therefore we load only entries that existed when we started the 1709 * load. Additionally, metaslab_sync_done() has to wait for the load 1710 * to complete because there are potential races like metaslab_load() 1711 * loading parts of the space map that are currently being appended 1712 * by metaslab_sync(). If we didn't, the ms_allocatable would have 1713 * entries that metaslab_sync_done() would try to re-add later. 1714 * 1715 * That's why before dropping the lock we remember the synced length 1716 * of the metaslab and read up to that point of the space map, 1717 * ignoring entries appended by metaslab_sync() that happen after we 1718 * drop the lock. 1719 */ 1720 uint64_t length = msp->ms_synced_length; 1721 mutex_exit(&msp->ms_lock); 1722 1723 if (msp->ms_sm != NULL) { 1724 error = space_map_load_length(msp->ms_sm, msp->ms_allocatable, 1725 SM_FREE, length); 1726 } else { 1727 /* 1728 * The space map has not been allocated yet, so treat 1729 * all the space in the metaslab as free and add it to the 1730 * ms_allocatable tree. 1731 */ 1732 range_tree_add(msp->ms_allocatable, 1733 msp->ms_start, msp->ms_size); 1734 } 1735 1736 /* 1737 * We need to grab the ms_sync_lock to prevent metaslab_sync() from 1738 * changing the ms_sm and the metaslab's range trees while we are 1739 * about to use them and populate the ms_allocatable. The ms_lock 1740 * is insufficient for this because metaslab_sync() doesn't hold 1741 * the ms_lock while writing the ms_checkpointing tree to disk. 1742 */ 1743 mutex_enter(&msp->ms_sync_lock); 1744 mutex_enter(&msp->ms_lock); 1745 ASSERT(!msp->ms_condensing); 1746 1747 if (error != 0) { 1748 mutex_exit(&msp->ms_sync_lock); 1749 return (error); 1750 } 1751 1752 ASSERT3P(msp->ms_group, !=, NULL); 1753 msp->ms_loaded = B_TRUE; 1754 1755 /* 1756 * The ms_allocatable contains the segments that exist in the 1757 * ms_defer trees [see ms_synced_length]. Thus we need to remove 1758 * them from ms_allocatable as they will be added again in 1759 * metaslab_sync_done(). 1760 */ 1761 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1762 range_tree_walk(msp->ms_defer[t], 1763 range_tree_remove, msp->ms_allocatable); 1764 } 1765 1766 /* 1767 * Call metaslab_recalculate_weight_and_sort() now that the 1768 * metaslab is loaded so we get the metaslab's real weight. 1769 * 1770 * Unless this metaslab was created with older software and 1771 * has not yet been converted to use segment-based weight, we 1772 * expect the new weight to be better or equal to the weight 1773 * that the metaslab had while it was not loaded. This is 1774 * because the old weight does not take into account the 1775 * consolidation of adjacent segments between TXGs. [see 1776 * comment for ms_synchist and ms_deferhist[] for more info] 1777 */ 1778 uint64_t weight = msp->ms_weight; 1779 metaslab_recalculate_weight_and_sort(msp); 1780 if (!WEIGHT_IS_SPACEBASED(weight)) 1781 ASSERT3U(weight, <=, msp->ms_weight); 1782 msp->ms_max_size = metaslab_block_maxsize(msp); 1783 1784 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1785 metaslab_verify_space(msp, spa_syncing_txg(spa)); 1786 mutex_exit(&msp->ms_sync_lock); 1787 1788 return (0); 1789 } 1790 1791 int 1792 metaslab_load(metaslab_t *msp) 1793 { 1794 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1795 1796 /* 1797 * There may be another thread loading the same metaslab, if that's 1798 * the case just wait until the other thread is done and return. 1799 */ 1800 metaslab_load_wait(msp); 1801 if (msp->ms_loaded) 1802 return (0); 1803 VERIFY(!msp->ms_loading); 1804 ASSERT(!msp->ms_condensing); 1805 1806 msp->ms_loading = B_TRUE; 1807 int error = metaslab_load_impl(msp); 1808 msp->ms_loading = B_FALSE; 1809 cv_broadcast(&msp->ms_load_cv); 1810 1811 return (error); 1812 } 1813 1814 void 1815 metaslab_unload(metaslab_t *msp) 1816 { 1817 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1818 1819 metaslab_verify_weight_and_frag(msp); 1820 1821 range_tree_vacate(msp->ms_allocatable, NULL, NULL); 1822 msp->ms_loaded = B_FALSE; 1823 1824 msp->ms_weight &= ~METASLAB_ACTIVE_MASK; 1825 msp->ms_max_size = 0; 1826 1827 /* 1828 * We explicitly recalculate the metaslab's weight based on its space 1829 * map (as it is now not loaded). We want unload metaslabs to always 1830 * have their weights calculated from the space map histograms, while 1831 * loaded ones have it calculated from their in-core range tree 1832 * [see metaslab_load()]. This way, the weight reflects the information 1833 * available in-core, whether it is loaded or not 1834 * 1835 * If ms_group == NULL means that we came here from metaslab_fini(), 1836 * at which point it doesn't make sense for us to do the recalculation 1837 * and the sorting. 1838 */ 1839 if (msp->ms_group != NULL) 1840 metaslab_recalculate_weight_and_sort(msp); 1841 } 1842 1843 static void 1844 metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta, 1845 int64_t defer_delta, int64_t space_delta) 1846 { 1847 vdev_space_update(vd, alloc_delta, defer_delta, space_delta); 1848 1849 ASSERT3P(vd->vdev_spa->spa_root_vdev, ==, vd->vdev_parent); 1850 ASSERT(vd->vdev_ms_count != 0); 1851 1852 metaslab_class_space_update(mc, alloc_delta, defer_delta, space_delta, 1853 vdev_deflated_space(vd, space_delta)); 1854 } 1855 1856 int 1857 metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, 1858 metaslab_t **msp) 1859 { 1860 vdev_t *vd = mg->mg_vd; 1861 spa_t *spa = vd->vdev_spa; 1862 objset_t *mos = spa->spa_meta_objset; 1863 metaslab_t *ms; 1864 int error; 1865 1866 ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); 1867 mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL); 1868 mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL); 1869 cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); 1870 1871 ms->ms_id = id; 1872 ms->ms_start = id << vd->vdev_ms_shift; 1873 ms->ms_size = 1ULL << vd->vdev_ms_shift; 1874 ms->ms_allocator = -1; 1875 ms->ms_new = B_TRUE; 1876 1877 /* 1878 * We only open space map objects that already exist. All others 1879 * will be opened when we finally allocate an object for it. 1880 * 1881 * Note: 1882 * When called from vdev_expand(), we can't call into the DMU as 1883 * we are holding the spa_config_lock as a writer and we would 1884 * deadlock [see relevant comment in vdev_metaslab_init()]. in 1885 * that case, the object parameter is zero though, so we won't 1886 * call into the DMU. 1887 */ 1888 if (object != 0) { 1889 error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start, 1890 ms->ms_size, vd->vdev_ashift); 1891 1892 if (error != 0) { 1893 kmem_free(ms, sizeof (metaslab_t)); 1894 return (error); 1895 } 1896 1897 ASSERT(ms->ms_sm != NULL); 1898 ASSERT3S(space_map_allocated(ms->ms_sm), >=, 0); 1899 ms->ms_allocated_space = space_map_allocated(ms->ms_sm); 1900 } 1901 1902 /* 1903 * We create the ms_allocatable here, but we don't create the 1904 * other range trees until metaslab_sync_done(). This serves 1905 * two purposes: it allows metaslab_sync_done() to detect the 1906 * addition of new space; and for debugging, it ensures that 1907 * we'd data fault on any attempt to use this metaslab before 1908 * it's ready. 1909 */ 1910 ms->ms_allocatable = range_tree_create(&metaslab_rt_ops, ms); 1911 metaslab_group_add(mg, ms); 1912 1913 metaslab_set_fragmentation(ms); 1914 1915 /* 1916 * If we're opening an existing pool (txg == 0) or creating 1917 * a new one (txg == TXG_INITIAL), all space is available now. 1918 * If we're adding space to an existing pool, the new space 1919 * does not become available until after this txg has synced. 1920 * The metaslab's weight will also be initialized when we sync 1921 * out this txg. This ensures that we don't attempt to allocate 1922 * from it before we have initialized it completely. 1923 */ 1924 if (txg <= TXG_INITIAL) { 1925 metaslab_sync_done(ms, 0); 1926 metaslab_space_update(vd, mg->mg_class, 1927 metaslab_allocated_space(ms), 0, 0); 1928 } 1929 1930 /* 1931 * If metaslab_debug_load is set and we're initializing a metaslab 1932 * that has an allocated space map object then load the space map 1933 * so that we can verify frees. 1934 */ 1935 if (metaslab_debug_load && ms->ms_sm != NULL) { 1936 mutex_enter(&ms->ms_lock); 1937 VERIFY0(metaslab_load(ms)); 1938 mutex_exit(&ms->ms_lock); 1939 } 1940 1941 if (txg != 0) { 1942 vdev_dirty(vd, 0, NULL, txg); 1943 vdev_dirty(vd, VDD_METASLAB, ms, txg); 1944 } 1945 1946 *msp = ms; 1947 1948 return (0); 1949 } 1950 1951 void 1952 metaslab_fini(metaslab_t *msp) 1953 { 1954 metaslab_group_t *mg = msp->ms_group; 1955 vdev_t *vd = mg->mg_vd; 1956 1957 metaslab_group_remove(mg, msp); 1958 1959 mutex_enter(&msp->ms_lock); 1960 VERIFY(msp->ms_group == NULL); 1961 metaslab_space_update(vd, mg->mg_class, 1962 -metaslab_allocated_space(msp), 0, -msp->ms_size); 1963 1964 space_map_close(msp->ms_sm); 1965 1966 metaslab_unload(msp); 1967 1968 range_tree_destroy(msp->ms_allocatable); 1969 range_tree_destroy(msp->ms_freeing); 1970 range_tree_destroy(msp->ms_freed); 1971 1972 for (int t = 0; t < TXG_SIZE; t++) { 1973 range_tree_destroy(msp->ms_allocating[t]); 1974 } 1975 1976 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1977 range_tree_destroy(msp->ms_defer[t]); 1978 } 1979 ASSERT0(msp->ms_deferspace); 1980 1981 range_tree_destroy(msp->ms_checkpointing); 1982 1983 for (int t = 0; t < TXG_SIZE; t++) 1984 ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t)); 1985 1986 mutex_exit(&msp->ms_lock); 1987 cv_destroy(&msp->ms_load_cv); 1988 mutex_destroy(&msp->ms_lock); 1989 mutex_destroy(&msp->ms_sync_lock); 1990 ASSERT3U(msp->ms_allocator, ==, -1); 1991 1992 kmem_free(msp, sizeof (metaslab_t)); 1993 } 1994 1995 #define FRAGMENTATION_TABLE_SIZE 17 1996 1997 /* 1998 * This table defines a segment size based fragmentation metric that will 1999 * allow each metaslab to derive its own fragmentation value. This is done 2000 * by calculating the space in each bucket of the spacemap histogram and 2001 * multiplying that by the fragmentation metric in this table. Doing 2002 * this for all buckets and dividing it by the total amount of free 2003 * space in this metaslab (i.e. the total free space in all buckets) gives 2004 * us the fragmentation metric. This means that a high fragmentation metric 2005 * equates to most of the free space being comprised of small segments. 2006 * Conversely, if the metric is low, then most of the free space is in 2007 * large segments. A 10% change in fragmentation equates to approximately 2008 * double the number of segments. 2009 * 2010 * This table defines 0% fragmented space using 16MB segments. Testing has 2011 * shown that segments that are greater than or equal to 16MB do not suffer 2012 * from drastic performance problems. Using this value, we derive the rest 2013 * of the table. Since the fragmentation value is never stored on disk, it 2014 * is possible to change these calculations in the future. 2015 */ 2016 int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = { 2017 100, /* 512B */ 2018 100, /* 1K */ 2019 98, /* 2K */ 2020 95, /* 4K */ 2021 90, /* 8K */ 2022 80, /* 16K */ 2023 70, /* 32K */ 2024 60, /* 64K */ 2025 50, /* 128K */ 2026 40, /* 256K */ 2027 30, /* 512K */ 2028 20, /* 1M */ 2029 15, /* 2M */ 2030 10, /* 4M */ 2031 5, /* 8M */ 2032 0 /* 16M */ 2033 }; 2034 2035 /* 2036 * Calculate the metaslab's fragmentation metric and set ms_fragmentation. 2037 * Setting this value to ZFS_FRAG_INVALID means that the metaslab has not 2038 * been upgraded and does not support this metric. Otherwise, the return 2039 * value should be in the range [0, 100]. 2040 */ 2041 static void 2042 metaslab_set_fragmentation(metaslab_t *msp) 2043 { 2044 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 2045 uint64_t fragmentation = 0; 2046 uint64_t total = 0; 2047 boolean_t feature_enabled = spa_feature_is_enabled(spa, 2048 SPA_FEATURE_SPACEMAP_HISTOGRAM); 2049 2050 if (!feature_enabled) { 2051 msp->ms_fragmentation = ZFS_FRAG_INVALID; 2052 return; 2053 } 2054 2055 /* 2056 * A null space map means that the entire metaslab is free 2057 * and thus is not fragmented. 2058 */ 2059 if (msp->ms_sm == NULL) { 2060 msp->ms_fragmentation = 0; 2061 return; 2062 } 2063 2064 /* 2065 * If this metaslab's space map has not been upgraded, flag it 2066 * so that we upgrade next time we encounter it. 2067 */ 2068 if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) { 2069 uint64_t txg = spa_syncing_txg(spa); 2070 vdev_t *vd = msp->ms_group->mg_vd; 2071 2072 /* 2073 * If we've reached the final dirty txg, then we must 2074 * be shutting down the pool. We don't want to dirty 2075 * any data past this point so skip setting the condense 2076 * flag. We can retry this action the next time the pool 2077 * is imported. 2078 */ 2079 if (spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) { 2080 msp->ms_condense_wanted = B_TRUE; 2081 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 2082 zfs_dbgmsg("txg %llu, requesting force condense: " 2083 "ms_id %llu, vdev_id %llu", txg, msp->ms_id, 2084 vd->vdev_id); 2085 } 2086 msp->ms_fragmentation = ZFS_FRAG_INVALID; 2087 return; 2088 } 2089 2090 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 2091 uint64_t space = 0; 2092 uint8_t shift = msp->ms_sm->sm_shift; 2093 2094 int idx = MIN(shift - SPA_MINBLOCKSHIFT + i, 2095 FRAGMENTATION_TABLE_SIZE - 1); 2096 2097 if (msp->ms_sm->sm_phys->smp_histogram[i] == 0) 2098 continue; 2099 2100 space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift); 2101 total += space; 2102 2103 ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE); 2104 fragmentation += space * zfs_frag_table[idx]; 2105 } 2106 2107 if (total > 0) 2108 fragmentation /= total; 2109 ASSERT3U(fragmentation, <=, 100); 2110 2111 msp->ms_fragmentation = fragmentation; 2112 } 2113 2114 /* 2115 * Compute a weight -- a selection preference value -- for the given metaslab. 2116 * This is based on the amount of free space, the level of fragmentation, 2117 * the LBA range, and whether the metaslab is loaded. 2118 */ 2119 static uint64_t 2120 metaslab_space_weight(metaslab_t *msp) 2121 { 2122 metaslab_group_t *mg = msp->ms_group; 2123 vdev_t *vd = mg->mg_vd; 2124 uint64_t weight, space; 2125 2126 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2127 ASSERT(!vd->vdev_removing); 2128 2129 /* 2130 * The baseline weight is the metaslab's free space. 2131 */ 2132 space = msp->ms_size - metaslab_allocated_space(msp); 2133 2134 if (metaslab_fragmentation_factor_enabled && 2135 msp->ms_fragmentation != ZFS_FRAG_INVALID) { 2136 /* 2137 * Use the fragmentation information to inversely scale 2138 * down the baseline weight. We need to ensure that we 2139 * don't exclude this metaslab completely when it's 100% 2140 * fragmented. To avoid this we reduce the fragmented value 2141 * by 1. 2142 */ 2143 space = (space * (100 - (msp->ms_fragmentation - 1))) / 100; 2144 2145 /* 2146 * If space < SPA_MINBLOCKSIZE, then we will not allocate from 2147 * this metaslab again. The fragmentation metric may have 2148 * decreased the space to something smaller than 2149 * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE 2150 * so that we can consume any remaining space. 2151 */ 2152 if (space > 0 && space < SPA_MINBLOCKSIZE) 2153 space = SPA_MINBLOCKSIZE; 2154 } 2155 weight = space; 2156 2157 /* 2158 * Modern disks have uniform bit density and constant angular velocity. 2159 * Therefore, the outer recording zones are faster (higher bandwidth) 2160 * than the inner zones by the ratio of outer to inner track diameter, 2161 * which is typically around 2:1. We account for this by assigning 2162 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). 2163 * In effect, this means that we'll select the metaslab with the most 2164 * free bandwidth rather than simply the one with the most free space. 2165 */ 2166 if (metaslab_lba_weighting_enabled) { 2167 weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count; 2168 ASSERT(weight >= space && weight <= 2 * space); 2169 } 2170 2171 /* 2172 * If this metaslab is one we're actively using, adjust its 2173 * weight to make it preferable to any inactive metaslab so 2174 * we'll polish it off. If the fragmentation on this metaslab 2175 * has exceed our threshold, then don't mark it active. 2176 */ 2177 if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID && 2178 msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) { 2179 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); 2180 } 2181 2182 WEIGHT_SET_SPACEBASED(weight); 2183 return (weight); 2184 } 2185 2186 /* 2187 * Return the weight of the specified metaslab, according to the segment-based 2188 * weighting algorithm. The metaslab must be loaded. This function can 2189 * be called within a sync pass since it relies only on the metaslab's 2190 * range tree which is always accurate when the metaslab is loaded. 2191 */ 2192 static uint64_t 2193 metaslab_weight_from_range_tree(metaslab_t *msp) 2194 { 2195 uint64_t weight = 0; 2196 uint32_t segments = 0; 2197 2198 ASSERT(msp->ms_loaded); 2199 2200 for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT; 2201 i--) { 2202 uint8_t shift = msp->ms_group->mg_vd->vdev_ashift; 2203 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; 2204 2205 segments <<= 1; 2206 segments += msp->ms_allocatable->rt_histogram[i]; 2207 2208 /* 2209 * The range tree provides more precision than the space map 2210 * and must be downgraded so that all values fit within the 2211 * space map's histogram. This allows us to compare loaded 2212 * vs. unloaded metaslabs to determine which metaslab is 2213 * considered "best". 2214 */ 2215 if (i > max_idx) 2216 continue; 2217 2218 if (segments != 0) { 2219 WEIGHT_SET_COUNT(weight, segments); 2220 WEIGHT_SET_INDEX(weight, i); 2221 WEIGHT_SET_ACTIVE(weight, 0); 2222 break; 2223 } 2224 } 2225 return (weight); 2226 } 2227 2228 /* 2229 * Calculate the weight based on the on-disk histogram. This should only 2230 * be called after a sync pass has completely finished since the on-disk 2231 * information is updated in metaslab_sync(). 2232 */ 2233 static uint64_t 2234 metaslab_weight_from_spacemap(metaslab_t *msp) 2235 { 2236 space_map_t *sm = msp->ms_sm; 2237 ASSERT(!msp->ms_loaded); 2238 ASSERT(sm != NULL); 2239 ASSERT3U(space_map_object(sm), !=, 0); 2240 ASSERT3U(sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); 2241 2242 /* 2243 * Create a joint histogram from all the segments that have made 2244 * it to the metaslab's space map histogram, that are not yet 2245 * available for allocation because they are still in the freeing 2246 * pipeline (e.g. freeing, freed, and defer trees). Then subtract 2247 * these segments from the space map's histogram to get a more 2248 * accurate weight. 2249 */ 2250 uint64_t deferspace_histogram[SPACE_MAP_HISTOGRAM_SIZE] = {0}; 2251 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) 2252 deferspace_histogram[i] += msp->ms_synchist[i]; 2253 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2254 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 2255 deferspace_histogram[i] += msp->ms_deferhist[t][i]; 2256 } 2257 } 2258 2259 uint64_t weight = 0; 2260 for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) { 2261 ASSERT3U(sm->sm_phys->smp_histogram[i], >=, 2262 deferspace_histogram[i]); 2263 uint64_t count = 2264 sm->sm_phys->smp_histogram[i] - deferspace_histogram[i]; 2265 if (count != 0) { 2266 WEIGHT_SET_COUNT(weight, count); 2267 WEIGHT_SET_INDEX(weight, i + sm->sm_shift); 2268 WEIGHT_SET_ACTIVE(weight, 0); 2269 break; 2270 } 2271 } 2272 return (weight); 2273 } 2274 2275 /* 2276 * Compute a segment-based weight for the specified metaslab. The weight 2277 * is determined by highest bucket in the histogram. The information 2278 * for the highest bucket is encoded into the weight value. 2279 */ 2280 static uint64_t 2281 metaslab_segment_weight(metaslab_t *msp) 2282 { 2283 metaslab_group_t *mg = msp->ms_group; 2284 uint64_t weight = 0; 2285 uint8_t shift = mg->mg_vd->vdev_ashift; 2286 2287 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2288 2289 /* 2290 * The metaslab is completely free. 2291 */ 2292 if (metaslab_allocated_space(msp) == 0) { 2293 int idx = highbit64(msp->ms_size) - 1; 2294 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; 2295 2296 if (idx < max_idx) { 2297 WEIGHT_SET_COUNT(weight, 1ULL); 2298 WEIGHT_SET_INDEX(weight, idx); 2299 } else { 2300 WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx)); 2301 WEIGHT_SET_INDEX(weight, max_idx); 2302 } 2303 WEIGHT_SET_ACTIVE(weight, 0); 2304 ASSERT(!WEIGHT_IS_SPACEBASED(weight)); 2305 2306 return (weight); 2307 } 2308 2309 ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); 2310 2311 /* 2312 * If the metaslab is fully allocated then just make the weight 0. 2313 */ 2314 if (metaslab_allocated_space(msp) == msp->ms_size) 2315 return (0); 2316 /* 2317 * If the metaslab is already loaded, then use the range tree to 2318 * determine the weight. Otherwise, we rely on the space map information 2319 * to generate the weight. 2320 */ 2321 if (msp->ms_loaded) { 2322 weight = metaslab_weight_from_range_tree(msp); 2323 } else { 2324 weight = metaslab_weight_from_spacemap(msp); 2325 } 2326 2327 /* 2328 * If the metaslab was active the last time we calculated its weight 2329 * then keep it active. We want to consume the entire region that 2330 * is associated with this weight. 2331 */ 2332 if (msp->ms_activation_weight != 0 && weight != 0) 2333 WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight)); 2334 return (weight); 2335 } 2336 2337 /* 2338 * Determine if we should attempt to allocate from this metaslab. If the 2339 * metaslab has a maximum size then we can quickly determine if the desired 2340 * allocation size can be satisfied. Otherwise, if we're using segment-based 2341 * weighting then we can determine the maximum allocation that this metaslab 2342 * can accommodate based on the index encoded in the weight. If we're using 2343 * space-based weights then rely on the entire weight (excluding the weight 2344 * type bit). 2345 */ 2346 boolean_t 2347 metaslab_should_allocate(metaslab_t *msp, uint64_t asize) 2348 { 2349 boolean_t should_allocate; 2350 2351 if (msp->ms_max_size != 0) 2352 return (msp->ms_max_size >= asize); 2353 2354 if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) { 2355 /* 2356 * The metaslab segment weight indicates segments in the 2357 * range [2^i, 2^(i+1)), where i is the index in the weight. 2358 * Since the asize might be in the middle of the range, we 2359 * should attempt the allocation if asize < 2^(i+1). 2360 */ 2361 should_allocate = (asize < 2362 1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1)); 2363 } else { 2364 should_allocate = (asize <= 2365 (msp->ms_weight & ~METASLAB_WEIGHT_TYPE)); 2366 } 2367 return (should_allocate); 2368 } 2369 2370 static uint64_t 2371 metaslab_weight(metaslab_t *msp) 2372 { 2373 vdev_t *vd = msp->ms_group->mg_vd; 2374 spa_t *spa = vd->vdev_spa; 2375 uint64_t weight; 2376 2377 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2378 2379 /* 2380 * If this vdev is in the process of being removed, there is nothing 2381 * for us to do here. 2382 */ 2383 if (vd->vdev_removing) 2384 return (0); 2385 2386 metaslab_set_fragmentation(msp); 2387 2388 /* 2389 * Update the maximum size if the metaslab is loaded. This will 2390 * ensure that we get an accurate maximum size if newly freed space 2391 * has been added back into the free tree. 2392 */ 2393 if (msp->ms_loaded) 2394 msp->ms_max_size = metaslab_block_maxsize(msp); 2395 else 2396 ASSERT0(msp->ms_max_size); 2397 2398 /* 2399 * Segment-based weighting requires space map histogram support. 2400 */ 2401 if (zfs_metaslab_segment_weight_enabled && 2402 spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) && 2403 (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size == 2404 sizeof (space_map_phys_t))) { 2405 weight = metaslab_segment_weight(msp); 2406 } else { 2407 weight = metaslab_space_weight(msp); 2408 } 2409 return (weight); 2410 } 2411 2412 void 2413 metaslab_recalculate_weight_and_sort(metaslab_t *msp) 2414 { 2415 /* note: we preserve the mask (e.g. indication of primary, etc..) */ 2416 uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; 2417 metaslab_group_sort(msp->ms_group, msp, 2418 metaslab_weight(msp) | was_active); 2419 } 2420 2421 static int 2422 metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp, 2423 int allocator, uint64_t activation_weight) 2424 { 2425 /* 2426 * If we're activating for the claim code, we don't want to actually 2427 * set the metaslab up for a specific allocator. 2428 */ 2429 if (activation_weight == METASLAB_WEIGHT_CLAIM) 2430 return (0); 2431 metaslab_t **arr = (activation_weight == METASLAB_WEIGHT_PRIMARY ? 2432 mg->mg_primaries : mg->mg_secondaries); 2433 2434 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2435 mutex_enter(&mg->mg_lock); 2436 if (arr[allocator] != NULL) { 2437 mutex_exit(&mg->mg_lock); 2438 return (EEXIST); 2439 } 2440 2441 arr[allocator] = msp; 2442 ASSERT3S(msp->ms_allocator, ==, -1); 2443 msp->ms_allocator = allocator; 2444 msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY); 2445 mutex_exit(&mg->mg_lock); 2446 2447 return (0); 2448 } 2449 2450 static int 2451 metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight) 2452 { 2453 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2454 2455 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { 2456 int error = metaslab_load(msp); 2457 if (error != 0) { 2458 metaslab_group_sort(msp->ms_group, msp, 0); 2459 return (error); 2460 } 2461 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) { 2462 /* 2463 * The metaslab was activated for another allocator 2464 * while we were waiting, we should reselect. 2465 */ 2466 return (EBUSY); 2467 } 2468 if ((error = metaslab_activate_allocator(msp->ms_group, msp, 2469 allocator, activation_weight)) != 0) { 2470 return (error); 2471 } 2472 2473 msp->ms_activation_weight = msp->ms_weight; 2474 metaslab_group_sort(msp->ms_group, msp, 2475 msp->ms_weight | activation_weight); 2476 } 2477 ASSERT(msp->ms_loaded); 2478 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); 2479 2480 return (0); 2481 } 2482 2483 static void 2484 metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp, 2485 uint64_t weight) 2486 { 2487 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2488 if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) { 2489 metaslab_group_sort(mg, msp, weight); 2490 return; 2491 } 2492 2493 mutex_enter(&mg->mg_lock); 2494 ASSERT3P(msp->ms_group, ==, mg); 2495 if (msp->ms_primary) { 2496 ASSERT3U(0, <=, msp->ms_allocator); 2497 ASSERT3U(msp->ms_allocator, <, mg->mg_allocators); 2498 ASSERT3P(mg->mg_primaries[msp->ms_allocator], ==, msp); 2499 ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); 2500 mg->mg_primaries[msp->ms_allocator] = NULL; 2501 } else { 2502 ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); 2503 ASSERT3P(mg->mg_secondaries[msp->ms_allocator], ==, msp); 2504 mg->mg_secondaries[msp->ms_allocator] = NULL; 2505 } 2506 msp->ms_allocator = -1; 2507 metaslab_group_sort_impl(mg, msp, weight); 2508 mutex_exit(&mg->mg_lock); 2509 } 2510 2511 static void 2512 metaslab_passivate(metaslab_t *msp, uint64_t weight) 2513 { 2514 uint64_t size = weight & ~METASLAB_WEIGHT_TYPE; 2515 2516 /* 2517 * If size < SPA_MINBLOCKSIZE, then we will not allocate from 2518 * this metaslab again. In that case, it had better be empty, 2519 * or we would be leaving space on the table. 2520 */ 2521 ASSERT(size >= SPA_MINBLOCKSIZE || 2522 range_tree_is_empty(msp->ms_allocatable)); 2523 ASSERT0(weight & METASLAB_ACTIVE_MASK); 2524 2525 msp->ms_activation_weight = 0; 2526 metaslab_passivate_allocator(msp->ms_group, msp, weight); 2527 ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); 2528 } 2529 2530 /* 2531 * Segment-based metaslabs are activated once and remain active until 2532 * we either fail an allocation attempt (similar to space-based metaslabs) 2533 * or have exhausted the free space in zfs_metaslab_switch_threshold 2534 * buckets since the metaslab was activated. This function checks to see 2535 * if we've exhaused the zfs_metaslab_switch_threshold buckets in the 2536 * metaslab and passivates it proactively. This will allow us to select a 2537 * metaslabs with larger contiguous region if any remaining within this 2538 * metaslab group. If we're in sync pass > 1, then we continue using this 2539 * metaslab so that we don't dirty more block and cause more sync passes. 2540 */ 2541 void 2542 metaslab_segment_may_passivate(metaslab_t *msp) 2543 { 2544 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 2545 2546 if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1) 2547 return; 2548 2549 /* 2550 * Since we are in the middle of a sync pass, the most accurate 2551 * information that is accessible to us is the in-core range tree 2552 * histogram; calculate the new weight based on that information. 2553 */ 2554 uint64_t weight = metaslab_weight_from_range_tree(msp); 2555 int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight); 2556 int current_idx = WEIGHT_GET_INDEX(weight); 2557 2558 if (current_idx <= activation_idx - zfs_metaslab_switch_threshold) 2559 metaslab_passivate(msp, weight); 2560 } 2561 2562 static void 2563 metaslab_preload(void *arg) 2564 { 2565 metaslab_t *msp = arg; 2566 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 2567 2568 ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock)); 2569 2570 mutex_enter(&msp->ms_lock); 2571 (void) metaslab_load(msp); 2572 msp->ms_selected_txg = spa_syncing_txg(spa); 2573 mutex_exit(&msp->ms_lock); 2574 } 2575 2576 static void 2577 metaslab_group_preload(metaslab_group_t *mg) 2578 { 2579 spa_t *spa = mg->mg_vd->vdev_spa; 2580 metaslab_t *msp; 2581 avl_tree_t *t = &mg->mg_metaslab_tree; 2582 int m = 0; 2583 2584 if (spa_shutting_down(spa) || !metaslab_preload_enabled) { 2585 taskq_wait(mg->mg_taskq); 2586 return; 2587 } 2588 2589 mutex_enter(&mg->mg_lock); 2590 2591 /* 2592 * Load the next potential metaslabs 2593 */ 2594 for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) { 2595 ASSERT3P(msp->ms_group, ==, mg); 2596 2597 /* 2598 * We preload only the maximum number of metaslabs specified 2599 * by metaslab_preload_limit. If a metaslab is being forced 2600 * to condense then we preload it too. This will ensure 2601 * that force condensing happens in the next txg. 2602 */ 2603 if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) { 2604 continue; 2605 } 2606 2607 VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload, 2608 msp, TQ_SLEEP) != TASKQID_INVALID); 2609 } 2610 mutex_exit(&mg->mg_lock); 2611 } 2612 2613 /* 2614 * Determine if the space map's on-disk footprint is past our tolerance 2615 * for inefficiency. We would like to use the following criteria to make 2616 * our decision: 2617 * 2618 * 1. The size of the space map object should not dramatically increase as a 2619 * result of writing out the free space range tree. 2620 * 2621 * 2. The minimal on-disk space map representation is zfs_condense_pct/100 2622 * times the size than the free space range tree representation 2623 * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1MB). 2624 * 2625 * 3. The on-disk size of the space map should actually decrease. 2626 * 2627 * Unfortunately, we cannot compute the on-disk size of the space map in this 2628 * context because we cannot accurately compute the effects of compression, etc. 2629 * Instead, we apply the heuristic described in the block comment for 2630 * zfs_metaslab_condense_block_threshold - we only condense if the space used 2631 * is greater than a threshold number of blocks. 2632 */ 2633 static boolean_t 2634 metaslab_should_condense(metaslab_t *msp) 2635 { 2636 space_map_t *sm = msp->ms_sm; 2637 vdev_t *vd = msp->ms_group->mg_vd; 2638 uint64_t vdev_blocksize = 1 << vd->vdev_ashift; 2639 uint64_t current_txg = spa_syncing_txg(vd->vdev_spa); 2640 2641 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2642 ASSERT(msp->ms_loaded); 2643 2644 /* 2645 * Allocations and frees in early passes are generally more space 2646 * efficient (in terms of blocks described in space map entries) 2647 * than the ones in later passes (e.g. we don't compress after 2648 * sync pass 5) and condensing a metaslab multiple times in a txg 2649 * could degrade performance. 2650 * 2651 * Thus we prefer condensing each metaslab at most once every txg at 2652 * the earliest sync pass possible. If a metaslab is eligible for 2653 * condensing again after being considered for condensing within the 2654 * same txg, it will hopefully be dirty in the next txg where it will 2655 * be condensed at an earlier pass. 2656 */ 2657 if (msp->ms_condense_checked_txg == current_txg) 2658 return (B_FALSE); 2659 msp->ms_condense_checked_txg = current_txg; 2660 2661 /* 2662 * We always condense metaslabs that are empty and metaslabs for 2663 * which a condense request has been made. 2664 */ 2665 if (avl_is_empty(&msp->ms_allocatable_by_size) || 2666 msp->ms_condense_wanted) 2667 return (B_TRUE); 2668 2669 uint64_t object_size = space_map_length(msp->ms_sm); 2670 uint64_t optimal_size = space_map_estimate_optimal_size(sm, 2671 msp->ms_allocatable, SM_NO_VDEVID); 2672 2673 dmu_object_info_t doi; 2674 dmu_object_info_from_db(sm->sm_dbuf, &doi); 2675 uint64_t record_size = MAX(doi.doi_data_block_size, vdev_blocksize); 2676 2677 return (object_size >= (optimal_size * zfs_condense_pct / 100) && 2678 object_size > zfs_metaslab_condense_block_threshold * record_size); 2679 } 2680 2681 /* 2682 * Condense the on-disk space map representation to its minimized form. 2683 * The minimized form consists of a small number of allocations followed by 2684 * the entries of the free range tree. 2685 */ 2686 static void 2687 metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) 2688 { 2689 range_tree_t *condense_tree; 2690 space_map_t *sm = msp->ms_sm; 2691 2692 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2693 ASSERT(msp->ms_loaded); 2694 2695 zfs_dbgmsg("condensing: txg %llu, msp[%llu] %p, vdev id %llu, " 2696 "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg, 2697 msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id, 2698 msp->ms_group->mg_vd->vdev_spa->spa_name, 2699 space_map_length(msp->ms_sm), 2700 avl_numnodes(&msp->ms_allocatable->rt_root), 2701 msp->ms_condense_wanted ? "TRUE" : "FALSE"); 2702 2703 msp->ms_condense_wanted = B_FALSE; 2704 2705 /* 2706 * Create an range tree that is 100% allocated. We remove segments 2707 * that have been freed in this txg, any deferred frees that exist, 2708 * and any allocation in the future. Removing segments should be 2709 * a relatively inexpensive operation since we expect these trees to 2710 * have a small number of nodes. 2711 */ 2712 condense_tree = range_tree_create(NULL, NULL); 2713 range_tree_add(condense_tree, msp->ms_start, msp->ms_size); 2714 2715 range_tree_walk(msp->ms_freeing, range_tree_remove, condense_tree); 2716 range_tree_walk(msp->ms_freed, range_tree_remove, condense_tree); 2717 2718 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2719 range_tree_walk(msp->ms_defer[t], 2720 range_tree_remove, condense_tree); 2721 } 2722 2723 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 2724 range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK], 2725 range_tree_remove, condense_tree); 2726 } 2727 2728 /* 2729 * We're about to drop the metaslab's lock thus allowing 2730 * other consumers to change it's content. Set the 2731 * metaslab's ms_condensing flag to ensure that 2732 * allocations on this metaslab do not occur while we're 2733 * in the middle of committing it to disk. This is only critical 2734 * for ms_allocatable as all other range trees use per txg 2735 * views of their content. 2736 */ 2737 msp->ms_condensing = B_TRUE; 2738 2739 mutex_exit(&msp->ms_lock); 2740 space_map_truncate(sm, zfs_metaslab_sm_blksz, tx); 2741 2742 /* 2743 * While we would ideally like to create a space map representation 2744 * that consists only of allocation records, doing so can be 2745 * prohibitively expensive because the in-core free tree can be 2746 * large, and therefore computationally expensive to subtract 2747 * from the condense_tree. Instead we sync out two trees, a cheap 2748 * allocation only tree followed by the in-core free tree. While not 2749 * optimal, this is typically close to optimal, and much cheaper to 2750 * compute. 2751 */ 2752 space_map_write(sm, condense_tree, SM_ALLOC, SM_NO_VDEVID, tx); 2753 range_tree_vacate(condense_tree, NULL, NULL); 2754 range_tree_destroy(condense_tree); 2755 2756 space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx); 2757 mutex_enter(&msp->ms_lock); 2758 msp->ms_condensing = B_FALSE; 2759 } 2760 2761 /* 2762 * Write a metaslab to disk in the context of the specified transaction group. 2763 */ 2764 void 2765 metaslab_sync(metaslab_t *msp, uint64_t txg) 2766 { 2767 metaslab_group_t *mg = msp->ms_group; 2768 vdev_t *vd = mg->mg_vd; 2769 spa_t *spa = vd->vdev_spa; 2770 objset_t *mos = spa_meta_objset(spa); 2771 range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK]; 2772 dmu_tx_t *tx; 2773 uint64_t object = space_map_object(msp->ms_sm); 2774 2775 ASSERT(!vd->vdev_ishole); 2776 2777 /* 2778 * This metaslab has just been added so there's no work to do now. 2779 */ 2780 if (msp->ms_freeing == NULL) { 2781 ASSERT3P(alloctree, ==, NULL); 2782 return; 2783 } 2784 2785 ASSERT3P(alloctree, !=, NULL); 2786 ASSERT3P(msp->ms_freeing, !=, NULL); 2787 ASSERT3P(msp->ms_freed, !=, NULL); 2788 ASSERT3P(msp->ms_checkpointing, !=, NULL); 2789 2790 /* 2791 * Normally, we don't want to process a metaslab if there are no 2792 * allocations or frees to perform. However, if the metaslab is being 2793 * forced to condense and it's loaded, we need to let it through. 2794 */ 2795 if (range_tree_is_empty(alloctree) && 2796 range_tree_is_empty(msp->ms_freeing) && 2797 range_tree_is_empty(msp->ms_checkpointing) && 2798 !(msp->ms_loaded && msp->ms_condense_wanted)) 2799 return; 2800 2801 2802 VERIFY(txg <= spa_final_dirty_txg(spa)); 2803 2804 /* 2805 * The only state that can actually be changing concurrently 2806 * with metaslab_sync() is the metaslab's ms_allocatable. No 2807 * other thread can be modifying this txg's alloc, freeing, 2808 * freed, or space_map_phys_t. We drop ms_lock whenever we 2809 * could call into the DMU, because the DMU can call down to 2810 * us (e.g. via zio_free()) at any time. 2811 * 2812 * The spa_vdev_remove_thread() can be reading metaslab state 2813 * concurrently, and it is locked out by the ms_sync_lock. 2814 * Note that the ms_lock is insufficient for this, because it 2815 * is dropped by space_map_write(). 2816 */ 2817 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 2818 2819 if (msp->ms_sm == NULL) { 2820 uint64_t new_object; 2821 2822 new_object = space_map_alloc(mos, zfs_metaslab_sm_blksz, tx); 2823 VERIFY3U(new_object, !=, 0); 2824 2825 VERIFY0(space_map_open(&msp->ms_sm, mos, new_object, 2826 msp->ms_start, msp->ms_size, vd->vdev_ashift)); 2827 2828 ASSERT(msp->ms_sm != NULL); 2829 ASSERT0(metaslab_allocated_space(msp)); 2830 } 2831 2832 if (!range_tree_is_empty(msp->ms_checkpointing) && 2833 vd->vdev_checkpoint_sm == NULL) { 2834 ASSERT(spa_has_checkpoint(spa)); 2835 2836 uint64_t new_object = space_map_alloc(mos, 2837 vdev_standard_sm_blksz, tx); 2838 VERIFY3U(new_object, !=, 0); 2839 2840 VERIFY0(space_map_open(&vd->vdev_checkpoint_sm, 2841 mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift)); 2842 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); 2843 2844 /* 2845 * We save the space map object as an entry in vdev_top_zap 2846 * so it can be retrieved when the pool is reopened after an 2847 * export or through zdb. 2848 */ 2849 VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, 2850 vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, 2851 sizeof (new_object), 1, &new_object, tx)); 2852 } 2853 2854 mutex_enter(&msp->ms_sync_lock); 2855 mutex_enter(&msp->ms_lock); 2856 2857 /* 2858 * Note: metaslab_condense() clears the space map's histogram. 2859 * Therefore we must verify and remove this histogram before 2860 * condensing. 2861 */ 2862 metaslab_group_histogram_verify(mg); 2863 metaslab_class_histogram_verify(mg->mg_class); 2864 metaslab_group_histogram_remove(mg, msp); 2865 2866 if (msp->ms_loaded && metaslab_should_condense(msp)) { 2867 metaslab_condense(msp, txg, tx); 2868 } else { 2869 mutex_exit(&msp->ms_lock); 2870 space_map_write(msp->ms_sm, alloctree, SM_ALLOC, 2871 SM_NO_VDEVID, tx); 2872 space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE, 2873 SM_NO_VDEVID, tx); 2874 mutex_enter(&msp->ms_lock); 2875 } 2876 2877 msp->ms_allocated_space += range_tree_space(alloctree); 2878 ASSERT3U(msp->ms_allocated_space, >=, 2879 range_tree_space(msp->ms_freeing)); 2880 msp->ms_allocated_space -= range_tree_space(msp->ms_freeing); 2881 2882 if (!range_tree_is_empty(msp->ms_checkpointing)) { 2883 ASSERT(spa_has_checkpoint(spa)); 2884 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); 2885 2886 /* 2887 * Since we are doing writes to disk and the ms_checkpointing 2888 * tree won't be changing during that time, we drop the 2889 * ms_lock while writing to the checkpoint space map. 2890 */ 2891 mutex_exit(&msp->ms_lock); 2892 space_map_write(vd->vdev_checkpoint_sm, 2893 msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx); 2894 mutex_enter(&msp->ms_lock); 2895 2896 spa->spa_checkpoint_info.sci_dspace += 2897 range_tree_space(msp->ms_checkpointing); 2898 vd->vdev_stat.vs_checkpoint_space += 2899 range_tree_space(msp->ms_checkpointing); 2900 ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==, 2901 -space_map_allocated(vd->vdev_checkpoint_sm)); 2902 2903 range_tree_vacate(msp->ms_checkpointing, NULL, NULL); 2904 } 2905 2906 if (msp->ms_loaded) { 2907 /* 2908 * When the space map is loaded, we have an accurate 2909 * histogram in the range tree. This gives us an opportunity 2910 * to bring the space map's histogram up-to-date so we clear 2911 * it first before updating it. 2912 */ 2913 space_map_histogram_clear(msp->ms_sm); 2914 space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx); 2915 2916 /* 2917 * Since we've cleared the histogram we need to add back 2918 * any free space that has already been processed, plus 2919 * any deferred space. This allows the on-disk histogram 2920 * to accurately reflect all free space even if some space 2921 * is not yet available for allocation (i.e. deferred). 2922 */ 2923 space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx); 2924 2925 /* 2926 * Add back any deferred free space that has not been 2927 * added back into the in-core free tree yet. This will 2928 * ensure that we don't end up with a space map histogram 2929 * that is completely empty unless the metaslab is fully 2930 * allocated. 2931 */ 2932 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2933 space_map_histogram_add(msp->ms_sm, 2934 msp->ms_defer[t], tx); 2935 } 2936 } 2937 2938 /* 2939 * Always add the free space from this sync pass to the space 2940 * map histogram. We want to make sure that the on-disk histogram 2941 * accounts for all free space. If the space map is not loaded, 2942 * then we will lose some accuracy but will correct it the next 2943 * time we load the space map. 2944 */ 2945 space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx); 2946 metaslab_aux_histograms_update(msp); 2947 2948 metaslab_group_histogram_add(mg, msp); 2949 metaslab_group_histogram_verify(mg); 2950 metaslab_class_histogram_verify(mg->mg_class); 2951 2952 /* 2953 * For sync pass 1, we avoid traversing this txg's free range tree 2954 * and instead will just swap the pointers for freeing and freed. 2955 * We can safely do this since the freed_tree is guaranteed to be 2956 * empty on the initial pass. 2957 */ 2958 if (spa_sync_pass(spa) == 1) { 2959 range_tree_swap(&msp->ms_freeing, &msp->ms_freed); 2960 ASSERT0(msp->ms_allocated_this_txg); 2961 } else { 2962 range_tree_vacate(msp->ms_freeing, 2963 range_tree_add, msp->ms_freed); 2964 } 2965 msp->ms_allocated_this_txg += range_tree_space(alloctree); 2966 range_tree_vacate(alloctree, NULL, NULL); 2967 2968 ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); 2969 ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg) 2970 & TXG_MASK])); 2971 ASSERT0(range_tree_space(msp->ms_freeing)); 2972 ASSERT0(range_tree_space(msp->ms_checkpointing)); 2973 2974 mutex_exit(&msp->ms_lock); 2975 2976 if (object != space_map_object(msp->ms_sm)) { 2977 object = space_map_object(msp->ms_sm); 2978 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * 2979 msp->ms_id, sizeof (uint64_t), &object, tx); 2980 } 2981 mutex_exit(&msp->ms_sync_lock); 2982 dmu_tx_commit(tx); 2983 } 2984 2985 /* 2986 * Called after a transaction group has completely synced to mark 2987 * all of the metaslab's free space as usable. 2988 */ 2989 void 2990 metaslab_sync_done(metaslab_t *msp, uint64_t txg) 2991 { 2992 metaslab_group_t *mg = msp->ms_group; 2993 vdev_t *vd = mg->mg_vd; 2994 spa_t *spa = vd->vdev_spa; 2995 range_tree_t **defer_tree; 2996 int64_t alloc_delta, defer_delta; 2997 boolean_t defer_allowed = B_TRUE; 2998 2999 ASSERT(!vd->vdev_ishole); 3000 3001 mutex_enter(&msp->ms_lock); 3002 3003 /* 3004 * If this metaslab is just becoming available, initialize its 3005 * range trees and add its capacity to the vdev. 3006 */ 3007 if (msp->ms_freed == NULL) { 3008 for (int t = 0; t < TXG_SIZE; t++) { 3009 ASSERT(msp->ms_allocating[t] == NULL); 3010 3011 msp->ms_allocating[t] = range_tree_create(NULL, NULL); 3012 } 3013 3014 ASSERT3P(msp->ms_freeing, ==, NULL); 3015 msp->ms_freeing = range_tree_create(NULL, NULL); 3016 3017 ASSERT3P(msp->ms_freed, ==, NULL); 3018 msp->ms_freed = range_tree_create(NULL, NULL); 3019 3020 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 3021 ASSERT(msp->ms_defer[t] == NULL); 3022 3023 msp->ms_defer[t] = range_tree_create(NULL, NULL); 3024 } 3025 3026 ASSERT3P(msp->ms_checkpointing, ==, NULL); 3027 msp->ms_checkpointing = range_tree_create(NULL, NULL); 3028 3029 metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size); 3030 } 3031 ASSERT0(range_tree_space(msp->ms_freeing)); 3032 ASSERT0(range_tree_space(msp->ms_checkpointing)); 3033 3034 defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE]; 3035 3036 uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) - 3037 metaslab_class_get_alloc(spa_normal_class(spa)); 3038 if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) { 3039 defer_allowed = B_FALSE; 3040 } 3041 3042 defer_delta = 0; 3043 alloc_delta = msp->ms_allocated_this_txg - 3044 range_tree_space(msp->ms_freed); 3045 if (defer_allowed) { 3046 defer_delta = range_tree_space(msp->ms_freed) - 3047 range_tree_space(*defer_tree); 3048 } else { 3049 defer_delta -= range_tree_space(*defer_tree); 3050 } 3051 3052 metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta, 3053 defer_delta, 0); 3054 3055 /* 3056 * If there's a metaslab_load() in progress, wait for it to complete 3057 * so that we have a consistent view of the in-core space map. 3058 */ 3059 metaslab_load_wait(msp); 3060 3061 /* 3062 * Move the frees from the defer_tree back to the free 3063 * range tree (if it's loaded). Swap the freed_tree and 3064 * the defer_tree -- this is safe to do because we've 3065 * just emptied out the defer_tree. 3066 */ 3067 range_tree_vacate(*defer_tree, 3068 msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable); 3069 if (defer_allowed) { 3070 range_tree_swap(&msp->ms_freed, defer_tree); 3071 } else { 3072 range_tree_vacate(msp->ms_freed, 3073 msp->ms_loaded ? range_tree_add : NULL, 3074 msp->ms_allocatable); 3075 } 3076 3077 msp->ms_synced_length = space_map_length(msp->ms_sm); 3078 3079 msp->ms_deferspace += defer_delta; 3080 ASSERT3S(msp->ms_deferspace, >=, 0); 3081 ASSERT3S(msp->ms_deferspace, <=, msp->ms_size); 3082 if (msp->ms_deferspace != 0) { 3083 /* 3084 * Keep syncing this metaslab until all deferred frees 3085 * are back in circulation. 3086 */ 3087 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 3088 } 3089 metaslab_aux_histograms_update_done(msp, defer_allowed); 3090 3091 if (msp->ms_new) { 3092 msp->ms_new = B_FALSE; 3093 mutex_enter(&mg->mg_lock); 3094 mg->mg_ms_ready++; 3095 mutex_exit(&mg->mg_lock); 3096 } 3097 3098 /* 3099 * Re-sort metaslab within its group now that we've adjusted 3100 * its allocatable space. 3101 */ 3102 metaslab_recalculate_weight_and_sort(msp); 3103 3104 /* 3105 * If the metaslab is loaded and we've not tried to load or allocate 3106 * from it in 'metaslab_unload_delay' txgs, then unload it. 3107 */ 3108 if (msp->ms_loaded && 3109 msp->ms_initializing == 0 && 3110 msp->ms_selected_txg + metaslab_unload_delay < txg) { 3111 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 3112 VERIFY0(range_tree_space( 3113 msp->ms_allocating[(txg + t) & TXG_MASK])); 3114 } 3115 if (msp->ms_allocator != -1) { 3116 metaslab_passivate(msp, msp->ms_weight & 3117 ~METASLAB_ACTIVE_MASK); 3118 } 3119 3120 if (!metaslab_debug_unload) 3121 metaslab_unload(msp); 3122 } 3123 3124 ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); 3125 ASSERT0(range_tree_space(msp->ms_freeing)); 3126 ASSERT0(range_tree_space(msp->ms_freed)); 3127 ASSERT0(range_tree_space(msp->ms_checkpointing)); 3128 3129 msp->ms_allocated_this_txg = 0; 3130 mutex_exit(&msp->ms_lock); 3131 } 3132 3133 void 3134 metaslab_sync_reassess(metaslab_group_t *mg) 3135 { 3136 spa_t *spa = mg->mg_class->mc_spa; 3137 3138 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 3139 metaslab_group_alloc_update(mg); 3140 mg->mg_fragmentation = metaslab_group_fragmentation(mg); 3141 3142 /* 3143 * Preload the next potential metaslabs but only on active 3144 * metaslab groups. We can get into a state where the metaslab 3145 * is no longer active since we dirty metaslabs as we remove a 3146 * a device, thus potentially making the metaslab group eligible 3147 * for preloading. 3148 */ 3149 if (mg->mg_activation_count > 0) { 3150 metaslab_group_preload(mg); 3151 } 3152 spa_config_exit(spa, SCL_ALLOC, FTAG); 3153 } 3154 3155 /* 3156 * When writing a ditto block (i.e. more than one DVA for a given BP) on 3157 * the same vdev as an existing DVA of this BP, then try to allocate it 3158 * on a different metaslab than existing DVAs (i.e. a unique metaslab). 3159 */ 3160 static boolean_t 3161 metaslab_is_unique(metaslab_t *msp, dva_t *dva) 3162 { 3163 uint64_t dva_ms_id; 3164 3165 if (DVA_GET_ASIZE(dva) == 0) 3166 return (B_TRUE); 3167 3168 if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) 3169 return (B_TRUE); 3170 3171 dva_ms_id = DVA_GET_OFFSET(dva) >> msp->ms_group->mg_vd->vdev_ms_shift; 3172 3173 return (msp->ms_id != dva_ms_id); 3174 } 3175 3176 /* 3177 * ========================================================================== 3178 * Metaslab allocation tracing facility 3179 * ========================================================================== 3180 */ 3181 kstat_t *metaslab_trace_ksp; 3182 kstat_named_t metaslab_trace_over_limit; 3183 3184 void 3185 metaslab_alloc_trace_init(void) 3186 { 3187 ASSERT(metaslab_alloc_trace_cache == NULL); 3188 metaslab_alloc_trace_cache = kmem_cache_create( 3189 "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t), 3190 0, NULL, NULL, NULL, NULL, NULL, 0); 3191 metaslab_trace_ksp = kstat_create("zfs", 0, "metaslab_trace_stats", 3192 "misc", KSTAT_TYPE_NAMED, 1, KSTAT_FLAG_VIRTUAL); 3193 if (metaslab_trace_ksp != NULL) { 3194 metaslab_trace_ksp->ks_data = &metaslab_trace_over_limit; 3195 kstat_named_init(&metaslab_trace_over_limit, 3196 "metaslab_trace_over_limit", KSTAT_DATA_UINT64); 3197 kstat_install(metaslab_trace_ksp); 3198 } 3199 } 3200 3201 void 3202 metaslab_alloc_trace_fini(void) 3203 { 3204 if (metaslab_trace_ksp != NULL) { 3205 kstat_delete(metaslab_trace_ksp); 3206 metaslab_trace_ksp = NULL; 3207 } 3208 kmem_cache_destroy(metaslab_alloc_trace_cache); 3209 metaslab_alloc_trace_cache = NULL; 3210 } 3211 3212 /* 3213 * Add an allocation trace element to the allocation tracing list. 3214 */ 3215 static void 3216 metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg, 3217 metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset, 3218 int allocator) 3219 { 3220 if (!metaslab_trace_enabled) 3221 return; 3222 3223 /* 3224 * When the tracing list reaches its maximum we remove 3225 * the second element in the list before adding a new one. 3226 * By removing the second element we preserve the original 3227 * entry as a clue to what allocations steps have already been 3228 * performed. 3229 */ 3230 if (zal->zal_size == metaslab_trace_max_entries) { 3231 metaslab_alloc_trace_t *mat_next; 3232 #ifdef DEBUG 3233 panic("too many entries in allocation list"); 3234 #endif 3235 atomic_inc_64(&metaslab_trace_over_limit.value.ui64); 3236 zal->zal_size--; 3237 mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list)); 3238 list_remove(&zal->zal_list, mat_next); 3239 kmem_cache_free(metaslab_alloc_trace_cache, mat_next); 3240 } 3241 3242 metaslab_alloc_trace_t *mat = 3243 kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP); 3244 list_link_init(&mat->mat_list_node); 3245 mat->mat_mg = mg; 3246 mat->mat_msp = msp; 3247 mat->mat_size = psize; 3248 mat->mat_dva_id = dva_id; 3249 mat->mat_offset = offset; 3250 mat->mat_weight = 0; 3251 mat->mat_allocator = allocator; 3252 3253 if (msp != NULL) 3254 mat->mat_weight = msp->ms_weight; 3255 3256 /* 3257 * The list is part of the zio so locking is not required. Only 3258 * a single thread will perform allocations for a given zio. 3259 */ 3260 list_insert_tail(&zal->zal_list, mat); 3261 zal->zal_size++; 3262 3263 ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries); 3264 } 3265 3266 void 3267 metaslab_trace_init(zio_alloc_list_t *zal) 3268 { 3269 list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t), 3270 offsetof(metaslab_alloc_trace_t, mat_list_node)); 3271 zal->zal_size = 0; 3272 } 3273 3274 void 3275 metaslab_trace_fini(zio_alloc_list_t *zal) 3276 { 3277 metaslab_alloc_trace_t *mat; 3278 3279 while ((mat = list_remove_head(&zal->zal_list)) != NULL) 3280 kmem_cache_free(metaslab_alloc_trace_cache, mat); 3281 list_destroy(&zal->zal_list); 3282 zal->zal_size = 0; 3283 } 3284 3285 /* 3286 * ========================================================================== 3287 * Metaslab block operations 3288 * ========================================================================== 3289 */ 3290 3291 static void 3292 metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags, 3293 int allocator) 3294 { 3295 if (!(flags & METASLAB_ASYNC_ALLOC) || 3296 (flags & METASLAB_DONT_THROTTLE)) 3297 return; 3298 3299 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 3300 if (!mg->mg_class->mc_alloc_throttle_enabled) 3301 return; 3302 3303 (void) zfs_refcount_add(&mg->mg_alloc_queue_depth[allocator], tag); 3304 } 3305 3306 static void 3307 metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator) 3308 { 3309 uint64_t max = mg->mg_max_alloc_queue_depth; 3310 uint64_t cur = mg->mg_cur_max_alloc_queue_depth[allocator]; 3311 while (cur < max) { 3312 if (atomic_cas_64(&mg->mg_cur_max_alloc_queue_depth[allocator], 3313 cur, cur + 1) == cur) { 3314 atomic_inc_64( 3315 &mg->mg_class->mc_alloc_max_slots[allocator]); 3316 return; 3317 } 3318 cur = mg->mg_cur_max_alloc_queue_depth[allocator]; 3319 } 3320 } 3321 3322 void 3323 metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags, 3324 int allocator, boolean_t io_complete) 3325 { 3326 if (!(flags & METASLAB_ASYNC_ALLOC) || 3327 (flags & METASLAB_DONT_THROTTLE)) 3328 return; 3329 3330 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 3331 if (!mg->mg_class->mc_alloc_throttle_enabled) 3332 return; 3333 3334 (void) zfs_refcount_remove(&mg->mg_alloc_queue_depth[allocator], tag); 3335 if (io_complete) 3336 metaslab_group_increment_qdepth(mg, allocator); 3337 } 3338 3339 void 3340 metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag, 3341 int allocator) 3342 { 3343 #ifdef ZFS_DEBUG 3344 const dva_t *dva = bp->blk_dva; 3345 int ndvas = BP_GET_NDVAS(bp); 3346 3347 for (int d = 0; d < ndvas; d++) { 3348 uint64_t vdev = DVA_GET_VDEV(&dva[d]); 3349 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 3350 VERIFY(zfs_refcount_not_held( 3351 &mg->mg_alloc_queue_depth[allocator], tag)); 3352 } 3353 #endif 3354 } 3355 3356 static uint64_t 3357 metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) 3358 { 3359 uint64_t start; 3360 range_tree_t *rt = msp->ms_allocatable; 3361 metaslab_class_t *mc = msp->ms_group->mg_class; 3362 3363 VERIFY(!msp->ms_condensing); 3364 VERIFY0(msp->ms_initializing); 3365 3366 start = mc->mc_ops->msop_alloc(msp, size); 3367 if (start != -1ULL) { 3368 metaslab_group_t *mg = msp->ms_group; 3369 vdev_t *vd = mg->mg_vd; 3370 3371 VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift)); 3372 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 3373 VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size); 3374 range_tree_remove(rt, start, size); 3375 3376 if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) 3377 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); 3378 3379 range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size); 3380 3381 /* Track the last successful allocation */ 3382 msp->ms_alloc_txg = txg; 3383 metaslab_verify_space(msp, txg); 3384 } 3385 3386 /* 3387 * Now that we've attempted the allocation we need to update the 3388 * metaslab's maximum block size since it may have changed. 3389 */ 3390 msp->ms_max_size = metaslab_block_maxsize(msp); 3391 return (start); 3392 } 3393 3394 /* 3395 * Find the metaslab with the highest weight that is less than what we've 3396 * already tried. In the common case, this means that we will examine each 3397 * metaslab at most once. Note that concurrent callers could reorder metaslabs 3398 * by activation/passivation once we have dropped the mg_lock. If a metaslab is 3399 * activated by another thread, and we fail to allocate from the metaslab we 3400 * have selected, we may not try the newly-activated metaslab, and instead 3401 * activate another metaslab. This is not optimal, but generally does not cause 3402 * any problems (a possible exception being if every metaslab is completely full 3403 * except for the the newly-activated metaslab which we fail to examine). 3404 */ 3405 static metaslab_t * 3406 find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, 3407 dva_t *dva, int d, boolean_t want_unique, uint64_t asize, int allocator, 3408 zio_alloc_list_t *zal, metaslab_t *search, boolean_t *was_active) 3409 { 3410 avl_index_t idx; 3411 avl_tree_t *t = &mg->mg_metaslab_tree; 3412 metaslab_t *msp = avl_find(t, search, &idx); 3413 if (msp == NULL) 3414 msp = avl_nearest(t, idx, AVL_AFTER); 3415 3416 for (; msp != NULL; msp = AVL_NEXT(t, msp)) { 3417 int i; 3418 if (!metaslab_should_allocate(msp, asize)) { 3419 metaslab_trace_add(zal, mg, msp, asize, d, 3420 TRACE_TOO_SMALL, allocator); 3421 continue; 3422 } 3423 3424 /* 3425 * If the selected metaslab is condensing or being 3426 * initialized, skip it. 3427 */ 3428 if (msp->ms_condensing || msp->ms_initializing > 0) 3429 continue; 3430 3431 *was_active = msp->ms_allocator != -1; 3432 /* 3433 * If we're activating as primary, this is our first allocation 3434 * from this disk, so we don't need to check how close we are. 3435 * If the metaslab under consideration was already active, 3436 * we're getting desperate enough to steal another allocator's 3437 * metaslab, so we still don't care about distances. 3438 */ 3439 if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active) 3440 break; 3441 3442 for (i = 0; i < d; i++) { 3443 if (want_unique && 3444 !metaslab_is_unique(msp, &dva[i])) 3445 break; /* try another metaslab */ 3446 } 3447 if (i == d) 3448 break; 3449 } 3450 3451 if (msp != NULL) { 3452 search->ms_weight = msp->ms_weight; 3453 search->ms_start = msp->ms_start + 1; 3454 search->ms_allocator = msp->ms_allocator; 3455 search->ms_primary = msp->ms_primary; 3456 } 3457 return (msp); 3458 } 3459 3460 /* ARGSUSED */ 3461 static uint64_t 3462 metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, 3463 uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, 3464 int d, int allocator) 3465 { 3466 metaslab_t *msp = NULL; 3467 uint64_t offset = -1ULL; 3468 uint64_t activation_weight; 3469 3470 activation_weight = METASLAB_WEIGHT_PRIMARY; 3471 for (int i = 0; i < d; i++) { 3472 if (activation_weight == METASLAB_WEIGHT_PRIMARY && 3473 DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { 3474 activation_weight = METASLAB_WEIGHT_SECONDARY; 3475 } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && 3476 DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { 3477 activation_weight = METASLAB_WEIGHT_CLAIM; 3478 break; 3479 } 3480 } 3481 3482 /* 3483 * If we don't have enough metaslabs active to fill the entire array, we 3484 * just use the 0th slot. 3485 */ 3486 if (mg->mg_ms_ready < mg->mg_allocators * 3) 3487 allocator = 0; 3488 3489 ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2); 3490 3491 metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP); 3492 search->ms_weight = UINT64_MAX; 3493 search->ms_start = 0; 3494 /* 3495 * At the end of the metaslab tree are the already-active metaslabs, 3496 * first the primaries, then the secondaries. When we resume searching 3497 * through the tree, we need to consider ms_allocator and ms_primary so 3498 * we start in the location right after where we left off, and don't 3499 * accidentally loop forever considering the same metaslabs. 3500 */ 3501 search->ms_allocator = -1; 3502 search->ms_primary = B_TRUE; 3503 for (;;) { 3504 boolean_t was_active = B_FALSE; 3505 3506 mutex_enter(&mg->mg_lock); 3507 3508 if (activation_weight == METASLAB_WEIGHT_PRIMARY && 3509 mg->mg_primaries[allocator] != NULL) { 3510 msp = mg->mg_primaries[allocator]; 3511 was_active = B_TRUE; 3512 } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && 3513 mg->mg_secondaries[allocator] != NULL) { 3514 msp = mg->mg_secondaries[allocator]; 3515 was_active = B_TRUE; 3516 } else { 3517 msp = find_valid_metaslab(mg, activation_weight, dva, d, 3518 want_unique, asize, allocator, zal, search, 3519 &was_active); 3520 } 3521 3522 mutex_exit(&mg->mg_lock); 3523 if (msp == NULL) { 3524 kmem_free(search, sizeof (*search)); 3525 return (-1ULL); 3526 } 3527 3528 mutex_enter(&msp->ms_lock); 3529 /* 3530 * Ensure that the metaslab we have selected is still 3531 * capable of handling our request. It's possible that 3532 * another thread may have changed the weight while we 3533 * were blocked on the metaslab lock. We check the 3534 * active status first to see if we need to reselect 3535 * a new metaslab. 3536 */ 3537 if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) { 3538 mutex_exit(&msp->ms_lock); 3539 continue; 3540 } 3541 3542 /* 3543 * If the metaslab is freshly activated for an allocator that 3544 * isn't the one we're allocating from, or if it's a primary and 3545 * we're seeking a secondary (or vice versa), we go back and 3546 * select a new metaslab. 3547 */ 3548 if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) && 3549 (msp->ms_allocator != -1) && 3550 (msp->ms_allocator != allocator || ((activation_weight == 3551 METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) { 3552 mutex_exit(&msp->ms_lock); 3553 continue; 3554 } 3555 3556 if (msp->ms_weight & METASLAB_WEIGHT_CLAIM && 3557 activation_weight != METASLAB_WEIGHT_CLAIM) { 3558 metaslab_passivate(msp, msp->ms_weight & 3559 ~METASLAB_WEIGHT_CLAIM); 3560 mutex_exit(&msp->ms_lock); 3561 continue; 3562 } 3563 3564 if (metaslab_activate(msp, allocator, activation_weight) != 0) { 3565 mutex_exit(&msp->ms_lock); 3566 continue; 3567 } 3568 3569 msp->ms_selected_txg = txg; 3570 3571 /* 3572 * Now that we have the lock, recheck to see if we should 3573 * continue to use this metaslab for this allocation. The 3574 * the metaslab is now loaded so metaslab_should_allocate() can 3575 * accurately determine if the allocation attempt should 3576 * proceed. 3577 */ 3578 if (!metaslab_should_allocate(msp, asize)) { 3579 /* Passivate this metaslab and select a new one. */ 3580 metaslab_trace_add(zal, mg, msp, asize, d, 3581 TRACE_TOO_SMALL, allocator); 3582 goto next; 3583 } 3584 3585 /* 3586 * If this metaslab is currently condensing then pick again as 3587 * we can't manipulate this metaslab until it's committed 3588 * to disk. If this metaslab is being initialized, we shouldn't 3589 * allocate from it since the allocated region might be 3590 * overwritten after allocation. 3591 */ 3592 if (msp->ms_condensing) { 3593 metaslab_trace_add(zal, mg, msp, asize, d, 3594 TRACE_CONDENSING, allocator); 3595 metaslab_passivate(msp, msp->ms_weight & 3596 ~METASLAB_ACTIVE_MASK); 3597 mutex_exit(&msp->ms_lock); 3598 continue; 3599 } else if (msp->ms_initializing > 0) { 3600 metaslab_trace_add(zal, mg, msp, asize, d, 3601 TRACE_INITIALIZING, allocator); 3602 metaslab_passivate(msp, msp->ms_weight & 3603 ~METASLAB_ACTIVE_MASK); 3604 mutex_exit(&msp->ms_lock); 3605 continue; 3606 } 3607 3608 offset = metaslab_block_alloc(msp, asize, txg); 3609 metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator); 3610 3611 if (offset != -1ULL) { 3612 /* Proactively passivate the metaslab, if needed */ 3613 metaslab_segment_may_passivate(msp); 3614 break; 3615 } 3616 next: 3617 ASSERT(msp->ms_loaded); 3618 3619 /* 3620 * We were unable to allocate from this metaslab so determine 3621 * a new weight for this metaslab. Now that we have loaded 3622 * the metaslab we can provide a better hint to the metaslab 3623 * selector. 3624 * 3625 * For space-based metaslabs, we use the maximum block size. 3626 * This information is only available when the metaslab 3627 * is loaded and is more accurate than the generic free 3628 * space weight that was calculated by metaslab_weight(). 3629 * This information allows us to quickly compare the maximum 3630 * available allocation in the metaslab to the allocation 3631 * size being requested. 3632 * 3633 * For segment-based metaslabs, determine the new weight 3634 * based on the highest bucket in the range tree. We 3635 * explicitly use the loaded segment weight (i.e. the range 3636 * tree histogram) since it contains the space that is 3637 * currently available for allocation and is accurate 3638 * even within a sync pass. 3639 */ 3640 if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) { 3641 uint64_t weight = metaslab_block_maxsize(msp); 3642 WEIGHT_SET_SPACEBASED(weight); 3643 metaslab_passivate(msp, weight); 3644 } else { 3645 metaslab_passivate(msp, 3646 metaslab_weight_from_range_tree(msp)); 3647 } 3648 3649 /* 3650 * We have just failed an allocation attempt, check 3651 * that metaslab_should_allocate() agrees. Otherwise, 3652 * we may end up in an infinite loop retrying the same 3653 * metaslab. 3654 */ 3655 ASSERT(!metaslab_should_allocate(msp, asize)); 3656 3657 mutex_exit(&msp->ms_lock); 3658 } 3659 mutex_exit(&msp->ms_lock); 3660 kmem_free(search, sizeof (*search)); 3661 return (offset); 3662 } 3663 3664 static uint64_t 3665 metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, 3666 uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, 3667 int d, int allocator) 3668 { 3669 uint64_t offset; 3670 ASSERT(mg->mg_initialized); 3671 3672 offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique, 3673 dva, d, allocator); 3674 3675 mutex_enter(&mg->mg_lock); 3676 if (offset == -1ULL) { 3677 mg->mg_failed_allocations++; 3678 metaslab_trace_add(zal, mg, NULL, asize, d, 3679 TRACE_GROUP_FAILURE, allocator); 3680 if (asize == SPA_GANGBLOCKSIZE) { 3681 /* 3682 * This metaslab group was unable to allocate 3683 * the minimum gang block size so it must be out of 3684 * space. We must notify the allocation throttle 3685 * to start skipping allocation attempts to this 3686 * metaslab group until more space becomes available. 3687 * Note: this failure cannot be caused by the 3688 * allocation throttle since the allocation throttle 3689 * is only responsible for skipping devices and 3690 * not failing block allocations. 3691 */ 3692 mg->mg_no_free_space = B_TRUE; 3693 } 3694 } 3695 mg->mg_allocations++; 3696 mutex_exit(&mg->mg_lock); 3697 return (offset); 3698 } 3699 3700 /* 3701 * Allocate a block for the specified i/o. 3702 */ 3703 int 3704 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, 3705 dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags, 3706 zio_alloc_list_t *zal, int allocator) 3707 { 3708 metaslab_group_t *mg, *rotor; 3709 vdev_t *vd; 3710 boolean_t try_hard = B_FALSE; 3711 3712 ASSERT(!DVA_IS_VALID(&dva[d])); 3713 3714 /* 3715 * For testing, make some blocks above a certain size be gang blocks. 3716 * This will also test spilling from special to normal. 3717 */ 3718 if (psize >= metaslab_force_ganging && (ddi_get_lbolt() & 3) == 0) { 3719 metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG, 3720 allocator); 3721 return (SET_ERROR(ENOSPC)); 3722 } 3723 3724 /* 3725 * Start at the rotor and loop through all mgs until we find something. 3726 * Note that there's no locking on mc_rotor or mc_aliquot because 3727 * nothing actually breaks if we miss a few updates -- we just won't 3728 * allocate quite as evenly. It all balances out over time. 3729 * 3730 * If we are doing ditto or log blocks, try to spread them across 3731 * consecutive vdevs. If we're forced to reuse a vdev before we've 3732 * allocated all of our ditto blocks, then try and spread them out on 3733 * that vdev as much as possible. If it turns out to not be possible, 3734 * gradually lower our standards until anything becomes acceptable. 3735 * Also, allocating on consecutive vdevs (as opposed to random vdevs) 3736 * gives us hope of containing our fault domains to something we're 3737 * able to reason about. Otherwise, any two top-level vdev failures 3738 * will guarantee the loss of data. With consecutive allocation, 3739 * only two adjacent top-level vdev failures will result in data loss. 3740 * 3741 * If we are doing gang blocks (hintdva is non-NULL), try to keep 3742 * ourselves on the same vdev as our gang block header. That 3743 * way, we can hope for locality in vdev_cache, plus it makes our 3744 * fault domains something tractable. 3745 */ 3746 if (hintdva) { 3747 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); 3748 3749 /* 3750 * It's possible the vdev we're using as the hint no 3751 * longer exists or its mg has been closed (e.g. by 3752 * device removal). Consult the rotor when 3753 * all else fails. 3754 */ 3755 if (vd != NULL && vd->vdev_mg != NULL) { 3756 mg = vd->vdev_mg; 3757 3758 if (flags & METASLAB_HINTBP_AVOID && 3759 mg->mg_next != NULL) 3760 mg = mg->mg_next; 3761 } else { 3762 mg = mc->mc_rotor; 3763 } 3764 } else if (d != 0) { 3765 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); 3766 mg = vd->vdev_mg->mg_next; 3767 } else { 3768 ASSERT(mc->mc_rotor != NULL); 3769 mg = mc->mc_rotor; 3770 } 3771 3772 /* 3773 * If the hint put us into the wrong metaslab class, or into a 3774 * metaslab group that has been passivated, just follow the rotor. 3775 */ 3776 if (mg->mg_class != mc || mg->mg_activation_count <= 0) 3777 mg = mc->mc_rotor; 3778 3779 rotor = mg; 3780 top: 3781 do { 3782 boolean_t allocatable; 3783 3784 ASSERT(mg->mg_activation_count == 1); 3785 vd = mg->mg_vd; 3786 3787 /* 3788 * Don't allocate from faulted devices. 3789 */ 3790 if (try_hard) { 3791 spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); 3792 allocatable = vdev_allocatable(vd); 3793 spa_config_exit(spa, SCL_ZIO, FTAG); 3794 } else { 3795 allocatable = vdev_allocatable(vd); 3796 } 3797 3798 /* 3799 * Determine if the selected metaslab group is eligible 3800 * for allocations. If we're ganging then don't allow 3801 * this metaslab group to skip allocations since that would 3802 * inadvertently return ENOSPC and suspend the pool 3803 * even though space is still available. 3804 */ 3805 if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) { 3806 allocatable = metaslab_group_allocatable(mg, rotor, 3807 psize, allocator); 3808 } 3809 3810 if (!allocatable) { 3811 metaslab_trace_add(zal, mg, NULL, psize, d, 3812 TRACE_NOT_ALLOCATABLE, allocator); 3813 goto next; 3814 } 3815 3816 ASSERT(mg->mg_initialized); 3817 3818 /* 3819 * Avoid writing single-copy data to a failing, 3820 * non-redundant vdev, unless we've already tried all 3821 * other vdevs. 3822 */ 3823 if ((vd->vdev_stat.vs_write_errors > 0 || 3824 vd->vdev_state < VDEV_STATE_HEALTHY) && 3825 d == 0 && !try_hard && vd->vdev_children == 0) { 3826 metaslab_trace_add(zal, mg, NULL, psize, d, 3827 TRACE_VDEV_ERROR, allocator); 3828 goto next; 3829 } 3830 3831 ASSERT(mg->mg_class == mc); 3832 3833 uint64_t asize = vdev_psize_to_asize(vd, psize); 3834 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); 3835 3836 /* 3837 * If we don't need to try hard, then require that the 3838 * block be on an different metaslab from any other DVAs 3839 * in this BP (unique=true). If we are trying hard, then 3840 * allow any metaslab to be used (unique=false). 3841 */ 3842 uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg, 3843 !try_hard, dva, d, allocator); 3844 3845 if (offset != -1ULL) { 3846 /* 3847 * If we've just selected this metaslab group, 3848 * figure out whether the corresponding vdev is 3849 * over- or under-used relative to the pool, 3850 * and set an allocation bias to even it out. 3851 */ 3852 if (mc->mc_aliquot == 0 && metaslab_bias_enabled) { 3853 vdev_stat_t *vs = &vd->vdev_stat; 3854 int64_t vu, cu; 3855 3856 vu = (vs->vs_alloc * 100) / (vs->vs_space + 1); 3857 cu = (mc->mc_alloc * 100) / (mc->mc_space + 1); 3858 3859 /* 3860 * Calculate how much more or less we should 3861 * try to allocate from this device during 3862 * this iteration around the rotor. 3863 * For example, if a device is 80% full 3864 * and the pool is 20% full then we should 3865 * reduce allocations by 60% on this device. 3866 * 3867 * mg_bias = (20 - 80) * 512K / 100 = -307K 3868 * 3869 * This reduces allocations by 307K for this 3870 * iteration. 3871 */ 3872 mg->mg_bias = ((cu - vu) * 3873 (int64_t)mg->mg_aliquot) / 100; 3874 } else if (!metaslab_bias_enabled) { 3875 mg->mg_bias = 0; 3876 } 3877 3878 if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= 3879 mg->mg_aliquot + mg->mg_bias) { 3880 mc->mc_rotor = mg->mg_next; 3881 mc->mc_aliquot = 0; 3882 } 3883 3884 DVA_SET_VDEV(&dva[d], vd->vdev_id); 3885 DVA_SET_OFFSET(&dva[d], offset); 3886 DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER)); 3887 DVA_SET_ASIZE(&dva[d], asize); 3888 3889 return (0); 3890 } 3891 next: 3892 mc->mc_rotor = mg->mg_next; 3893 mc->mc_aliquot = 0; 3894 } while ((mg = mg->mg_next) != rotor); 3895 3896 /* 3897 * If we haven't tried hard, do so now. 3898 */ 3899 if (!try_hard) { 3900 try_hard = B_TRUE; 3901 goto top; 3902 } 3903 3904 bzero(&dva[d], sizeof (dva_t)); 3905 3906 metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator); 3907 return (SET_ERROR(ENOSPC)); 3908 } 3909 3910 void 3911 metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize, 3912 boolean_t checkpoint) 3913 { 3914 metaslab_t *msp; 3915 spa_t *spa = vd->vdev_spa; 3916 3917 ASSERT(vdev_is_concrete(vd)); 3918 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 3919 ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); 3920 3921 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 3922 3923 VERIFY(!msp->ms_condensing); 3924 VERIFY3U(offset, >=, msp->ms_start); 3925 VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size); 3926 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 3927 VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift)); 3928 3929 metaslab_check_free_impl(vd, offset, asize); 3930 3931 mutex_enter(&msp->ms_lock); 3932 if (range_tree_is_empty(msp->ms_freeing) && 3933 range_tree_is_empty(msp->ms_checkpointing)) { 3934 vdev_dirty(vd, VDD_METASLAB, msp, spa_syncing_txg(spa)); 3935 } 3936 3937 if (checkpoint) { 3938 ASSERT(spa_has_checkpoint(spa)); 3939 range_tree_add(msp->ms_checkpointing, offset, asize); 3940 } else { 3941 range_tree_add(msp->ms_freeing, offset, asize); 3942 } 3943 mutex_exit(&msp->ms_lock); 3944 } 3945 3946 /* ARGSUSED */ 3947 void 3948 metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, 3949 uint64_t size, void *arg) 3950 { 3951 boolean_t *checkpoint = arg; 3952 3953 ASSERT3P(checkpoint, !=, NULL); 3954 3955 if (vd->vdev_ops->vdev_op_remap != NULL) 3956 vdev_indirect_mark_obsolete(vd, offset, size); 3957 else 3958 metaslab_free_impl(vd, offset, size, *checkpoint); 3959 } 3960 3961 static void 3962 metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size, 3963 boolean_t checkpoint) 3964 { 3965 spa_t *spa = vd->vdev_spa; 3966 3967 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 3968 3969 if (spa_syncing_txg(spa) > spa_freeze_txg(spa)) 3970 return; 3971 3972 if (spa->spa_vdev_removal != NULL && 3973 spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id && 3974 vdev_is_concrete(vd)) { 3975 /* 3976 * Note: we check if the vdev is concrete because when 3977 * we complete the removal, we first change the vdev to be 3978 * an indirect vdev (in open context), and then (in syncing 3979 * context) clear spa_vdev_removal. 3980 */ 3981 free_from_removing_vdev(vd, offset, size); 3982 } else if (vd->vdev_ops->vdev_op_remap != NULL) { 3983 vdev_indirect_mark_obsolete(vd, offset, size); 3984 vd->vdev_ops->vdev_op_remap(vd, offset, size, 3985 metaslab_free_impl_cb, &checkpoint); 3986 } else { 3987 metaslab_free_concrete(vd, offset, size, checkpoint); 3988 } 3989 } 3990 3991 typedef struct remap_blkptr_cb_arg { 3992 blkptr_t *rbca_bp; 3993 spa_remap_cb_t rbca_cb; 3994 vdev_t *rbca_remap_vd; 3995 uint64_t rbca_remap_offset; 3996 void *rbca_cb_arg; 3997 } remap_blkptr_cb_arg_t; 3998 3999 void 4000 remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, 4001 uint64_t size, void *arg) 4002 { 4003 remap_blkptr_cb_arg_t *rbca = arg; 4004 blkptr_t *bp = rbca->rbca_bp; 4005 4006 /* We can not remap split blocks. */ 4007 if (size != DVA_GET_ASIZE(&bp->blk_dva[0])) 4008 return; 4009 ASSERT0(inner_offset); 4010 4011 if (rbca->rbca_cb != NULL) { 4012 /* 4013 * At this point we know that we are not handling split 4014 * blocks and we invoke the callback on the previous 4015 * vdev which must be indirect. 4016 */ 4017 ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops); 4018 4019 rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id, 4020 rbca->rbca_remap_offset, size, rbca->rbca_cb_arg); 4021 4022 /* set up remap_blkptr_cb_arg for the next call */ 4023 rbca->rbca_remap_vd = vd; 4024 rbca->rbca_remap_offset = offset; 4025 } 4026 4027 /* 4028 * The phys birth time is that of dva[0]. This ensures that we know 4029 * when each dva was written, so that resilver can determine which 4030 * blocks need to be scrubbed (i.e. those written during the time 4031 * the vdev was offline). It also ensures that the key used in 4032 * the ARC hash table is unique (i.e. dva[0] + phys_birth). If 4033 * we didn't change the phys_birth, a lookup in the ARC for a 4034 * remapped BP could find the data that was previously stored at 4035 * this vdev + offset. 4036 */ 4037 vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa, 4038 DVA_GET_VDEV(&bp->blk_dva[0])); 4039 vdev_indirect_births_t *vib = oldvd->vdev_indirect_births; 4040 bp->blk_phys_birth = vdev_indirect_births_physbirth(vib, 4041 DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0])); 4042 4043 DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id); 4044 DVA_SET_OFFSET(&bp->blk_dva[0], offset); 4045 } 4046 4047 /* 4048 * If the block pointer contains any indirect DVAs, modify them to refer to 4049 * concrete DVAs. Note that this will sometimes not be possible, leaving 4050 * the indirect DVA in place. This happens if the indirect DVA spans multiple 4051 * segments in the mapping (i.e. it is a "split block"). 4052 * 4053 * If the BP was remapped, calls the callback on the original dva (note the 4054 * callback can be called multiple times if the original indirect DVA refers 4055 * to another indirect DVA, etc). 4056 * 4057 * Returns TRUE if the BP was remapped. 4058 */ 4059 boolean_t 4060 spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg) 4061 { 4062 remap_blkptr_cb_arg_t rbca; 4063 4064 if (!zfs_remap_blkptr_enable) 4065 return (B_FALSE); 4066 4067 if (!spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) 4068 return (B_FALSE); 4069 4070 /* 4071 * Dedup BP's can not be remapped, because ddt_phys_select() depends 4072 * on DVA[0] being the same in the BP as in the DDT (dedup table). 4073 */ 4074 if (BP_GET_DEDUP(bp)) 4075 return (B_FALSE); 4076 4077 /* 4078 * Gang blocks can not be remapped, because 4079 * zio_checksum_gang_verifier() depends on the DVA[0] that's in 4080 * the BP used to read the gang block header (GBH) being the same 4081 * as the DVA[0] that we allocated for the GBH. 4082 */ 4083 if (BP_IS_GANG(bp)) 4084 return (B_FALSE); 4085 4086 /* 4087 * Embedded BP's have no DVA to remap. 4088 */ 4089 if (BP_GET_NDVAS(bp) < 1) 4090 return (B_FALSE); 4091 4092 /* 4093 * Note: we only remap dva[0]. If we remapped other dvas, we 4094 * would no longer know what their phys birth txg is. 4095 */ 4096 dva_t *dva = &bp->blk_dva[0]; 4097 4098 uint64_t offset = DVA_GET_OFFSET(dva); 4099 uint64_t size = DVA_GET_ASIZE(dva); 4100 vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); 4101 4102 if (vd->vdev_ops->vdev_op_remap == NULL) 4103 return (B_FALSE); 4104 4105 rbca.rbca_bp = bp; 4106 rbca.rbca_cb = callback; 4107 rbca.rbca_remap_vd = vd; 4108 rbca.rbca_remap_offset = offset; 4109 rbca.rbca_cb_arg = arg; 4110 4111 /* 4112 * remap_blkptr_cb() will be called in order for each level of 4113 * indirection, until a concrete vdev is reached or a split block is 4114 * encountered. old_vd and old_offset are updated within the callback 4115 * as we go from the one indirect vdev to the next one (either concrete 4116 * or indirect again) in that order. 4117 */ 4118 vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca); 4119 4120 /* Check if the DVA wasn't remapped because it is a split block */ 4121 if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id) 4122 return (B_FALSE); 4123 4124 return (B_TRUE); 4125 } 4126 4127 /* 4128 * Undo the allocation of a DVA which happened in the given transaction group. 4129 */ 4130 void 4131 metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg) 4132 { 4133 metaslab_t *msp; 4134 vdev_t *vd; 4135 uint64_t vdev = DVA_GET_VDEV(dva); 4136 uint64_t offset = DVA_GET_OFFSET(dva); 4137 uint64_t size = DVA_GET_ASIZE(dva); 4138 4139 ASSERT(DVA_IS_VALID(dva)); 4140 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 4141 4142 if (txg > spa_freeze_txg(spa)) 4143 return; 4144 4145 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 4146 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { 4147 cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu", 4148 (u_longlong_t)vdev, (u_longlong_t)offset); 4149 ASSERT(0); 4150 return; 4151 } 4152 4153 ASSERT(!vd->vdev_removing); 4154 ASSERT(vdev_is_concrete(vd)); 4155 ASSERT0(vd->vdev_indirect_config.vic_mapping_object); 4156 ASSERT3P(vd->vdev_indirect_mapping, ==, NULL); 4157 4158 if (DVA_GET_GANG(dva)) 4159 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 4160 4161 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 4162 4163 mutex_enter(&msp->ms_lock); 4164 range_tree_remove(msp->ms_allocating[txg & TXG_MASK], 4165 offset, size); 4166 4167 VERIFY(!msp->ms_condensing); 4168 VERIFY3U(offset, >=, msp->ms_start); 4169 VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size); 4170 VERIFY3U(range_tree_space(msp->ms_allocatable) + size, <=, 4171 msp->ms_size); 4172 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 4173 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 4174 range_tree_add(msp->ms_allocatable, offset, size); 4175 mutex_exit(&msp->ms_lock); 4176 } 4177 4178 /* 4179 * Free the block represented by the given DVA. 4180 */ 4181 void 4182 metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint) 4183 { 4184 uint64_t vdev = DVA_GET_VDEV(dva); 4185 uint64_t offset = DVA_GET_OFFSET(dva); 4186 uint64_t size = DVA_GET_ASIZE(dva); 4187 vdev_t *vd = vdev_lookup_top(spa, vdev); 4188 4189 ASSERT(DVA_IS_VALID(dva)); 4190 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 4191 4192 if (DVA_GET_GANG(dva)) { 4193 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 4194 } 4195 4196 metaslab_free_impl(vd, offset, size, checkpoint); 4197 } 4198 4199 /* 4200 * Reserve some allocation slots. The reservation system must be called 4201 * before we call into the allocator. If there aren't any available slots 4202 * then the I/O will be throttled until an I/O completes and its slots are 4203 * freed up. The function returns true if it was successful in placing 4204 * the reservation. 4205 */ 4206 boolean_t 4207 metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator, 4208 zio_t *zio, int flags) 4209 { 4210 uint64_t available_slots = 0; 4211 boolean_t slot_reserved = B_FALSE; 4212 uint64_t max = mc->mc_alloc_max_slots[allocator]; 4213 4214 ASSERT(mc->mc_alloc_throttle_enabled); 4215 mutex_enter(&mc->mc_lock); 4216 4217 uint64_t reserved_slots = 4218 zfs_refcount_count(&mc->mc_alloc_slots[allocator]); 4219 if (reserved_slots < max) 4220 available_slots = max - reserved_slots; 4221 4222 if (slots <= available_slots || GANG_ALLOCATION(flags) || 4223 flags & METASLAB_MUST_RESERVE) { 4224 /* 4225 * We reserve the slots individually so that we can unreserve 4226 * them individually when an I/O completes. 4227 */ 4228 for (int d = 0; d < slots; d++) { 4229 reserved_slots = 4230 zfs_refcount_add(&mc->mc_alloc_slots[allocator], 4231 zio); 4232 } 4233 zio->io_flags |= ZIO_FLAG_IO_ALLOCATING; 4234 slot_reserved = B_TRUE; 4235 } 4236 4237 mutex_exit(&mc->mc_lock); 4238 return (slot_reserved); 4239 } 4240 4241 void 4242 metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, 4243 int allocator, zio_t *zio) 4244 { 4245 ASSERT(mc->mc_alloc_throttle_enabled); 4246 mutex_enter(&mc->mc_lock); 4247 for (int d = 0; d < slots; d++) { 4248 (void) zfs_refcount_remove(&mc->mc_alloc_slots[allocator], 4249 zio); 4250 } 4251 mutex_exit(&mc->mc_lock); 4252 } 4253 4254 static int 4255 metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size, 4256 uint64_t txg) 4257 { 4258 metaslab_t *msp; 4259 spa_t *spa = vd->vdev_spa; 4260 int error = 0; 4261 4262 if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count) 4263 return (ENXIO); 4264 4265 ASSERT3P(vd->vdev_ms, !=, NULL); 4266 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 4267 4268 mutex_enter(&msp->ms_lock); 4269 4270 if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) 4271 error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM); 4272 /* 4273 * No need to fail in that case; someone else has activated the 4274 * metaslab, but that doesn't preclude us from using it. 4275 */ 4276 if (error == EBUSY) 4277 error = 0; 4278 4279 if (error == 0 && 4280 !range_tree_contains(msp->ms_allocatable, offset, size)) 4281 error = SET_ERROR(ENOENT); 4282 4283 if (error || txg == 0) { /* txg == 0 indicates dry run */ 4284 mutex_exit(&msp->ms_lock); 4285 return (error); 4286 } 4287 4288 VERIFY(!msp->ms_condensing); 4289 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 4290 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 4291 VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=, 4292 msp->ms_size); 4293 range_tree_remove(msp->ms_allocatable, offset, size); 4294 4295 if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ 4296 if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) 4297 vdev_dirty(vd, VDD_METASLAB, msp, txg); 4298 range_tree_add(msp->ms_allocating[txg & TXG_MASK], 4299 offset, size); 4300 } 4301 4302 mutex_exit(&msp->ms_lock); 4303 4304 return (0); 4305 } 4306 4307 typedef struct metaslab_claim_cb_arg_t { 4308 uint64_t mcca_txg; 4309 int mcca_error; 4310 } metaslab_claim_cb_arg_t; 4311 4312 /* ARGSUSED */ 4313 static void 4314 metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, 4315 uint64_t size, void *arg) 4316 { 4317 metaslab_claim_cb_arg_t *mcca_arg = arg; 4318 4319 if (mcca_arg->mcca_error == 0) { 4320 mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset, 4321 size, mcca_arg->mcca_txg); 4322 } 4323 } 4324 4325 int 4326 metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg) 4327 { 4328 if (vd->vdev_ops->vdev_op_remap != NULL) { 4329 metaslab_claim_cb_arg_t arg; 4330 4331 /* 4332 * Only zdb(1M) can claim on indirect vdevs. This is used 4333 * to detect leaks of mapped space (that are not accounted 4334 * for in the obsolete counts, spacemap, or bpobj). 4335 */ 4336 ASSERT(!spa_writeable(vd->vdev_spa)); 4337 arg.mcca_error = 0; 4338 arg.mcca_txg = txg; 4339 4340 vd->vdev_ops->vdev_op_remap(vd, offset, size, 4341 metaslab_claim_impl_cb, &arg); 4342 4343 if (arg.mcca_error == 0) { 4344 arg.mcca_error = metaslab_claim_concrete(vd, 4345 offset, size, txg); 4346 } 4347 return (arg.mcca_error); 4348 } else { 4349 return (metaslab_claim_concrete(vd, offset, size, txg)); 4350 } 4351 } 4352 4353 /* 4354 * Intent log support: upon opening the pool after a crash, notify the SPA 4355 * of blocks that the intent log has allocated for immediate write, but 4356 * which are still considered free by the SPA because the last transaction 4357 * group didn't commit yet. 4358 */ 4359 static int 4360 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) 4361 { 4362 uint64_t vdev = DVA_GET_VDEV(dva); 4363 uint64_t offset = DVA_GET_OFFSET(dva); 4364 uint64_t size = DVA_GET_ASIZE(dva); 4365 vdev_t *vd; 4366 4367 if ((vd = vdev_lookup_top(spa, vdev)) == NULL) { 4368 return (SET_ERROR(ENXIO)); 4369 } 4370 4371 ASSERT(DVA_IS_VALID(dva)); 4372 4373 if (DVA_GET_GANG(dva)) 4374 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 4375 4376 return (metaslab_claim_impl(vd, offset, size, txg)); 4377 } 4378 4379 int 4380 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, 4381 int ndvas, uint64_t txg, blkptr_t *hintbp, int flags, 4382 zio_alloc_list_t *zal, zio_t *zio, int allocator) 4383 { 4384 dva_t *dva = bp->blk_dva; 4385 dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL; 4386 int error = 0; 4387 4388 ASSERT(bp->blk_birth == 0); 4389 ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); 4390 4391 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 4392 4393 if (mc->mc_rotor == NULL) { /* no vdevs in this class */ 4394 spa_config_exit(spa, SCL_ALLOC, FTAG); 4395 return (SET_ERROR(ENOSPC)); 4396 } 4397 4398 ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); 4399 ASSERT(BP_GET_NDVAS(bp) == 0); 4400 ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); 4401 ASSERT3P(zal, !=, NULL); 4402 4403 for (int d = 0; d < ndvas; d++) { 4404 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, 4405 txg, flags, zal, allocator); 4406 if (error != 0) { 4407 for (d--; d >= 0; d--) { 4408 metaslab_unalloc_dva(spa, &dva[d], txg); 4409 metaslab_group_alloc_decrement(spa, 4410 DVA_GET_VDEV(&dva[d]), zio, flags, 4411 allocator, B_FALSE); 4412 bzero(&dva[d], sizeof (dva_t)); 4413 } 4414 spa_config_exit(spa, SCL_ALLOC, FTAG); 4415 return (error); 4416 } else { 4417 /* 4418 * Update the metaslab group's queue depth 4419 * based on the newly allocated dva. 4420 */ 4421 metaslab_group_alloc_increment(spa, 4422 DVA_GET_VDEV(&dva[d]), zio, flags, allocator); 4423 } 4424 4425 } 4426 ASSERT(error == 0); 4427 ASSERT(BP_GET_NDVAS(bp) == ndvas); 4428 4429 spa_config_exit(spa, SCL_ALLOC, FTAG); 4430 4431 BP_SET_BIRTH(bp, txg, txg); 4432 4433 return (0); 4434 } 4435 4436 void 4437 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) 4438 { 4439 const dva_t *dva = bp->blk_dva; 4440 int ndvas = BP_GET_NDVAS(bp); 4441 4442 ASSERT(!BP_IS_HOLE(bp)); 4443 ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); 4444 4445 /* 4446 * If we have a checkpoint for the pool we need to make sure that 4447 * the blocks that we free that are part of the checkpoint won't be 4448 * reused until the checkpoint is discarded or we revert to it. 4449 * 4450 * The checkpoint flag is passed down the metaslab_free code path 4451 * and is set whenever we want to add a block to the checkpoint's 4452 * accounting. That is, we "checkpoint" blocks that existed at the 4453 * time the checkpoint was created and are therefore referenced by 4454 * the checkpointed uberblock. 4455 * 4456 * Note that, we don't checkpoint any blocks if the current 4457 * syncing txg <= spa_checkpoint_txg. We want these frees to sync 4458 * normally as they will be referenced by the checkpointed uberblock. 4459 */ 4460 boolean_t checkpoint = B_FALSE; 4461 if (bp->blk_birth <= spa->spa_checkpoint_txg && 4462 spa_syncing_txg(spa) > spa->spa_checkpoint_txg) { 4463 /* 4464 * At this point, if the block is part of the checkpoint 4465 * there is no way it was created in the current txg. 4466 */ 4467 ASSERT(!now); 4468 ASSERT3U(spa_syncing_txg(spa), ==, txg); 4469 checkpoint = B_TRUE; 4470 } 4471 4472 spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); 4473 4474 for (int d = 0; d < ndvas; d++) { 4475 if (now) { 4476 metaslab_unalloc_dva(spa, &dva[d], txg); 4477 } else { 4478 ASSERT3U(txg, ==, spa_syncing_txg(spa)); 4479 metaslab_free_dva(spa, &dva[d], checkpoint); 4480 } 4481 } 4482 4483 spa_config_exit(spa, SCL_FREE, FTAG); 4484 } 4485 4486 int 4487 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) 4488 { 4489 const dva_t *dva = bp->blk_dva; 4490 int ndvas = BP_GET_NDVAS(bp); 4491 int error = 0; 4492 4493 ASSERT(!BP_IS_HOLE(bp)); 4494 4495 if (txg != 0) { 4496 /* 4497 * First do a dry run to make sure all DVAs are claimable, 4498 * so we don't have to unwind from partial failures below. 4499 */ 4500 if ((error = metaslab_claim(spa, bp, 0)) != 0) 4501 return (error); 4502 } 4503 4504 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 4505 4506 for (int d = 0; d < ndvas; d++) { 4507 error = metaslab_claim_dva(spa, &dva[d], txg); 4508 if (error != 0) 4509 break; 4510 } 4511 4512 spa_config_exit(spa, SCL_ALLOC, FTAG); 4513 4514 ASSERT(error == 0 || txg == 0); 4515 4516 return (error); 4517 } 4518 4519 /* ARGSUSED */ 4520 static void 4521 metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset, 4522 uint64_t size, void *arg) 4523 { 4524 if (vd->vdev_ops == &vdev_indirect_ops) 4525 return; 4526 4527 metaslab_check_free_impl(vd, offset, size); 4528 } 4529 4530 static void 4531 metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size) 4532 { 4533 metaslab_t *msp; 4534 spa_t *spa = vd->vdev_spa; 4535 4536 if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) 4537 return; 4538 4539 if (vd->vdev_ops->vdev_op_remap != NULL) { 4540 vd->vdev_ops->vdev_op_remap(vd, offset, size, 4541 metaslab_check_free_impl_cb, NULL); 4542 return; 4543 } 4544 4545 ASSERT(vdev_is_concrete(vd)); 4546 ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); 4547 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 4548 4549 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 4550 4551 mutex_enter(&msp->ms_lock); 4552 if (msp->ms_loaded) { 4553 range_tree_verify_not_present(msp->ms_allocatable, 4554 offset, size); 4555 } 4556 4557 range_tree_verify_not_present(msp->ms_freeing, offset, size); 4558 range_tree_verify_not_present(msp->ms_checkpointing, offset, size); 4559 range_tree_verify_not_present(msp->ms_freed, offset, size); 4560 for (int j = 0; j < TXG_DEFER_SIZE; j++) 4561 range_tree_verify_not_present(msp->ms_defer[j], offset, size); 4562 mutex_exit(&msp->ms_lock); 4563 } 4564 4565 void 4566 metaslab_check_free(spa_t *spa, const blkptr_t *bp) 4567 { 4568 if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) 4569 return; 4570 4571 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 4572 for (int i = 0; i < BP_GET_NDVAS(bp); i++) { 4573 uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]); 4574 vdev_t *vd = vdev_lookup_top(spa, vdev); 4575 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); 4576 uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]); 4577 4578 if (DVA_GET_GANG(&bp->blk_dva[i])) 4579 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 4580 4581 ASSERT3P(vd, !=, NULL); 4582 4583 metaslab_check_free_impl(vd, offset, size); 4584 } 4585 spa_config_exit(spa, SCL_VDEV, FTAG); 4586 } 4587