1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 24 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 25 * Copyright (c) 2014 Integros [integros.com] 26 * Copyright (c) 2017, Intel Corporation. 27 */ 28 29 #include <sys/zfs_context.h> 30 #include <sys/dmu.h> 31 #include <sys/dmu_tx.h> 32 #include <sys/space_map.h> 33 #include <sys/metaslab_impl.h> 34 #include <sys/vdev_impl.h> 35 #include <sys/zio.h> 36 #include <sys/spa_impl.h> 37 #include <sys/zfeature.h> 38 #include <sys/vdev_indirect_mapping.h> 39 #include <sys/zap.h> 40 41 #define GANG_ALLOCATION(flags) \ 42 ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER)) 43 44 uint64_t metaslab_aliquot = 512ULL << 10; 45 uint64_t metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ 46 47 /* 48 * Since we can touch multiple metaslabs (and their respective space maps) 49 * with each transaction group, we benefit from having a smaller space map 50 * block size since it allows us to issue more I/O operations scattered 51 * around the disk. 52 */ 53 int zfs_metaslab_sm_blksz = (1 << 12); 54 55 /* 56 * The in-core space map representation is more compact than its on-disk form. 57 * The zfs_condense_pct determines how much more compact the in-core 58 * space map representation must be before we compact it on-disk. 59 * Values should be greater than or equal to 100. 60 */ 61 int zfs_condense_pct = 200; 62 63 /* 64 * Condensing a metaslab is not guaranteed to actually reduce the amount of 65 * space used on disk. In particular, a space map uses data in increments of 66 * MAX(1 << ashift, space_map_blksize), so a metaslab might use the 67 * same number of blocks after condensing. Since the goal of condensing is to 68 * reduce the number of IOPs required to read the space map, we only want to 69 * condense when we can be sure we will reduce the number of blocks used by the 70 * space map. Unfortunately, we cannot precisely compute whether or not this is 71 * the case in metaslab_should_condense since we are holding ms_lock. Instead, 72 * we apply the following heuristic: do not condense a spacemap unless the 73 * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold 74 * blocks. 75 */ 76 int zfs_metaslab_condense_block_threshold = 4; 77 78 /* 79 * The zfs_mg_noalloc_threshold defines which metaslab groups should 80 * be eligible for allocation. The value is defined as a percentage of 81 * free space. Metaslab groups that have more free space than 82 * zfs_mg_noalloc_threshold are always eligible for allocations. Once 83 * a metaslab group's free space is less than or equal to the 84 * zfs_mg_noalloc_threshold the allocator will avoid allocating to that 85 * group unless all groups in the pool have reached zfs_mg_noalloc_threshold. 86 * Once all groups in the pool reach zfs_mg_noalloc_threshold then all 87 * groups are allowed to accept allocations. Gang blocks are always 88 * eligible to allocate on any metaslab group. The default value of 0 means 89 * no metaslab group will be excluded based on this criterion. 90 */ 91 int zfs_mg_noalloc_threshold = 0; 92 93 /* 94 * Metaslab groups are considered eligible for allocations if their 95 * fragmenation metric (measured as a percentage) is less than or equal to 96 * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold 97 * then it will be skipped unless all metaslab groups within the metaslab 98 * class have also crossed this threshold. 99 */ 100 int zfs_mg_fragmentation_threshold = 85; 101 102 /* 103 * Allow metaslabs to keep their active state as long as their fragmentation 104 * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An 105 * active metaslab that exceeds this threshold will no longer keep its active 106 * status allowing better metaslabs to be selected. 107 */ 108 int zfs_metaslab_fragmentation_threshold = 70; 109 110 /* 111 * When set will load all metaslabs when pool is first opened. 112 */ 113 int metaslab_debug_load = 0; 114 115 /* 116 * When set will prevent metaslabs from being unloaded. 117 */ 118 int metaslab_debug_unload = 0; 119 120 /* 121 * Minimum size which forces the dynamic allocator to change 122 * it's allocation strategy. Once the space map cannot satisfy 123 * an allocation of this size then it switches to using more 124 * aggressive strategy (i.e search by size rather than offset). 125 */ 126 uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE; 127 128 /* 129 * The minimum free space, in percent, which must be available 130 * in a space map to continue allocations in a first-fit fashion. 131 * Once the space map's free space drops below this level we dynamically 132 * switch to using best-fit allocations. 133 */ 134 int metaslab_df_free_pct = 4; 135 136 /* 137 * A metaslab is considered "free" if it contains a contiguous 138 * segment which is greater than metaslab_min_alloc_size. 139 */ 140 uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS; 141 142 /* 143 * Percentage of all cpus that can be used by the metaslab taskq. 144 */ 145 int metaslab_load_pct = 50; 146 147 /* 148 * Determines how many txgs a metaslab may remain loaded without having any 149 * allocations from it. As long as a metaslab continues to be used we will 150 * keep it loaded. 151 */ 152 int metaslab_unload_delay = TXG_SIZE * 2; 153 154 /* 155 * Max number of metaslabs per group to preload. 156 */ 157 int metaslab_preload_limit = SPA_DVAS_PER_BP; 158 159 /* 160 * Enable/disable preloading of metaslab. 161 */ 162 boolean_t metaslab_preload_enabled = B_TRUE; 163 164 /* 165 * Enable/disable fragmentation weighting on metaslabs. 166 */ 167 boolean_t metaslab_fragmentation_factor_enabled = B_TRUE; 168 169 /* 170 * Enable/disable lba weighting (i.e. outer tracks are given preference). 171 */ 172 boolean_t metaslab_lba_weighting_enabled = B_TRUE; 173 174 /* 175 * Enable/disable metaslab group biasing. 176 */ 177 boolean_t metaslab_bias_enabled = B_TRUE; 178 179 /* 180 * Enable/disable remapping of indirect DVAs to their concrete vdevs. 181 */ 182 boolean_t zfs_remap_blkptr_enable = B_TRUE; 183 184 /* 185 * Enable/disable segment-based metaslab selection. 186 */ 187 boolean_t zfs_metaslab_segment_weight_enabled = B_TRUE; 188 189 /* 190 * When using segment-based metaslab selection, we will continue 191 * allocating from the active metaslab until we have exhausted 192 * zfs_metaslab_switch_threshold of its buckets. 193 */ 194 int zfs_metaslab_switch_threshold = 2; 195 196 /* 197 * Internal switch to enable/disable the metaslab allocation tracing 198 * facility. 199 */ 200 boolean_t metaslab_trace_enabled = B_TRUE; 201 202 /* 203 * Maximum entries that the metaslab allocation tracing facility will keep 204 * in a given list when running in non-debug mode. We limit the number 205 * of entries in non-debug mode to prevent us from using up too much memory. 206 * The limit should be sufficiently large that we don't expect any allocation 207 * to every exceed this value. In debug mode, the system will panic if this 208 * limit is ever reached allowing for further investigation. 209 */ 210 uint64_t metaslab_trace_max_entries = 5000; 211 212 static uint64_t metaslab_weight(metaslab_t *); 213 static void metaslab_set_fragmentation(metaslab_t *); 214 static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t); 215 static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t); 216 static void metaslab_passivate(metaslab_t *msp, uint64_t weight); 217 static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp); 218 219 kmem_cache_t *metaslab_alloc_trace_cache; 220 221 /* 222 * ========================================================================== 223 * Metaslab classes 224 * ========================================================================== 225 */ 226 metaslab_class_t * 227 metaslab_class_create(spa_t *spa, metaslab_ops_t *ops) 228 { 229 metaslab_class_t *mc; 230 231 mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); 232 233 mc->mc_spa = spa; 234 mc->mc_rotor = NULL; 235 mc->mc_ops = ops; 236 mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL); 237 mc->mc_alloc_slots = kmem_zalloc(spa->spa_alloc_count * 238 sizeof (zfs_refcount_t), KM_SLEEP); 239 mc->mc_alloc_max_slots = kmem_zalloc(spa->spa_alloc_count * 240 sizeof (uint64_t), KM_SLEEP); 241 for (int i = 0; i < spa->spa_alloc_count; i++) 242 zfs_refcount_create_tracked(&mc->mc_alloc_slots[i]); 243 244 return (mc); 245 } 246 247 void 248 metaslab_class_destroy(metaslab_class_t *mc) 249 { 250 ASSERT(mc->mc_rotor == NULL); 251 ASSERT(mc->mc_alloc == 0); 252 ASSERT(mc->mc_deferred == 0); 253 ASSERT(mc->mc_space == 0); 254 ASSERT(mc->mc_dspace == 0); 255 256 for (int i = 0; i < mc->mc_spa->spa_alloc_count; i++) 257 zfs_refcount_destroy(&mc->mc_alloc_slots[i]); 258 kmem_free(mc->mc_alloc_slots, mc->mc_spa->spa_alloc_count * 259 sizeof (zfs_refcount_t)); 260 kmem_free(mc->mc_alloc_max_slots, mc->mc_spa->spa_alloc_count * 261 sizeof (uint64_t)); 262 mutex_destroy(&mc->mc_lock); 263 kmem_free(mc, sizeof (metaslab_class_t)); 264 } 265 266 int 267 metaslab_class_validate(metaslab_class_t *mc) 268 { 269 metaslab_group_t *mg; 270 vdev_t *vd; 271 272 /* 273 * Must hold one of the spa_config locks. 274 */ 275 ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || 276 spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); 277 278 if ((mg = mc->mc_rotor) == NULL) 279 return (0); 280 281 do { 282 vd = mg->mg_vd; 283 ASSERT(vd->vdev_mg != NULL); 284 ASSERT3P(vd->vdev_top, ==, vd); 285 ASSERT3P(mg->mg_class, ==, mc); 286 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); 287 } while ((mg = mg->mg_next) != mc->mc_rotor); 288 289 return (0); 290 } 291 292 static void 293 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, 294 int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) 295 { 296 atomic_add_64(&mc->mc_alloc, alloc_delta); 297 atomic_add_64(&mc->mc_deferred, defer_delta); 298 atomic_add_64(&mc->mc_space, space_delta); 299 atomic_add_64(&mc->mc_dspace, dspace_delta); 300 } 301 302 uint64_t 303 metaslab_class_get_alloc(metaslab_class_t *mc) 304 { 305 return (mc->mc_alloc); 306 } 307 308 uint64_t 309 metaslab_class_get_deferred(metaslab_class_t *mc) 310 { 311 return (mc->mc_deferred); 312 } 313 314 uint64_t 315 metaslab_class_get_space(metaslab_class_t *mc) 316 { 317 return (mc->mc_space); 318 } 319 320 uint64_t 321 metaslab_class_get_dspace(metaslab_class_t *mc) 322 { 323 return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); 324 } 325 326 void 327 metaslab_class_histogram_verify(metaslab_class_t *mc) 328 { 329 spa_t *spa = mc->mc_spa; 330 vdev_t *rvd = spa->spa_root_vdev; 331 uint64_t *mc_hist; 332 int i; 333 334 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 335 return; 336 337 mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 338 KM_SLEEP); 339 340 for (int c = 0; c < rvd->vdev_children; c++) { 341 vdev_t *tvd = rvd->vdev_child[c]; 342 metaslab_group_t *mg = tvd->vdev_mg; 343 344 /* 345 * Skip any holes, uninitialized top-levels, or 346 * vdevs that are not in this metalab class. 347 */ 348 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || 349 mg->mg_class != mc) { 350 continue; 351 } 352 353 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 354 mc_hist[i] += mg->mg_histogram[i]; 355 } 356 357 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 358 VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]); 359 360 kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 361 } 362 363 /* 364 * Calculate the metaslab class's fragmentation metric. The metric 365 * is weighted based on the space contribution of each metaslab group. 366 * The return value will be a number between 0 and 100 (inclusive), or 367 * ZFS_FRAG_INVALID if the metric has not been set. See comment above the 368 * zfs_frag_table for more information about the metric. 369 */ 370 uint64_t 371 metaslab_class_fragmentation(metaslab_class_t *mc) 372 { 373 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 374 uint64_t fragmentation = 0; 375 376 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 377 378 for (int c = 0; c < rvd->vdev_children; c++) { 379 vdev_t *tvd = rvd->vdev_child[c]; 380 metaslab_group_t *mg = tvd->vdev_mg; 381 382 /* 383 * Skip any holes, uninitialized top-levels, 384 * or vdevs that are not in this metalab class. 385 */ 386 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || 387 mg->mg_class != mc) { 388 continue; 389 } 390 391 /* 392 * If a metaslab group does not contain a fragmentation 393 * metric then just bail out. 394 */ 395 if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { 396 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 397 return (ZFS_FRAG_INVALID); 398 } 399 400 /* 401 * Determine how much this metaslab_group is contributing 402 * to the overall pool fragmentation metric. 403 */ 404 fragmentation += mg->mg_fragmentation * 405 metaslab_group_get_space(mg); 406 } 407 fragmentation /= metaslab_class_get_space(mc); 408 409 ASSERT3U(fragmentation, <=, 100); 410 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 411 return (fragmentation); 412 } 413 414 /* 415 * Calculate the amount of expandable space that is available in 416 * this metaslab class. If a device is expanded then its expandable 417 * space will be the amount of allocatable space that is currently not 418 * part of this metaslab class. 419 */ 420 uint64_t 421 metaslab_class_expandable_space(metaslab_class_t *mc) 422 { 423 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 424 uint64_t space = 0; 425 426 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 427 for (int c = 0; c < rvd->vdev_children; c++) { 428 uint64_t tspace; 429 vdev_t *tvd = rvd->vdev_child[c]; 430 metaslab_group_t *mg = tvd->vdev_mg; 431 432 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || 433 mg->mg_class != mc) { 434 continue; 435 } 436 437 /* 438 * Calculate if we have enough space to add additional 439 * metaslabs. We report the expandable space in terms 440 * of the metaslab size since that's the unit of expansion. 441 * Adjust by efi system partition size. 442 */ 443 tspace = tvd->vdev_max_asize - tvd->vdev_asize; 444 if (tspace > mc->mc_spa->spa_bootsize) { 445 tspace -= mc->mc_spa->spa_bootsize; 446 } 447 space += P2ALIGN(tspace, 1ULL << tvd->vdev_ms_shift); 448 } 449 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 450 return (space); 451 } 452 453 static int 454 metaslab_compare(const void *x1, const void *x2) 455 { 456 const metaslab_t *m1 = x1; 457 const metaslab_t *m2 = x2; 458 459 int sort1 = 0; 460 int sort2 = 0; 461 if (m1->ms_allocator != -1 && m1->ms_primary) 462 sort1 = 1; 463 else if (m1->ms_allocator != -1 && !m1->ms_primary) 464 sort1 = 2; 465 if (m2->ms_allocator != -1 && m2->ms_primary) 466 sort2 = 1; 467 else if (m2->ms_allocator != -1 && !m2->ms_primary) 468 sort2 = 2; 469 470 /* 471 * Sort inactive metaslabs first, then primaries, then secondaries. When 472 * selecting a metaslab to allocate from, an allocator first tries its 473 * primary, then secondary active metaslab. If it doesn't have active 474 * metaslabs, or can't allocate from them, it searches for an inactive 475 * metaslab to activate. If it can't find a suitable one, it will steal 476 * a primary or secondary metaslab from another allocator. 477 */ 478 if (sort1 < sort2) 479 return (-1); 480 if (sort1 > sort2) 481 return (1); 482 483 if (m1->ms_weight < m2->ms_weight) 484 return (1); 485 if (m1->ms_weight > m2->ms_weight) 486 return (-1); 487 488 /* 489 * If the weights are identical, use the offset to force uniqueness. 490 */ 491 if (m1->ms_start < m2->ms_start) 492 return (-1); 493 if (m1->ms_start > m2->ms_start) 494 return (1); 495 496 ASSERT3P(m1, ==, m2); 497 498 return (0); 499 } 500 501 /* 502 * Verify that the space accounting on disk matches the in-core range_trees. 503 */ 504 void 505 metaslab_verify_space(metaslab_t *msp, uint64_t txg) 506 { 507 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 508 uint64_t allocated = 0; 509 uint64_t sm_free_space, msp_free_space; 510 511 ASSERT(MUTEX_HELD(&msp->ms_lock)); 512 513 if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) 514 return; 515 516 /* 517 * We can only verify the metaslab space when we're called 518 * from syncing context with a loaded metaslab that has an allocated 519 * space map. Calling this in non-syncing context does not 520 * provide a consistent view of the metaslab since we're performing 521 * allocations in the future. 522 */ 523 if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL || 524 !msp->ms_loaded) 525 return; 526 527 sm_free_space = msp->ms_size - space_map_allocated(msp->ms_sm) - 528 space_map_alloc_delta(msp->ms_sm); 529 530 /* 531 * Account for future allocations since we would have already 532 * deducted that space from the ms_freetree. 533 */ 534 for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { 535 allocated += 536 range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]); 537 } 538 539 msp_free_space = range_tree_space(msp->ms_allocatable) + allocated + 540 msp->ms_deferspace + range_tree_space(msp->ms_freed); 541 542 VERIFY3U(sm_free_space, ==, msp_free_space); 543 } 544 545 /* 546 * ========================================================================== 547 * Metaslab groups 548 * ========================================================================== 549 */ 550 /* 551 * Update the allocatable flag and the metaslab group's capacity. 552 * The allocatable flag is set to true if the capacity is below 553 * the zfs_mg_noalloc_threshold or has a fragmentation value that is 554 * greater than zfs_mg_fragmentation_threshold. If a metaslab group 555 * transitions from allocatable to non-allocatable or vice versa then the 556 * metaslab group's class is updated to reflect the transition. 557 */ 558 static void 559 metaslab_group_alloc_update(metaslab_group_t *mg) 560 { 561 vdev_t *vd = mg->mg_vd; 562 metaslab_class_t *mc = mg->mg_class; 563 vdev_stat_t *vs = &vd->vdev_stat; 564 boolean_t was_allocatable; 565 boolean_t was_initialized; 566 567 ASSERT(vd == vd->vdev_top); 568 ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==, 569 SCL_ALLOC); 570 571 mutex_enter(&mg->mg_lock); 572 was_allocatable = mg->mg_allocatable; 573 was_initialized = mg->mg_initialized; 574 575 mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / 576 (vs->vs_space + 1); 577 578 mutex_enter(&mc->mc_lock); 579 580 /* 581 * If the metaslab group was just added then it won't 582 * have any space until we finish syncing out this txg. 583 * At that point we will consider it initialized and available 584 * for allocations. We also don't consider non-activated 585 * metaslab groups (e.g. vdevs that are in the middle of being removed) 586 * to be initialized, because they can't be used for allocation. 587 */ 588 mg->mg_initialized = metaslab_group_initialized(mg); 589 if (!was_initialized && mg->mg_initialized) { 590 mc->mc_groups++; 591 } else if (was_initialized && !mg->mg_initialized) { 592 ASSERT3U(mc->mc_groups, >, 0); 593 mc->mc_groups--; 594 } 595 if (mg->mg_initialized) 596 mg->mg_no_free_space = B_FALSE; 597 598 /* 599 * A metaslab group is considered allocatable if it has plenty 600 * of free space or is not heavily fragmented. We only take 601 * fragmentation into account if the metaslab group has a valid 602 * fragmentation metric (i.e. a value between 0 and 100). 603 */ 604 mg->mg_allocatable = (mg->mg_activation_count > 0 && 605 mg->mg_free_capacity > zfs_mg_noalloc_threshold && 606 (mg->mg_fragmentation == ZFS_FRAG_INVALID || 607 mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)); 608 609 /* 610 * The mc_alloc_groups maintains a count of the number of 611 * groups in this metaslab class that are still above the 612 * zfs_mg_noalloc_threshold. This is used by the allocating 613 * threads to determine if they should avoid allocations to 614 * a given group. The allocator will avoid allocations to a group 615 * if that group has reached or is below the zfs_mg_noalloc_threshold 616 * and there are still other groups that are above the threshold. 617 * When a group transitions from allocatable to non-allocatable or 618 * vice versa we update the metaslab class to reflect that change. 619 * When the mc_alloc_groups value drops to 0 that means that all 620 * groups have reached the zfs_mg_noalloc_threshold making all groups 621 * eligible for allocations. This effectively means that all devices 622 * are balanced again. 623 */ 624 if (was_allocatable && !mg->mg_allocatable) 625 mc->mc_alloc_groups--; 626 else if (!was_allocatable && mg->mg_allocatable) 627 mc->mc_alloc_groups++; 628 mutex_exit(&mc->mc_lock); 629 630 mutex_exit(&mg->mg_lock); 631 } 632 633 metaslab_group_t * 634 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators) 635 { 636 metaslab_group_t *mg; 637 638 mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); 639 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); 640 mutex_init(&mg->mg_ms_initialize_lock, NULL, MUTEX_DEFAULT, NULL); 641 cv_init(&mg->mg_ms_initialize_cv, NULL, CV_DEFAULT, NULL); 642 mg->mg_primaries = kmem_zalloc(allocators * sizeof (metaslab_t *), 643 KM_SLEEP); 644 mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *), 645 KM_SLEEP); 646 avl_create(&mg->mg_metaslab_tree, metaslab_compare, 647 sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); 648 mg->mg_vd = vd; 649 mg->mg_class = mc; 650 mg->mg_activation_count = 0; 651 mg->mg_initialized = B_FALSE; 652 mg->mg_no_free_space = B_TRUE; 653 mg->mg_allocators = allocators; 654 655 mg->mg_alloc_queue_depth = kmem_zalloc(allocators * 656 sizeof (zfs_refcount_t), KM_SLEEP); 657 mg->mg_cur_max_alloc_queue_depth = kmem_zalloc(allocators * 658 sizeof (uint64_t), KM_SLEEP); 659 for (int i = 0; i < allocators; i++) { 660 zfs_refcount_create_tracked(&mg->mg_alloc_queue_depth[i]); 661 mg->mg_cur_max_alloc_queue_depth[i] = 0; 662 } 663 664 mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct, 665 minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT); 666 667 return (mg); 668 } 669 670 void 671 metaslab_group_destroy(metaslab_group_t *mg) 672 { 673 ASSERT(mg->mg_prev == NULL); 674 ASSERT(mg->mg_next == NULL); 675 /* 676 * We may have gone below zero with the activation count 677 * either because we never activated in the first place or 678 * because we're done, and possibly removing the vdev. 679 */ 680 ASSERT(mg->mg_activation_count <= 0); 681 682 taskq_destroy(mg->mg_taskq); 683 avl_destroy(&mg->mg_metaslab_tree); 684 kmem_free(mg->mg_primaries, mg->mg_allocators * sizeof (metaslab_t *)); 685 kmem_free(mg->mg_secondaries, mg->mg_allocators * 686 sizeof (metaslab_t *)); 687 mutex_destroy(&mg->mg_lock); 688 mutex_destroy(&mg->mg_ms_initialize_lock); 689 cv_destroy(&mg->mg_ms_initialize_cv); 690 691 for (int i = 0; i < mg->mg_allocators; i++) { 692 zfs_refcount_destroy(&mg->mg_alloc_queue_depth[i]); 693 mg->mg_cur_max_alloc_queue_depth[i] = 0; 694 } 695 kmem_free(mg->mg_alloc_queue_depth, mg->mg_allocators * 696 sizeof (zfs_refcount_t)); 697 kmem_free(mg->mg_cur_max_alloc_queue_depth, mg->mg_allocators * 698 sizeof (uint64_t)); 699 700 kmem_free(mg, sizeof (metaslab_group_t)); 701 } 702 703 void 704 metaslab_group_activate(metaslab_group_t *mg) 705 { 706 metaslab_class_t *mc = mg->mg_class; 707 metaslab_group_t *mgprev, *mgnext; 708 709 ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER), !=, 0); 710 711 ASSERT(mc->mc_rotor != mg); 712 ASSERT(mg->mg_prev == NULL); 713 ASSERT(mg->mg_next == NULL); 714 ASSERT(mg->mg_activation_count <= 0); 715 716 if (++mg->mg_activation_count <= 0) 717 return; 718 719 mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); 720 metaslab_group_alloc_update(mg); 721 722 if ((mgprev = mc->mc_rotor) == NULL) { 723 mg->mg_prev = mg; 724 mg->mg_next = mg; 725 } else { 726 mgnext = mgprev->mg_next; 727 mg->mg_prev = mgprev; 728 mg->mg_next = mgnext; 729 mgprev->mg_next = mg; 730 mgnext->mg_prev = mg; 731 } 732 mc->mc_rotor = mg; 733 } 734 735 /* 736 * Passivate a metaslab group and remove it from the allocation rotor. 737 * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating 738 * a metaslab group. This function will momentarily drop spa_config_locks 739 * that are lower than the SCL_ALLOC lock (see comment below). 740 */ 741 void 742 metaslab_group_passivate(metaslab_group_t *mg) 743 { 744 metaslab_class_t *mc = mg->mg_class; 745 spa_t *spa = mc->mc_spa; 746 metaslab_group_t *mgprev, *mgnext; 747 int locks = spa_config_held(spa, SCL_ALL, RW_WRITER); 748 749 ASSERT3U(spa_config_held(spa, SCL_ALLOC | SCL_ZIO, RW_WRITER), ==, 750 (SCL_ALLOC | SCL_ZIO)); 751 752 if (--mg->mg_activation_count != 0) { 753 ASSERT(mc->mc_rotor != mg); 754 ASSERT(mg->mg_prev == NULL); 755 ASSERT(mg->mg_next == NULL); 756 ASSERT(mg->mg_activation_count < 0); 757 return; 758 } 759 760 /* 761 * The spa_config_lock is an array of rwlocks, ordered as 762 * follows (from highest to lowest): 763 * SCL_CONFIG > SCL_STATE > SCL_L2ARC > SCL_ALLOC > 764 * SCL_ZIO > SCL_FREE > SCL_VDEV 765 * (For more information about the spa_config_lock see spa_misc.c) 766 * The higher the lock, the broader its coverage. When we passivate 767 * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO 768 * config locks. However, the metaslab group's taskq might be trying 769 * to preload metaslabs so we must drop the SCL_ZIO lock and any 770 * lower locks to allow the I/O to complete. At a minimum, 771 * we continue to hold the SCL_ALLOC lock, which prevents any future 772 * allocations from taking place and any changes to the vdev tree. 773 */ 774 spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa); 775 taskq_wait(mg->mg_taskq); 776 spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER); 777 metaslab_group_alloc_update(mg); 778 for (int i = 0; i < mg->mg_allocators; i++) { 779 metaslab_t *msp = mg->mg_primaries[i]; 780 if (msp != NULL) { 781 mutex_enter(&msp->ms_lock); 782 metaslab_passivate(msp, 783 metaslab_weight_from_range_tree(msp)); 784 mutex_exit(&msp->ms_lock); 785 } 786 msp = mg->mg_secondaries[i]; 787 if (msp != NULL) { 788 mutex_enter(&msp->ms_lock); 789 metaslab_passivate(msp, 790 metaslab_weight_from_range_tree(msp)); 791 mutex_exit(&msp->ms_lock); 792 } 793 } 794 795 mgprev = mg->mg_prev; 796 mgnext = mg->mg_next; 797 798 if (mg == mgnext) { 799 mc->mc_rotor = NULL; 800 } else { 801 mc->mc_rotor = mgnext; 802 mgprev->mg_next = mgnext; 803 mgnext->mg_prev = mgprev; 804 } 805 806 mg->mg_prev = NULL; 807 mg->mg_next = NULL; 808 } 809 810 boolean_t 811 metaslab_group_initialized(metaslab_group_t *mg) 812 { 813 vdev_t *vd = mg->mg_vd; 814 vdev_stat_t *vs = &vd->vdev_stat; 815 816 return (vs->vs_space != 0 && mg->mg_activation_count > 0); 817 } 818 819 uint64_t 820 metaslab_group_get_space(metaslab_group_t *mg) 821 { 822 return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count); 823 } 824 825 void 826 metaslab_group_histogram_verify(metaslab_group_t *mg) 827 { 828 uint64_t *mg_hist; 829 vdev_t *vd = mg->mg_vd; 830 uint64_t ashift = vd->vdev_ashift; 831 int i; 832 833 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 834 return; 835 836 mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 837 KM_SLEEP); 838 839 ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=, 840 SPACE_MAP_HISTOGRAM_SIZE + ashift); 841 842 for (int m = 0; m < vd->vdev_ms_count; m++) { 843 metaslab_t *msp = vd->vdev_ms[m]; 844 845 /* skip if not active or not a member */ 846 if (msp->ms_sm == NULL || msp->ms_group != mg) 847 continue; 848 849 for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) 850 mg_hist[i + ashift] += 851 msp->ms_sm->sm_phys->smp_histogram[i]; 852 } 853 854 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++) 855 VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]); 856 857 kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 858 } 859 860 static void 861 metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp) 862 { 863 metaslab_class_t *mc = mg->mg_class; 864 uint64_t ashift = mg->mg_vd->vdev_ashift; 865 866 ASSERT(MUTEX_HELD(&msp->ms_lock)); 867 if (msp->ms_sm == NULL) 868 return; 869 870 mutex_enter(&mg->mg_lock); 871 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 872 mg->mg_histogram[i + ashift] += 873 msp->ms_sm->sm_phys->smp_histogram[i]; 874 mc->mc_histogram[i + ashift] += 875 msp->ms_sm->sm_phys->smp_histogram[i]; 876 } 877 mutex_exit(&mg->mg_lock); 878 } 879 880 void 881 metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp) 882 { 883 metaslab_class_t *mc = mg->mg_class; 884 uint64_t ashift = mg->mg_vd->vdev_ashift; 885 886 ASSERT(MUTEX_HELD(&msp->ms_lock)); 887 if (msp->ms_sm == NULL) 888 return; 889 890 mutex_enter(&mg->mg_lock); 891 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 892 ASSERT3U(mg->mg_histogram[i + ashift], >=, 893 msp->ms_sm->sm_phys->smp_histogram[i]); 894 ASSERT3U(mc->mc_histogram[i + ashift], >=, 895 msp->ms_sm->sm_phys->smp_histogram[i]); 896 897 mg->mg_histogram[i + ashift] -= 898 msp->ms_sm->sm_phys->smp_histogram[i]; 899 mc->mc_histogram[i + ashift] -= 900 msp->ms_sm->sm_phys->smp_histogram[i]; 901 } 902 mutex_exit(&mg->mg_lock); 903 } 904 905 static void 906 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) 907 { 908 ASSERT(msp->ms_group == NULL); 909 mutex_enter(&mg->mg_lock); 910 msp->ms_group = mg; 911 msp->ms_weight = 0; 912 avl_add(&mg->mg_metaslab_tree, msp); 913 mutex_exit(&mg->mg_lock); 914 915 mutex_enter(&msp->ms_lock); 916 metaslab_group_histogram_add(mg, msp); 917 mutex_exit(&msp->ms_lock); 918 } 919 920 static void 921 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) 922 { 923 mutex_enter(&msp->ms_lock); 924 metaslab_group_histogram_remove(mg, msp); 925 mutex_exit(&msp->ms_lock); 926 927 mutex_enter(&mg->mg_lock); 928 ASSERT(msp->ms_group == mg); 929 avl_remove(&mg->mg_metaslab_tree, msp); 930 msp->ms_group = NULL; 931 mutex_exit(&mg->mg_lock); 932 } 933 934 static void 935 metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 936 { 937 ASSERT(MUTEX_HELD(&mg->mg_lock)); 938 ASSERT(msp->ms_group == mg); 939 avl_remove(&mg->mg_metaslab_tree, msp); 940 msp->ms_weight = weight; 941 avl_add(&mg->mg_metaslab_tree, msp); 942 943 } 944 945 static void 946 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 947 { 948 /* 949 * Although in principle the weight can be any value, in 950 * practice we do not use values in the range [1, 511]. 951 */ 952 ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0); 953 ASSERT(MUTEX_HELD(&msp->ms_lock)); 954 955 mutex_enter(&mg->mg_lock); 956 metaslab_group_sort_impl(mg, msp, weight); 957 mutex_exit(&mg->mg_lock); 958 } 959 960 /* 961 * Calculate the fragmentation for a given metaslab group. We can use 962 * a simple average here since all metaslabs within the group must have 963 * the same size. The return value will be a value between 0 and 100 964 * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this 965 * group have a fragmentation metric. 966 */ 967 uint64_t 968 metaslab_group_fragmentation(metaslab_group_t *mg) 969 { 970 vdev_t *vd = mg->mg_vd; 971 uint64_t fragmentation = 0; 972 uint64_t valid_ms = 0; 973 974 for (int m = 0; m < vd->vdev_ms_count; m++) { 975 metaslab_t *msp = vd->vdev_ms[m]; 976 977 if (msp->ms_fragmentation == ZFS_FRAG_INVALID) 978 continue; 979 if (msp->ms_group != mg) 980 continue; 981 982 valid_ms++; 983 fragmentation += msp->ms_fragmentation; 984 } 985 986 if (valid_ms <= mg->mg_vd->vdev_ms_count / 2) 987 return (ZFS_FRAG_INVALID); 988 989 fragmentation /= valid_ms; 990 ASSERT3U(fragmentation, <=, 100); 991 return (fragmentation); 992 } 993 994 /* 995 * Determine if a given metaslab group should skip allocations. A metaslab 996 * group should avoid allocations if its free capacity is less than the 997 * zfs_mg_noalloc_threshold or its fragmentation metric is greater than 998 * zfs_mg_fragmentation_threshold and there is at least one metaslab group 999 * that can still handle allocations. If the allocation throttle is enabled 1000 * then we skip allocations to devices that have reached their maximum 1001 * allocation queue depth unless the selected metaslab group is the only 1002 * eligible group remaining. 1003 */ 1004 static boolean_t 1005 metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, 1006 uint64_t psize, int allocator) 1007 { 1008 spa_t *spa = mg->mg_vd->vdev_spa; 1009 metaslab_class_t *mc = mg->mg_class; 1010 1011 /* 1012 * We can only consider skipping this metaslab group if it's 1013 * in the normal metaslab class and there are other metaslab 1014 * groups to select from. Otherwise, we always consider it eligible 1015 * for allocations. 1016 */ 1017 if ((mc != spa_normal_class(spa) && 1018 mc != spa_special_class(spa) && 1019 mc != spa_dedup_class(spa)) || 1020 mc->mc_groups <= 1) 1021 return (B_TRUE); 1022 1023 /* 1024 * If the metaslab group's mg_allocatable flag is set (see comments 1025 * in metaslab_group_alloc_update() for more information) and 1026 * the allocation throttle is disabled then allow allocations to this 1027 * device. However, if the allocation throttle is enabled then 1028 * check if we have reached our allocation limit (mg_alloc_queue_depth) 1029 * to determine if we should allow allocations to this metaslab group. 1030 * If all metaslab groups are no longer considered allocatable 1031 * (mc_alloc_groups == 0) or we're trying to allocate the smallest 1032 * gang block size then we allow allocations on this metaslab group 1033 * regardless of the mg_allocatable or throttle settings. 1034 */ 1035 if (mg->mg_allocatable) { 1036 metaslab_group_t *mgp; 1037 int64_t qdepth; 1038 uint64_t qmax = mg->mg_cur_max_alloc_queue_depth[allocator]; 1039 1040 if (!mc->mc_alloc_throttle_enabled) 1041 return (B_TRUE); 1042 1043 /* 1044 * If this metaslab group does not have any free space, then 1045 * there is no point in looking further. 1046 */ 1047 if (mg->mg_no_free_space) 1048 return (B_FALSE); 1049 1050 qdepth = zfs_refcount_count( 1051 &mg->mg_alloc_queue_depth[allocator]); 1052 1053 /* 1054 * If this metaslab group is below its qmax or it's 1055 * the only allocatable metasable group, then attempt 1056 * to allocate from it. 1057 */ 1058 if (qdepth < qmax || mc->mc_alloc_groups == 1) 1059 return (B_TRUE); 1060 ASSERT3U(mc->mc_alloc_groups, >, 1); 1061 1062 /* 1063 * Since this metaslab group is at or over its qmax, we 1064 * need to determine if there are metaslab groups after this 1065 * one that might be able to handle this allocation. This is 1066 * racy since we can't hold the locks for all metaslab 1067 * groups at the same time when we make this check. 1068 */ 1069 for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) { 1070 qmax = mgp->mg_cur_max_alloc_queue_depth[allocator]; 1071 1072 qdepth = zfs_refcount_count( 1073 &mgp->mg_alloc_queue_depth[allocator]); 1074 1075 /* 1076 * If there is another metaslab group that 1077 * might be able to handle the allocation, then 1078 * we return false so that we skip this group. 1079 */ 1080 if (qdepth < qmax && !mgp->mg_no_free_space) 1081 return (B_FALSE); 1082 } 1083 1084 /* 1085 * We didn't find another group to handle the allocation 1086 * so we can't skip this metaslab group even though 1087 * we are at or over our qmax. 1088 */ 1089 return (B_TRUE); 1090 1091 } else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) { 1092 return (B_TRUE); 1093 } 1094 return (B_FALSE); 1095 } 1096 1097 /* 1098 * ========================================================================== 1099 * Range tree callbacks 1100 * ========================================================================== 1101 */ 1102 1103 /* 1104 * Comparison function for the private size-ordered tree. Tree is sorted 1105 * by size, larger sizes at the end of the tree. 1106 */ 1107 static int 1108 metaslab_rangesize_compare(const void *x1, const void *x2) 1109 { 1110 const range_seg_t *r1 = x1; 1111 const range_seg_t *r2 = x2; 1112 uint64_t rs_size1 = r1->rs_end - r1->rs_start; 1113 uint64_t rs_size2 = r2->rs_end - r2->rs_start; 1114 1115 if (rs_size1 < rs_size2) 1116 return (-1); 1117 if (rs_size1 > rs_size2) 1118 return (1); 1119 1120 if (r1->rs_start < r2->rs_start) 1121 return (-1); 1122 1123 if (r1->rs_start > r2->rs_start) 1124 return (1); 1125 1126 return (0); 1127 } 1128 1129 /* 1130 * Create any block allocator specific components. The current allocators 1131 * rely on using both a size-ordered range_tree_t and an array of uint64_t's. 1132 */ 1133 static void 1134 metaslab_rt_create(range_tree_t *rt, void *arg) 1135 { 1136 metaslab_t *msp = arg; 1137 1138 ASSERT3P(rt->rt_arg, ==, msp); 1139 ASSERT(msp->ms_allocatable == NULL); 1140 1141 avl_create(&msp->ms_allocatable_by_size, metaslab_rangesize_compare, 1142 sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node)); 1143 } 1144 1145 /* 1146 * Destroy the block allocator specific components. 1147 */ 1148 static void 1149 metaslab_rt_destroy(range_tree_t *rt, void *arg) 1150 { 1151 metaslab_t *msp = arg; 1152 1153 ASSERT3P(rt->rt_arg, ==, msp); 1154 ASSERT3P(msp->ms_allocatable, ==, rt); 1155 ASSERT0(avl_numnodes(&msp->ms_allocatable_by_size)); 1156 1157 avl_destroy(&msp->ms_allocatable_by_size); 1158 } 1159 1160 static void 1161 metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg) 1162 { 1163 metaslab_t *msp = arg; 1164 1165 ASSERT3P(rt->rt_arg, ==, msp); 1166 ASSERT3P(msp->ms_allocatable, ==, rt); 1167 VERIFY(!msp->ms_condensing); 1168 avl_add(&msp->ms_allocatable_by_size, rs); 1169 } 1170 1171 static void 1172 metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg) 1173 { 1174 metaslab_t *msp = arg; 1175 1176 ASSERT3P(rt->rt_arg, ==, msp); 1177 ASSERT3P(msp->ms_allocatable, ==, rt); 1178 VERIFY(!msp->ms_condensing); 1179 avl_remove(&msp->ms_allocatable_by_size, rs); 1180 } 1181 1182 static void 1183 metaslab_rt_vacate(range_tree_t *rt, void *arg) 1184 { 1185 metaslab_t *msp = arg; 1186 1187 ASSERT3P(rt->rt_arg, ==, msp); 1188 ASSERT3P(msp->ms_allocatable, ==, rt); 1189 1190 /* 1191 * Normally one would walk the tree freeing nodes along the way. 1192 * Since the nodes are shared with the range trees we can avoid 1193 * walking all nodes and just reinitialize the avl tree. The nodes 1194 * will be freed by the range tree, so we don't want to free them here. 1195 */ 1196 avl_create(&msp->ms_allocatable_by_size, metaslab_rangesize_compare, 1197 sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node)); 1198 } 1199 1200 static range_tree_ops_t metaslab_rt_ops = { 1201 metaslab_rt_create, 1202 metaslab_rt_destroy, 1203 metaslab_rt_add, 1204 metaslab_rt_remove, 1205 metaslab_rt_vacate 1206 }; 1207 1208 /* 1209 * ========================================================================== 1210 * Common allocator routines 1211 * ========================================================================== 1212 */ 1213 1214 /* 1215 * Return the maximum contiguous segment within the metaslab. 1216 */ 1217 uint64_t 1218 metaslab_block_maxsize(metaslab_t *msp) 1219 { 1220 avl_tree_t *t = &msp->ms_allocatable_by_size; 1221 range_seg_t *rs; 1222 1223 if (t == NULL || (rs = avl_last(t)) == NULL) 1224 return (0ULL); 1225 1226 return (rs->rs_end - rs->rs_start); 1227 } 1228 1229 static range_seg_t * 1230 metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size) 1231 { 1232 range_seg_t *rs, rsearch; 1233 avl_index_t where; 1234 1235 rsearch.rs_start = start; 1236 rsearch.rs_end = start + size; 1237 1238 rs = avl_find(t, &rsearch, &where); 1239 if (rs == NULL) { 1240 rs = avl_nearest(t, where, AVL_AFTER); 1241 } 1242 1243 return (rs); 1244 } 1245 1246 /* 1247 * This is a helper function that can be used by the allocator to find 1248 * a suitable block to allocate. This will search the specified AVL 1249 * tree looking for a block that matches the specified criteria. 1250 */ 1251 static uint64_t 1252 metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, 1253 uint64_t align) 1254 { 1255 range_seg_t *rs = metaslab_block_find(t, *cursor, size); 1256 1257 while (rs != NULL) { 1258 uint64_t offset = P2ROUNDUP(rs->rs_start, align); 1259 1260 if (offset + size <= rs->rs_end) { 1261 *cursor = offset + size; 1262 return (offset); 1263 } 1264 rs = AVL_NEXT(t, rs); 1265 } 1266 1267 /* 1268 * If we know we've searched the whole map (*cursor == 0), give up. 1269 * Otherwise, reset the cursor to the beginning and try again. 1270 */ 1271 if (*cursor == 0) 1272 return (-1ULL); 1273 1274 *cursor = 0; 1275 return (metaslab_block_picker(t, cursor, size, align)); 1276 } 1277 1278 /* 1279 * ========================================================================== 1280 * The first-fit block allocator 1281 * ========================================================================== 1282 */ 1283 static uint64_t 1284 metaslab_ff_alloc(metaslab_t *msp, uint64_t size) 1285 { 1286 /* 1287 * Find the largest power of 2 block size that evenly divides the 1288 * requested size. This is used to try to allocate blocks with similar 1289 * alignment from the same area of the metaslab (i.e. same cursor 1290 * bucket) but it does not guarantee that other allocations sizes 1291 * may exist in the same region. 1292 */ 1293 uint64_t align = size & -size; 1294 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; 1295 avl_tree_t *t = &msp->ms_allocatable->rt_root; 1296 1297 return (metaslab_block_picker(t, cursor, size, align)); 1298 } 1299 1300 static metaslab_ops_t metaslab_ff_ops = { 1301 metaslab_ff_alloc 1302 }; 1303 1304 /* 1305 * ========================================================================== 1306 * Dynamic block allocator - 1307 * Uses the first fit allocation scheme until space get low and then 1308 * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold 1309 * and metaslab_df_free_pct to determine when to switch the allocation scheme. 1310 * ========================================================================== 1311 */ 1312 static uint64_t 1313 metaslab_df_alloc(metaslab_t *msp, uint64_t size) 1314 { 1315 /* 1316 * Find the largest power of 2 block size that evenly divides the 1317 * requested size. This is used to try to allocate blocks with similar 1318 * alignment from the same area of the metaslab (i.e. same cursor 1319 * bucket) but it does not guarantee that other allocations sizes 1320 * may exist in the same region. 1321 */ 1322 uint64_t align = size & -size; 1323 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; 1324 range_tree_t *rt = msp->ms_allocatable; 1325 avl_tree_t *t = &rt->rt_root; 1326 uint64_t max_size = metaslab_block_maxsize(msp); 1327 int free_pct = range_tree_space(rt) * 100 / msp->ms_size; 1328 1329 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1330 ASSERT3U(avl_numnodes(t), ==, 1331 avl_numnodes(&msp->ms_allocatable_by_size)); 1332 1333 if (max_size < size) 1334 return (-1ULL); 1335 1336 /* 1337 * If we're running low on space switch to using the size 1338 * sorted AVL tree (best-fit). 1339 */ 1340 if (max_size < metaslab_df_alloc_threshold || 1341 free_pct < metaslab_df_free_pct) { 1342 t = &msp->ms_allocatable_by_size; 1343 *cursor = 0; 1344 } 1345 1346 return (metaslab_block_picker(t, cursor, size, 1ULL)); 1347 } 1348 1349 static metaslab_ops_t metaslab_df_ops = { 1350 metaslab_df_alloc 1351 }; 1352 1353 /* 1354 * ========================================================================== 1355 * Cursor fit block allocator - 1356 * Select the largest region in the metaslab, set the cursor to the beginning 1357 * of the range and the cursor_end to the end of the range. As allocations 1358 * are made advance the cursor. Continue allocating from the cursor until 1359 * the range is exhausted and then find a new range. 1360 * ========================================================================== 1361 */ 1362 static uint64_t 1363 metaslab_cf_alloc(metaslab_t *msp, uint64_t size) 1364 { 1365 range_tree_t *rt = msp->ms_allocatable; 1366 avl_tree_t *t = &msp->ms_allocatable_by_size; 1367 uint64_t *cursor = &msp->ms_lbas[0]; 1368 uint64_t *cursor_end = &msp->ms_lbas[1]; 1369 uint64_t offset = 0; 1370 1371 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1372 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root)); 1373 1374 ASSERT3U(*cursor_end, >=, *cursor); 1375 1376 if ((*cursor + size) > *cursor_end) { 1377 range_seg_t *rs; 1378 1379 rs = avl_last(&msp->ms_allocatable_by_size); 1380 if (rs == NULL || (rs->rs_end - rs->rs_start) < size) 1381 return (-1ULL); 1382 1383 *cursor = rs->rs_start; 1384 *cursor_end = rs->rs_end; 1385 } 1386 1387 offset = *cursor; 1388 *cursor += size; 1389 1390 return (offset); 1391 } 1392 1393 static metaslab_ops_t metaslab_cf_ops = { 1394 metaslab_cf_alloc 1395 }; 1396 1397 /* 1398 * ========================================================================== 1399 * New dynamic fit allocator - 1400 * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift 1401 * contiguous blocks. If no region is found then just use the largest segment 1402 * that remains. 1403 * ========================================================================== 1404 */ 1405 1406 /* 1407 * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift) 1408 * to request from the allocator. 1409 */ 1410 uint64_t metaslab_ndf_clump_shift = 4; 1411 1412 static uint64_t 1413 metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) 1414 { 1415 avl_tree_t *t = &msp->ms_allocatable->rt_root; 1416 avl_index_t where; 1417 range_seg_t *rs, rsearch; 1418 uint64_t hbit = highbit64(size); 1419 uint64_t *cursor = &msp->ms_lbas[hbit - 1]; 1420 uint64_t max_size = metaslab_block_maxsize(msp); 1421 1422 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1423 ASSERT3U(avl_numnodes(t), ==, 1424 avl_numnodes(&msp->ms_allocatable_by_size)); 1425 1426 if (max_size < size) 1427 return (-1ULL); 1428 1429 rsearch.rs_start = *cursor; 1430 rsearch.rs_end = *cursor + size; 1431 1432 rs = avl_find(t, &rsearch, &where); 1433 if (rs == NULL || (rs->rs_end - rs->rs_start) < size) { 1434 t = &msp->ms_allocatable_by_size; 1435 1436 rsearch.rs_start = 0; 1437 rsearch.rs_end = MIN(max_size, 1438 1ULL << (hbit + metaslab_ndf_clump_shift)); 1439 rs = avl_find(t, &rsearch, &where); 1440 if (rs == NULL) 1441 rs = avl_nearest(t, where, AVL_AFTER); 1442 ASSERT(rs != NULL); 1443 } 1444 1445 if ((rs->rs_end - rs->rs_start) >= size) { 1446 *cursor = rs->rs_start + size; 1447 return (rs->rs_start); 1448 } 1449 return (-1ULL); 1450 } 1451 1452 static metaslab_ops_t metaslab_ndf_ops = { 1453 metaslab_ndf_alloc 1454 }; 1455 1456 metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops; 1457 1458 /* 1459 * ========================================================================== 1460 * Metaslabs 1461 * ========================================================================== 1462 */ 1463 1464 /* 1465 * Wait for any in-progress metaslab loads to complete. 1466 */ 1467 static void 1468 metaslab_load_wait(metaslab_t *msp) 1469 { 1470 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1471 1472 while (msp->ms_loading) { 1473 ASSERT(!msp->ms_loaded); 1474 cv_wait(&msp->ms_load_cv, &msp->ms_lock); 1475 } 1476 } 1477 1478 static int 1479 metaslab_load_impl(metaslab_t *msp) 1480 { 1481 int error = 0; 1482 1483 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1484 ASSERT(msp->ms_loading); 1485 1486 /* 1487 * Nobody else can manipulate a loading metaslab, so it's now safe 1488 * to drop the lock. This way we don't have to hold the lock while 1489 * reading the spacemap from disk. 1490 */ 1491 mutex_exit(&msp->ms_lock); 1492 1493 /* 1494 * If the space map has not been allocated yet, then treat 1495 * all the space in the metaslab as free and add it to ms_allocatable. 1496 */ 1497 if (msp->ms_sm != NULL) { 1498 error = space_map_load(msp->ms_sm, msp->ms_allocatable, 1499 SM_FREE); 1500 } else { 1501 range_tree_add(msp->ms_allocatable, 1502 msp->ms_start, msp->ms_size); 1503 } 1504 1505 mutex_enter(&msp->ms_lock); 1506 1507 if (error != 0) 1508 return (error); 1509 1510 ASSERT3P(msp->ms_group, !=, NULL); 1511 msp->ms_loaded = B_TRUE; 1512 1513 /* 1514 * If the metaslab already has a spacemap, then we need to 1515 * remove all segments from the defer tree; otherwise, the 1516 * metaslab is completely empty and we can skip this. 1517 */ 1518 if (msp->ms_sm != NULL) { 1519 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1520 range_tree_walk(msp->ms_defer[t], 1521 range_tree_remove, msp->ms_allocatable); 1522 } 1523 } 1524 msp->ms_max_size = metaslab_block_maxsize(msp); 1525 1526 return (0); 1527 } 1528 1529 int 1530 metaslab_load(metaslab_t *msp) 1531 { 1532 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1533 1534 /* 1535 * There may be another thread loading the same metaslab, if that's 1536 * the case just wait until the other thread is done and return. 1537 */ 1538 metaslab_load_wait(msp); 1539 if (msp->ms_loaded) 1540 return (0); 1541 VERIFY(!msp->ms_loading); 1542 1543 msp->ms_loading = B_TRUE; 1544 int error = metaslab_load_impl(msp); 1545 msp->ms_loading = B_FALSE; 1546 cv_broadcast(&msp->ms_load_cv); 1547 1548 return (error); 1549 } 1550 1551 void 1552 metaslab_unload(metaslab_t *msp) 1553 { 1554 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1555 range_tree_vacate(msp->ms_allocatable, NULL, NULL); 1556 msp->ms_loaded = B_FALSE; 1557 msp->ms_weight &= ~METASLAB_ACTIVE_MASK; 1558 msp->ms_max_size = 0; 1559 } 1560 1561 static void 1562 metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta, 1563 int64_t defer_delta, int64_t space_delta) 1564 { 1565 vdev_space_update(vd, alloc_delta, defer_delta, space_delta); 1566 1567 ASSERT3P(vd->vdev_spa->spa_root_vdev, ==, vd->vdev_parent); 1568 ASSERT(vd->vdev_ms_count != 0); 1569 1570 metaslab_class_space_update(mc, alloc_delta, defer_delta, space_delta, 1571 vdev_deflated_space(vd, space_delta)); 1572 } 1573 1574 int 1575 metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, 1576 metaslab_t **msp) 1577 { 1578 vdev_t *vd = mg->mg_vd; 1579 spa_t *spa = vd->vdev_spa; 1580 objset_t *mos = spa->spa_meta_objset; 1581 metaslab_t *ms; 1582 int error; 1583 1584 ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); 1585 mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL); 1586 mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL); 1587 cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); 1588 1589 ms->ms_id = id; 1590 ms->ms_start = id << vd->vdev_ms_shift; 1591 ms->ms_size = 1ULL << vd->vdev_ms_shift; 1592 ms->ms_allocator = -1; 1593 ms->ms_new = B_TRUE; 1594 1595 /* 1596 * We only open space map objects that already exist. All others 1597 * will be opened when we finally allocate an object for it. 1598 */ 1599 if (object != 0) { 1600 error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start, 1601 ms->ms_size, vd->vdev_ashift); 1602 1603 if (error != 0) { 1604 kmem_free(ms, sizeof (metaslab_t)); 1605 return (error); 1606 } 1607 1608 ASSERT(ms->ms_sm != NULL); 1609 } 1610 1611 /* 1612 * We create the main range tree here, but we don't create the 1613 * other range trees until metaslab_sync_done(). This serves 1614 * two purposes: it allows metaslab_sync_done() to detect the 1615 * addition of new space; and for debugging, it ensures that we'd 1616 * data fault on any attempt to use this metaslab before it's ready. 1617 */ 1618 ms->ms_allocatable = range_tree_create(&metaslab_rt_ops, ms); 1619 metaslab_group_add(mg, ms); 1620 1621 metaslab_set_fragmentation(ms); 1622 1623 /* 1624 * If we're opening an existing pool (txg == 0) or creating 1625 * a new one (txg == TXG_INITIAL), all space is available now. 1626 * If we're adding space to an existing pool, the new space 1627 * does not become available until after this txg has synced. 1628 * The metaslab's weight will also be initialized when we sync 1629 * out this txg. This ensures that we don't attempt to allocate 1630 * from it before we have initialized it completely. 1631 */ 1632 if (txg <= TXG_INITIAL) 1633 metaslab_sync_done(ms, 0); 1634 1635 /* 1636 * If metaslab_debug_load is set and we're initializing a metaslab 1637 * that has an allocated space map object then load the space map 1638 * so that we can verify frees. 1639 */ 1640 if (metaslab_debug_load && ms->ms_sm != NULL) { 1641 mutex_enter(&ms->ms_lock); 1642 VERIFY0(metaslab_load(ms)); 1643 mutex_exit(&ms->ms_lock); 1644 } 1645 1646 if (txg != 0) { 1647 vdev_dirty(vd, 0, NULL, txg); 1648 vdev_dirty(vd, VDD_METASLAB, ms, txg); 1649 } 1650 1651 *msp = ms; 1652 1653 return (0); 1654 } 1655 1656 void 1657 metaslab_fini(metaslab_t *msp) 1658 { 1659 metaslab_group_t *mg = msp->ms_group; 1660 vdev_t *vd = mg->mg_vd; 1661 1662 metaslab_group_remove(mg, msp); 1663 1664 mutex_enter(&msp->ms_lock); 1665 VERIFY(msp->ms_group == NULL); 1666 metaslab_space_update(vd, mg->mg_class, 1667 -space_map_allocated(msp->ms_sm), 0, -msp->ms_size); 1668 1669 space_map_close(msp->ms_sm); 1670 1671 metaslab_unload(msp); 1672 1673 range_tree_destroy(msp->ms_allocatable); 1674 range_tree_destroy(msp->ms_freeing); 1675 range_tree_destroy(msp->ms_freed); 1676 1677 for (int t = 0; t < TXG_SIZE; t++) { 1678 range_tree_destroy(msp->ms_allocating[t]); 1679 } 1680 1681 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1682 range_tree_destroy(msp->ms_defer[t]); 1683 } 1684 ASSERT0(msp->ms_deferspace); 1685 1686 range_tree_destroy(msp->ms_checkpointing); 1687 1688 mutex_exit(&msp->ms_lock); 1689 cv_destroy(&msp->ms_load_cv); 1690 mutex_destroy(&msp->ms_lock); 1691 mutex_destroy(&msp->ms_sync_lock); 1692 ASSERT3U(msp->ms_allocator, ==, -1); 1693 1694 kmem_free(msp, sizeof (metaslab_t)); 1695 } 1696 1697 #define FRAGMENTATION_TABLE_SIZE 17 1698 1699 /* 1700 * This table defines a segment size based fragmentation metric that will 1701 * allow each metaslab to derive its own fragmentation value. This is done 1702 * by calculating the space in each bucket of the spacemap histogram and 1703 * multiplying that by the fragmetation metric in this table. Doing 1704 * this for all buckets and dividing it by the total amount of free 1705 * space in this metaslab (i.e. the total free space in all buckets) gives 1706 * us the fragmentation metric. This means that a high fragmentation metric 1707 * equates to most of the free space being comprised of small segments. 1708 * Conversely, if the metric is low, then most of the free space is in 1709 * large segments. A 10% change in fragmentation equates to approximately 1710 * double the number of segments. 1711 * 1712 * This table defines 0% fragmented space using 16MB segments. Testing has 1713 * shown that segments that are greater than or equal to 16MB do not suffer 1714 * from drastic performance problems. Using this value, we derive the rest 1715 * of the table. Since the fragmentation value is never stored on disk, it 1716 * is possible to change these calculations in the future. 1717 */ 1718 int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = { 1719 100, /* 512B */ 1720 100, /* 1K */ 1721 98, /* 2K */ 1722 95, /* 4K */ 1723 90, /* 8K */ 1724 80, /* 16K */ 1725 70, /* 32K */ 1726 60, /* 64K */ 1727 50, /* 128K */ 1728 40, /* 256K */ 1729 30, /* 512K */ 1730 20, /* 1M */ 1731 15, /* 2M */ 1732 10, /* 4M */ 1733 5, /* 8M */ 1734 0 /* 16M */ 1735 }; 1736 1737 /* 1738 * Calclate the metaslab's fragmentation metric. A return value 1739 * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does 1740 * not support this metric. Otherwise, the return value should be in the 1741 * range [0, 100]. 1742 */ 1743 static void 1744 metaslab_set_fragmentation(metaslab_t *msp) 1745 { 1746 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1747 uint64_t fragmentation = 0; 1748 uint64_t total = 0; 1749 boolean_t feature_enabled = spa_feature_is_enabled(spa, 1750 SPA_FEATURE_SPACEMAP_HISTOGRAM); 1751 1752 if (!feature_enabled) { 1753 msp->ms_fragmentation = ZFS_FRAG_INVALID; 1754 return; 1755 } 1756 1757 /* 1758 * A null space map means that the entire metaslab is free 1759 * and thus is not fragmented. 1760 */ 1761 if (msp->ms_sm == NULL) { 1762 msp->ms_fragmentation = 0; 1763 return; 1764 } 1765 1766 /* 1767 * If this metaslab's space map has not been upgraded, flag it 1768 * so that we upgrade next time we encounter it. 1769 */ 1770 if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) { 1771 uint64_t txg = spa_syncing_txg(spa); 1772 vdev_t *vd = msp->ms_group->mg_vd; 1773 1774 /* 1775 * If we've reached the final dirty txg, then we must 1776 * be shutting down the pool. We don't want to dirty 1777 * any data past this point so skip setting the condense 1778 * flag. We can retry this action the next time the pool 1779 * is imported. 1780 */ 1781 if (spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) { 1782 msp->ms_condense_wanted = B_TRUE; 1783 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 1784 zfs_dbgmsg("txg %llu, requesting force condense: " 1785 "ms_id %llu, vdev_id %llu", txg, msp->ms_id, 1786 vd->vdev_id); 1787 } 1788 msp->ms_fragmentation = ZFS_FRAG_INVALID; 1789 return; 1790 } 1791 1792 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 1793 uint64_t space = 0; 1794 uint8_t shift = msp->ms_sm->sm_shift; 1795 1796 int idx = MIN(shift - SPA_MINBLOCKSHIFT + i, 1797 FRAGMENTATION_TABLE_SIZE - 1); 1798 1799 if (msp->ms_sm->sm_phys->smp_histogram[i] == 0) 1800 continue; 1801 1802 space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift); 1803 total += space; 1804 1805 ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE); 1806 fragmentation += space * zfs_frag_table[idx]; 1807 } 1808 1809 if (total > 0) 1810 fragmentation /= total; 1811 ASSERT3U(fragmentation, <=, 100); 1812 1813 msp->ms_fragmentation = fragmentation; 1814 } 1815 1816 /* 1817 * Compute a weight -- a selection preference value -- for the given metaslab. 1818 * This is based on the amount of free space, the level of fragmentation, 1819 * the LBA range, and whether the metaslab is loaded. 1820 */ 1821 static uint64_t 1822 metaslab_space_weight(metaslab_t *msp) 1823 { 1824 metaslab_group_t *mg = msp->ms_group; 1825 vdev_t *vd = mg->mg_vd; 1826 uint64_t weight, space; 1827 1828 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1829 ASSERT(!vd->vdev_removing); 1830 1831 /* 1832 * The baseline weight is the metaslab's free space. 1833 */ 1834 space = msp->ms_size - space_map_allocated(msp->ms_sm); 1835 1836 if (metaslab_fragmentation_factor_enabled && 1837 msp->ms_fragmentation != ZFS_FRAG_INVALID) { 1838 /* 1839 * Use the fragmentation information to inversely scale 1840 * down the baseline weight. We need to ensure that we 1841 * don't exclude this metaslab completely when it's 100% 1842 * fragmented. To avoid this we reduce the fragmented value 1843 * by 1. 1844 */ 1845 space = (space * (100 - (msp->ms_fragmentation - 1))) / 100; 1846 1847 /* 1848 * If space < SPA_MINBLOCKSIZE, then we will not allocate from 1849 * this metaslab again. The fragmentation metric may have 1850 * decreased the space to something smaller than 1851 * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE 1852 * so that we can consume any remaining space. 1853 */ 1854 if (space > 0 && space < SPA_MINBLOCKSIZE) 1855 space = SPA_MINBLOCKSIZE; 1856 } 1857 weight = space; 1858 1859 /* 1860 * Modern disks have uniform bit density and constant angular velocity. 1861 * Therefore, the outer recording zones are faster (higher bandwidth) 1862 * than the inner zones by the ratio of outer to inner track diameter, 1863 * which is typically around 2:1. We account for this by assigning 1864 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). 1865 * In effect, this means that we'll select the metaslab with the most 1866 * free bandwidth rather than simply the one with the most free space. 1867 */ 1868 if (metaslab_lba_weighting_enabled) { 1869 weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count; 1870 ASSERT(weight >= space && weight <= 2 * space); 1871 } 1872 1873 /* 1874 * If this metaslab is one we're actively using, adjust its 1875 * weight to make it preferable to any inactive metaslab so 1876 * we'll polish it off. If the fragmentation on this metaslab 1877 * has exceed our threshold, then don't mark it active. 1878 */ 1879 if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID && 1880 msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) { 1881 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); 1882 } 1883 1884 WEIGHT_SET_SPACEBASED(weight); 1885 return (weight); 1886 } 1887 1888 /* 1889 * Return the weight of the specified metaslab, according to the segment-based 1890 * weighting algorithm. The metaslab must be loaded. This function can 1891 * be called within a sync pass since it relies only on the metaslab's 1892 * range tree which is always accurate when the metaslab is loaded. 1893 */ 1894 static uint64_t 1895 metaslab_weight_from_range_tree(metaslab_t *msp) 1896 { 1897 uint64_t weight = 0; 1898 uint32_t segments = 0; 1899 1900 ASSERT(msp->ms_loaded); 1901 1902 for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT; 1903 i--) { 1904 uint8_t shift = msp->ms_group->mg_vd->vdev_ashift; 1905 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; 1906 1907 segments <<= 1; 1908 segments += msp->ms_allocatable->rt_histogram[i]; 1909 1910 /* 1911 * The range tree provides more precision than the space map 1912 * and must be downgraded so that all values fit within the 1913 * space map's histogram. This allows us to compare loaded 1914 * vs. unloaded metaslabs to determine which metaslab is 1915 * considered "best". 1916 */ 1917 if (i > max_idx) 1918 continue; 1919 1920 if (segments != 0) { 1921 WEIGHT_SET_COUNT(weight, segments); 1922 WEIGHT_SET_INDEX(weight, i); 1923 WEIGHT_SET_ACTIVE(weight, 0); 1924 break; 1925 } 1926 } 1927 return (weight); 1928 } 1929 1930 /* 1931 * Calculate the weight based on the on-disk histogram. This should only 1932 * be called after a sync pass has completely finished since the on-disk 1933 * information is updated in metaslab_sync(). 1934 */ 1935 static uint64_t 1936 metaslab_weight_from_spacemap(metaslab_t *msp) 1937 { 1938 uint64_t weight = 0; 1939 1940 for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) { 1941 if (msp->ms_sm->sm_phys->smp_histogram[i] != 0) { 1942 WEIGHT_SET_COUNT(weight, 1943 msp->ms_sm->sm_phys->smp_histogram[i]); 1944 WEIGHT_SET_INDEX(weight, i + 1945 msp->ms_sm->sm_shift); 1946 WEIGHT_SET_ACTIVE(weight, 0); 1947 break; 1948 } 1949 } 1950 return (weight); 1951 } 1952 1953 /* 1954 * Compute a segment-based weight for the specified metaslab. The weight 1955 * is determined by highest bucket in the histogram. The information 1956 * for the highest bucket is encoded into the weight value. 1957 */ 1958 static uint64_t 1959 metaslab_segment_weight(metaslab_t *msp) 1960 { 1961 metaslab_group_t *mg = msp->ms_group; 1962 uint64_t weight = 0; 1963 uint8_t shift = mg->mg_vd->vdev_ashift; 1964 1965 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1966 1967 /* 1968 * The metaslab is completely free. 1969 */ 1970 if (space_map_allocated(msp->ms_sm) == 0) { 1971 int idx = highbit64(msp->ms_size) - 1; 1972 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; 1973 1974 if (idx < max_idx) { 1975 WEIGHT_SET_COUNT(weight, 1ULL); 1976 WEIGHT_SET_INDEX(weight, idx); 1977 } else { 1978 WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx)); 1979 WEIGHT_SET_INDEX(weight, max_idx); 1980 } 1981 WEIGHT_SET_ACTIVE(weight, 0); 1982 ASSERT(!WEIGHT_IS_SPACEBASED(weight)); 1983 1984 return (weight); 1985 } 1986 1987 ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); 1988 1989 /* 1990 * If the metaslab is fully allocated then just make the weight 0. 1991 */ 1992 if (space_map_allocated(msp->ms_sm) == msp->ms_size) 1993 return (0); 1994 /* 1995 * If the metaslab is already loaded, then use the range tree to 1996 * determine the weight. Otherwise, we rely on the space map information 1997 * to generate the weight. 1998 */ 1999 if (msp->ms_loaded) { 2000 weight = metaslab_weight_from_range_tree(msp); 2001 } else { 2002 weight = metaslab_weight_from_spacemap(msp); 2003 } 2004 2005 /* 2006 * If the metaslab was active the last time we calculated its weight 2007 * then keep it active. We want to consume the entire region that 2008 * is associated with this weight. 2009 */ 2010 if (msp->ms_activation_weight != 0 && weight != 0) 2011 WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight)); 2012 return (weight); 2013 } 2014 2015 /* 2016 * Determine if we should attempt to allocate from this metaslab. If the 2017 * metaslab has a maximum size then we can quickly determine if the desired 2018 * allocation size can be satisfied. Otherwise, if we're using segment-based 2019 * weighting then we can determine the maximum allocation that this metaslab 2020 * can accommodate based on the index encoded in the weight. If we're using 2021 * space-based weights then rely on the entire weight (excluding the weight 2022 * type bit). 2023 */ 2024 boolean_t 2025 metaslab_should_allocate(metaslab_t *msp, uint64_t asize) 2026 { 2027 boolean_t should_allocate; 2028 2029 if (msp->ms_max_size != 0) 2030 return (msp->ms_max_size >= asize); 2031 2032 if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) { 2033 /* 2034 * The metaslab segment weight indicates segments in the 2035 * range [2^i, 2^(i+1)), where i is the index in the weight. 2036 * Since the asize might be in the middle of the range, we 2037 * should attempt the allocation if asize < 2^(i+1). 2038 */ 2039 should_allocate = (asize < 2040 1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1)); 2041 } else { 2042 should_allocate = (asize <= 2043 (msp->ms_weight & ~METASLAB_WEIGHT_TYPE)); 2044 } 2045 return (should_allocate); 2046 } 2047 2048 static uint64_t 2049 metaslab_weight(metaslab_t *msp) 2050 { 2051 vdev_t *vd = msp->ms_group->mg_vd; 2052 spa_t *spa = vd->vdev_spa; 2053 uint64_t weight; 2054 2055 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2056 2057 /* 2058 * If this vdev is in the process of being removed, there is nothing 2059 * for us to do here. 2060 */ 2061 if (vd->vdev_removing) 2062 return (0); 2063 2064 metaslab_set_fragmentation(msp); 2065 2066 /* 2067 * Update the maximum size if the metaslab is loaded. This will 2068 * ensure that we get an accurate maximum size if newly freed space 2069 * has been added back into the free tree. 2070 */ 2071 if (msp->ms_loaded) 2072 msp->ms_max_size = metaslab_block_maxsize(msp); 2073 2074 /* 2075 * Segment-based weighting requires space map histogram support. 2076 */ 2077 if (zfs_metaslab_segment_weight_enabled && 2078 spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) && 2079 (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size == 2080 sizeof (space_map_phys_t))) { 2081 weight = metaslab_segment_weight(msp); 2082 } else { 2083 weight = metaslab_space_weight(msp); 2084 } 2085 return (weight); 2086 } 2087 2088 static int 2089 metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp, 2090 int allocator, uint64_t activation_weight) 2091 { 2092 /* 2093 * If we're activating for the claim code, we don't want to actually 2094 * set the metaslab up for a specific allocator. 2095 */ 2096 if (activation_weight == METASLAB_WEIGHT_CLAIM) 2097 return (0); 2098 metaslab_t **arr = (activation_weight == METASLAB_WEIGHT_PRIMARY ? 2099 mg->mg_primaries : mg->mg_secondaries); 2100 2101 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2102 mutex_enter(&mg->mg_lock); 2103 if (arr[allocator] != NULL) { 2104 mutex_exit(&mg->mg_lock); 2105 return (EEXIST); 2106 } 2107 2108 arr[allocator] = msp; 2109 ASSERT3S(msp->ms_allocator, ==, -1); 2110 msp->ms_allocator = allocator; 2111 msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY); 2112 mutex_exit(&mg->mg_lock); 2113 2114 return (0); 2115 } 2116 2117 static int 2118 metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight) 2119 { 2120 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2121 2122 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { 2123 int error = metaslab_load(msp); 2124 if (error != 0) { 2125 metaslab_group_sort(msp->ms_group, msp, 0); 2126 return (error); 2127 } 2128 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) { 2129 /* 2130 * The metaslab was activated for another allocator 2131 * while we were waiting, we should reselect. 2132 */ 2133 return (EBUSY); 2134 } 2135 if ((error = metaslab_activate_allocator(msp->ms_group, msp, 2136 allocator, activation_weight)) != 0) { 2137 return (error); 2138 } 2139 2140 msp->ms_activation_weight = msp->ms_weight; 2141 metaslab_group_sort(msp->ms_group, msp, 2142 msp->ms_weight | activation_weight); 2143 } 2144 ASSERT(msp->ms_loaded); 2145 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); 2146 2147 return (0); 2148 } 2149 2150 static void 2151 metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp, 2152 uint64_t weight) 2153 { 2154 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2155 if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) { 2156 metaslab_group_sort(mg, msp, weight); 2157 return; 2158 } 2159 2160 mutex_enter(&mg->mg_lock); 2161 ASSERT3P(msp->ms_group, ==, mg); 2162 if (msp->ms_primary) { 2163 ASSERT3U(0, <=, msp->ms_allocator); 2164 ASSERT3U(msp->ms_allocator, <, mg->mg_allocators); 2165 ASSERT3P(mg->mg_primaries[msp->ms_allocator], ==, msp); 2166 ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); 2167 mg->mg_primaries[msp->ms_allocator] = NULL; 2168 } else { 2169 ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); 2170 ASSERT3P(mg->mg_secondaries[msp->ms_allocator], ==, msp); 2171 mg->mg_secondaries[msp->ms_allocator] = NULL; 2172 } 2173 msp->ms_allocator = -1; 2174 metaslab_group_sort_impl(mg, msp, weight); 2175 mutex_exit(&mg->mg_lock); 2176 } 2177 2178 static void 2179 metaslab_passivate(metaslab_t *msp, uint64_t weight) 2180 { 2181 uint64_t size = weight & ~METASLAB_WEIGHT_TYPE; 2182 2183 /* 2184 * If size < SPA_MINBLOCKSIZE, then we will not allocate from 2185 * this metaslab again. In that case, it had better be empty, 2186 * or we would be leaving space on the table. 2187 */ 2188 ASSERT(size >= SPA_MINBLOCKSIZE || 2189 range_tree_is_empty(msp->ms_allocatable)); 2190 ASSERT0(weight & METASLAB_ACTIVE_MASK); 2191 2192 msp->ms_activation_weight = 0; 2193 metaslab_passivate_allocator(msp->ms_group, msp, weight); 2194 ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); 2195 } 2196 2197 /* 2198 * Segment-based metaslabs are activated once and remain active until 2199 * we either fail an allocation attempt (similar to space-based metaslabs) 2200 * or have exhausted the free space in zfs_metaslab_switch_threshold 2201 * buckets since the metaslab was activated. This function checks to see 2202 * if we've exhaused the zfs_metaslab_switch_threshold buckets in the 2203 * metaslab and passivates it proactively. This will allow us to select a 2204 * metaslabs with larger contiguous region if any remaining within this 2205 * metaslab group. If we're in sync pass > 1, then we continue using this 2206 * metaslab so that we don't dirty more block and cause more sync passes. 2207 */ 2208 void 2209 metaslab_segment_may_passivate(metaslab_t *msp) 2210 { 2211 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 2212 2213 if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1) 2214 return; 2215 2216 /* 2217 * Since we are in the middle of a sync pass, the most accurate 2218 * information that is accessible to us is the in-core range tree 2219 * histogram; calculate the new weight based on that information. 2220 */ 2221 uint64_t weight = metaslab_weight_from_range_tree(msp); 2222 int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight); 2223 int current_idx = WEIGHT_GET_INDEX(weight); 2224 2225 if (current_idx <= activation_idx - zfs_metaslab_switch_threshold) 2226 metaslab_passivate(msp, weight); 2227 } 2228 2229 static void 2230 metaslab_preload(void *arg) 2231 { 2232 metaslab_t *msp = arg; 2233 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 2234 2235 ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock)); 2236 2237 mutex_enter(&msp->ms_lock); 2238 (void) metaslab_load(msp); 2239 msp->ms_selected_txg = spa_syncing_txg(spa); 2240 mutex_exit(&msp->ms_lock); 2241 } 2242 2243 static void 2244 metaslab_group_preload(metaslab_group_t *mg) 2245 { 2246 spa_t *spa = mg->mg_vd->vdev_spa; 2247 metaslab_t *msp; 2248 avl_tree_t *t = &mg->mg_metaslab_tree; 2249 int m = 0; 2250 2251 if (spa_shutting_down(spa) || !metaslab_preload_enabled) { 2252 taskq_wait(mg->mg_taskq); 2253 return; 2254 } 2255 2256 mutex_enter(&mg->mg_lock); 2257 2258 /* 2259 * Load the next potential metaslabs 2260 */ 2261 for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) { 2262 ASSERT3P(msp->ms_group, ==, mg); 2263 2264 /* 2265 * We preload only the maximum number of metaslabs specified 2266 * by metaslab_preload_limit. If a metaslab is being forced 2267 * to condense then we preload it too. This will ensure 2268 * that force condensing happens in the next txg. 2269 */ 2270 if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) { 2271 continue; 2272 } 2273 2274 VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload, 2275 msp, TQ_SLEEP) != TASKQID_INVALID); 2276 } 2277 mutex_exit(&mg->mg_lock); 2278 } 2279 2280 /* 2281 * Determine if the space map's on-disk footprint is past our tolerance 2282 * for inefficiency. We would like to use the following criteria to make 2283 * our decision: 2284 * 2285 * 1. The size of the space map object should not dramatically increase as a 2286 * result of writing out the free space range tree. 2287 * 2288 * 2. The minimal on-disk space map representation is zfs_condense_pct/100 2289 * times the size than the free space range tree representation 2290 * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1MB). 2291 * 2292 * 3. The on-disk size of the space map should actually decrease. 2293 * 2294 * Unfortunately, we cannot compute the on-disk size of the space map in this 2295 * context because we cannot accurately compute the effects of compression, etc. 2296 * Instead, we apply the heuristic described in the block comment for 2297 * zfs_metaslab_condense_block_threshold - we only condense if the space used 2298 * is greater than a threshold number of blocks. 2299 */ 2300 static boolean_t 2301 metaslab_should_condense(metaslab_t *msp) 2302 { 2303 space_map_t *sm = msp->ms_sm; 2304 vdev_t *vd = msp->ms_group->mg_vd; 2305 uint64_t vdev_blocksize = 1 << vd->vdev_ashift; 2306 uint64_t current_txg = spa_syncing_txg(vd->vdev_spa); 2307 2308 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2309 ASSERT(msp->ms_loaded); 2310 2311 /* 2312 * Allocations and frees in early passes are generally more space 2313 * efficient (in terms of blocks described in space map entries) 2314 * than the ones in later passes (e.g. we don't compress after 2315 * sync pass 5) and condensing a metaslab multiple times in a txg 2316 * could degrade performance. 2317 * 2318 * Thus we prefer condensing each metaslab at most once every txg at 2319 * the earliest sync pass possible. If a metaslab is eligible for 2320 * condensing again after being considered for condensing within the 2321 * same txg, it will hopefully be dirty in the next txg where it will 2322 * be condensed at an earlier pass. 2323 */ 2324 if (msp->ms_condense_checked_txg == current_txg) 2325 return (B_FALSE); 2326 msp->ms_condense_checked_txg = current_txg; 2327 2328 /* 2329 * We always condense metaslabs that are empty and metaslabs for 2330 * which a condense request has been made. 2331 */ 2332 if (avl_is_empty(&msp->ms_allocatable_by_size) || 2333 msp->ms_condense_wanted) 2334 return (B_TRUE); 2335 2336 uint64_t object_size = space_map_length(msp->ms_sm); 2337 uint64_t optimal_size = space_map_estimate_optimal_size(sm, 2338 msp->ms_allocatable, SM_NO_VDEVID); 2339 2340 dmu_object_info_t doi; 2341 dmu_object_info_from_db(sm->sm_dbuf, &doi); 2342 uint64_t record_size = MAX(doi.doi_data_block_size, vdev_blocksize); 2343 2344 return (object_size >= (optimal_size * zfs_condense_pct / 100) && 2345 object_size > zfs_metaslab_condense_block_threshold * record_size); 2346 } 2347 2348 /* 2349 * Condense the on-disk space map representation to its minimized form. 2350 * The minimized form consists of a small number of allocations followed by 2351 * the entries of the free range tree. 2352 */ 2353 static void 2354 metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) 2355 { 2356 range_tree_t *condense_tree; 2357 space_map_t *sm = msp->ms_sm; 2358 2359 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2360 ASSERT(msp->ms_loaded); 2361 2362 zfs_dbgmsg("condensing: txg %llu, msp[%llu] %p, vdev id %llu, " 2363 "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg, 2364 msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id, 2365 msp->ms_group->mg_vd->vdev_spa->spa_name, 2366 space_map_length(msp->ms_sm), 2367 avl_numnodes(&msp->ms_allocatable->rt_root), 2368 msp->ms_condense_wanted ? "TRUE" : "FALSE"); 2369 2370 msp->ms_condense_wanted = B_FALSE; 2371 2372 /* 2373 * Create an range tree that is 100% allocated. We remove segments 2374 * that have been freed in this txg, any deferred frees that exist, 2375 * and any allocation in the future. Removing segments should be 2376 * a relatively inexpensive operation since we expect these trees to 2377 * have a small number of nodes. 2378 */ 2379 condense_tree = range_tree_create(NULL, NULL); 2380 range_tree_add(condense_tree, msp->ms_start, msp->ms_size); 2381 2382 range_tree_walk(msp->ms_freeing, range_tree_remove, condense_tree); 2383 range_tree_walk(msp->ms_freed, range_tree_remove, condense_tree); 2384 2385 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2386 range_tree_walk(msp->ms_defer[t], 2387 range_tree_remove, condense_tree); 2388 } 2389 2390 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 2391 range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK], 2392 range_tree_remove, condense_tree); 2393 } 2394 2395 /* 2396 * We're about to drop the metaslab's lock thus allowing 2397 * other consumers to change it's content. Set the 2398 * metaslab's ms_condensing flag to ensure that 2399 * allocations on this metaslab do not occur while we're 2400 * in the middle of committing it to disk. This is only critical 2401 * for ms_allocatable as all other range trees use per txg 2402 * views of their content. 2403 */ 2404 msp->ms_condensing = B_TRUE; 2405 2406 mutex_exit(&msp->ms_lock); 2407 space_map_truncate(sm, zfs_metaslab_sm_blksz, tx); 2408 2409 /* 2410 * While we would ideally like to create a space map representation 2411 * that consists only of allocation records, doing so can be 2412 * prohibitively expensive because the in-core free tree can be 2413 * large, and therefore computationally expensive to subtract 2414 * from the condense_tree. Instead we sync out two trees, a cheap 2415 * allocation only tree followed by the in-core free tree. While not 2416 * optimal, this is typically close to optimal, and much cheaper to 2417 * compute. 2418 */ 2419 space_map_write(sm, condense_tree, SM_ALLOC, SM_NO_VDEVID, tx); 2420 range_tree_vacate(condense_tree, NULL, NULL); 2421 range_tree_destroy(condense_tree); 2422 2423 space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx); 2424 mutex_enter(&msp->ms_lock); 2425 msp->ms_condensing = B_FALSE; 2426 } 2427 2428 /* 2429 * Write a metaslab to disk in the context of the specified transaction group. 2430 */ 2431 void 2432 metaslab_sync(metaslab_t *msp, uint64_t txg) 2433 { 2434 metaslab_group_t *mg = msp->ms_group; 2435 vdev_t *vd = mg->mg_vd; 2436 spa_t *spa = vd->vdev_spa; 2437 objset_t *mos = spa_meta_objset(spa); 2438 range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK]; 2439 dmu_tx_t *tx; 2440 uint64_t object = space_map_object(msp->ms_sm); 2441 2442 ASSERT(!vd->vdev_ishole); 2443 2444 /* 2445 * This metaslab has just been added so there's no work to do now. 2446 */ 2447 if (msp->ms_freeing == NULL) { 2448 ASSERT3P(alloctree, ==, NULL); 2449 return; 2450 } 2451 2452 ASSERT3P(alloctree, !=, NULL); 2453 ASSERT3P(msp->ms_freeing, !=, NULL); 2454 ASSERT3P(msp->ms_freed, !=, NULL); 2455 ASSERT3P(msp->ms_checkpointing, !=, NULL); 2456 2457 /* 2458 * Normally, we don't want to process a metaslab if there are no 2459 * allocations or frees to perform. However, if the metaslab is being 2460 * forced to condense and it's loaded, we need to let it through. 2461 */ 2462 if (range_tree_is_empty(alloctree) && 2463 range_tree_is_empty(msp->ms_freeing) && 2464 range_tree_is_empty(msp->ms_checkpointing) && 2465 !(msp->ms_loaded && msp->ms_condense_wanted)) 2466 return; 2467 2468 2469 VERIFY(txg <= spa_final_dirty_txg(spa)); 2470 2471 /* 2472 * The only state that can actually be changing concurrently with 2473 * metaslab_sync() is the metaslab's ms_allocatable. No other 2474 * thread can be modifying this txg's alloc, freeing, 2475 * freed, or space_map_phys_t. We drop ms_lock whenever we 2476 * could call into the DMU, because the DMU can call down to us 2477 * (e.g. via zio_free()) at any time. 2478 * 2479 * The spa_vdev_remove_thread() can be reading metaslab state 2480 * concurrently, and it is locked out by the ms_sync_lock. Note 2481 * that the ms_lock is insufficient for this, because it is dropped 2482 * by space_map_write(). 2483 */ 2484 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 2485 2486 if (msp->ms_sm == NULL) { 2487 uint64_t new_object; 2488 2489 new_object = space_map_alloc(mos, zfs_metaslab_sm_blksz, tx); 2490 VERIFY3U(new_object, !=, 0); 2491 2492 VERIFY0(space_map_open(&msp->ms_sm, mos, new_object, 2493 msp->ms_start, msp->ms_size, vd->vdev_ashift)); 2494 ASSERT(msp->ms_sm != NULL); 2495 } 2496 2497 if (!range_tree_is_empty(msp->ms_checkpointing) && 2498 vd->vdev_checkpoint_sm == NULL) { 2499 ASSERT(spa_has_checkpoint(spa)); 2500 2501 uint64_t new_object = space_map_alloc(mos, 2502 vdev_standard_sm_blksz, tx); 2503 VERIFY3U(new_object, !=, 0); 2504 2505 VERIFY0(space_map_open(&vd->vdev_checkpoint_sm, 2506 mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift)); 2507 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); 2508 2509 /* 2510 * We save the space map object as an entry in vdev_top_zap 2511 * so it can be retrieved when the pool is reopened after an 2512 * export or through zdb. 2513 */ 2514 VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, 2515 vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, 2516 sizeof (new_object), 1, &new_object, tx)); 2517 } 2518 2519 mutex_enter(&msp->ms_sync_lock); 2520 mutex_enter(&msp->ms_lock); 2521 2522 /* 2523 * Note: metaslab_condense() clears the space map's histogram. 2524 * Therefore we must verify and remove this histogram before 2525 * condensing. 2526 */ 2527 metaslab_group_histogram_verify(mg); 2528 metaslab_class_histogram_verify(mg->mg_class); 2529 metaslab_group_histogram_remove(mg, msp); 2530 2531 if (msp->ms_loaded && metaslab_should_condense(msp)) { 2532 metaslab_condense(msp, txg, tx); 2533 } else { 2534 mutex_exit(&msp->ms_lock); 2535 space_map_write(msp->ms_sm, alloctree, SM_ALLOC, 2536 SM_NO_VDEVID, tx); 2537 space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE, 2538 SM_NO_VDEVID, tx); 2539 mutex_enter(&msp->ms_lock); 2540 } 2541 2542 if (!range_tree_is_empty(msp->ms_checkpointing)) { 2543 ASSERT(spa_has_checkpoint(spa)); 2544 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); 2545 2546 /* 2547 * Since we are doing writes to disk and the ms_checkpointing 2548 * tree won't be changing during that time, we drop the 2549 * ms_lock while writing to the checkpoint space map. 2550 */ 2551 mutex_exit(&msp->ms_lock); 2552 space_map_write(vd->vdev_checkpoint_sm, 2553 msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx); 2554 mutex_enter(&msp->ms_lock); 2555 space_map_update(vd->vdev_checkpoint_sm); 2556 2557 spa->spa_checkpoint_info.sci_dspace += 2558 range_tree_space(msp->ms_checkpointing); 2559 vd->vdev_stat.vs_checkpoint_space += 2560 range_tree_space(msp->ms_checkpointing); 2561 ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==, 2562 -vd->vdev_checkpoint_sm->sm_alloc); 2563 2564 range_tree_vacate(msp->ms_checkpointing, NULL, NULL); 2565 } 2566 2567 if (msp->ms_loaded) { 2568 /* 2569 * When the space map is loaded, we have an accurate 2570 * histogram in the range tree. This gives us an opportunity 2571 * to bring the space map's histogram up-to-date so we clear 2572 * it first before updating it. 2573 */ 2574 space_map_histogram_clear(msp->ms_sm); 2575 space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx); 2576 2577 /* 2578 * Since we've cleared the histogram we need to add back 2579 * any free space that has already been processed, plus 2580 * any deferred space. This allows the on-disk histogram 2581 * to accurately reflect all free space even if some space 2582 * is not yet available for allocation (i.e. deferred). 2583 */ 2584 space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx); 2585 2586 /* 2587 * Add back any deferred free space that has not been 2588 * added back into the in-core free tree yet. This will 2589 * ensure that we don't end up with a space map histogram 2590 * that is completely empty unless the metaslab is fully 2591 * allocated. 2592 */ 2593 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2594 space_map_histogram_add(msp->ms_sm, 2595 msp->ms_defer[t], tx); 2596 } 2597 } 2598 2599 /* 2600 * Always add the free space from this sync pass to the space 2601 * map histogram. We want to make sure that the on-disk histogram 2602 * accounts for all free space. If the space map is not loaded, 2603 * then we will lose some accuracy but will correct it the next 2604 * time we load the space map. 2605 */ 2606 space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx); 2607 2608 metaslab_group_histogram_add(mg, msp); 2609 metaslab_group_histogram_verify(mg); 2610 metaslab_class_histogram_verify(mg->mg_class); 2611 2612 /* 2613 * For sync pass 1, we avoid traversing this txg's free range tree 2614 * and instead will just swap the pointers for freeing and 2615 * freed. We can safely do this since the freed_tree is 2616 * guaranteed to be empty on the initial pass. 2617 */ 2618 if (spa_sync_pass(spa) == 1) { 2619 range_tree_swap(&msp->ms_freeing, &msp->ms_freed); 2620 } else { 2621 range_tree_vacate(msp->ms_freeing, 2622 range_tree_add, msp->ms_freed); 2623 } 2624 range_tree_vacate(alloctree, NULL, NULL); 2625 2626 ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); 2627 ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg) 2628 & TXG_MASK])); 2629 ASSERT0(range_tree_space(msp->ms_freeing)); 2630 ASSERT0(range_tree_space(msp->ms_checkpointing)); 2631 2632 mutex_exit(&msp->ms_lock); 2633 2634 if (object != space_map_object(msp->ms_sm)) { 2635 object = space_map_object(msp->ms_sm); 2636 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * 2637 msp->ms_id, sizeof (uint64_t), &object, tx); 2638 } 2639 mutex_exit(&msp->ms_sync_lock); 2640 dmu_tx_commit(tx); 2641 } 2642 2643 /* 2644 * Called after a transaction group has completely synced to mark 2645 * all of the metaslab's free space as usable. 2646 */ 2647 void 2648 metaslab_sync_done(metaslab_t *msp, uint64_t txg) 2649 { 2650 metaslab_group_t *mg = msp->ms_group; 2651 vdev_t *vd = mg->mg_vd; 2652 spa_t *spa = vd->vdev_spa; 2653 range_tree_t **defer_tree; 2654 int64_t alloc_delta, defer_delta; 2655 boolean_t defer_allowed = B_TRUE; 2656 2657 ASSERT(!vd->vdev_ishole); 2658 2659 mutex_enter(&msp->ms_lock); 2660 2661 /* 2662 * If this metaslab is just becoming available, initialize its 2663 * range trees and add its capacity to the vdev. 2664 */ 2665 if (msp->ms_freed == NULL) { 2666 for (int t = 0; t < TXG_SIZE; t++) { 2667 ASSERT(msp->ms_allocating[t] == NULL); 2668 2669 msp->ms_allocating[t] = range_tree_create(NULL, NULL); 2670 } 2671 2672 ASSERT3P(msp->ms_freeing, ==, NULL); 2673 msp->ms_freeing = range_tree_create(NULL, NULL); 2674 2675 ASSERT3P(msp->ms_freed, ==, NULL); 2676 msp->ms_freed = range_tree_create(NULL, NULL); 2677 2678 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2679 ASSERT(msp->ms_defer[t] == NULL); 2680 2681 msp->ms_defer[t] = range_tree_create(NULL, NULL); 2682 } 2683 2684 ASSERT3P(msp->ms_checkpointing, ==, NULL); 2685 msp->ms_checkpointing = range_tree_create(NULL, NULL); 2686 2687 metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size); 2688 } 2689 ASSERT0(range_tree_space(msp->ms_freeing)); 2690 ASSERT0(range_tree_space(msp->ms_checkpointing)); 2691 2692 defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE]; 2693 2694 uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) - 2695 metaslab_class_get_alloc(spa_normal_class(spa)); 2696 if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) { 2697 defer_allowed = B_FALSE; 2698 } 2699 2700 defer_delta = 0; 2701 alloc_delta = space_map_alloc_delta(msp->ms_sm); 2702 if (defer_allowed) { 2703 defer_delta = range_tree_space(msp->ms_freed) - 2704 range_tree_space(*defer_tree); 2705 } else { 2706 defer_delta -= range_tree_space(*defer_tree); 2707 } 2708 2709 metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta, 2710 defer_delta, 0); 2711 2712 /* 2713 * If there's a metaslab_load() in progress, wait for it to complete 2714 * so that we have a consistent view of the in-core space map. 2715 */ 2716 metaslab_load_wait(msp); 2717 2718 /* 2719 * Move the frees from the defer_tree back to the free 2720 * range tree (if it's loaded). Swap the freed_tree and 2721 * the defer_tree -- this is safe to do because we've 2722 * just emptied out the defer_tree. 2723 */ 2724 range_tree_vacate(*defer_tree, 2725 msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable); 2726 if (defer_allowed) { 2727 range_tree_swap(&msp->ms_freed, defer_tree); 2728 } else { 2729 range_tree_vacate(msp->ms_freed, 2730 msp->ms_loaded ? range_tree_add : NULL, 2731 msp->ms_allocatable); 2732 } 2733 space_map_update(msp->ms_sm); 2734 2735 msp->ms_deferspace += defer_delta; 2736 ASSERT3S(msp->ms_deferspace, >=, 0); 2737 ASSERT3S(msp->ms_deferspace, <=, msp->ms_size); 2738 if (msp->ms_deferspace != 0) { 2739 /* 2740 * Keep syncing this metaslab until all deferred frees 2741 * are back in circulation. 2742 */ 2743 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 2744 } 2745 2746 if (msp->ms_new) { 2747 msp->ms_new = B_FALSE; 2748 mutex_enter(&mg->mg_lock); 2749 mg->mg_ms_ready++; 2750 mutex_exit(&mg->mg_lock); 2751 } 2752 /* 2753 * Calculate the new weights before unloading any metaslabs. 2754 * This will give us the most accurate weighting. 2755 */ 2756 metaslab_group_sort(mg, msp, metaslab_weight(msp) | 2757 (msp->ms_weight & METASLAB_ACTIVE_MASK)); 2758 2759 /* 2760 * If the metaslab is loaded and we've not tried to load or allocate 2761 * from it in 'metaslab_unload_delay' txgs, then unload it. 2762 */ 2763 if (msp->ms_loaded && 2764 msp->ms_initializing == 0 && 2765 msp->ms_selected_txg + metaslab_unload_delay < txg) { 2766 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 2767 VERIFY0(range_tree_space( 2768 msp->ms_allocating[(txg + t) & TXG_MASK])); 2769 } 2770 if (msp->ms_allocator != -1) { 2771 metaslab_passivate(msp, msp->ms_weight & 2772 ~METASLAB_ACTIVE_MASK); 2773 } 2774 2775 if (!metaslab_debug_unload) 2776 metaslab_unload(msp); 2777 } 2778 2779 ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); 2780 ASSERT0(range_tree_space(msp->ms_freeing)); 2781 ASSERT0(range_tree_space(msp->ms_freed)); 2782 ASSERT0(range_tree_space(msp->ms_checkpointing)); 2783 2784 mutex_exit(&msp->ms_lock); 2785 } 2786 2787 void 2788 metaslab_sync_reassess(metaslab_group_t *mg) 2789 { 2790 spa_t *spa = mg->mg_class->mc_spa; 2791 2792 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 2793 metaslab_group_alloc_update(mg); 2794 mg->mg_fragmentation = metaslab_group_fragmentation(mg); 2795 2796 /* 2797 * Preload the next potential metaslabs but only on active 2798 * metaslab groups. We can get into a state where the metaslab 2799 * is no longer active since we dirty metaslabs as we remove a 2800 * a device, thus potentially making the metaslab group eligible 2801 * for preloading. 2802 */ 2803 if (mg->mg_activation_count > 0) { 2804 metaslab_group_preload(mg); 2805 } 2806 spa_config_exit(spa, SCL_ALLOC, FTAG); 2807 } 2808 2809 /* 2810 * When writing a ditto block (i.e. more than one DVA for a given BP) on 2811 * the same vdev as an existing DVA of this BP, then try to allocate it 2812 * on a different metaslab than existing DVAs (i.e. a unique metaslab). 2813 */ 2814 static boolean_t 2815 metaslab_is_unique(metaslab_t *msp, dva_t *dva) 2816 { 2817 uint64_t dva_ms_id; 2818 2819 if (DVA_GET_ASIZE(dva) == 0) 2820 return (B_TRUE); 2821 2822 if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) 2823 return (B_TRUE); 2824 2825 dva_ms_id = DVA_GET_OFFSET(dva) >> msp->ms_group->mg_vd->vdev_ms_shift; 2826 2827 return (msp->ms_id != dva_ms_id); 2828 } 2829 2830 /* 2831 * ========================================================================== 2832 * Metaslab allocation tracing facility 2833 * ========================================================================== 2834 */ 2835 kstat_t *metaslab_trace_ksp; 2836 kstat_named_t metaslab_trace_over_limit; 2837 2838 void 2839 metaslab_alloc_trace_init(void) 2840 { 2841 ASSERT(metaslab_alloc_trace_cache == NULL); 2842 metaslab_alloc_trace_cache = kmem_cache_create( 2843 "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t), 2844 0, NULL, NULL, NULL, NULL, NULL, 0); 2845 metaslab_trace_ksp = kstat_create("zfs", 0, "metaslab_trace_stats", 2846 "misc", KSTAT_TYPE_NAMED, 1, KSTAT_FLAG_VIRTUAL); 2847 if (metaslab_trace_ksp != NULL) { 2848 metaslab_trace_ksp->ks_data = &metaslab_trace_over_limit; 2849 kstat_named_init(&metaslab_trace_over_limit, 2850 "metaslab_trace_over_limit", KSTAT_DATA_UINT64); 2851 kstat_install(metaslab_trace_ksp); 2852 } 2853 } 2854 2855 void 2856 metaslab_alloc_trace_fini(void) 2857 { 2858 if (metaslab_trace_ksp != NULL) { 2859 kstat_delete(metaslab_trace_ksp); 2860 metaslab_trace_ksp = NULL; 2861 } 2862 kmem_cache_destroy(metaslab_alloc_trace_cache); 2863 metaslab_alloc_trace_cache = NULL; 2864 } 2865 2866 /* 2867 * Add an allocation trace element to the allocation tracing list. 2868 */ 2869 static void 2870 metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg, 2871 metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset, 2872 int allocator) 2873 { 2874 if (!metaslab_trace_enabled) 2875 return; 2876 2877 /* 2878 * When the tracing list reaches its maximum we remove 2879 * the second element in the list before adding a new one. 2880 * By removing the second element we preserve the original 2881 * entry as a clue to what allocations steps have already been 2882 * performed. 2883 */ 2884 if (zal->zal_size == metaslab_trace_max_entries) { 2885 metaslab_alloc_trace_t *mat_next; 2886 #ifdef DEBUG 2887 panic("too many entries in allocation list"); 2888 #endif 2889 atomic_inc_64(&metaslab_trace_over_limit.value.ui64); 2890 zal->zal_size--; 2891 mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list)); 2892 list_remove(&zal->zal_list, mat_next); 2893 kmem_cache_free(metaslab_alloc_trace_cache, mat_next); 2894 } 2895 2896 metaslab_alloc_trace_t *mat = 2897 kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP); 2898 list_link_init(&mat->mat_list_node); 2899 mat->mat_mg = mg; 2900 mat->mat_msp = msp; 2901 mat->mat_size = psize; 2902 mat->mat_dva_id = dva_id; 2903 mat->mat_offset = offset; 2904 mat->mat_weight = 0; 2905 mat->mat_allocator = allocator; 2906 2907 if (msp != NULL) 2908 mat->mat_weight = msp->ms_weight; 2909 2910 /* 2911 * The list is part of the zio so locking is not required. Only 2912 * a single thread will perform allocations for a given zio. 2913 */ 2914 list_insert_tail(&zal->zal_list, mat); 2915 zal->zal_size++; 2916 2917 ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries); 2918 } 2919 2920 void 2921 metaslab_trace_init(zio_alloc_list_t *zal) 2922 { 2923 list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t), 2924 offsetof(metaslab_alloc_trace_t, mat_list_node)); 2925 zal->zal_size = 0; 2926 } 2927 2928 void 2929 metaslab_trace_fini(zio_alloc_list_t *zal) 2930 { 2931 metaslab_alloc_trace_t *mat; 2932 2933 while ((mat = list_remove_head(&zal->zal_list)) != NULL) 2934 kmem_cache_free(metaslab_alloc_trace_cache, mat); 2935 list_destroy(&zal->zal_list); 2936 zal->zal_size = 0; 2937 } 2938 2939 /* 2940 * ========================================================================== 2941 * Metaslab block operations 2942 * ========================================================================== 2943 */ 2944 2945 static void 2946 metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags, 2947 int allocator) 2948 { 2949 if (!(flags & METASLAB_ASYNC_ALLOC) || 2950 (flags & METASLAB_DONT_THROTTLE)) 2951 return; 2952 2953 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 2954 if (!mg->mg_class->mc_alloc_throttle_enabled) 2955 return; 2956 2957 (void) zfs_refcount_add(&mg->mg_alloc_queue_depth[allocator], tag); 2958 } 2959 2960 static void 2961 metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator) 2962 { 2963 uint64_t max = mg->mg_max_alloc_queue_depth; 2964 uint64_t cur = mg->mg_cur_max_alloc_queue_depth[allocator]; 2965 while (cur < max) { 2966 if (atomic_cas_64(&mg->mg_cur_max_alloc_queue_depth[allocator], 2967 cur, cur + 1) == cur) { 2968 atomic_inc_64( 2969 &mg->mg_class->mc_alloc_max_slots[allocator]); 2970 return; 2971 } 2972 cur = mg->mg_cur_max_alloc_queue_depth[allocator]; 2973 } 2974 } 2975 2976 void 2977 metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags, 2978 int allocator, boolean_t io_complete) 2979 { 2980 if (!(flags & METASLAB_ASYNC_ALLOC) || 2981 (flags & METASLAB_DONT_THROTTLE)) 2982 return; 2983 2984 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 2985 if (!mg->mg_class->mc_alloc_throttle_enabled) 2986 return; 2987 2988 (void) zfs_refcount_remove(&mg->mg_alloc_queue_depth[allocator], tag); 2989 if (io_complete) 2990 metaslab_group_increment_qdepth(mg, allocator); 2991 } 2992 2993 void 2994 metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag, 2995 int allocator) 2996 { 2997 #ifdef ZFS_DEBUG 2998 const dva_t *dva = bp->blk_dva; 2999 int ndvas = BP_GET_NDVAS(bp); 3000 3001 for (int d = 0; d < ndvas; d++) { 3002 uint64_t vdev = DVA_GET_VDEV(&dva[d]); 3003 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 3004 VERIFY(zfs_refcount_not_held( 3005 &mg->mg_alloc_queue_depth[allocator], tag)); 3006 } 3007 #endif 3008 } 3009 3010 static uint64_t 3011 metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) 3012 { 3013 uint64_t start; 3014 range_tree_t *rt = msp->ms_allocatable; 3015 metaslab_class_t *mc = msp->ms_group->mg_class; 3016 3017 VERIFY(!msp->ms_condensing); 3018 VERIFY0(msp->ms_initializing); 3019 3020 start = mc->mc_ops->msop_alloc(msp, size); 3021 if (start != -1ULL) { 3022 metaslab_group_t *mg = msp->ms_group; 3023 vdev_t *vd = mg->mg_vd; 3024 3025 VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift)); 3026 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 3027 VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size); 3028 range_tree_remove(rt, start, size); 3029 3030 if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) 3031 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); 3032 3033 range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size); 3034 3035 /* Track the last successful allocation */ 3036 msp->ms_alloc_txg = txg; 3037 metaslab_verify_space(msp, txg); 3038 } 3039 3040 /* 3041 * Now that we've attempted the allocation we need to update the 3042 * metaslab's maximum block size since it may have changed. 3043 */ 3044 msp->ms_max_size = metaslab_block_maxsize(msp); 3045 return (start); 3046 } 3047 3048 /* 3049 * Find the metaslab with the highest weight that is less than what we've 3050 * already tried. In the common case, this means that we will examine each 3051 * metaslab at most once. Note that concurrent callers could reorder metaslabs 3052 * by activation/passivation once we have dropped the mg_lock. If a metaslab is 3053 * activated by another thread, and we fail to allocate from the metaslab we 3054 * have selected, we may not try the newly-activated metaslab, and instead 3055 * activate another metaslab. This is not optimal, but generally does not cause 3056 * any problems (a possible exception being if every metaslab is completely full 3057 * except for the the newly-activated metaslab which we fail to examine). 3058 */ 3059 static metaslab_t * 3060 find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, 3061 dva_t *dva, int d, boolean_t want_unique, uint64_t asize, int allocator, 3062 zio_alloc_list_t *zal, metaslab_t *search, boolean_t *was_active) 3063 { 3064 avl_index_t idx; 3065 avl_tree_t *t = &mg->mg_metaslab_tree; 3066 metaslab_t *msp = avl_find(t, search, &idx); 3067 if (msp == NULL) 3068 msp = avl_nearest(t, idx, AVL_AFTER); 3069 3070 for (; msp != NULL; msp = AVL_NEXT(t, msp)) { 3071 int i; 3072 if (!metaslab_should_allocate(msp, asize)) { 3073 metaslab_trace_add(zal, mg, msp, asize, d, 3074 TRACE_TOO_SMALL, allocator); 3075 continue; 3076 } 3077 3078 /* 3079 * If the selected metaslab is condensing or being 3080 * initialized, skip it. 3081 */ 3082 if (msp->ms_condensing || msp->ms_initializing > 0) 3083 continue; 3084 3085 *was_active = msp->ms_allocator != -1; 3086 /* 3087 * If we're activating as primary, this is our first allocation 3088 * from this disk, so we don't need to check how close we are. 3089 * If the metaslab under consideration was already active, 3090 * we're getting desperate enough to steal another allocator's 3091 * metaslab, so we still don't care about distances. 3092 */ 3093 if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active) 3094 break; 3095 3096 for (i = 0; i < d; i++) { 3097 if (want_unique && 3098 !metaslab_is_unique(msp, &dva[i])) 3099 break; /* try another metaslab */ 3100 } 3101 if (i == d) 3102 break; 3103 } 3104 3105 if (msp != NULL) { 3106 search->ms_weight = msp->ms_weight; 3107 search->ms_start = msp->ms_start + 1; 3108 search->ms_allocator = msp->ms_allocator; 3109 search->ms_primary = msp->ms_primary; 3110 } 3111 return (msp); 3112 } 3113 3114 /* ARGSUSED */ 3115 static uint64_t 3116 metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, 3117 uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, 3118 int d, int allocator) 3119 { 3120 metaslab_t *msp = NULL; 3121 uint64_t offset = -1ULL; 3122 uint64_t activation_weight; 3123 3124 activation_weight = METASLAB_WEIGHT_PRIMARY; 3125 for (int i = 0; i < d; i++) { 3126 if (activation_weight == METASLAB_WEIGHT_PRIMARY && 3127 DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { 3128 activation_weight = METASLAB_WEIGHT_SECONDARY; 3129 } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && 3130 DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { 3131 activation_weight = METASLAB_WEIGHT_CLAIM; 3132 break; 3133 } 3134 } 3135 3136 /* 3137 * If we don't have enough metaslabs active to fill the entire array, we 3138 * just use the 0th slot. 3139 */ 3140 if (mg->mg_ms_ready < mg->mg_allocators * 3) 3141 allocator = 0; 3142 3143 ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2); 3144 3145 metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP); 3146 search->ms_weight = UINT64_MAX; 3147 search->ms_start = 0; 3148 /* 3149 * At the end of the metaslab tree are the already-active metaslabs, 3150 * first the primaries, then the secondaries. When we resume searching 3151 * through the tree, we need to consider ms_allocator and ms_primary so 3152 * we start in the location right after where we left off, and don't 3153 * accidentally loop forever considering the same metaslabs. 3154 */ 3155 search->ms_allocator = -1; 3156 search->ms_primary = B_TRUE; 3157 for (;;) { 3158 boolean_t was_active = B_FALSE; 3159 3160 mutex_enter(&mg->mg_lock); 3161 3162 if (activation_weight == METASLAB_WEIGHT_PRIMARY && 3163 mg->mg_primaries[allocator] != NULL) { 3164 msp = mg->mg_primaries[allocator]; 3165 was_active = B_TRUE; 3166 } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && 3167 mg->mg_secondaries[allocator] != NULL) { 3168 msp = mg->mg_secondaries[allocator]; 3169 was_active = B_TRUE; 3170 } else { 3171 msp = find_valid_metaslab(mg, activation_weight, dva, d, 3172 want_unique, asize, allocator, zal, search, 3173 &was_active); 3174 } 3175 3176 mutex_exit(&mg->mg_lock); 3177 if (msp == NULL) { 3178 kmem_free(search, sizeof (*search)); 3179 return (-1ULL); 3180 } 3181 3182 mutex_enter(&msp->ms_lock); 3183 /* 3184 * Ensure that the metaslab we have selected is still 3185 * capable of handling our request. It's possible that 3186 * another thread may have changed the weight while we 3187 * were blocked on the metaslab lock. We check the 3188 * active status first to see if we need to reselect 3189 * a new metaslab. 3190 */ 3191 if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) { 3192 mutex_exit(&msp->ms_lock); 3193 continue; 3194 } 3195 3196 /* 3197 * If the metaslab is freshly activated for an allocator that 3198 * isn't the one we're allocating from, or if it's a primary and 3199 * we're seeking a secondary (or vice versa), we go back and 3200 * select a new metaslab. 3201 */ 3202 if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) && 3203 (msp->ms_allocator != -1) && 3204 (msp->ms_allocator != allocator || ((activation_weight == 3205 METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) { 3206 mutex_exit(&msp->ms_lock); 3207 continue; 3208 } 3209 3210 if (msp->ms_weight & METASLAB_WEIGHT_CLAIM && 3211 activation_weight != METASLAB_WEIGHT_CLAIM) { 3212 metaslab_passivate(msp, msp->ms_weight & 3213 ~METASLAB_WEIGHT_CLAIM); 3214 mutex_exit(&msp->ms_lock); 3215 continue; 3216 } 3217 3218 if (metaslab_activate(msp, allocator, activation_weight) != 0) { 3219 mutex_exit(&msp->ms_lock); 3220 continue; 3221 } 3222 3223 msp->ms_selected_txg = txg; 3224 3225 /* 3226 * Now that we have the lock, recheck to see if we should 3227 * continue to use this metaslab for this allocation. The 3228 * the metaslab is now loaded so metaslab_should_allocate() can 3229 * accurately determine if the allocation attempt should 3230 * proceed. 3231 */ 3232 if (!metaslab_should_allocate(msp, asize)) { 3233 /* Passivate this metaslab and select a new one. */ 3234 metaslab_trace_add(zal, mg, msp, asize, d, 3235 TRACE_TOO_SMALL, allocator); 3236 goto next; 3237 } 3238 3239 /* 3240 * If this metaslab is currently condensing then pick again as 3241 * we can't manipulate this metaslab until it's committed 3242 * to disk. If this metaslab is being initialized, we shouldn't 3243 * allocate from it since the allocated region might be 3244 * overwritten after allocation. 3245 */ 3246 if (msp->ms_condensing) { 3247 metaslab_trace_add(zal, mg, msp, asize, d, 3248 TRACE_CONDENSING, allocator); 3249 metaslab_passivate(msp, msp->ms_weight & 3250 ~METASLAB_ACTIVE_MASK); 3251 mutex_exit(&msp->ms_lock); 3252 continue; 3253 } else if (msp->ms_initializing > 0) { 3254 metaslab_trace_add(zal, mg, msp, asize, d, 3255 TRACE_INITIALIZING, allocator); 3256 metaslab_passivate(msp, msp->ms_weight & 3257 ~METASLAB_ACTIVE_MASK); 3258 mutex_exit(&msp->ms_lock); 3259 continue; 3260 } 3261 3262 offset = metaslab_block_alloc(msp, asize, txg); 3263 metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator); 3264 3265 if (offset != -1ULL) { 3266 /* Proactively passivate the metaslab, if needed */ 3267 metaslab_segment_may_passivate(msp); 3268 break; 3269 } 3270 next: 3271 ASSERT(msp->ms_loaded); 3272 3273 /* 3274 * We were unable to allocate from this metaslab so determine 3275 * a new weight for this metaslab. Now that we have loaded 3276 * the metaslab we can provide a better hint to the metaslab 3277 * selector. 3278 * 3279 * For space-based metaslabs, we use the maximum block size. 3280 * This information is only available when the metaslab 3281 * is loaded and is more accurate than the generic free 3282 * space weight that was calculated by metaslab_weight(). 3283 * This information allows us to quickly compare the maximum 3284 * available allocation in the metaslab to the allocation 3285 * size being requested. 3286 * 3287 * For segment-based metaslabs, determine the new weight 3288 * based on the highest bucket in the range tree. We 3289 * explicitly use the loaded segment weight (i.e. the range 3290 * tree histogram) since it contains the space that is 3291 * currently available for allocation and is accurate 3292 * even within a sync pass. 3293 */ 3294 if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) { 3295 uint64_t weight = metaslab_block_maxsize(msp); 3296 WEIGHT_SET_SPACEBASED(weight); 3297 metaslab_passivate(msp, weight); 3298 } else { 3299 metaslab_passivate(msp, 3300 metaslab_weight_from_range_tree(msp)); 3301 } 3302 3303 /* 3304 * We have just failed an allocation attempt, check 3305 * that metaslab_should_allocate() agrees. Otherwise, 3306 * we may end up in an infinite loop retrying the same 3307 * metaslab. 3308 */ 3309 ASSERT(!metaslab_should_allocate(msp, asize)); 3310 3311 mutex_exit(&msp->ms_lock); 3312 } 3313 mutex_exit(&msp->ms_lock); 3314 kmem_free(search, sizeof (*search)); 3315 return (offset); 3316 } 3317 3318 static uint64_t 3319 metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, 3320 uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, 3321 int d, int allocator) 3322 { 3323 uint64_t offset; 3324 ASSERT(mg->mg_initialized); 3325 3326 offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique, 3327 dva, d, allocator); 3328 3329 mutex_enter(&mg->mg_lock); 3330 if (offset == -1ULL) { 3331 mg->mg_failed_allocations++; 3332 metaslab_trace_add(zal, mg, NULL, asize, d, 3333 TRACE_GROUP_FAILURE, allocator); 3334 if (asize == SPA_GANGBLOCKSIZE) { 3335 /* 3336 * This metaslab group was unable to allocate 3337 * the minimum gang block size so it must be out of 3338 * space. We must notify the allocation throttle 3339 * to start skipping allocation attempts to this 3340 * metaslab group until more space becomes available. 3341 * Note: this failure cannot be caused by the 3342 * allocation throttle since the allocation throttle 3343 * is only responsible for skipping devices and 3344 * not failing block allocations. 3345 */ 3346 mg->mg_no_free_space = B_TRUE; 3347 } 3348 } 3349 mg->mg_allocations++; 3350 mutex_exit(&mg->mg_lock); 3351 return (offset); 3352 } 3353 3354 /* 3355 * Allocate a block for the specified i/o. 3356 */ 3357 int 3358 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, 3359 dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags, 3360 zio_alloc_list_t *zal, int allocator) 3361 { 3362 metaslab_group_t *mg, *rotor; 3363 vdev_t *vd; 3364 boolean_t try_hard = B_FALSE; 3365 3366 ASSERT(!DVA_IS_VALID(&dva[d])); 3367 3368 /* 3369 * For testing, make some blocks above a certain size be gang blocks. 3370 * This will also test spilling from special to normal. 3371 */ 3372 if (psize >= metaslab_force_ganging && (ddi_get_lbolt() & 3) == 0) { 3373 metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG, 3374 allocator); 3375 return (SET_ERROR(ENOSPC)); 3376 } 3377 3378 /* 3379 * Start at the rotor and loop through all mgs until we find something. 3380 * Note that there's no locking on mc_rotor or mc_aliquot because 3381 * nothing actually breaks if we miss a few updates -- we just won't 3382 * allocate quite as evenly. It all balances out over time. 3383 * 3384 * If we are doing ditto or log blocks, try to spread them across 3385 * consecutive vdevs. If we're forced to reuse a vdev before we've 3386 * allocated all of our ditto blocks, then try and spread them out on 3387 * that vdev as much as possible. If it turns out to not be possible, 3388 * gradually lower our standards until anything becomes acceptable. 3389 * Also, allocating on consecutive vdevs (as opposed to random vdevs) 3390 * gives us hope of containing our fault domains to something we're 3391 * able to reason about. Otherwise, any two top-level vdev failures 3392 * will guarantee the loss of data. With consecutive allocation, 3393 * only two adjacent top-level vdev failures will result in data loss. 3394 * 3395 * If we are doing gang blocks (hintdva is non-NULL), try to keep 3396 * ourselves on the same vdev as our gang block header. That 3397 * way, we can hope for locality in vdev_cache, plus it makes our 3398 * fault domains something tractable. 3399 */ 3400 if (hintdva) { 3401 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); 3402 3403 /* 3404 * It's possible the vdev we're using as the hint no 3405 * longer exists or its mg has been closed (e.g. by 3406 * device removal). Consult the rotor when 3407 * all else fails. 3408 */ 3409 if (vd != NULL && vd->vdev_mg != NULL) { 3410 mg = vd->vdev_mg; 3411 3412 if (flags & METASLAB_HINTBP_AVOID && 3413 mg->mg_next != NULL) 3414 mg = mg->mg_next; 3415 } else { 3416 mg = mc->mc_rotor; 3417 } 3418 } else if (d != 0) { 3419 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); 3420 mg = vd->vdev_mg->mg_next; 3421 } else { 3422 ASSERT(mc->mc_rotor != NULL); 3423 mg = mc->mc_rotor; 3424 } 3425 3426 /* 3427 * If the hint put us into the wrong metaslab class, or into a 3428 * metaslab group that has been passivated, just follow the rotor. 3429 */ 3430 if (mg->mg_class != mc || mg->mg_activation_count <= 0) 3431 mg = mc->mc_rotor; 3432 3433 rotor = mg; 3434 top: 3435 do { 3436 boolean_t allocatable; 3437 3438 ASSERT(mg->mg_activation_count == 1); 3439 vd = mg->mg_vd; 3440 3441 /* 3442 * Don't allocate from faulted devices. 3443 */ 3444 if (try_hard) { 3445 spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); 3446 allocatable = vdev_allocatable(vd); 3447 spa_config_exit(spa, SCL_ZIO, FTAG); 3448 } else { 3449 allocatable = vdev_allocatable(vd); 3450 } 3451 3452 /* 3453 * Determine if the selected metaslab group is eligible 3454 * for allocations. If we're ganging then don't allow 3455 * this metaslab group to skip allocations since that would 3456 * inadvertently return ENOSPC and suspend the pool 3457 * even though space is still available. 3458 */ 3459 if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) { 3460 allocatable = metaslab_group_allocatable(mg, rotor, 3461 psize, allocator); 3462 } 3463 3464 if (!allocatable) { 3465 metaslab_trace_add(zal, mg, NULL, psize, d, 3466 TRACE_NOT_ALLOCATABLE, allocator); 3467 goto next; 3468 } 3469 3470 ASSERT(mg->mg_initialized); 3471 3472 /* 3473 * Avoid writing single-copy data to a failing, 3474 * non-redundant vdev, unless we've already tried all 3475 * other vdevs. 3476 */ 3477 if ((vd->vdev_stat.vs_write_errors > 0 || 3478 vd->vdev_state < VDEV_STATE_HEALTHY) && 3479 d == 0 && !try_hard && vd->vdev_children == 0) { 3480 metaslab_trace_add(zal, mg, NULL, psize, d, 3481 TRACE_VDEV_ERROR, allocator); 3482 goto next; 3483 } 3484 3485 ASSERT(mg->mg_class == mc); 3486 3487 uint64_t asize = vdev_psize_to_asize(vd, psize); 3488 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); 3489 3490 /* 3491 * If we don't need to try hard, then require that the 3492 * block be on an different metaslab from any other DVAs 3493 * in this BP (unique=true). If we are trying hard, then 3494 * allow any metaslab to be used (unique=false). 3495 */ 3496 uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg, 3497 !try_hard, dva, d, allocator); 3498 3499 if (offset != -1ULL) { 3500 /* 3501 * If we've just selected this metaslab group, 3502 * figure out whether the corresponding vdev is 3503 * over- or under-used relative to the pool, 3504 * and set an allocation bias to even it out. 3505 */ 3506 if (mc->mc_aliquot == 0 && metaslab_bias_enabled) { 3507 vdev_stat_t *vs = &vd->vdev_stat; 3508 int64_t vu, cu; 3509 3510 vu = (vs->vs_alloc * 100) / (vs->vs_space + 1); 3511 cu = (mc->mc_alloc * 100) / (mc->mc_space + 1); 3512 3513 /* 3514 * Calculate how much more or less we should 3515 * try to allocate from this device during 3516 * this iteration around the rotor. 3517 * For example, if a device is 80% full 3518 * and the pool is 20% full then we should 3519 * reduce allocations by 60% on this device. 3520 * 3521 * mg_bias = (20 - 80) * 512K / 100 = -307K 3522 * 3523 * This reduces allocations by 307K for this 3524 * iteration. 3525 */ 3526 mg->mg_bias = ((cu - vu) * 3527 (int64_t)mg->mg_aliquot) / 100; 3528 } else if (!metaslab_bias_enabled) { 3529 mg->mg_bias = 0; 3530 } 3531 3532 if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= 3533 mg->mg_aliquot + mg->mg_bias) { 3534 mc->mc_rotor = mg->mg_next; 3535 mc->mc_aliquot = 0; 3536 } 3537 3538 DVA_SET_VDEV(&dva[d], vd->vdev_id); 3539 DVA_SET_OFFSET(&dva[d], offset); 3540 DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER)); 3541 DVA_SET_ASIZE(&dva[d], asize); 3542 3543 return (0); 3544 } 3545 next: 3546 mc->mc_rotor = mg->mg_next; 3547 mc->mc_aliquot = 0; 3548 } while ((mg = mg->mg_next) != rotor); 3549 3550 /* 3551 * If we haven't tried hard, do so now. 3552 */ 3553 if (!try_hard) { 3554 try_hard = B_TRUE; 3555 goto top; 3556 } 3557 3558 bzero(&dva[d], sizeof (dva_t)); 3559 3560 metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator); 3561 return (SET_ERROR(ENOSPC)); 3562 } 3563 3564 void 3565 metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize, 3566 boolean_t checkpoint) 3567 { 3568 metaslab_t *msp; 3569 spa_t *spa = vd->vdev_spa; 3570 3571 ASSERT(vdev_is_concrete(vd)); 3572 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 3573 ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); 3574 3575 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 3576 3577 VERIFY(!msp->ms_condensing); 3578 VERIFY3U(offset, >=, msp->ms_start); 3579 VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size); 3580 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 3581 VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift)); 3582 3583 metaslab_check_free_impl(vd, offset, asize); 3584 3585 mutex_enter(&msp->ms_lock); 3586 if (range_tree_is_empty(msp->ms_freeing) && 3587 range_tree_is_empty(msp->ms_checkpointing)) { 3588 vdev_dirty(vd, VDD_METASLAB, msp, spa_syncing_txg(spa)); 3589 } 3590 3591 if (checkpoint) { 3592 ASSERT(spa_has_checkpoint(spa)); 3593 range_tree_add(msp->ms_checkpointing, offset, asize); 3594 } else { 3595 range_tree_add(msp->ms_freeing, offset, asize); 3596 } 3597 mutex_exit(&msp->ms_lock); 3598 } 3599 3600 /* ARGSUSED */ 3601 void 3602 metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, 3603 uint64_t size, void *arg) 3604 { 3605 boolean_t *checkpoint = arg; 3606 3607 ASSERT3P(checkpoint, !=, NULL); 3608 3609 if (vd->vdev_ops->vdev_op_remap != NULL) 3610 vdev_indirect_mark_obsolete(vd, offset, size); 3611 else 3612 metaslab_free_impl(vd, offset, size, *checkpoint); 3613 } 3614 3615 static void 3616 metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size, 3617 boolean_t checkpoint) 3618 { 3619 spa_t *spa = vd->vdev_spa; 3620 3621 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 3622 3623 if (spa_syncing_txg(spa) > spa_freeze_txg(spa)) 3624 return; 3625 3626 if (spa->spa_vdev_removal != NULL && 3627 spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id && 3628 vdev_is_concrete(vd)) { 3629 /* 3630 * Note: we check if the vdev is concrete because when 3631 * we complete the removal, we first change the vdev to be 3632 * an indirect vdev (in open context), and then (in syncing 3633 * context) clear spa_vdev_removal. 3634 */ 3635 free_from_removing_vdev(vd, offset, size); 3636 } else if (vd->vdev_ops->vdev_op_remap != NULL) { 3637 vdev_indirect_mark_obsolete(vd, offset, size); 3638 vd->vdev_ops->vdev_op_remap(vd, offset, size, 3639 metaslab_free_impl_cb, &checkpoint); 3640 } else { 3641 metaslab_free_concrete(vd, offset, size, checkpoint); 3642 } 3643 } 3644 3645 typedef struct remap_blkptr_cb_arg { 3646 blkptr_t *rbca_bp; 3647 spa_remap_cb_t rbca_cb; 3648 vdev_t *rbca_remap_vd; 3649 uint64_t rbca_remap_offset; 3650 void *rbca_cb_arg; 3651 } remap_blkptr_cb_arg_t; 3652 3653 void 3654 remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, 3655 uint64_t size, void *arg) 3656 { 3657 remap_blkptr_cb_arg_t *rbca = arg; 3658 blkptr_t *bp = rbca->rbca_bp; 3659 3660 /* We can not remap split blocks. */ 3661 if (size != DVA_GET_ASIZE(&bp->blk_dva[0])) 3662 return; 3663 ASSERT0(inner_offset); 3664 3665 if (rbca->rbca_cb != NULL) { 3666 /* 3667 * At this point we know that we are not handling split 3668 * blocks and we invoke the callback on the previous 3669 * vdev which must be indirect. 3670 */ 3671 ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops); 3672 3673 rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id, 3674 rbca->rbca_remap_offset, size, rbca->rbca_cb_arg); 3675 3676 /* set up remap_blkptr_cb_arg for the next call */ 3677 rbca->rbca_remap_vd = vd; 3678 rbca->rbca_remap_offset = offset; 3679 } 3680 3681 /* 3682 * The phys birth time is that of dva[0]. This ensures that we know 3683 * when each dva was written, so that resilver can determine which 3684 * blocks need to be scrubbed (i.e. those written during the time 3685 * the vdev was offline). It also ensures that the key used in 3686 * the ARC hash table is unique (i.e. dva[0] + phys_birth). If 3687 * we didn't change the phys_birth, a lookup in the ARC for a 3688 * remapped BP could find the data that was previously stored at 3689 * this vdev + offset. 3690 */ 3691 vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa, 3692 DVA_GET_VDEV(&bp->blk_dva[0])); 3693 vdev_indirect_births_t *vib = oldvd->vdev_indirect_births; 3694 bp->blk_phys_birth = vdev_indirect_births_physbirth(vib, 3695 DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0])); 3696 3697 DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id); 3698 DVA_SET_OFFSET(&bp->blk_dva[0], offset); 3699 } 3700 3701 /* 3702 * If the block pointer contains any indirect DVAs, modify them to refer to 3703 * concrete DVAs. Note that this will sometimes not be possible, leaving 3704 * the indirect DVA in place. This happens if the indirect DVA spans multiple 3705 * segments in the mapping (i.e. it is a "split block"). 3706 * 3707 * If the BP was remapped, calls the callback on the original dva (note the 3708 * callback can be called multiple times if the original indirect DVA refers 3709 * to another indirect DVA, etc). 3710 * 3711 * Returns TRUE if the BP was remapped. 3712 */ 3713 boolean_t 3714 spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg) 3715 { 3716 remap_blkptr_cb_arg_t rbca; 3717 3718 if (!zfs_remap_blkptr_enable) 3719 return (B_FALSE); 3720 3721 if (!spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) 3722 return (B_FALSE); 3723 3724 /* 3725 * Dedup BP's can not be remapped, because ddt_phys_select() depends 3726 * on DVA[0] being the same in the BP as in the DDT (dedup table). 3727 */ 3728 if (BP_GET_DEDUP(bp)) 3729 return (B_FALSE); 3730 3731 /* 3732 * Gang blocks can not be remapped, because 3733 * zio_checksum_gang_verifier() depends on the DVA[0] that's in 3734 * the BP used to read the gang block header (GBH) being the same 3735 * as the DVA[0] that we allocated for the GBH. 3736 */ 3737 if (BP_IS_GANG(bp)) 3738 return (B_FALSE); 3739 3740 /* 3741 * Embedded BP's have no DVA to remap. 3742 */ 3743 if (BP_GET_NDVAS(bp) < 1) 3744 return (B_FALSE); 3745 3746 /* 3747 * Note: we only remap dva[0]. If we remapped other dvas, we 3748 * would no longer know what their phys birth txg is. 3749 */ 3750 dva_t *dva = &bp->blk_dva[0]; 3751 3752 uint64_t offset = DVA_GET_OFFSET(dva); 3753 uint64_t size = DVA_GET_ASIZE(dva); 3754 vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); 3755 3756 if (vd->vdev_ops->vdev_op_remap == NULL) 3757 return (B_FALSE); 3758 3759 rbca.rbca_bp = bp; 3760 rbca.rbca_cb = callback; 3761 rbca.rbca_remap_vd = vd; 3762 rbca.rbca_remap_offset = offset; 3763 rbca.rbca_cb_arg = arg; 3764 3765 /* 3766 * remap_blkptr_cb() will be called in order for each level of 3767 * indirection, until a concrete vdev is reached or a split block is 3768 * encountered. old_vd and old_offset are updated within the callback 3769 * as we go from the one indirect vdev to the next one (either concrete 3770 * or indirect again) in that order. 3771 */ 3772 vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca); 3773 3774 /* Check if the DVA wasn't remapped because it is a split block */ 3775 if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id) 3776 return (B_FALSE); 3777 3778 return (B_TRUE); 3779 } 3780 3781 /* 3782 * Undo the allocation of a DVA which happened in the given transaction group. 3783 */ 3784 void 3785 metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg) 3786 { 3787 metaslab_t *msp; 3788 vdev_t *vd; 3789 uint64_t vdev = DVA_GET_VDEV(dva); 3790 uint64_t offset = DVA_GET_OFFSET(dva); 3791 uint64_t size = DVA_GET_ASIZE(dva); 3792 3793 ASSERT(DVA_IS_VALID(dva)); 3794 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 3795 3796 if (txg > spa_freeze_txg(spa)) 3797 return; 3798 3799 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 3800 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { 3801 cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu", 3802 (u_longlong_t)vdev, (u_longlong_t)offset); 3803 ASSERT(0); 3804 return; 3805 } 3806 3807 ASSERT(!vd->vdev_removing); 3808 ASSERT(vdev_is_concrete(vd)); 3809 ASSERT0(vd->vdev_indirect_config.vic_mapping_object); 3810 ASSERT3P(vd->vdev_indirect_mapping, ==, NULL); 3811 3812 if (DVA_GET_GANG(dva)) 3813 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 3814 3815 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 3816 3817 mutex_enter(&msp->ms_lock); 3818 range_tree_remove(msp->ms_allocating[txg & TXG_MASK], 3819 offset, size); 3820 3821 VERIFY(!msp->ms_condensing); 3822 VERIFY3U(offset, >=, msp->ms_start); 3823 VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size); 3824 VERIFY3U(range_tree_space(msp->ms_allocatable) + size, <=, 3825 msp->ms_size); 3826 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 3827 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 3828 range_tree_add(msp->ms_allocatable, offset, size); 3829 mutex_exit(&msp->ms_lock); 3830 } 3831 3832 /* 3833 * Free the block represented by the given DVA. 3834 */ 3835 void 3836 metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint) 3837 { 3838 uint64_t vdev = DVA_GET_VDEV(dva); 3839 uint64_t offset = DVA_GET_OFFSET(dva); 3840 uint64_t size = DVA_GET_ASIZE(dva); 3841 vdev_t *vd = vdev_lookup_top(spa, vdev); 3842 3843 ASSERT(DVA_IS_VALID(dva)); 3844 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 3845 3846 if (DVA_GET_GANG(dva)) { 3847 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 3848 } 3849 3850 metaslab_free_impl(vd, offset, size, checkpoint); 3851 } 3852 3853 /* 3854 * Reserve some allocation slots. The reservation system must be called 3855 * before we call into the allocator. If there aren't any available slots 3856 * then the I/O will be throttled until an I/O completes and its slots are 3857 * freed up. The function returns true if it was successful in placing 3858 * the reservation. 3859 */ 3860 boolean_t 3861 metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator, 3862 zio_t *zio, int flags) 3863 { 3864 uint64_t available_slots = 0; 3865 boolean_t slot_reserved = B_FALSE; 3866 uint64_t max = mc->mc_alloc_max_slots[allocator]; 3867 3868 ASSERT(mc->mc_alloc_throttle_enabled); 3869 mutex_enter(&mc->mc_lock); 3870 3871 uint64_t reserved_slots = 3872 zfs_refcount_count(&mc->mc_alloc_slots[allocator]); 3873 if (reserved_slots < max) 3874 available_slots = max - reserved_slots; 3875 3876 if (slots <= available_slots || GANG_ALLOCATION(flags) || 3877 flags & METASLAB_MUST_RESERVE) { 3878 /* 3879 * We reserve the slots individually so that we can unreserve 3880 * them individually when an I/O completes. 3881 */ 3882 for (int d = 0; d < slots; d++) { 3883 reserved_slots = 3884 zfs_refcount_add(&mc->mc_alloc_slots[allocator], 3885 zio); 3886 } 3887 zio->io_flags |= ZIO_FLAG_IO_ALLOCATING; 3888 slot_reserved = B_TRUE; 3889 } 3890 3891 mutex_exit(&mc->mc_lock); 3892 return (slot_reserved); 3893 } 3894 3895 void 3896 metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, 3897 int allocator, zio_t *zio) 3898 { 3899 ASSERT(mc->mc_alloc_throttle_enabled); 3900 mutex_enter(&mc->mc_lock); 3901 for (int d = 0; d < slots; d++) { 3902 (void) zfs_refcount_remove(&mc->mc_alloc_slots[allocator], 3903 zio); 3904 } 3905 mutex_exit(&mc->mc_lock); 3906 } 3907 3908 static int 3909 metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size, 3910 uint64_t txg) 3911 { 3912 metaslab_t *msp; 3913 spa_t *spa = vd->vdev_spa; 3914 int error = 0; 3915 3916 if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count) 3917 return (ENXIO); 3918 3919 ASSERT3P(vd->vdev_ms, !=, NULL); 3920 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 3921 3922 mutex_enter(&msp->ms_lock); 3923 3924 if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) 3925 error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM); 3926 /* 3927 * No need to fail in that case; someone else has activated the 3928 * metaslab, but that doesn't preclude us from using it. 3929 */ 3930 if (error == EBUSY) 3931 error = 0; 3932 3933 if (error == 0 && 3934 !range_tree_contains(msp->ms_allocatable, offset, size)) 3935 error = SET_ERROR(ENOENT); 3936 3937 if (error || txg == 0) { /* txg == 0 indicates dry run */ 3938 mutex_exit(&msp->ms_lock); 3939 return (error); 3940 } 3941 3942 VERIFY(!msp->ms_condensing); 3943 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 3944 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 3945 VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=, 3946 msp->ms_size); 3947 range_tree_remove(msp->ms_allocatable, offset, size); 3948 3949 if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ 3950 if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) 3951 vdev_dirty(vd, VDD_METASLAB, msp, txg); 3952 range_tree_add(msp->ms_allocating[txg & TXG_MASK], 3953 offset, size); 3954 } 3955 3956 mutex_exit(&msp->ms_lock); 3957 3958 return (0); 3959 } 3960 3961 typedef struct metaslab_claim_cb_arg_t { 3962 uint64_t mcca_txg; 3963 int mcca_error; 3964 } metaslab_claim_cb_arg_t; 3965 3966 /* ARGSUSED */ 3967 static void 3968 metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, 3969 uint64_t size, void *arg) 3970 { 3971 metaslab_claim_cb_arg_t *mcca_arg = arg; 3972 3973 if (mcca_arg->mcca_error == 0) { 3974 mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset, 3975 size, mcca_arg->mcca_txg); 3976 } 3977 } 3978 3979 int 3980 metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg) 3981 { 3982 if (vd->vdev_ops->vdev_op_remap != NULL) { 3983 metaslab_claim_cb_arg_t arg; 3984 3985 /* 3986 * Only zdb(1M) can claim on indirect vdevs. This is used 3987 * to detect leaks of mapped space (that are not accounted 3988 * for in the obsolete counts, spacemap, or bpobj). 3989 */ 3990 ASSERT(!spa_writeable(vd->vdev_spa)); 3991 arg.mcca_error = 0; 3992 arg.mcca_txg = txg; 3993 3994 vd->vdev_ops->vdev_op_remap(vd, offset, size, 3995 metaslab_claim_impl_cb, &arg); 3996 3997 if (arg.mcca_error == 0) { 3998 arg.mcca_error = metaslab_claim_concrete(vd, 3999 offset, size, txg); 4000 } 4001 return (arg.mcca_error); 4002 } else { 4003 return (metaslab_claim_concrete(vd, offset, size, txg)); 4004 } 4005 } 4006 4007 /* 4008 * Intent log support: upon opening the pool after a crash, notify the SPA 4009 * of blocks that the intent log has allocated for immediate write, but 4010 * which are still considered free by the SPA because the last transaction 4011 * group didn't commit yet. 4012 */ 4013 static int 4014 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) 4015 { 4016 uint64_t vdev = DVA_GET_VDEV(dva); 4017 uint64_t offset = DVA_GET_OFFSET(dva); 4018 uint64_t size = DVA_GET_ASIZE(dva); 4019 vdev_t *vd; 4020 4021 if ((vd = vdev_lookup_top(spa, vdev)) == NULL) { 4022 return (SET_ERROR(ENXIO)); 4023 } 4024 4025 ASSERT(DVA_IS_VALID(dva)); 4026 4027 if (DVA_GET_GANG(dva)) 4028 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 4029 4030 return (metaslab_claim_impl(vd, offset, size, txg)); 4031 } 4032 4033 int 4034 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, 4035 int ndvas, uint64_t txg, blkptr_t *hintbp, int flags, 4036 zio_alloc_list_t *zal, zio_t *zio, int allocator) 4037 { 4038 dva_t *dva = bp->blk_dva; 4039 dva_t *hintdva = hintbp->blk_dva; 4040 int error = 0; 4041 4042 ASSERT(bp->blk_birth == 0); 4043 ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); 4044 4045 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 4046 4047 if (mc->mc_rotor == NULL) { /* no vdevs in this class */ 4048 spa_config_exit(spa, SCL_ALLOC, FTAG); 4049 return (SET_ERROR(ENOSPC)); 4050 } 4051 4052 ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); 4053 ASSERT(BP_GET_NDVAS(bp) == 0); 4054 ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); 4055 ASSERT3P(zal, !=, NULL); 4056 4057 for (int d = 0; d < ndvas; d++) { 4058 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, 4059 txg, flags, zal, allocator); 4060 if (error != 0) { 4061 for (d--; d >= 0; d--) { 4062 metaslab_unalloc_dva(spa, &dva[d], txg); 4063 metaslab_group_alloc_decrement(spa, 4064 DVA_GET_VDEV(&dva[d]), zio, flags, 4065 allocator, B_FALSE); 4066 bzero(&dva[d], sizeof (dva_t)); 4067 } 4068 spa_config_exit(spa, SCL_ALLOC, FTAG); 4069 return (error); 4070 } else { 4071 /* 4072 * Update the metaslab group's queue depth 4073 * based on the newly allocated dva. 4074 */ 4075 metaslab_group_alloc_increment(spa, 4076 DVA_GET_VDEV(&dva[d]), zio, flags, allocator); 4077 } 4078 4079 } 4080 ASSERT(error == 0); 4081 ASSERT(BP_GET_NDVAS(bp) == ndvas); 4082 4083 spa_config_exit(spa, SCL_ALLOC, FTAG); 4084 4085 BP_SET_BIRTH(bp, txg, txg); 4086 4087 return (0); 4088 } 4089 4090 void 4091 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) 4092 { 4093 const dva_t *dva = bp->blk_dva; 4094 int ndvas = BP_GET_NDVAS(bp); 4095 4096 ASSERT(!BP_IS_HOLE(bp)); 4097 ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); 4098 4099 /* 4100 * If we have a checkpoint for the pool we need to make sure that 4101 * the blocks that we free that are part of the checkpoint won't be 4102 * reused until the checkpoint is discarded or we revert to it. 4103 * 4104 * The checkpoint flag is passed down the metaslab_free code path 4105 * and is set whenever we want to add a block to the checkpoint's 4106 * accounting. That is, we "checkpoint" blocks that existed at the 4107 * time the checkpoint was created and are therefore referenced by 4108 * the checkpointed uberblock. 4109 * 4110 * Note that, we don't checkpoint any blocks if the current 4111 * syncing txg <= spa_checkpoint_txg. We want these frees to sync 4112 * normally as they will be referenced by the checkpointed uberblock. 4113 */ 4114 boolean_t checkpoint = B_FALSE; 4115 if (bp->blk_birth <= spa->spa_checkpoint_txg && 4116 spa_syncing_txg(spa) > spa->spa_checkpoint_txg) { 4117 /* 4118 * At this point, if the block is part of the checkpoint 4119 * there is no way it was created in the current txg. 4120 */ 4121 ASSERT(!now); 4122 ASSERT3U(spa_syncing_txg(spa), ==, txg); 4123 checkpoint = B_TRUE; 4124 } 4125 4126 spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); 4127 4128 for (int d = 0; d < ndvas; d++) { 4129 if (now) { 4130 metaslab_unalloc_dva(spa, &dva[d], txg); 4131 } else { 4132 ASSERT3U(txg, ==, spa_syncing_txg(spa)); 4133 metaslab_free_dva(spa, &dva[d], checkpoint); 4134 } 4135 } 4136 4137 spa_config_exit(spa, SCL_FREE, FTAG); 4138 } 4139 4140 int 4141 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) 4142 { 4143 const dva_t *dva = bp->blk_dva; 4144 int ndvas = BP_GET_NDVAS(bp); 4145 int error = 0; 4146 4147 ASSERT(!BP_IS_HOLE(bp)); 4148 4149 if (txg != 0) { 4150 /* 4151 * First do a dry run to make sure all DVAs are claimable, 4152 * so we don't have to unwind from partial failures below. 4153 */ 4154 if ((error = metaslab_claim(spa, bp, 0)) != 0) 4155 return (error); 4156 } 4157 4158 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 4159 4160 for (int d = 0; d < ndvas; d++) { 4161 error = metaslab_claim_dva(spa, &dva[d], txg); 4162 if (error != 0) 4163 break; 4164 } 4165 4166 spa_config_exit(spa, SCL_ALLOC, FTAG); 4167 4168 ASSERT(error == 0 || txg == 0); 4169 4170 return (error); 4171 } 4172 4173 /* ARGSUSED */ 4174 static void 4175 metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset, 4176 uint64_t size, void *arg) 4177 { 4178 if (vd->vdev_ops == &vdev_indirect_ops) 4179 return; 4180 4181 metaslab_check_free_impl(vd, offset, size); 4182 } 4183 4184 static void 4185 metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size) 4186 { 4187 metaslab_t *msp; 4188 spa_t *spa = vd->vdev_spa; 4189 4190 if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) 4191 return; 4192 4193 if (vd->vdev_ops->vdev_op_remap != NULL) { 4194 vd->vdev_ops->vdev_op_remap(vd, offset, size, 4195 metaslab_check_free_impl_cb, NULL); 4196 return; 4197 } 4198 4199 ASSERT(vdev_is_concrete(vd)); 4200 ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); 4201 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 4202 4203 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 4204 4205 mutex_enter(&msp->ms_lock); 4206 if (msp->ms_loaded) 4207 range_tree_verify(msp->ms_allocatable, offset, size); 4208 4209 range_tree_verify(msp->ms_freeing, offset, size); 4210 range_tree_verify(msp->ms_checkpointing, offset, size); 4211 range_tree_verify(msp->ms_freed, offset, size); 4212 for (int j = 0; j < TXG_DEFER_SIZE; j++) 4213 range_tree_verify(msp->ms_defer[j], offset, size); 4214 mutex_exit(&msp->ms_lock); 4215 } 4216 4217 void 4218 metaslab_check_free(spa_t *spa, const blkptr_t *bp) 4219 { 4220 if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) 4221 return; 4222 4223 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 4224 for (int i = 0; i < BP_GET_NDVAS(bp); i++) { 4225 uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]); 4226 vdev_t *vd = vdev_lookup_top(spa, vdev); 4227 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); 4228 uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]); 4229 4230 if (DVA_GET_GANG(&bp->blk_dva[i])) 4231 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 4232 4233 ASSERT3P(vd, !=, NULL); 4234 4235 metaslab_check_free_impl(vd, offset, size); 4236 } 4237 spa_config_exit(spa, SCL_VDEV, FTAG); 4238 } 4239