1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 24 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 25 * Copyright (c) 2014 Integros [integros.com] 26 */ 27 28 #include <sys/zfs_context.h> 29 #include <sys/dmu.h> 30 #include <sys/dmu_tx.h> 31 #include <sys/space_map.h> 32 #include <sys/metaslab_impl.h> 33 #include <sys/vdev_impl.h> 34 #include <sys/zio.h> 35 #include <sys/spa_impl.h> 36 #include <sys/zfeature.h> 37 #include <sys/vdev_indirect_mapping.h> 38 #include <sys/zap.h> 39 40 #define GANG_ALLOCATION(flags) \ 41 ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER)) 42 43 uint64_t metaslab_aliquot = 512ULL << 10; 44 uint64_t metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ 45 46 /* 47 * Since we can touch multiple metaslabs (and their respective space maps) 48 * with each transaction group, we benefit from having a smaller space map 49 * block size since it allows us to issue more I/O operations scattered 50 * around the disk. 51 */ 52 int zfs_metaslab_sm_blksz = (1 << 12); 53 54 /* 55 * The in-core space map representation is more compact than its on-disk form. 56 * The zfs_condense_pct determines how much more compact the in-core 57 * space map representation must be before we compact it on-disk. 58 * Values should be greater than or equal to 100. 59 */ 60 int zfs_condense_pct = 200; 61 62 /* 63 * Condensing a metaslab is not guaranteed to actually reduce the amount of 64 * space used on disk. In particular, a space map uses data in increments of 65 * MAX(1 << ashift, space_map_blksize), so a metaslab might use the 66 * same number of blocks after condensing. Since the goal of condensing is to 67 * reduce the number of IOPs required to read the space map, we only want to 68 * condense when we can be sure we will reduce the number of blocks used by the 69 * space map. Unfortunately, we cannot precisely compute whether or not this is 70 * the case in metaslab_should_condense since we are holding ms_lock. Instead, 71 * we apply the following heuristic: do not condense a spacemap unless the 72 * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold 73 * blocks. 74 */ 75 int zfs_metaslab_condense_block_threshold = 4; 76 77 /* 78 * The zfs_mg_noalloc_threshold defines which metaslab groups should 79 * be eligible for allocation. The value is defined as a percentage of 80 * free space. Metaslab groups that have more free space than 81 * zfs_mg_noalloc_threshold are always eligible for allocations. Once 82 * a metaslab group's free space is less than or equal to the 83 * zfs_mg_noalloc_threshold the allocator will avoid allocating to that 84 * group unless all groups in the pool have reached zfs_mg_noalloc_threshold. 85 * Once all groups in the pool reach zfs_mg_noalloc_threshold then all 86 * groups are allowed to accept allocations. Gang blocks are always 87 * eligible to allocate on any metaslab group. The default value of 0 means 88 * no metaslab group will be excluded based on this criterion. 89 */ 90 int zfs_mg_noalloc_threshold = 0; 91 92 /* 93 * Metaslab groups are considered eligible for allocations if their 94 * fragmenation metric (measured as a percentage) is less than or equal to 95 * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold 96 * then it will be skipped unless all metaslab groups within the metaslab 97 * class have also crossed this threshold. 98 */ 99 int zfs_mg_fragmentation_threshold = 85; 100 101 /* 102 * Allow metaslabs to keep their active state as long as their fragmentation 103 * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An 104 * active metaslab that exceeds this threshold will no longer keep its active 105 * status allowing better metaslabs to be selected. 106 */ 107 int zfs_metaslab_fragmentation_threshold = 70; 108 109 /* 110 * When set will load all metaslabs when pool is first opened. 111 */ 112 int metaslab_debug_load = 0; 113 114 /* 115 * When set will prevent metaslabs from being unloaded. 116 */ 117 int metaslab_debug_unload = 0; 118 119 /* 120 * Minimum size which forces the dynamic allocator to change 121 * it's allocation strategy. Once the space map cannot satisfy 122 * an allocation of this size then it switches to using more 123 * aggressive strategy (i.e search by size rather than offset). 124 */ 125 uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE; 126 127 /* 128 * The minimum free space, in percent, which must be available 129 * in a space map to continue allocations in a first-fit fashion. 130 * Once the space map's free space drops below this level we dynamically 131 * switch to using best-fit allocations. 132 */ 133 int metaslab_df_free_pct = 4; 134 135 /* 136 * A metaslab is considered "free" if it contains a contiguous 137 * segment which is greater than metaslab_min_alloc_size. 138 */ 139 uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS; 140 141 /* 142 * Percentage of all cpus that can be used by the metaslab taskq. 143 */ 144 int metaslab_load_pct = 50; 145 146 /* 147 * Determines how many txgs a metaslab may remain loaded without having any 148 * allocations from it. As long as a metaslab continues to be used we will 149 * keep it loaded. 150 */ 151 int metaslab_unload_delay = TXG_SIZE * 2; 152 153 /* 154 * Max number of metaslabs per group to preload. 155 */ 156 int metaslab_preload_limit = SPA_DVAS_PER_BP; 157 158 /* 159 * Enable/disable preloading of metaslab. 160 */ 161 boolean_t metaslab_preload_enabled = B_TRUE; 162 163 /* 164 * Enable/disable fragmentation weighting on metaslabs. 165 */ 166 boolean_t metaslab_fragmentation_factor_enabled = B_TRUE; 167 168 /* 169 * Enable/disable lba weighting (i.e. outer tracks are given preference). 170 */ 171 boolean_t metaslab_lba_weighting_enabled = B_TRUE; 172 173 /* 174 * Enable/disable metaslab group biasing. 175 */ 176 boolean_t metaslab_bias_enabled = B_TRUE; 177 178 /* 179 * Enable/disable remapping of indirect DVAs to their concrete vdevs. 180 */ 181 boolean_t zfs_remap_blkptr_enable = B_TRUE; 182 183 /* 184 * Enable/disable segment-based metaslab selection. 185 */ 186 boolean_t zfs_metaslab_segment_weight_enabled = B_TRUE; 187 188 /* 189 * When using segment-based metaslab selection, we will continue 190 * allocating from the active metaslab until we have exhausted 191 * zfs_metaslab_switch_threshold of its buckets. 192 */ 193 int zfs_metaslab_switch_threshold = 2; 194 195 /* 196 * Internal switch to enable/disable the metaslab allocation tracing 197 * facility. 198 */ 199 boolean_t metaslab_trace_enabled = B_TRUE; 200 201 /* 202 * Maximum entries that the metaslab allocation tracing facility will keep 203 * in a given list when running in non-debug mode. We limit the number 204 * of entries in non-debug mode to prevent us from using up too much memory. 205 * The limit should be sufficiently large that we don't expect any allocation 206 * to every exceed this value. In debug mode, the system will panic if this 207 * limit is ever reached allowing for further investigation. 208 */ 209 uint64_t metaslab_trace_max_entries = 5000; 210 211 static uint64_t metaslab_weight(metaslab_t *); 212 static void metaslab_set_fragmentation(metaslab_t *); 213 static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t); 214 static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t); 215 static void metaslab_passivate(metaslab_t *msp, uint64_t weight); 216 static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp); 217 218 kmem_cache_t *metaslab_alloc_trace_cache; 219 220 /* 221 * ========================================================================== 222 * Metaslab classes 223 * ========================================================================== 224 */ 225 metaslab_class_t * 226 metaslab_class_create(spa_t *spa, metaslab_ops_t *ops) 227 { 228 metaslab_class_t *mc; 229 230 mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); 231 232 mc->mc_spa = spa; 233 mc->mc_rotor = NULL; 234 mc->mc_ops = ops; 235 mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL); 236 mc->mc_alloc_slots = kmem_zalloc(spa->spa_alloc_count * 237 sizeof (zfs_refcount_t), KM_SLEEP); 238 mc->mc_alloc_max_slots = kmem_zalloc(spa->spa_alloc_count * 239 sizeof (uint64_t), KM_SLEEP); 240 for (int i = 0; i < spa->spa_alloc_count; i++) 241 zfs_refcount_create_tracked(&mc->mc_alloc_slots[i]); 242 243 return (mc); 244 } 245 246 void 247 metaslab_class_destroy(metaslab_class_t *mc) 248 { 249 ASSERT(mc->mc_rotor == NULL); 250 ASSERT(mc->mc_alloc == 0); 251 ASSERT(mc->mc_deferred == 0); 252 ASSERT(mc->mc_space == 0); 253 ASSERT(mc->mc_dspace == 0); 254 255 for (int i = 0; i < mc->mc_spa->spa_alloc_count; i++) 256 zfs_refcount_destroy(&mc->mc_alloc_slots[i]); 257 kmem_free(mc->mc_alloc_slots, mc->mc_spa->spa_alloc_count * 258 sizeof (zfs_refcount_t)); 259 kmem_free(mc->mc_alloc_max_slots, mc->mc_spa->spa_alloc_count * 260 sizeof (uint64_t)); 261 mutex_destroy(&mc->mc_lock); 262 kmem_free(mc, sizeof (metaslab_class_t)); 263 } 264 265 int 266 metaslab_class_validate(metaslab_class_t *mc) 267 { 268 metaslab_group_t *mg; 269 vdev_t *vd; 270 271 /* 272 * Must hold one of the spa_config locks. 273 */ 274 ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || 275 spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); 276 277 if ((mg = mc->mc_rotor) == NULL) 278 return (0); 279 280 do { 281 vd = mg->mg_vd; 282 ASSERT(vd->vdev_mg != NULL); 283 ASSERT3P(vd->vdev_top, ==, vd); 284 ASSERT3P(mg->mg_class, ==, mc); 285 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); 286 } while ((mg = mg->mg_next) != mc->mc_rotor); 287 288 return (0); 289 } 290 291 void 292 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, 293 int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) 294 { 295 atomic_add_64(&mc->mc_alloc, alloc_delta); 296 atomic_add_64(&mc->mc_deferred, defer_delta); 297 atomic_add_64(&mc->mc_space, space_delta); 298 atomic_add_64(&mc->mc_dspace, dspace_delta); 299 } 300 301 uint64_t 302 metaslab_class_get_alloc(metaslab_class_t *mc) 303 { 304 return (mc->mc_alloc); 305 } 306 307 uint64_t 308 metaslab_class_get_deferred(metaslab_class_t *mc) 309 { 310 return (mc->mc_deferred); 311 } 312 313 uint64_t 314 metaslab_class_get_space(metaslab_class_t *mc) 315 { 316 return (mc->mc_space); 317 } 318 319 uint64_t 320 metaslab_class_get_dspace(metaslab_class_t *mc) 321 { 322 return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); 323 } 324 325 void 326 metaslab_class_histogram_verify(metaslab_class_t *mc) 327 { 328 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 329 uint64_t *mc_hist; 330 int i; 331 332 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 333 return; 334 335 mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 336 KM_SLEEP); 337 338 for (int c = 0; c < rvd->vdev_children; c++) { 339 vdev_t *tvd = rvd->vdev_child[c]; 340 metaslab_group_t *mg = tvd->vdev_mg; 341 342 /* 343 * Skip any holes, uninitialized top-levels, or 344 * vdevs that are not in this metalab class. 345 */ 346 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || 347 mg->mg_class != mc) { 348 continue; 349 } 350 351 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 352 mc_hist[i] += mg->mg_histogram[i]; 353 } 354 355 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 356 VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]); 357 358 kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 359 } 360 361 /* 362 * Calculate the metaslab class's fragmentation metric. The metric 363 * is weighted based on the space contribution of each metaslab group. 364 * The return value will be a number between 0 and 100 (inclusive), or 365 * ZFS_FRAG_INVALID if the metric has not been set. See comment above the 366 * zfs_frag_table for more information about the metric. 367 */ 368 uint64_t 369 metaslab_class_fragmentation(metaslab_class_t *mc) 370 { 371 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 372 uint64_t fragmentation = 0; 373 374 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 375 376 for (int c = 0; c < rvd->vdev_children; c++) { 377 vdev_t *tvd = rvd->vdev_child[c]; 378 metaslab_group_t *mg = tvd->vdev_mg; 379 380 /* 381 * Skip any holes, uninitialized top-levels, 382 * or vdevs that are not in this metalab class. 383 */ 384 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || 385 mg->mg_class != mc) { 386 continue; 387 } 388 389 /* 390 * If a metaslab group does not contain a fragmentation 391 * metric then just bail out. 392 */ 393 if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { 394 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 395 return (ZFS_FRAG_INVALID); 396 } 397 398 /* 399 * Determine how much this metaslab_group is contributing 400 * to the overall pool fragmentation metric. 401 */ 402 fragmentation += mg->mg_fragmentation * 403 metaslab_group_get_space(mg); 404 } 405 fragmentation /= metaslab_class_get_space(mc); 406 407 ASSERT3U(fragmentation, <=, 100); 408 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 409 return (fragmentation); 410 } 411 412 /* 413 * Calculate the amount of expandable space that is available in 414 * this metaslab class. If a device is expanded then its expandable 415 * space will be the amount of allocatable space that is currently not 416 * part of this metaslab class. 417 */ 418 uint64_t 419 metaslab_class_expandable_space(metaslab_class_t *mc) 420 { 421 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 422 uint64_t space = 0; 423 424 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 425 for (int c = 0; c < rvd->vdev_children; c++) { 426 uint64_t tspace; 427 vdev_t *tvd = rvd->vdev_child[c]; 428 metaslab_group_t *mg = tvd->vdev_mg; 429 430 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || 431 mg->mg_class != mc) { 432 continue; 433 } 434 435 /* 436 * Calculate if we have enough space to add additional 437 * metaslabs. We report the expandable space in terms 438 * of the metaslab size since that's the unit of expansion. 439 * Adjust by efi system partition size. 440 */ 441 tspace = tvd->vdev_max_asize - tvd->vdev_asize; 442 if (tspace > mc->mc_spa->spa_bootsize) { 443 tspace -= mc->mc_spa->spa_bootsize; 444 } 445 space += P2ALIGN(tspace, 1ULL << tvd->vdev_ms_shift); 446 } 447 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 448 return (space); 449 } 450 451 static int 452 metaslab_compare(const void *x1, const void *x2) 453 { 454 const metaslab_t *m1 = x1; 455 const metaslab_t *m2 = x2; 456 457 int sort1 = 0; 458 int sort2 = 0; 459 if (m1->ms_allocator != -1 && m1->ms_primary) 460 sort1 = 1; 461 else if (m1->ms_allocator != -1 && !m1->ms_primary) 462 sort1 = 2; 463 if (m2->ms_allocator != -1 && m2->ms_primary) 464 sort2 = 1; 465 else if (m2->ms_allocator != -1 && !m2->ms_primary) 466 sort2 = 2; 467 468 /* 469 * Sort inactive metaslabs first, then primaries, then secondaries. When 470 * selecting a metaslab to allocate from, an allocator first tries its 471 * primary, then secondary active metaslab. If it doesn't have active 472 * metaslabs, or can't allocate from them, it searches for an inactive 473 * metaslab to activate. If it can't find a suitable one, it will steal 474 * a primary or secondary metaslab from another allocator. 475 */ 476 if (sort1 < sort2) 477 return (-1); 478 if (sort1 > sort2) 479 return (1); 480 481 if (m1->ms_weight < m2->ms_weight) 482 return (1); 483 if (m1->ms_weight > m2->ms_weight) 484 return (-1); 485 486 /* 487 * If the weights are identical, use the offset to force uniqueness. 488 */ 489 if (m1->ms_start < m2->ms_start) 490 return (-1); 491 if (m1->ms_start > m2->ms_start) 492 return (1); 493 494 ASSERT3P(m1, ==, m2); 495 496 return (0); 497 } 498 499 /* 500 * Verify that the space accounting on disk matches the in-core range_trees. 501 */ 502 void 503 metaslab_verify_space(metaslab_t *msp, uint64_t txg) 504 { 505 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 506 uint64_t allocated = 0; 507 uint64_t sm_free_space, msp_free_space; 508 509 ASSERT(MUTEX_HELD(&msp->ms_lock)); 510 511 if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) 512 return; 513 514 /* 515 * We can only verify the metaslab space when we're called 516 * from syncing context with a loaded metaslab that has an allocated 517 * space map. Calling this in non-syncing context does not 518 * provide a consistent view of the metaslab since we're performing 519 * allocations in the future. 520 */ 521 if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL || 522 !msp->ms_loaded) 523 return; 524 525 sm_free_space = msp->ms_size - space_map_allocated(msp->ms_sm) - 526 space_map_alloc_delta(msp->ms_sm); 527 528 /* 529 * Account for future allocations since we would have already 530 * deducted that space from the ms_freetree. 531 */ 532 for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { 533 allocated += 534 range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]); 535 } 536 537 msp_free_space = range_tree_space(msp->ms_allocatable) + allocated + 538 msp->ms_deferspace + range_tree_space(msp->ms_freed); 539 540 VERIFY3U(sm_free_space, ==, msp_free_space); 541 } 542 543 /* 544 * ========================================================================== 545 * Metaslab groups 546 * ========================================================================== 547 */ 548 /* 549 * Update the allocatable flag and the metaslab group's capacity. 550 * The allocatable flag is set to true if the capacity is below 551 * the zfs_mg_noalloc_threshold or has a fragmentation value that is 552 * greater than zfs_mg_fragmentation_threshold. If a metaslab group 553 * transitions from allocatable to non-allocatable or vice versa then the 554 * metaslab group's class is updated to reflect the transition. 555 */ 556 static void 557 metaslab_group_alloc_update(metaslab_group_t *mg) 558 { 559 vdev_t *vd = mg->mg_vd; 560 metaslab_class_t *mc = mg->mg_class; 561 vdev_stat_t *vs = &vd->vdev_stat; 562 boolean_t was_allocatable; 563 boolean_t was_initialized; 564 565 ASSERT(vd == vd->vdev_top); 566 ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==, 567 SCL_ALLOC); 568 569 mutex_enter(&mg->mg_lock); 570 was_allocatable = mg->mg_allocatable; 571 was_initialized = mg->mg_initialized; 572 573 mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / 574 (vs->vs_space + 1); 575 576 mutex_enter(&mc->mc_lock); 577 578 /* 579 * If the metaslab group was just added then it won't 580 * have any space until we finish syncing out this txg. 581 * At that point we will consider it initialized and available 582 * for allocations. We also don't consider non-activated 583 * metaslab groups (e.g. vdevs that are in the middle of being removed) 584 * to be initialized, because they can't be used for allocation. 585 */ 586 mg->mg_initialized = metaslab_group_initialized(mg); 587 if (!was_initialized && mg->mg_initialized) { 588 mc->mc_groups++; 589 } else if (was_initialized && !mg->mg_initialized) { 590 ASSERT3U(mc->mc_groups, >, 0); 591 mc->mc_groups--; 592 } 593 if (mg->mg_initialized) 594 mg->mg_no_free_space = B_FALSE; 595 596 /* 597 * A metaslab group is considered allocatable if it has plenty 598 * of free space or is not heavily fragmented. We only take 599 * fragmentation into account if the metaslab group has a valid 600 * fragmentation metric (i.e. a value between 0 and 100). 601 */ 602 mg->mg_allocatable = (mg->mg_activation_count > 0 && 603 mg->mg_free_capacity > zfs_mg_noalloc_threshold && 604 (mg->mg_fragmentation == ZFS_FRAG_INVALID || 605 mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)); 606 607 /* 608 * The mc_alloc_groups maintains a count of the number of 609 * groups in this metaslab class that are still above the 610 * zfs_mg_noalloc_threshold. This is used by the allocating 611 * threads to determine if they should avoid allocations to 612 * a given group. The allocator will avoid allocations to a group 613 * if that group has reached or is below the zfs_mg_noalloc_threshold 614 * and there are still other groups that are above the threshold. 615 * When a group transitions from allocatable to non-allocatable or 616 * vice versa we update the metaslab class to reflect that change. 617 * When the mc_alloc_groups value drops to 0 that means that all 618 * groups have reached the zfs_mg_noalloc_threshold making all groups 619 * eligible for allocations. This effectively means that all devices 620 * are balanced again. 621 */ 622 if (was_allocatable && !mg->mg_allocatable) 623 mc->mc_alloc_groups--; 624 else if (!was_allocatable && mg->mg_allocatable) 625 mc->mc_alloc_groups++; 626 mutex_exit(&mc->mc_lock); 627 628 mutex_exit(&mg->mg_lock); 629 } 630 631 metaslab_group_t * 632 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators) 633 { 634 metaslab_group_t *mg; 635 636 mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); 637 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); 638 mutex_init(&mg->mg_ms_initialize_lock, NULL, MUTEX_DEFAULT, NULL); 639 cv_init(&mg->mg_ms_initialize_cv, NULL, CV_DEFAULT, NULL); 640 mg->mg_primaries = kmem_zalloc(allocators * sizeof (metaslab_t *), 641 KM_SLEEP); 642 mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *), 643 KM_SLEEP); 644 avl_create(&mg->mg_metaslab_tree, metaslab_compare, 645 sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); 646 mg->mg_vd = vd; 647 mg->mg_class = mc; 648 mg->mg_activation_count = 0; 649 mg->mg_initialized = B_FALSE; 650 mg->mg_no_free_space = B_TRUE; 651 mg->mg_allocators = allocators; 652 653 mg->mg_alloc_queue_depth = kmem_zalloc(allocators * 654 sizeof (zfs_refcount_t), KM_SLEEP); 655 mg->mg_cur_max_alloc_queue_depth = kmem_zalloc(allocators * 656 sizeof (uint64_t), KM_SLEEP); 657 for (int i = 0; i < allocators; i++) { 658 zfs_refcount_create_tracked(&mg->mg_alloc_queue_depth[i]); 659 mg->mg_cur_max_alloc_queue_depth[i] = 0; 660 } 661 662 mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct, 663 minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT); 664 665 return (mg); 666 } 667 668 void 669 metaslab_group_destroy(metaslab_group_t *mg) 670 { 671 ASSERT(mg->mg_prev == NULL); 672 ASSERT(mg->mg_next == NULL); 673 /* 674 * We may have gone below zero with the activation count 675 * either because we never activated in the first place or 676 * because we're done, and possibly removing the vdev. 677 */ 678 ASSERT(mg->mg_activation_count <= 0); 679 680 taskq_destroy(mg->mg_taskq); 681 avl_destroy(&mg->mg_metaslab_tree); 682 kmem_free(mg->mg_primaries, mg->mg_allocators * sizeof (metaslab_t *)); 683 kmem_free(mg->mg_secondaries, mg->mg_allocators * 684 sizeof (metaslab_t *)); 685 mutex_destroy(&mg->mg_lock); 686 mutex_destroy(&mg->mg_ms_initialize_lock); 687 cv_destroy(&mg->mg_ms_initialize_cv); 688 689 for (int i = 0; i < mg->mg_allocators; i++) { 690 zfs_refcount_destroy(&mg->mg_alloc_queue_depth[i]); 691 mg->mg_cur_max_alloc_queue_depth[i] = 0; 692 } 693 kmem_free(mg->mg_alloc_queue_depth, mg->mg_allocators * 694 sizeof (zfs_refcount_t)); 695 kmem_free(mg->mg_cur_max_alloc_queue_depth, mg->mg_allocators * 696 sizeof (uint64_t)); 697 698 kmem_free(mg, sizeof (metaslab_group_t)); 699 } 700 701 void 702 metaslab_group_activate(metaslab_group_t *mg) 703 { 704 metaslab_class_t *mc = mg->mg_class; 705 metaslab_group_t *mgprev, *mgnext; 706 707 ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER), !=, 0); 708 709 ASSERT(mc->mc_rotor != mg); 710 ASSERT(mg->mg_prev == NULL); 711 ASSERT(mg->mg_next == NULL); 712 ASSERT(mg->mg_activation_count <= 0); 713 714 if (++mg->mg_activation_count <= 0) 715 return; 716 717 mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); 718 metaslab_group_alloc_update(mg); 719 720 if ((mgprev = mc->mc_rotor) == NULL) { 721 mg->mg_prev = mg; 722 mg->mg_next = mg; 723 } else { 724 mgnext = mgprev->mg_next; 725 mg->mg_prev = mgprev; 726 mg->mg_next = mgnext; 727 mgprev->mg_next = mg; 728 mgnext->mg_prev = mg; 729 } 730 mc->mc_rotor = mg; 731 } 732 733 /* 734 * Passivate a metaslab group and remove it from the allocation rotor. 735 * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating 736 * a metaslab group. This function will momentarily drop spa_config_locks 737 * that are lower than the SCL_ALLOC lock (see comment below). 738 */ 739 void 740 metaslab_group_passivate(metaslab_group_t *mg) 741 { 742 metaslab_class_t *mc = mg->mg_class; 743 spa_t *spa = mc->mc_spa; 744 metaslab_group_t *mgprev, *mgnext; 745 int locks = spa_config_held(spa, SCL_ALL, RW_WRITER); 746 747 ASSERT3U(spa_config_held(spa, SCL_ALLOC | SCL_ZIO, RW_WRITER), ==, 748 (SCL_ALLOC | SCL_ZIO)); 749 750 if (--mg->mg_activation_count != 0) { 751 ASSERT(mc->mc_rotor != mg); 752 ASSERT(mg->mg_prev == NULL); 753 ASSERT(mg->mg_next == NULL); 754 ASSERT(mg->mg_activation_count < 0); 755 return; 756 } 757 758 /* 759 * The spa_config_lock is an array of rwlocks, ordered as 760 * follows (from highest to lowest): 761 * SCL_CONFIG > SCL_STATE > SCL_L2ARC > SCL_ALLOC > 762 * SCL_ZIO > SCL_FREE > SCL_VDEV 763 * (For more information about the spa_config_lock see spa_misc.c) 764 * The higher the lock, the broader its coverage. When we passivate 765 * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO 766 * config locks. However, the metaslab group's taskq might be trying 767 * to preload metaslabs so we must drop the SCL_ZIO lock and any 768 * lower locks to allow the I/O to complete. At a minimum, 769 * we continue to hold the SCL_ALLOC lock, which prevents any future 770 * allocations from taking place and any changes to the vdev tree. 771 */ 772 spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa); 773 taskq_wait(mg->mg_taskq); 774 spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER); 775 metaslab_group_alloc_update(mg); 776 for (int i = 0; i < mg->mg_allocators; i++) { 777 metaslab_t *msp = mg->mg_primaries[i]; 778 if (msp != NULL) { 779 mutex_enter(&msp->ms_lock); 780 metaslab_passivate(msp, 781 metaslab_weight_from_range_tree(msp)); 782 mutex_exit(&msp->ms_lock); 783 } 784 msp = mg->mg_secondaries[i]; 785 if (msp != NULL) { 786 mutex_enter(&msp->ms_lock); 787 metaslab_passivate(msp, 788 metaslab_weight_from_range_tree(msp)); 789 mutex_exit(&msp->ms_lock); 790 } 791 } 792 793 mgprev = mg->mg_prev; 794 mgnext = mg->mg_next; 795 796 if (mg == mgnext) { 797 mc->mc_rotor = NULL; 798 } else { 799 mc->mc_rotor = mgnext; 800 mgprev->mg_next = mgnext; 801 mgnext->mg_prev = mgprev; 802 } 803 804 mg->mg_prev = NULL; 805 mg->mg_next = NULL; 806 } 807 808 boolean_t 809 metaslab_group_initialized(metaslab_group_t *mg) 810 { 811 vdev_t *vd = mg->mg_vd; 812 vdev_stat_t *vs = &vd->vdev_stat; 813 814 return (vs->vs_space != 0 && mg->mg_activation_count > 0); 815 } 816 817 uint64_t 818 metaslab_group_get_space(metaslab_group_t *mg) 819 { 820 return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count); 821 } 822 823 void 824 metaslab_group_histogram_verify(metaslab_group_t *mg) 825 { 826 uint64_t *mg_hist; 827 vdev_t *vd = mg->mg_vd; 828 uint64_t ashift = vd->vdev_ashift; 829 int i; 830 831 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 832 return; 833 834 mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 835 KM_SLEEP); 836 837 ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=, 838 SPACE_MAP_HISTOGRAM_SIZE + ashift); 839 840 for (int m = 0; m < vd->vdev_ms_count; m++) { 841 metaslab_t *msp = vd->vdev_ms[m]; 842 843 if (msp->ms_sm == NULL) 844 continue; 845 846 for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) 847 mg_hist[i + ashift] += 848 msp->ms_sm->sm_phys->smp_histogram[i]; 849 } 850 851 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++) 852 VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]); 853 854 kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 855 } 856 857 static void 858 metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp) 859 { 860 metaslab_class_t *mc = mg->mg_class; 861 uint64_t ashift = mg->mg_vd->vdev_ashift; 862 863 ASSERT(MUTEX_HELD(&msp->ms_lock)); 864 if (msp->ms_sm == NULL) 865 return; 866 867 mutex_enter(&mg->mg_lock); 868 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 869 mg->mg_histogram[i + ashift] += 870 msp->ms_sm->sm_phys->smp_histogram[i]; 871 mc->mc_histogram[i + ashift] += 872 msp->ms_sm->sm_phys->smp_histogram[i]; 873 } 874 mutex_exit(&mg->mg_lock); 875 } 876 877 void 878 metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp) 879 { 880 metaslab_class_t *mc = mg->mg_class; 881 uint64_t ashift = mg->mg_vd->vdev_ashift; 882 883 ASSERT(MUTEX_HELD(&msp->ms_lock)); 884 if (msp->ms_sm == NULL) 885 return; 886 887 mutex_enter(&mg->mg_lock); 888 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 889 ASSERT3U(mg->mg_histogram[i + ashift], >=, 890 msp->ms_sm->sm_phys->smp_histogram[i]); 891 ASSERT3U(mc->mc_histogram[i + ashift], >=, 892 msp->ms_sm->sm_phys->smp_histogram[i]); 893 894 mg->mg_histogram[i + ashift] -= 895 msp->ms_sm->sm_phys->smp_histogram[i]; 896 mc->mc_histogram[i + ashift] -= 897 msp->ms_sm->sm_phys->smp_histogram[i]; 898 } 899 mutex_exit(&mg->mg_lock); 900 } 901 902 static void 903 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) 904 { 905 ASSERT(msp->ms_group == NULL); 906 mutex_enter(&mg->mg_lock); 907 msp->ms_group = mg; 908 msp->ms_weight = 0; 909 avl_add(&mg->mg_metaslab_tree, msp); 910 mutex_exit(&mg->mg_lock); 911 912 mutex_enter(&msp->ms_lock); 913 metaslab_group_histogram_add(mg, msp); 914 mutex_exit(&msp->ms_lock); 915 } 916 917 static void 918 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) 919 { 920 mutex_enter(&msp->ms_lock); 921 metaslab_group_histogram_remove(mg, msp); 922 mutex_exit(&msp->ms_lock); 923 924 mutex_enter(&mg->mg_lock); 925 ASSERT(msp->ms_group == mg); 926 avl_remove(&mg->mg_metaslab_tree, msp); 927 msp->ms_group = NULL; 928 mutex_exit(&mg->mg_lock); 929 } 930 931 static void 932 metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 933 { 934 ASSERT(MUTEX_HELD(&mg->mg_lock)); 935 ASSERT(msp->ms_group == mg); 936 avl_remove(&mg->mg_metaslab_tree, msp); 937 msp->ms_weight = weight; 938 avl_add(&mg->mg_metaslab_tree, msp); 939 940 } 941 942 static void 943 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 944 { 945 /* 946 * Although in principle the weight can be any value, in 947 * practice we do not use values in the range [1, 511]. 948 */ 949 ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0); 950 ASSERT(MUTEX_HELD(&msp->ms_lock)); 951 952 mutex_enter(&mg->mg_lock); 953 metaslab_group_sort_impl(mg, msp, weight); 954 mutex_exit(&mg->mg_lock); 955 } 956 957 /* 958 * Calculate the fragmentation for a given metaslab group. We can use 959 * a simple average here since all metaslabs within the group must have 960 * the same size. The return value will be a value between 0 and 100 961 * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this 962 * group have a fragmentation metric. 963 */ 964 uint64_t 965 metaslab_group_fragmentation(metaslab_group_t *mg) 966 { 967 vdev_t *vd = mg->mg_vd; 968 uint64_t fragmentation = 0; 969 uint64_t valid_ms = 0; 970 971 for (int m = 0; m < vd->vdev_ms_count; m++) { 972 metaslab_t *msp = vd->vdev_ms[m]; 973 974 if (msp->ms_fragmentation == ZFS_FRAG_INVALID) 975 continue; 976 977 valid_ms++; 978 fragmentation += msp->ms_fragmentation; 979 } 980 981 if (valid_ms <= vd->vdev_ms_count / 2) 982 return (ZFS_FRAG_INVALID); 983 984 fragmentation /= valid_ms; 985 ASSERT3U(fragmentation, <=, 100); 986 return (fragmentation); 987 } 988 989 /* 990 * Determine if a given metaslab group should skip allocations. A metaslab 991 * group should avoid allocations if its free capacity is less than the 992 * zfs_mg_noalloc_threshold or its fragmentation metric is greater than 993 * zfs_mg_fragmentation_threshold and there is at least one metaslab group 994 * that can still handle allocations. If the allocation throttle is enabled 995 * then we skip allocations to devices that have reached their maximum 996 * allocation queue depth unless the selected metaslab group is the only 997 * eligible group remaining. 998 */ 999 static boolean_t 1000 metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, 1001 uint64_t psize, int allocator) 1002 { 1003 spa_t *spa = mg->mg_vd->vdev_spa; 1004 metaslab_class_t *mc = mg->mg_class; 1005 1006 /* 1007 * We can only consider skipping this metaslab group if it's 1008 * in the normal metaslab class and there are other metaslab 1009 * groups to select from. Otherwise, we always consider it eligible 1010 * for allocations. 1011 */ 1012 if (mc != spa_normal_class(spa) || mc->mc_groups <= 1) 1013 return (B_TRUE); 1014 1015 /* 1016 * If the metaslab group's mg_allocatable flag is set (see comments 1017 * in metaslab_group_alloc_update() for more information) and 1018 * the allocation throttle is disabled then allow allocations to this 1019 * device. However, if the allocation throttle is enabled then 1020 * check if we have reached our allocation limit (mg_alloc_queue_depth) 1021 * to determine if we should allow allocations to this metaslab group. 1022 * If all metaslab groups are no longer considered allocatable 1023 * (mc_alloc_groups == 0) or we're trying to allocate the smallest 1024 * gang block size then we allow allocations on this metaslab group 1025 * regardless of the mg_allocatable or throttle settings. 1026 */ 1027 if (mg->mg_allocatable) { 1028 metaslab_group_t *mgp; 1029 int64_t qdepth; 1030 uint64_t qmax = mg->mg_cur_max_alloc_queue_depth[allocator]; 1031 1032 if (!mc->mc_alloc_throttle_enabled) 1033 return (B_TRUE); 1034 1035 /* 1036 * If this metaslab group does not have any free space, then 1037 * there is no point in looking further. 1038 */ 1039 if (mg->mg_no_free_space) 1040 return (B_FALSE); 1041 1042 qdepth = zfs_refcount_count( 1043 &mg->mg_alloc_queue_depth[allocator]); 1044 1045 /* 1046 * If this metaslab group is below its qmax or it's 1047 * the only allocatable metasable group, then attempt 1048 * to allocate from it. 1049 */ 1050 if (qdepth < qmax || mc->mc_alloc_groups == 1) 1051 return (B_TRUE); 1052 ASSERT3U(mc->mc_alloc_groups, >, 1); 1053 1054 /* 1055 * Since this metaslab group is at or over its qmax, we 1056 * need to determine if there are metaslab groups after this 1057 * one that might be able to handle this allocation. This is 1058 * racy since we can't hold the locks for all metaslab 1059 * groups at the same time when we make this check. 1060 */ 1061 for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) { 1062 qmax = mgp->mg_cur_max_alloc_queue_depth[allocator]; 1063 1064 qdepth = zfs_refcount_count( 1065 &mgp->mg_alloc_queue_depth[allocator]); 1066 1067 /* 1068 * If there is another metaslab group that 1069 * might be able to handle the allocation, then 1070 * we return false so that we skip this group. 1071 */ 1072 if (qdepth < qmax && !mgp->mg_no_free_space) 1073 return (B_FALSE); 1074 } 1075 1076 /* 1077 * We didn't find another group to handle the allocation 1078 * so we can't skip this metaslab group even though 1079 * we are at or over our qmax. 1080 */ 1081 return (B_TRUE); 1082 1083 } else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) { 1084 return (B_TRUE); 1085 } 1086 return (B_FALSE); 1087 } 1088 1089 /* 1090 * ========================================================================== 1091 * Range tree callbacks 1092 * ========================================================================== 1093 */ 1094 1095 /* 1096 * Comparison function for the private size-ordered tree. Tree is sorted 1097 * by size, larger sizes at the end of the tree. 1098 */ 1099 static int 1100 metaslab_rangesize_compare(const void *x1, const void *x2) 1101 { 1102 const range_seg_t *r1 = x1; 1103 const range_seg_t *r2 = x2; 1104 uint64_t rs_size1 = r1->rs_end - r1->rs_start; 1105 uint64_t rs_size2 = r2->rs_end - r2->rs_start; 1106 1107 if (rs_size1 < rs_size2) 1108 return (-1); 1109 if (rs_size1 > rs_size2) 1110 return (1); 1111 1112 if (r1->rs_start < r2->rs_start) 1113 return (-1); 1114 1115 if (r1->rs_start > r2->rs_start) 1116 return (1); 1117 1118 return (0); 1119 } 1120 1121 /* 1122 * Create any block allocator specific components. The current allocators 1123 * rely on using both a size-ordered range_tree_t and an array of uint64_t's. 1124 */ 1125 static void 1126 metaslab_rt_create(range_tree_t *rt, void *arg) 1127 { 1128 metaslab_t *msp = arg; 1129 1130 ASSERT3P(rt->rt_arg, ==, msp); 1131 ASSERT(msp->ms_allocatable == NULL); 1132 1133 avl_create(&msp->ms_allocatable_by_size, metaslab_rangesize_compare, 1134 sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node)); 1135 } 1136 1137 /* 1138 * Destroy the block allocator specific components. 1139 */ 1140 static void 1141 metaslab_rt_destroy(range_tree_t *rt, void *arg) 1142 { 1143 metaslab_t *msp = arg; 1144 1145 ASSERT3P(rt->rt_arg, ==, msp); 1146 ASSERT3P(msp->ms_allocatable, ==, rt); 1147 ASSERT0(avl_numnodes(&msp->ms_allocatable_by_size)); 1148 1149 avl_destroy(&msp->ms_allocatable_by_size); 1150 } 1151 1152 static void 1153 metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg) 1154 { 1155 metaslab_t *msp = arg; 1156 1157 ASSERT3P(rt->rt_arg, ==, msp); 1158 ASSERT3P(msp->ms_allocatable, ==, rt); 1159 VERIFY(!msp->ms_condensing); 1160 avl_add(&msp->ms_allocatable_by_size, rs); 1161 } 1162 1163 static void 1164 metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg) 1165 { 1166 metaslab_t *msp = arg; 1167 1168 ASSERT3P(rt->rt_arg, ==, msp); 1169 ASSERT3P(msp->ms_allocatable, ==, rt); 1170 VERIFY(!msp->ms_condensing); 1171 avl_remove(&msp->ms_allocatable_by_size, rs); 1172 } 1173 1174 static void 1175 metaslab_rt_vacate(range_tree_t *rt, void *arg) 1176 { 1177 metaslab_t *msp = arg; 1178 1179 ASSERT3P(rt->rt_arg, ==, msp); 1180 ASSERT3P(msp->ms_allocatable, ==, rt); 1181 1182 /* 1183 * Normally one would walk the tree freeing nodes along the way. 1184 * Since the nodes are shared with the range trees we can avoid 1185 * walking all nodes and just reinitialize the avl tree. The nodes 1186 * will be freed by the range tree, so we don't want to free them here. 1187 */ 1188 avl_create(&msp->ms_allocatable_by_size, metaslab_rangesize_compare, 1189 sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node)); 1190 } 1191 1192 static range_tree_ops_t metaslab_rt_ops = { 1193 metaslab_rt_create, 1194 metaslab_rt_destroy, 1195 metaslab_rt_add, 1196 metaslab_rt_remove, 1197 metaslab_rt_vacate 1198 }; 1199 1200 /* 1201 * ========================================================================== 1202 * Common allocator routines 1203 * ========================================================================== 1204 */ 1205 1206 /* 1207 * Return the maximum contiguous segment within the metaslab. 1208 */ 1209 uint64_t 1210 metaslab_block_maxsize(metaslab_t *msp) 1211 { 1212 avl_tree_t *t = &msp->ms_allocatable_by_size; 1213 range_seg_t *rs; 1214 1215 if (t == NULL || (rs = avl_last(t)) == NULL) 1216 return (0ULL); 1217 1218 return (rs->rs_end - rs->rs_start); 1219 } 1220 1221 static range_seg_t * 1222 metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size) 1223 { 1224 range_seg_t *rs, rsearch; 1225 avl_index_t where; 1226 1227 rsearch.rs_start = start; 1228 rsearch.rs_end = start + size; 1229 1230 rs = avl_find(t, &rsearch, &where); 1231 if (rs == NULL) { 1232 rs = avl_nearest(t, where, AVL_AFTER); 1233 } 1234 1235 return (rs); 1236 } 1237 1238 /* 1239 * This is a helper function that can be used by the allocator to find 1240 * a suitable block to allocate. This will search the specified AVL 1241 * tree looking for a block that matches the specified criteria. 1242 */ 1243 static uint64_t 1244 metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, 1245 uint64_t align) 1246 { 1247 range_seg_t *rs = metaslab_block_find(t, *cursor, size); 1248 1249 while (rs != NULL) { 1250 uint64_t offset = P2ROUNDUP(rs->rs_start, align); 1251 1252 if (offset + size <= rs->rs_end) { 1253 *cursor = offset + size; 1254 return (offset); 1255 } 1256 rs = AVL_NEXT(t, rs); 1257 } 1258 1259 /* 1260 * If we know we've searched the whole map (*cursor == 0), give up. 1261 * Otherwise, reset the cursor to the beginning and try again. 1262 */ 1263 if (*cursor == 0) 1264 return (-1ULL); 1265 1266 *cursor = 0; 1267 return (metaslab_block_picker(t, cursor, size, align)); 1268 } 1269 1270 /* 1271 * ========================================================================== 1272 * The first-fit block allocator 1273 * ========================================================================== 1274 */ 1275 static uint64_t 1276 metaslab_ff_alloc(metaslab_t *msp, uint64_t size) 1277 { 1278 /* 1279 * Find the largest power of 2 block size that evenly divides the 1280 * requested size. This is used to try to allocate blocks with similar 1281 * alignment from the same area of the metaslab (i.e. same cursor 1282 * bucket) but it does not guarantee that other allocations sizes 1283 * may exist in the same region. 1284 */ 1285 uint64_t align = size & -size; 1286 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; 1287 avl_tree_t *t = &msp->ms_allocatable->rt_root; 1288 1289 return (metaslab_block_picker(t, cursor, size, align)); 1290 } 1291 1292 static metaslab_ops_t metaslab_ff_ops = { 1293 metaslab_ff_alloc 1294 }; 1295 1296 /* 1297 * ========================================================================== 1298 * Dynamic block allocator - 1299 * Uses the first fit allocation scheme until space get low and then 1300 * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold 1301 * and metaslab_df_free_pct to determine when to switch the allocation scheme. 1302 * ========================================================================== 1303 */ 1304 static uint64_t 1305 metaslab_df_alloc(metaslab_t *msp, uint64_t size) 1306 { 1307 /* 1308 * Find the largest power of 2 block size that evenly divides the 1309 * requested size. This is used to try to allocate blocks with similar 1310 * alignment from the same area of the metaslab (i.e. same cursor 1311 * bucket) but it does not guarantee that other allocations sizes 1312 * may exist in the same region. 1313 */ 1314 uint64_t align = size & -size; 1315 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; 1316 range_tree_t *rt = msp->ms_allocatable; 1317 avl_tree_t *t = &rt->rt_root; 1318 uint64_t max_size = metaslab_block_maxsize(msp); 1319 int free_pct = range_tree_space(rt) * 100 / msp->ms_size; 1320 1321 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1322 ASSERT3U(avl_numnodes(t), ==, 1323 avl_numnodes(&msp->ms_allocatable_by_size)); 1324 1325 if (max_size < size) 1326 return (-1ULL); 1327 1328 /* 1329 * If we're running low on space switch to using the size 1330 * sorted AVL tree (best-fit). 1331 */ 1332 if (max_size < metaslab_df_alloc_threshold || 1333 free_pct < metaslab_df_free_pct) { 1334 t = &msp->ms_allocatable_by_size; 1335 *cursor = 0; 1336 } 1337 1338 return (metaslab_block_picker(t, cursor, size, 1ULL)); 1339 } 1340 1341 static metaslab_ops_t metaslab_df_ops = { 1342 metaslab_df_alloc 1343 }; 1344 1345 /* 1346 * ========================================================================== 1347 * Cursor fit block allocator - 1348 * Select the largest region in the metaslab, set the cursor to the beginning 1349 * of the range and the cursor_end to the end of the range. As allocations 1350 * are made advance the cursor. Continue allocating from the cursor until 1351 * the range is exhausted and then find a new range. 1352 * ========================================================================== 1353 */ 1354 static uint64_t 1355 metaslab_cf_alloc(metaslab_t *msp, uint64_t size) 1356 { 1357 range_tree_t *rt = msp->ms_allocatable; 1358 avl_tree_t *t = &msp->ms_allocatable_by_size; 1359 uint64_t *cursor = &msp->ms_lbas[0]; 1360 uint64_t *cursor_end = &msp->ms_lbas[1]; 1361 uint64_t offset = 0; 1362 1363 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1364 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root)); 1365 1366 ASSERT3U(*cursor_end, >=, *cursor); 1367 1368 if ((*cursor + size) > *cursor_end) { 1369 range_seg_t *rs; 1370 1371 rs = avl_last(&msp->ms_allocatable_by_size); 1372 if (rs == NULL || (rs->rs_end - rs->rs_start) < size) 1373 return (-1ULL); 1374 1375 *cursor = rs->rs_start; 1376 *cursor_end = rs->rs_end; 1377 } 1378 1379 offset = *cursor; 1380 *cursor += size; 1381 1382 return (offset); 1383 } 1384 1385 static metaslab_ops_t metaslab_cf_ops = { 1386 metaslab_cf_alloc 1387 }; 1388 1389 /* 1390 * ========================================================================== 1391 * New dynamic fit allocator - 1392 * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift 1393 * contiguous blocks. If no region is found then just use the largest segment 1394 * that remains. 1395 * ========================================================================== 1396 */ 1397 1398 /* 1399 * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift) 1400 * to request from the allocator. 1401 */ 1402 uint64_t metaslab_ndf_clump_shift = 4; 1403 1404 static uint64_t 1405 metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) 1406 { 1407 avl_tree_t *t = &msp->ms_allocatable->rt_root; 1408 avl_index_t where; 1409 range_seg_t *rs, rsearch; 1410 uint64_t hbit = highbit64(size); 1411 uint64_t *cursor = &msp->ms_lbas[hbit - 1]; 1412 uint64_t max_size = metaslab_block_maxsize(msp); 1413 1414 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1415 ASSERT3U(avl_numnodes(t), ==, 1416 avl_numnodes(&msp->ms_allocatable_by_size)); 1417 1418 if (max_size < size) 1419 return (-1ULL); 1420 1421 rsearch.rs_start = *cursor; 1422 rsearch.rs_end = *cursor + size; 1423 1424 rs = avl_find(t, &rsearch, &where); 1425 if (rs == NULL || (rs->rs_end - rs->rs_start) < size) { 1426 t = &msp->ms_allocatable_by_size; 1427 1428 rsearch.rs_start = 0; 1429 rsearch.rs_end = MIN(max_size, 1430 1ULL << (hbit + metaslab_ndf_clump_shift)); 1431 rs = avl_find(t, &rsearch, &where); 1432 if (rs == NULL) 1433 rs = avl_nearest(t, where, AVL_AFTER); 1434 ASSERT(rs != NULL); 1435 } 1436 1437 if ((rs->rs_end - rs->rs_start) >= size) { 1438 *cursor = rs->rs_start + size; 1439 return (rs->rs_start); 1440 } 1441 return (-1ULL); 1442 } 1443 1444 static metaslab_ops_t metaslab_ndf_ops = { 1445 metaslab_ndf_alloc 1446 }; 1447 1448 metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops; 1449 1450 /* 1451 * ========================================================================== 1452 * Metaslabs 1453 * ========================================================================== 1454 */ 1455 1456 /* 1457 * Wait for any in-progress metaslab loads to complete. 1458 */ 1459 static void 1460 metaslab_load_wait(metaslab_t *msp) 1461 { 1462 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1463 1464 while (msp->ms_loading) { 1465 ASSERT(!msp->ms_loaded); 1466 cv_wait(&msp->ms_load_cv, &msp->ms_lock); 1467 } 1468 } 1469 1470 static int 1471 metaslab_load_impl(metaslab_t *msp) 1472 { 1473 int error = 0; 1474 1475 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1476 ASSERT(msp->ms_loading); 1477 1478 /* 1479 * Nobody else can manipulate a loading metaslab, so it's now safe 1480 * to drop the lock. This way we don't have to hold the lock while 1481 * reading the spacemap from disk. 1482 */ 1483 mutex_exit(&msp->ms_lock); 1484 1485 /* 1486 * If the space map has not been allocated yet, then treat 1487 * all the space in the metaslab as free and add it to ms_allocatable. 1488 */ 1489 if (msp->ms_sm != NULL) { 1490 error = space_map_load(msp->ms_sm, msp->ms_allocatable, 1491 SM_FREE); 1492 } else { 1493 range_tree_add(msp->ms_allocatable, 1494 msp->ms_start, msp->ms_size); 1495 } 1496 1497 mutex_enter(&msp->ms_lock); 1498 1499 if (error != 0) 1500 return (error); 1501 1502 ASSERT3P(msp->ms_group, !=, NULL); 1503 msp->ms_loaded = B_TRUE; 1504 1505 /* 1506 * If the metaslab already has a spacemap, then we need to 1507 * remove all segments from the defer tree; otherwise, the 1508 * metaslab is completely empty and we can skip this. 1509 */ 1510 if (msp->ms_sm != NULL) { 1511 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1512 range_tree_walk(msp->ms_defer[t], 1513 range_tree_remove, msp->ms_allocatable); 1514 } 1515 } 1516 msp->ms_max_size = metaslab_block_maxsize(msp); 1517 1518 return (0); 1519 } 1520 1521 int 1522 metaslab_load(metaslab_t *msp) 1523 { 1524 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1525 1526 /* 1527 * There may be another thread loading the same metaslab, if that's 1528 * the case just wait until the other thread is done and return. 1529 */ 1530 metaslab_load_wait(msp); 1531 if (msp->ms_loaded) 1532 return (0); 1533 VERIFY(!msp->ms_loading); 1534 1535 msp->ms_loading = B_TRUE; 1536 int error = metaslab_load_impl(msp); 1537 msp->ms_loading = B_FALSE; 1538 cv_broadcast(&msp->ms_load_cv); 1539 1540 return (error); 1541 } 1542 1543 void 1544 metaslab_unload(metaslab_t *msp) 1545 { 1546 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1547 range_tree_vacate(msp->ms_allocatable, NULL, NULL); 1548 msp->ms_loaded = B_FALSE; 1549 msp->ms_weight &= ~METASLAB_ACTIVE_MASK; 1550 msp->ms_max_size = 0; 1551 } 1552 1553 int 1554 metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, 1555 metaslab_t **msp) 1556 { 1557 vdev_t *vd = mg->mg_vd; 1558 objset_t *mos = vd->vdev_spa->spa_meta_objset; 1559 metaslab_t *ms; 1560 int error; 1561 1562 ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); 1563 mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL); 1564 mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL); 1565 cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); 1566 1567 ms->ms_id = id; 1568 ms->ms_start = id << vd->vdev_ms_shift; 1569 ms->ms_size = 1ULL << vd->vdev_ms_shift; 1570 ms->ms_allocator = -1; 1571 ms->ms_new = B_TRUE; 1572 1573 /* 1574 * We only open space map objects that already exist. All others 1575 * will be opened when we finally allocate an object for it. 1576 */ 1577 if (object != 0) { 1578 error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start, 1579 ms->ms_size, vd->vdev_ashift); 1580 1581 if (error != 0) { 1582 kmem_free(ms, sizeof (metaslab_t)); 1583 return (error); 1584 } 1585 1586 ASSERT(ms->ms_sm != NULL); 1587 } 1588 1589 /* 1590 * We create the main range tree here, but we don't create the 1591 * other range trees until metaslab_sync_done(). This serves 1592 * two purposes: it allows metaslab_sync_done() to detect the 1593 * addition of new space; and for debugging, it ensures that we'd 1594 * data fault on any attempt to use this metaslab before it's ready. 1595 */ 1596 ms->ms_allocatable = range_tree_create(&metaslab_rt_ops, ms); 1597 metaslab_group_add(mg, ms); 1598 1599 metaslab_set_fragmentation(ms); 1600 1601 /* 1602 * If we're opening an existing pool (txg == 0) or creating 1603 * a new one (txg == TXG_INITIAL), all space is available now. 1604 * If we're adding space to an existing pool, the new space 1605 * does not become available until after this txg has synced. 1606 * The metaslab's weight will also be initialized when we sync 1607 * out this txg. This ensures that we don't attempt to allocate 1608 * from it before we have initialized it completely. 1609 */ 1610 if (txg <= TXG_INITIAL) 1611 metaslab_sync_done(ms, 0); 1612 1613 /* 1614 * If metaslab_debug_load is set and we're initializing a metaslab 1615 * that has an allocated space map object then load the its space 1616 * map so that can verify frees. 1617 */ 1618 if (metaslab_debug_load && ms->ms_sm != NULL) { 1619 mutex_enter(&ms->ms_lock); 1620 VERIFY0(metaslab_load(ms)); 1621 mutex_exit(&ms->ms_lock); 1622 } 1623 1624 if (txg != 0) { 1625 vdev_dirty(vd, 0, NULL, txg); 1626 vdev_dirty(vd, VDD_METASLAB, ms, txg); 1627 } 1628 1629 *msp = ms; 1630 1631 return (0); 1632 } 1633 1634 void 1635 metaslab_fini(metaslab_t *msp) 1636 { 1637 metaslab_group_t *mg = msp->ms_group; 1638 1639 metaslab_group_remove(mg, msp); 1640 1641 mutex_enter(&msp->ms_lock); 1642 VERIFY(msp->ms_group == NULL); 1643 vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm), 1644 0, -msp->ms_size); 1645 space_map_close(msp->ms_sm); 1646 1647 metaslab_unload(msp); 1648 range_tree_destroy(msp->ms_allocatable); 1649 range_tree_destroy(msp->ms_freeing); 1650 range_tree_destroy(msp->ms_freed); 1651 1652 for (int t = 0; t < TXG_SIZE; t++) { 1653 range_tree_destroy(msp->ms_allocating[t]); 1654 } 1655 1656 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1657 range_tree_destroy(msp->ms_defer[t]); 1658 } 1659 ASSERT0(msp->ms_deferspace); 1660 1661 range_tree_destroy(msp->ms_checkpointing); 1662 1663 mutex_exit(&msp->ms_lock); 1664 cv_destroy(&msp->ms_load_cv); 1665 mutex_destroy(&msp->ms_lock); 1666 mutex_destroy(&msp->ms_sync_lock); 1667 ASSERT3U(msp->ms_allocator, ==, -1); 1668 1669 kmem_free(msp, sizeof (metaslab_t)); 1670 } 1671 1672 #define FRAGMENTATION_TABLE_SIZE 17 1673 1674 /* 1675 * This table defines a segment size based fragmentation metric that will 1676 * allow each metaslab to derive its own fragmentation value. This is done 1677 * by calculating the space in each bucket of the spacemap histogram and 1678 * multiplying that by the fragmetation metric in this table. Doing 1679 * this for all buckets and dividing it by the total amount of free 1680 * space in this metaslab (i.e. the total free space in all buckets) gives 1681 * us the fragmentation metric. This means that a high fragmentation metric 1682 * equates to most of the free space being comprised of small segments. 1683 * Conversely, if the metric is low, then most of the free space is in 1684 * large segments. A 10% change in fragmentation equates to approximately 1685 * double the number of segments. 1686 * 1687 * This table defines 0% fragmented space using 16MB segments. Testing has 1688 * shown that segments that are greater than or equal to 16MB do not suffer 1689 * from drastic performance problems. Using this value, we derive the rest 1690 * of the table. Since the fragmentation value is never stored on disk, it 1691 * is possible to change these calculations in the future. 1692 */ 1693 int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = { 1694 100, /* 512B */ 1695 100, /* 1K */ 1696 98, /* 2K */ 1697 95, /* 4K */ 1698 90, /* 8K */ 1699 80, /* 16K */ 1700 70, /* 32K */ 1701 60, /* 64K */ 1702 50, /* 128K */ 1703 40, /* 256K */ 1704 30, /* 512K */ 1705 20, /* 1M */ 1706 15, /* 2M */ 1707 10, /* 4M */ 1708 5, /* 8M */ 1709 0 /* 16M */ 1710 }; 1711 1712 /* 1713 * Calclate the metaslab's fragmentation metric. A return value 1714 * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does 1715 * not support this metric. Otherwise, the return value should be in the 1716 * range [0, 100]. 1717 */ 1718 static void 1719 metaslab_set_fragmentation(metaslab_t *msp) 1720 { 1721 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1722 uint64_t fragmentation = 0; 1723 uint64_t total = 0; 1724 boolean_t feature_enabled = spa_feature_is_enabled(spa, 1725 SPA_FEATURE_SPACEMAP_HISTOGRAM); 1726 1727 if (!feature_enabled) { 1728 msp->ms_fragmentation = ZFS_FRAG_INVALID; 1729 return; 1730 } 1731 1732 /* 1733 * A null space map means that the entire metaslab is free 1734 * and thus is not fragmented. 1735 */ 1736 if (msp->ms_sm == NULL) { 1737 msp->ms_fragmentation = 0; 1738 return; 1739 } 1740 1741 /* 1742 * If this metaslab's space map has not been upgraded, flag it 1743 * so that we upgrade next time we encounter it. 1744 */ 1745 if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) { 1746 uint64_t txg = spa_syncing_txg(spa); 1747 vdev_t *vd = msp->ms_group->mg_vd; 1748 1749 /* 1750 * If we've reached the final dirty txg, then we must 1751 * be shutting down the pool. We don't want to dirty 1752 * any data past this point so skip setting the condense 1753 * flag. We can retry this action the next time the pool 1754 * is imported. 1755 */ 1756 if (spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) { 1757 msp->ms_condense_wanted = B_TRUE; 1758 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 1759 zfs_dbgmsg("txg %llu, requesting force condense: " 1760 "ms_id %llu, vdev_id %llu", txg, msp->ms_id, 1761 vd->vdev_id); 1762 } 1763 msp->ms_fragmentation = ZFS_FRAG_INVALID; 1764 return; 1765 } 1766 1767 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 1768 uint64_t space = 0; 1769 uint8_t shift = msp->ms_sm->sm_shift; 1770 1771 int idx = MIN(shift - SPA_MINBLOCKSHIFT + i, 1772 FRAGMENTATION_TABLE_SIZE - 1); 1773 1774 if (msp->ms_sm->sm_phys->smp_histogram[i] == 0) 1775 continue; 1776 1777 space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift); 1778 total += space; 1779 1780 ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE); 1781 fragmentation += space * zfs_frag_table[idx]; 1782 } 1783 1784 if (total > 0) 1785 fragmentation /= total; 1786 ASSERT3U(fragmentation, <=, 100); 1787 1788 msp->ms_fragmentation = fragmentation; 1789 } 1790 1791 /* 1792 * Compute a weight -- a selection preference value -- for the given metaslab. 1793 * This is based on the amount of free space, the level of fragmentation, 1794 * the LBA range, and whether the metaslab is loaded. 1795 */ 1796 static uint64_t 1797 metaslab_space_weight(metaslab_t *msp) 1798 { 1799 metaslab_group_t *mg = msp->ms_group; 1800 vdev_t *vd = mg->mg_vd; 1801 uint64_t weight, space; 1802 1803 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1804 ASSERT(!vd->vdev_removing); 1805 1806 /* 1807 * The baseline weight is the metaslab's free space. 1808 */ 1809 space = msp->ms_size - space_map_allocated(msp->ms_sm); 1810 1811 if (metaslab_fragmentation_factor_enabled && 1812 msp->ms_fragmentation != ZFS_FRAG_INVALID) { 1813 /* 1814 * Use the fragmentation information to inversely scale 1815 * down the baseline weight. We need to ensure that we 1816 * don't exclude this metaslab completely when it's 100% 1817 * fragmented. To avoid this we reduce the fragmented value 1818 * by 1. 1819 */ 1820 space = (space * (100 - (msp->ms_fragmentation - 1))) / 100; 1821 1822 /* 1823 * If space < SPA_MINBLOCKSIZE, then we will not allocate from 1824 * this metaslab again. The fragmentation metric may have 1825 * decreased the space to something smaller than 1826 * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE 1827 * so that we can consume any remaining space. 1828 */ 1829 if (space > 0 && space < SPA_MINBLOCKSIZE) 1830 space = SPA_MINBLOCKSIZE; 1831 } 1832 weight = space; 1833 1834 /* 1835 * Modern disks have uniform bit density and constant angular velocity. 1836 * Therefore, the outer recording zones are faster (higher bandwidth) 1837 * than the inner zones by the ratio of outer to inner track diameter, 1838 * which is typically around 2:1. We account for this by assigning 1839 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). 1840 * In effect, this means that we'll select the metaslab with the most 1841 * free bandwidth rather than simply the one with the most free space. 1842 */ 1843 if (metaslab_lba_weighting_enabled) { 1844 weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count; 1845 ASSERT(weight >= space && weight <= 2 * space); 1846 } 1847 1848 /* 1849 * If this metaslab is one we're actively using, adjust its 1850 * weight to make it preferable to any inactive metaslab so 1851 * we'll polish it off. If the fragmentation on this metaslab 1852 * has exceed our threshold, then don't mark it active. 1853 */ 1854 if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID && 1855 msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) { 1856 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); 1857 } 1858 1859 WEIGHT_SET_SPACEBASED(weight); 1860 return (weight); 1861 } 1862 1863 /* 1864 * Return the weight of the specified metaslab, according to the segment-based 1865 * weighting algorithm. The metaslab must be loaded. This function can 1866 * be called within a sync pass since it relies only on the metaslab's 1867 * range tree which is always accurate when the metaslab is loaded. 1868 */ 1869 static uint64_t 1870 metaslab_weight_from_range_tree(metaslab_t *msp) 1871 { 1872 uint64_t weight = 0; 1873 uint32_t segments = 0; 1874 1875 ASSERT(msp->ms_loaded); 1876 1877 for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT; 1878 i--) { 1879 uint8_t shift = msp->ms_group->mg_vd->vdev_ashift; 1880 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; 1881 1882 segments <<= 1; 1883 segments += msp->ms_allocatable->rt_histogram[i]; 1884 1885 /* 1886 * The range tree provides more precision than the space map 1887 * and must be downgraded so that all values fit within the 1888 * space map's histogram. This allows us to compare loaded 1889 * vs. unloaded metaslabs to determine which metaslab is 1890 * considered "best". 1891 */ 1892 if (i > max_idx) 1893 continue; 1894 1895 if (segments != 0) { 1896 WEIGHT_SET_COUNT(weight, segments); 1897 WEIGHT_SET_INDEX(weight, i); 1898 WEIGHT_SET_ACTIVE(weight, 0); 1899 break; 1900 } 1901 } 1902 return (weight); 1903 } 1904 1905 /* 1906 * Calculate the weight based on the on-disk histogram. This should only 1907 * be called after a sync pass has completely finished since the on-disk 1908 * information is updated in metaslab_sync(). 1909 */ 1910 static uint64_t 1911 metaslab_weight_from_spacemap(metaslab_t *msp) 1912 { 1913 uint64_t weight = 0; 1914 1915 for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) { 1916 if (msp->ms_sm->sm_phys->smp_histogram[i] != 0) { 1917 WEIGHT_SET_COUNT(weight, 1918 msp->ms_sm->sm_phys->smp_histogram[i]); 1919 WEIGHT_SET_INDEX(weight, i + 1920 msp->ms_sm->sm_shift); 1921 WEIGHT_SET_ACTIVE(weight, 0); 1922 break; 1923 } 1924 } 1925 return (weight); 1926 } 1927 1928 /* 1929 * Compute a segment-based weight for the specified metaslab. The weight 1930 * is determined by highest bucket in the histogram. The information 1931 * for the highest bucket is encoded into the weight value. 1932 */ 1933 static uint64_t 1934 metaslab_segment_weight(metaslab_t *msp) 1935 { 1936 metaslab_group_t *mg = msp->ms_group; 1937 uint64_t weight = 0; 1938 uint8_t shift = mg->mg_vd->vdev_ashift; 1939 1940 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1941 1942 /* 1943 * The metaslab is completely free. 1944 */ 1945 if (space_map_allocated(msp->ms_sm) == 0) { 1946 int idx = highbit64(msp->ms_size) - 1; 1947 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; 1948 1949 if (idx < max_idx) { 1950 WEIGHT_SET_COUNT(weight, 1ULL); 1951 WEIGHT_SET_INDEX(weight, idx); 1952 } else { 1953 WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx)); 1954 WEIGHT_SET_INDEX(weight, max_idx); 1955 } 1956 WEIGHT_SET_ACTIVE(weight, 0); 1957 ASSERT(!WEIGHT_IS_SPACEBASED(weight)); 1958 1959 return (weight); 1960 } 1961 1962 ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); 1963 1964 /* 1965 * If the metaslab is fully allocated then just make the weight 0. 1966 */ 1967 if (space_map_allocated(msp->ms_sm) == msp->ms_size) 1968 return (0); 1969 /* 1970 * If the metaslab is already loaded, then use the range tree to 1971 * determine the weight. Otherwise, we rely on the space map information 1972 * to generate the weight. 1973 */ 1974 if (msp->ms_loaded) { 1975 weight = metaslab_weight_from_range_tree(msp); 1976 } else { 1977 weight = metaslab_weight_from_spacemap(msp); 1978 } 1979 1980 /* 1981 * If the metaslab was active the last time we calculated its weight 1982 * then keep it active. We want to consume the entire region that 1983 * is associated with this weight. 1984 */ 1985 if (msp->ms_activation_weight != 0 && weight != 0) 1986 WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight)); 1987 return (weight); 1988 } 1989 1990 /* 1991 * Determine if we should attempt to allocate from this metaslab. If the 1992 * metaslab has a maximum size then we can quickly determine if the desired 1993 * allocation size can be satisfied. Otherwise, if we're using segment-based 1994 * weighting then we can determine the maximum allocation that this metaslab 1995 * can accommodate based on the index encoded in the weight. If we're using 1996 * space-based weights then rely on the entire weight (excluding the weight 1997 * type bit). 1998 */ 1999 boolean_t 2000 metaslab_should_allocate(metaslab_t *msp, uint64_t asize) 2001 { 2002 boolean_t should_allocate; 2003 2004 if (msp->ms_max_size != 0) 2005 return (msp->ms_max_size >= asize); 2006 2007 if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) { 2008 /* 2009 * The metaslab segment weight indicates segments in the 2010 * range [2^i, 2^(i+1)), where i is the index in the weight. 2011 * Since the asize might be in the middle of the range, we 2012 * should attempt the allocation if asize < 2^(i+1). 2013 */ 2014 should_allocate = (asize < 2015 1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1)); 2016 } else { 2017 should_allocate = (asize <= 2018 (msp->ms_weight & ~METASLAB_WEIGHT_TYPE)); 2019 } 2020 return (should_allocate); 2021 } 2022 2023 static uint64_t 2024 metaslab_weight(metaslab_t *msp) 2025 { 2026 vdev_t *vd = msp->ms_group->mg_vd; 2027 spa_t *spa = vd->vdev_spa; 2028 uint64_t weight; 2029 2030 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2031 2032 /* 2033 * If this vdev is in the process of being removed, there is nothing 2034 * for us to do here. 2035 */ 2036 if (vd->vdev_removing) 2037 return (0); 2038 2039 metaslab_set_fragmentation(msp); 2040 2041 /* 2042 * Update the maximum size if the metaslab is loaded. This will 2043 * ensure that we get an accurate maximum size if newly freed space 2044 * has been added back into the free tree. 2045 */ 2046 if (msp->ms_loaded) 2047 msp->ms_max_size = metaslab_block_maxsize(msp); 2048 2049 /* 2050 * Segment-based weighting requires space map histogram support. 2051 */ 2052 if (zfs_metaslab_segment_weight_enabled && 2053 spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) && 2054 (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size == 2055 sizeof (space_map_phys_t))) { 2056 weight = metaslab_segment_weight(msp); 2057 } else { 2058 weight = metaslab_space_weight(msp); 2059 } 2060 return (weight); 2061 } 2062 2063 static int 2064 metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp, 2065 int allocator, uint64_t activation_weight) 2066 { 2067 /* 2068 * If we're activating for the claim code, we don't want to actually 2069 * set the metaslab up for a specific allocator. 2070 */ 2071 if (activation_weight == METASLAB_WEIGHT_CLAIM) 2072 return (0); 2073 metaslab_t **arr = (activation_weight == METASLAB_WEIGHT_PRIMARY ? 2074 mg->mg_primaries : mg->mg_secondaries); 2075 2076 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2077 mutex_enter(&mg->mg_lock); 2078 if (arr[allocator] != NULL) { 2079 mutex_exit(&mg->mg_lock); 2080 return (EEXIST); 2081 } 2082 2083 arr[allocator] = msp; 2084 ASSERT3S(msp->ms_allocator, ==, -1); 2085 msp->ms_allocator = allocator; 2086 msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY); 2087 mutex_exit(&mg->mg_lock); 2088 2089 return (0); 2090 } 2091 2092 static int 2093 metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight) 2094 { 2095 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2096 2097 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { 2098 int error = metaslab_load(msp); 2099 if (error != 0) { 2100 metaslab_group_sort(msp->ms_group, msp, 0); 2101 return (error); 2102 } 2103 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) { 2104 /* 2105 * The metaslab was activated for another allocator 2106 * while we were waiting, we should reselect. 2107 */ 2108 return (EBUSY); 2109 } 2110 if ((error = metaslab_activate_allocator(msp->ms_group, msp, 2111 allocator, activation_weight)) != 0) { 2112 return (error); 2113 } 2114 2115 msp->ms_activation_weight = msp->ms_weight; 2116 metaslab_group_sort(msp->ms_group, msp, 2117 msp->ms_weight | activation_weight); 2118 } 2119 ASSERT(msp->ms_loaded); 2120 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); 2121 2122 return (0); 2123 } 2124 2125 static void 2126 metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp, 2127 uint64_t weight) 2128 { 2129 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2130 if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) { 2131 metaslab_group_sort(mg, msp, weight); 2132 return; 2133 } 2134 2135 mutex_enter(&mg->mg_lock); 2136 ASSERT3P(msp->ms_group, ==, mg); 2137 if (msp->ms_primary) { 2138 ASSERT3U(0, <=, msp->ms_allocator); 2139 ASSERT3U(msp->ms_allocator, <, mg->mg_allocators); 2140 ASSERT3P(mg->mg_primaries[msp->ms_allocator], ==, msp); 2141 ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); 2142 mg->mg_primaries[msp->ms_allocator] = NULL; 2143 } else { 2144 ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); 2145 ASSERT3P(mg->mg_secondaries[msp->ms_allocator], ==, msp); 2146 mg->mg_secondaries[msp->ms_allocator] = NULL; 2147 } 2148 msp->ms_allocator = -1; 2149 metaslab_group_sort_impl(mg, msp, weight); 2150 mutex_exit(&mg->mg_lock); 2151 } 2152 2153 static void 2154 metaslab_passivate(metaslab_t *msp, uint64_t weight) 2155 { 2156 uint64_t size = weight & ~METASLAB_WEIGHT_TYPE; 2157 2158 /* 2159 * If size < SPA_MINBLOCKSIZE, then we will not allocate from 2160 * this metaslab again. In that case, it had better be empty, 2161 * or we would be leaving space on the table. 2162 */ 2163 ASSERT(size >= SPA_MINBLOCKSIZE || 2164 range_tree_is_empty(msp->ms_allocatable)); 2165 ASSERT0(weight & METASLAB_ACTIVE_MASK); 2166 2167 msp->ms_activation_weight = 0; 2168 metaslab_passivate_allocator(msp->ms_group, msp, weight); 2169 ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); 2170 } 2171 2172 /* 2173 * Segment-based metaslabs are activated once and remain active until 2174 * we either fail an allocation attempt (similar to space-based metaslabs) 2175 * or have exhausted the free space in zfs_metaslab_switch_threshold 2176 * buckets since the metaslab was activated. This function checks to see 2177 * if we've exhaused the zfs_metaslab_switch_threshold buckets in the 2178 * metaslab and passivates it proactively. This will allow us to select a 2179 * metaslabs with larger contiguous region if any remaining within this 2180 * metaslab group. If we're in sync pass > 1, then we continue using this 2181 * metaslab so that we don't dirty more block and cause more sync passes. 2182 */ 2183 void 2184 metaslab_segment_may_passivate(metaslab_t *msp) 2185 { 2186 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 2187 2188 if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1) 2189 return; 2190 2191 /* 2192 * Since we are in the middle of a sync pass, the most accurate 2193 * information that is accessible to us is the in-core range tree 2194 * histogram; calculate the new weight based on that information. 2195 */ 2196 uint64_t weight = metaslab_weight_from_range_tree(msp); 2197 int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight); 2198 int current_idx = WEIGHT_GET_INDEX(weight); 2199 2200 if (current_idx <= activation_idx - zfs_metaslab_switch_threshold) 2201 metaslab_passivate(msp, weight); 2202 } 2203 2204 static void 2205 metaslab_preload(void *arg) 2206 { 2207 metaslab_t *msp = arg; 2208 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 2209 2210 ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock)); 2211 2212 mutex_enter(&msp->ms_lock); 2213 (void) metaslab_load(msp); 2214 msp->ms_selected_txg = spa_syncing_txg(spa); 2215 mutex_exit(&msp->ms_lock); 2216 } 2217 2218 static void 2219 metaslab_group_preload(metaslab_group_t *mg) 2220 { 2221 spa_t *spa = mg->mg_vd->vdev_spa; 2222 metaslab_t *msp; 2223 avl_tree_t *t = &mg->mg_metaslab_tree; 2224 int m = 0; 2225 2226 if (spa_shutting_down(spa) || !metaslab_preload_enabled) { 2227 taskq_wait(mg->mg_taskq); 2228 return; 2229 } 2230 2231 mutex_enter(&mg->mg_lock); 2232 2233 /* 2234 * Load the next potential metaslabs 2235 */ 2236 for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) { 2237 ASSERT3P(msp->ms_group, ==, mg); 2238 2239 /* 2240 * We preload only the maximum number of metaslabs specified 2241 * by metaslab_preload_limit. If a metaslab is being forced 2242 * to condense then we preload it too. This will ensure 2243 * that force condensing happens in the next txg. 2244 */ 2245 if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) { 2246 continue; 2247 } 2248 2249 VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload, 2250 msp, TQ_SLEEP) != NULL); 2251 } 2252 mutex_exit(&mg->mg_lock); 2253 } 2254 2255 /* 2256 * Determine if the space map's on-disk footprint is past our tolerance 2257 * for inefficiency. We would like to use the following criteria to make 2258 * our decision: 2259 * 2260 * 1. The size of the space map object should not dramatically increase as a 2261 * result of writing out the free space range tree. 2262 * 2263 * 2. The minimal on-disk space map representation is zfs_condense_pct/100 2264 * times the size than the free space range tree representation 2265 * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1MB). 2266 * 2267 * 3. The on-disk size of the space map should actually decrease. 2268 * 2269 * Unfortunately, we cannot compute the on-disk size of the space map in this 2270 * context because we cannot accurately compute the effects of compression, etc. 2271 * Instead, we apply the heuristic described in the block comment for 2272 * zfs_metaslab_condense_block_threshold - we only condense if the space used 2273 * is greater than a threshold number of blocks. 2274 */ 2275 static boolean_t 2276 metaslab_should_condense(metaslab_t *msp) 2277 { 2278 space_map_t *sm = msp->ms_sm; 2279 vdev_t *vd = msp->ms_group->mg_vd; 2280 uint64_t vdev_blocksize = 1 << vd->vdev_ashift; 2281 uint64_t current_txg = spa_syncing_txg(vd->vdev_spa); 2282 2283 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2284 ASSERT(msp->ms_loaded); 2285 2286 /* 2287 * Allocations and frees in early passes are generally more space 2288 * efficient (in terms of blocks described in space map entries) 2289 * than the ones in later passes (e.g. we don't compress after 2290 * sync pass 5) and condensing a metaslab multiple times in a txg 2291 * could degrade performance. 2292 * 2293 * Thus we prefer condensing each metaslab at most once every txg at 2294 * the earliest sync pass possible. If a metaslab is eligible for 2295 * condensing again after being considered for condensing within the 2296 * same txg, it will hopefully be dirty in the next txg where it will 2297 * be condensed at an earlier pass. 2298 */ 2299 if (msp->ms_condense_checked_txg == current_txg) 2300 return (B_FALSE); 2301 msp->ms_condense_checked_txg = current_txg; 2302 2303 /* 2304 * We always condense metaslabs that are empty and metaslabs for 2305 * which a condense request has been made. 2306 */ 2307 if (avl_is_empty(&msp->ms_allocatable_by_size) || 2308 msp->ms_condense_wanted) 2309 return (B_TRUE); 2310 2311 uint64_t object_size = space_map_length(msp->ms_sm); 2312 uint64_t optimal_size = space_map_estimate_optimal_size(sm, 2313 msp->ms_allocatable, SM_NO_VDEVID); 2314 2315 dmu_object_info_t doi; 2316 dmu_object_info_from_db(sm->sm_dbuf, &doi); 2317 uint64_t record_size = MAX(doi.doi_data_block_size, vdev_blocksize); 2318 2319 return (object_size >= (optimal_size * zfs_condense_pct / 100) && 2320 object_size > zfs_metaslab_condense_block_threshold * record_size); 2321 } 2322 2323 /* 2324 * Condense the on-disk space map representation to its minimized form. 2325 * The minimized form consists of a small number of allocations followed by 2326 * the entries of the free range tree. 2327 */ 2328 static void 2329 metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) 2330 { 2331 range_tree_t *condense_tree; 2332 space_map_t *sm = msp->ms_sm; 2333 2334 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2335 ASSERT(msp->ms_loaded); 2336 2337 zfs_dbgmsg("condensing: txg %llu, msp[%llu] %p, vdev id %llu, " 2338 "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg, 2339 msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id, 2340 msp->ms_group->mg_vd->vdev_spa->spa_name, 2341 space_map_length(msp->ms_sm), 2342 avl_numnodes(&msp->ms_allocatable->rt_root), 2343 msp->ms_condense_wanted ? "TRUE" : "FALSE"); 2344 2345 msp->ms_condense_wanted = B_FALSE; 2346 2347 /* 2348 * Create an range tree that is 100% allocated. We remove segments 2349 * that have been freed in this txg, any deferred frees that exist, 2350 * and any allocation in the future. Removing segments should be 2351 * a relatively inexpensive operation since we expect these trees to 2352 * have a small number of nodes. 2353 */ 2354 condense_tree = range_tree_create(NULL, NULL); 2355 range_tree_add(condense_tree, msp->ms_start, msp->ms_size); 2356 2357 range_tree_walk(msp->ms_freeing, range_tree_remove, condense_tree); 2358 range_tree_walk(msp->ms_freed, range_tree_remove, condense_tree); 2359 2360 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2361 range_tree_walk(msp->ms_defer[t], 2362 range_tree_remove, condense_tree); 2363 } 2364 2365 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 2366 range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK], 2367 range_tree_remove, condense_tree); 2368 } 2369 2370 /* 2371 * We're about to drop the metaslab's lock thus allowing 2372 * other consumers to change it's content. Set the 2373 * metaslab's ms_condensing flag to ensure that 2374 * allocations on this metaslab do not occur while we're 2375 * in the middle of committing it to disk. This is only critical 2376 * for ms_allocatable as all other range trees use per txg 2377 * views of their content. 2378 */ 2379 msp->ms_condensing = B_TRUE; 2380 2381 mutex_exit(&msp->ms_lock); 2382 space_map_truncate(sm, zfs_metaslab_sm_blksz, tx); 2383 2384 /* 2385 * While we would ideally like to create a space map representation 2386 * that consists only of allocation records, doing so can be 2387 * prohibitively expensive because the in-core free tree can be 2388 * large, and therefore computationally expensive to subtract 2389 * from the condense_tree. Instead we sync out two trees, a cheap 2390 * allocation only tree followed by the in-core free tree. While not 2391 * optimal, this is typically close to optimal, and much cheaper to 2392 * compute. 2393 */ 2394 space_map_write(sm, condense_tree, SM_ALLOC, SM_NO_VDEVID, tx); 2395 range_tree_vacate(condense_tree, NULL, NULL); 2396 range_tree_destroy(condense_tree); 2397 2398 space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx); 2399 mutex_enter(&msp->ms_lock); 2400 msp->ms_condensing = B_FALSE; 2401 } 2402 2403 /* 2404 * Write a metaslab to disk in the context of the specified transaction group. 2405 */ 2406 void 2407 metaslab_sync(metaslab_t *msp, uint64_t txg) 2408 { 2409 metaslab_group_t *mg = msp->ms_group; 2410 vdev_t *vd = mg->mg_vd; 2411 spa_t *spa = vd->vdev_spa; 2412 objset_t *mos = spa_meta_objset(spa); 2413 range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK]; 2414 dmu_tx_t *tx; 2415 uint64_t object = space_map_object(msp->ms_sm); 2416 2417 ASSERT(!vd->vdev_ishole); 2418 2419 /* 2420 * This metaslab has just been added so there's no work to do now. 2421 */ 2422 if (msp->ms_freeing == NULL) { 2423 ASSERT3P(alloctree, ==, NULL); 2424 return; 2425 } 2426 2427 ASSERT3P(alloctree, !=, NULL); 2428 ASSERT3P(msp->ms_freeing, !=, NULL); 2429 ASSERT3P(msp->ms_freed, !=, NULL); 2430 ASSERT3P(msp->ms_checkpointing, !=, NULL); 2431 2432 /* 2433 * Normally, we don't want to process a metaslab if there are no 2434 * allocations or frees to perform. However, if the metaslab is being 2435 * forced to condense and it's loaded, we need to let it through. 2436 */ 2437 if (range_tree_is_empty(alloctree) && 2438 range_tree_is_empty(msp->ms_freeing) && 2439 range_tree_is_empty(msp->ms_checkpointing) && 2440 !(msp->ms_loaded && msp->ms_condense_wanted)) 2441 return; 2442 2443 2444 VERIFY(txg <= spa_final_dirty_txg(spa)); 2445 2446 /* 2447 * The only state that can actually be changing concurrently with 2448 * metaslab_sync() is the metaslab's ms_allocatable. No other 2449 * thread can be modifying this txg's alloc, freeing, 2450 * freed, or space_map_phys_t. We drop ms_lock whenever we 2451 * could call into the DMU, because the DMU can call down to us 2452 * (e.g. via zio_free()) at any time. 2453 * 2454 * The spa_vdev_remove_thread() can be reading metaslab state 2455 * concurrently, and it is locked out by the ms_sync_lock. Note 2456 * that the ms_lock is insufficient for this, because it is dropped 2457 * by space_map_write(). 2458 */ 2459 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 2460 2461 if (msp->ms_sm == NULL) { 2462 uint64_t new_object; 2463 2464 new_object = space_map_alloc(mos, zfs_metaslab_sm_blksz, tx); 2465 VERIFY3U(new_object, !=, 0); 2466 2467 VERIFY0(space_map_open(&msp->ms_sm, mos, new_object, 2468 msp->ms_start, msp->ms_size, vd->vdev_ashift)); 2469 ASSERT(msp->ms_sm != NULL); 2470 } 2471 2472 if (!range_tree_is_empty(msp->ms_checkpointing) && 2473 vd->vdev_checkpoint_sm == NULL) { 2474 ASSERT(spa_has_checkpoint(spa)); 2475 2476 uint64_t new_object = space_map_alloc(mos, 2477 vdev_standard_sm_blksz, tx); 2478 VERIFY3U(new_object, !=, 0); 2479 2480 VERIFY0(space_map_open(&vd->vdev_checkpoint_sm, 2481 mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift)); 2482 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); 2483 2484 /* 2485 * We save the space map object as an entry in vdev_top_zap 2486 * so it can be retrieved when the pool is reopened after an 2487 * export or through zdb. 2488 */ 2489 VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, 2490 vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, 2491 sizeof (new_object), 1, &new_object, tx)); 2492 } 2493 2494 mutex_enter(&msp->ms_sync_lock); 2495 mutex_enter(&msp->ms_lock); 2496 2497 /* 2498 * Note: metaslab_condense() clears the space map's histogram. 2499 * Therefore we must verify and remove this histogram before 2500 * condensing. 2501 */ 2502 metaslab_group_histogram_verify(mg); 2503 metaslab_class_histogram_verify(mg->mg_class); 2504 metaslab_group_histogram_remove(mg, msp); 2505 2506 if (msp->ms_loaded && metaslab_should_condense(msp)) { 2507 metaslab_condense(msp, txg, tx); 2508 } else { 2509 mutex_exit(&msp->ms_lock); 2510 space_map_write(msp->ms_sm, alloctree, SM_ALLOC, 2511 SM_NO_VDEVID, tx); 2512 space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE, 2513 SM_NO_VDEVID, tx); 2514 mutex_enter(&msp->ms_lock); 2515 } 2516 2517 if (!range_tree_is_empty(msp->ms_checkpointing)) { 2518 ASSERT(spa_has_checkpoint(spa)); 2519 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); 2520 2521 /* 2522 * Since we are doing writes to disk and the ms_checkpointing 2523 * tree won't be changing during that time, we drop the 2524 * ms_lock while writing to the checkpoint space map. 2525 */ 2526 mutex_exit(&msp->ms_lock); 2527 space_map_write(vd->vdev_checkpoint_sm, 2528 msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx); 2529 mutex_enter(&msp->ms_lock); 2530 space_map_update(vd->vdev_checkpoint_sm); 2531 2532 spa->spa_checkpoint_info.sci_dspace += 2533 range_tree_space(msp->ms_checkpointing); 2534 vd->vdev_stat.vs_checkpoint_space += 2535 range_tree_space(msp->ms_checkpointing); 2536 ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==, 2537 -vd->vdev_checkpoint_sm->sm_alloc); 2538 2539 range_tree_vacate(msp->ms_checkpointing, NULL, NULL); 2540 } 2541 2542 if (msp->ms_loaded) { 2543 /* 2544 * When the space map is loaded, we have an accurate 2545 * histogram in the range tree. This gives us an opportunity 2546 * to bring the space map's histogram up-to-date so we clear 2547 * it first before updating it. 2548 */ 2549 space_map_histogram_clear(msp->ms_sm); 2550 space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx); 2551 2552 /* 2553 * Since we've cleared the histogram we need to add back 2554 * any free space that has already been processed, plus 2555 * any deferred space. This allows the on-disk histogram 2556 * to accurately reflect all free space even if some space 2557 * is not yet available for allocation (i.e. deferred). 2558 */ 2559 space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx); 2560 2561 /* 2562 * Add back any deferred free space that has not been 2563 * added back into the in-core free tree yet. This will 2564 * ensure that we don't end up with a space map histogram 2565 * that is completely empty unless the metaslab is fully 2566 * allocated. 2567 */ 2568 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2569 space_map_histogram_add(msp->ms_sm, 2570 msp->ms_defer[t], tx); 2571 } 2572 } 2573 2574 /* 2575 * Always add the free space from this sync pass to the space 2576 * map histogram. We want to make sure that the on-disk histogram 2577 * accounts for all free space. If the space map is not loaded, 2578 * then we will lose some accuracy but will correct it the next 2579 * time we load the space map. 2580 */ 2581 space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx); 2582 2583 metaslab_group_histogram_add(mg, msp); 2584 metaslab_group_histogram_verify(mg); 2585 metaslab_class_histogram_verify(mg->mg_class); 2586 2587 /* 2588 * For sync pass 1, we avoid traversing this txg's free range tree 2589 * and instead will just swap the pointers for freeing and 2590 * freed. We can safely do this since the freed_tree is 2591 * guaranteed to be empty on the initial pass. 2592 */ 2593 if (spa_sync_pass(spa) == 1) { 2594 range_tree_swap(&msp->ms_freeing, &msp->ms_freed); 2595 } else { 2596 range_tree_vacate(msp->ms_freeing, 2597 range_tree_add, msp->ms_freed); 2598 } 2599 range_tree_vacate(alloctree, NULL, NULL); 2600 2601 ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); 2602 ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg) 2603 & TXG_MASK])); 2604 ASSERT0(range_tree_space(msp->ms_freeing)); 2605 ASSERT0(range_tree_space(msp->ms_checkpointing)); 2606 2607 mutex_exit(&msp->ms_lock); 2608 2609 if (object != space_map_object(msp->ms_sm)) { 2610 object = space_map_object(msp->ms_sm); 2611 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * 2612 msp->ms_id, sizeof (uint64_t), &object, tx); 2613 } 2614 mutex_exit(&msp->ms_sync_lock); 2615 dmu_tx_commit(tx); 2616 } 2617 2618 /* 2619 * Called after a transaction group has completely synced to mark 2620 * all of the metaslab's free space as usable. 2621 */ 2622 void 2623 metaslab_sync_done(metaslab_t *msp, uint64_t txg) 2624 { 2625 metaslab_group_t *mg = msp->ms_group; 2626 vdev_t *vd = mg->mg_vd; 2627 spa_t *spa = vd->vdev_spa; 2628 range_tree_t **defer_tree; 2629 int64_t alloc_delta, defer_delta; 2630 boolean_t defer_allowed = B_TRUE; 2631 2632 ASSERT(!vd->vdev_ishole); 2633 2634 mutex_enter(&msp->ms_lock); 2635 2636 /* 2637 * If this metaslab is just becoming available, initialize its 2638 * range trees and add its capacity to the vdev. 2639 */ 2640 if (msp->ms_freed == NULL) { 2641 for (int t = 0; t < TXG_SIZE; t++) { 2642 ASSERT(msp->ms_allocating[t] == NULL); 2643 2644 msp->ms_allocating[t] = range_tree_create(NULL, NULL); 2645 } 2646 2647 ASSERT3P(msp->ms_freeing, ==, NULL); 2648 msp->ms_freeing = range_tree_create(NULL, NULL); 2649 2650 ASSERT3P(msp->ms_freed, ==, NULL); 2651 msp->ms_freed = range_tree_create(NULL, NULL); 2652 2653 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2654 ASSERT(msp->ms_defer[t] == NULL); 2655 2656 msp->ms_defer[t] = range_tree_create(NULL, NULL); 2657 } 2658 2659 ASSERT3P(msp->ms_checkpointing, ==, NULL); 2660 msp->ms_checkpointing = range_tree_create(NULL, NULL); 2661 2662 vdev_space_update(vd, 0, 0, msp->ms_size); 2663 } 2664 ASSERT0(range_tree_space(msp->ms_freeing)); 2665 ASSERT0(range_tree_space(msp->ms_checkpointing)); 2666 2667 defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE]; 2668 2669 uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) - 2670 metaslab_class_get_alloc(spa_normal_class(spa)); 2671 if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) { 2672 defer_allowed = B_FALSE; 2673 } 2674 2675 defer_delta = 0; 2676 alloc_delta = space_map_alloc_delta(msp->ms_sm); 2677 if (defer_allowed) { 2678 defer_delta = range_tree_space(msp->ms_freed) - 2679 range_tree_space(*defer_tree); 2680 } else { 2681 defer_delta -= range_tree_space(*defer_tree); 2682 } 2683 2684 vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0); 2685 2686 /* 2687 * If there's a metaslab_load() in progress, wait for it to complete 2688 * so that we have a consistent view of the in-core space map. 2689 */ 2690 metaslab_load_wait(msp); 2691 2692 /* 2693 * Move the frees from the defer_tree back to the free 2694 * range tree (if it's loaded). Swap the freed_tree and 2695 * the defer_tree -- this is safe to do because we've 2696 * just emptied out the defer_tree. 2697 */ 2698 range_tree_vacate(*defer_tree, 2699 msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable); 2700 if (defer_allowed) { 2701 range_tree_swap(&msp->ms_freed, defer_tree); 2702 } else { 2703 range_tree_vacate(msp->ms_freed, 2704 msp->ms_loaded ? range_tree_add : NULL, 2705 msp->ms_allocatable); 2706 } 2707 space_map_update(msp->ms_sm); 2708 2709 msp->ms_deferspace += defer_delta; 2710 ASSERT3S(msp->ms_deferspace, >=, 0); 2711 ASSERT3S(msp->ms_deferspace, <=, msp->ms_size); 2712 if (msp->ms_deferspace != 0) { 2713 /* 2714 * Keep syncing this metaslab until all deferred frees 2715 * are back in circulation. 2716 */ 2717 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 2718 } 2719 2720 if (msp->ms_new) { 2721 msp->ms_new = B_FALSE; 2722 mutex_enter(&mg->mg_lock); 2723 mg->mg_ms_ready++; 2724 mutex_exit(&mg->mg_lock); 2725 } 2726 /* 2727 * Calculate the new weights before unloading any metaslabs. 2728 * This will give us the most accurate weighting. 2729 */ 2730 metaslab_group_sort(mg, msp, metaslab_weight(msp) | 2731 (msp->ms_weight & METASLAB_ACTIVE_MASK)); 2732 2733 /* 2734 * If the metaslab is loaded and we've not tried to load or allocate 2735 * from it in 'metaslab_unload_delay' txgs, then unload it. 2736 */ 2737 if (msp->ms_loaded && 2738 msp->ms_initializing == 0 && 2739 msp->ms_selected_txg + metaslab_unload_delay < txg) { 2740 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 2741 VERIFY0(range_tree_space( 2742 msp->ms_allocating[(txg + t) & TXG_MASK])); 2743 } 2744 if (msp->ms_allocator != -1) { 2745 metaslab_passivate(msp, msp->ms_weight & 2746 ~METASLAB_ACTIVE_MASK); 2747 } 2748 2749 if (!metaslab_debug_unload) 2750 metaslab_unload(msp); 2751 } 2752 2753 ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); 2754 ASSERT0(range_tree_space(msp->ms_freeing)); 2755 ASSERT0(range_tree_space(msp->ms_freed)); 2756 ASSERT0(range_tree_space(msp->ms_checkpointing)); 2757 2758 mutex_exit(&msp->ms_lock); 2759 } 2760 2761 void 2762 metaslab_sync_reassess(metaslab_group_t *mg) 2763 { 2764 spa_t *spa = mg->mg_class->mc_spa; 2765 2766 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 2767 metaslab_group_alloc_update(mg); 2768 mg->mg_fragmentation = metaslab_group_fragmentation(mg); 2769 2770 /* 2771 * Preload the next potential metaslabs but only on active 2772 * metaslab groups. We can get into a state where the metaslab 2773 * is no longer active since we dirty metaslabs as we remove a 2774 * a device, thus potentially making the metaslab group eligible 2775 * for preloading. 2776 */ 2777 if (mg->mg_activation_count > 0) { 2778 metaslab_group_preload(mg); 2779 } 2780 spa_config_exit(spa, SCL_ALLOC, FTAG); 2781 } 2782 2783 static uint64_t 2784 metaslab_distance(metaslab_t *msp, dva_t *dva) 2785 { 2786 uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift; 2787 uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift; 2788 uint64_t start = msp->ms_id; 2789 2790 if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) 2791 return (1ULL << 63); 2792 2793 if (offset < start) 2794 return ((start - offset) << ms_shift); 2795 if (offset > start) 2796 return ((offset - start) << ms_shift); 2797 return (0); 2798 } 2799 2800 /* 2801 * ========================================================================== 2802 * Metaslab allocation tracing facility 2803 * ========================================================================== 2804 */ 2805 kstat_t *metaslab_trace_ksp; 2806 kstat_named_t metaslab_trace_over_limit; 2807 2808 void 2809 metaslab_alloc_trace_init(void) 2810 { 2811 ASSERT(metaslab_alloc_trace_cache == NULL); 2812 metaslab_alloc_trace_cache = kmem_cache_create( 2813 "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t), 2814 0, NULL, NULL, NULL, NULL, NULL, 0); 2815 metaslab_trace_ksp = kstat_create("zfs", 0, "metaslab_trace_stats", 2816 "misc", KSTAT_TYPE_NAMED, 1, KSTAT_FLAG_VIRTUAL); 2817 if (metaslab_trace_ksp != NULL) { 2818 metaslab_trace_ksp->ks_data = &metaslab_trace_over_limit; 2819 kstat_named_init(&metaslab_trace_over_limit, 2820 "metaslab_trace_over_limit", KSTAT_DATA_UINT64); 2821 kstat_install(metaslab_trace_ksp); 2822 } 2823 } 2824 2825 void 2826 metaslab_alloc_trace_fini(void) 2827 { 2828 if (metaslab_trace_ksp != NULL) { 2829 kstat_delete(metaslab_trace_ksp); 2830 metaslab_trace_ksp = NULL; 2831 } 2832 kmem_cache_destroy(metaslab_alloc_trace_cache); 2833 metaslab_alloc_trace_cache = NULL; 2834 } 2835 2836 /* 2837 * Add an allocation trace element to the allocation tracing list. 2838 */ 2839 static void 2840 metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg, 2841 metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset, 2842 int allocator) 2843 { 2844 if (!metaslab_trace_enabled) 2845 return; 2846 2847 /* 2848 * When the tracing list reaches its maximum we remove 2849 * the second element in the list before adding a new one. 2850 * By removing the second element we preserve the original 2851 * entry as a clue to what allocations steps have already been 2852 * performed. 2853 */ 2854 if (zal->zal_size == metaslab_trace_max_entries) { 2855 metaslab_alloc_trace_t *mat_next; 2856 #ifdef DEBUG 2857 panic("too many entries in allocation list"); 2858 #endif 2859 atomic_inc_64(&metaslab_trace_over_limit.value.ui64); 2860 zal->zal_size--; 2861 mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list)); 2862 list_remove(&zal->zal_list, mat_next); 2863 kmem_cache_free(metaslab_alloc_trace_cache, mat_next); 2864 } 2865 2866 metaslab_alloc_trace_t *mat = 2867 kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP); 2868 list_link_init(&mat->mat_list_node); 2869 mat->mat_mg = mg; 2870 mat->mat_msp = msp; 2871 mat->mat_size = psize; 2872 mat->mat_dva_id = dva_id; 2873 mat->mat_offset = offset; 2874 mat->mat_weight = 0; 2875 mat->mat_allocator = allocator; 2876 2877 if (msp != NULL) 2878 mat->mat_weight = msp->ms_weight; 2879 2880 /* 2881 * The list is part of the zio so locking is not required. Only 2882 * a single thread will perform allocations for a given zio. 2883 */ 2884 list_insert_tail(&zal->zal_list, mat); 2885 zal->zal_size++; 2886 2887 ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries); 2888 } 2889 2890 void 2891 metaslab_trace_init(zio_alloc_list_t *zal) 2892 { 2893 list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t), 2894 offsetof(metaslab_alloc_trace_t, mat_list_node)); 2895 zal->zal_size = 0; 2896 } 2897 2898 void 2899 metaslab_trace_fini(zio_alloc_list_t *zal) 2900 { 2901 metaslab_alloc_trace_t *mat; 2902 2903 while ((mat = list_remove_head(&zal->zal_list)) != NULL) 2904 kmem_cache_free(metaslab_alloc_trace_cache, mat); 2905 list_destroy(&zal->zal_list); 2906 zal->zal_size = 0; 2907 } 2908 2909 /* 2910 * ========================================================================== 2911 * Metaslab block operations 2912 * ========================================================================== 2913 */ 2914 2915 static void 2916 metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags, 2917 int allocator) 2918 { 2919 if (!(flags & METASLAB_ASYNC_ALLOC) || 2920 (flags & METASLAB_DONT_THROTTLE)) 2921 return; 2922 2923 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 2924 if (!mg->mg_class->mc_alloc_throttle_enabled) 2925 return; 2926 2927 (void) zfs_refcount_add(&mg->mg_alloc_queue_depth[allocator], tag); 2928 } 2929 2930 static void 2931 metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator) 2932 { 2933 uint64_t max = mg->mg_max_alloc_queue_depth; 2934 uint64_t cur = mg->mg_cur_max_alloc_queue_depth[allocator]; 2935 while (cur < max) { 2936 if (atomic_cas_64(&mg->mg_cur_max_alloc_queue_depth[allocator], 2937 cur, cur + 1) == cur) { 2938 atomic_inc_64( 2939 &mg->mg_class->mc_alloc_max_slots[allocator]); 2940 return; 2941 } 2942 cur = mg->mg_cur_max_alloc_queue_depth[allocator]; 2943 } 2944 } 2945 2946 void 2947 metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags, 2948 int allocator, boolean_t io_complete) 2949 { 2950 if (!(flags & METASLAB_ASYNC_ALLOC) || 2951 (flags & METASLAB_DONT_THROTTLE)) 2952 return; 2953 2954 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 2955 if (!mg->mg_class->mc_alloc_throttle_enabled) 2956 return; 2957 2958 (void) zfs_refcount_remove(&mg->mg_alloc_queue_depth[allocator], tag); 2959 if (io_complete) 2960 metaslab_group_increment_qdepth(mg, allocator); 2961 } 2962 2963 void 2964 metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag, 2965 int allocator) 2966 { 2967 #ifdef ZFS_DEBUG 2968 const dva_t *dva = bp->blk_dva; 2969 int ndvas = BP_GET_NDVAS(bp); 2970 2971 for (int d = 0; d < ndvas; d++) { 2972 uint64_t vdev = DVA_GET_VDEV(&dva[d]); 2973 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 2974 VERIFY(zfs_refcount_not_held( 2975 &mg->mg_alloc_queue_depth[allocator], tag)); 2976 } 2977 #endif 2978 } 2979 2980 static uint64_t 2981 metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) 2982 { 2983 uint64_t start; 2984 range_tree_t *rt = msp->ms_allocatable; 2985 metaslab_class_t *mc = msp->ms_group->mg_class; 2986 2987 VERIFY(!msp->ms_condensing); 2988 VERIFY0(msp->ms_initializing); 2989 2990 start = mc->mc_ops->msop_alloc(msp, size); 2991 if (start != -1ULL) { 2992 metaslab_group_t *mg = msp->ms_group; 2993 vdev_t *vd = mg->mg_vd; 2994 2995 VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift)); 2996 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 2997 VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size); 2998 range_tree_remove(rt, start, size); 2999 3000 if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) 3001 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); 3002 3003 range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size); 3004 3005 /* Track the last successful allocation */ 3006 msp->ms_alloc_txg = txg; 3007 metaslab_verify_space(msp, txg); 3008 } 3009 3010 /* 3011 * Now that we've attempted the allocation we need to update the 3012 * metaslab's maximum block size since it may have changed. 3013 */ 3014 msp->ms_max_size = metaslab_block_maxsize(msp); 3015 return (start); 3016 } 3017 3018 /* 3019 * Find the metaslab with the highest weight that is less than what we've 3020 * already tried. In the common case, this means that we will examine each 3021 * metaslab at most once. Note that concurrent callers could reorder metaslabs 3022 * by activation/passivation once we have dropped the mg_lock. If a metaslab is 3023 * activated by another thread, and we fail to allocate from the metaslab we 3024 * have selected, we may not try the newly-activated metaslab, and instead 3025 * activate another metaslab. This is not optimal, but generally does not cause 3026 * any problems (a possible exception being if every metaslab is completely full 3027 * except for the the newly-activated metaslab which we fail to examine). 3028 */ 3029 static metaslab_t * 3030 find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, 3031 dva_t *dva, int d, uint64_t min_distance, uint64_t asize, int allocator, 3032 zio_alloc_list_t *zal, metaslab_t *search, boolean_t *was_active) 3033 { 3034 avl_index_t idx; 3035 avl_tree_t *t = &mg->mg_metaslab_tree; 3036 metaslab_t *msp = avl_find(t, search, &idx); 3037 if (msp == NULL) 3038 msp = avl_nearest(t, idx, AVL_AFTER); 3039 3040 for (; msp != NULL; msp = AVL_NEXT(t, msp)) { 3041 int i; 3042 if (!metaslab_should_allocate(msp, asize)) { 3043 metaslab_trace_add(zal, mg, msp, asize, d, 3044 TRACE_TOO_SMALL, allocator); 3045 continue; 3046 } 3047 3048 /* 3049 * If the selected metaslab is condensing or being 3050 * initialized, skip it. 3051 */ 3052 if (msp->ms_condensing || msp->ms_initializing > 0) 3053 continue; 3054 3055 *was_active = msp->ms_allocator != -1; 3056 /* 3057 * If we're activating as primary, this is our first allocation 3058 * from this disk, so we don't need to check how close we are. 3059 * If the metaslab under consideration was already active, 3060 * we're getting desperate enough to steal another allocator's 3061 * metaslab, so we still don't care about distances. 3062 */ 3063 if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active) 3064 break; 3065 3066 uint64_t target_distance = min_distance 3067 + (space_map_allocated(msp->ms_sm) != 0 ? 0 : 3068 min_distance >> 1); 3069 3070 for (i = 0; i < d; i++) { 3071 if (metaslab_distance(msp, &dva[i]) < target_distance) 3072 break; 3073 } 3074 if (i == d) 3075 break; 3076 } 3077 3078 if (msp != NULL) { 3079 search->ms_weight = msp->ms_weight; 3080 search->ms_start = msp->ms_start + 1; 3081 search->ms_allocator = msp->ms_allocator; 3082 search->ms_primary = msp->ms_primary; 3083 } 3084 return (msp); 3085 } 3086 3087 /* ARGSUSED */ 3088 static uint64_t 3089 metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, 3090 uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d, 3091 int allocator) 3092 { 3093 metaslab_t *msp = NULL; 3094 uint64_t offset = -1ULL; 3095 uint64_t activation_weight; 3096 3097 activation_weight = METASLAB_WEIGHT_PRIMARY; 3098 for (int i = 0; i < d; i++) { 3099 if (activation_weight == METASLAB_WEIGHT_PRIMARY && 3100 DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { 3101 activation_weight = METASLAB_WEIGHT_SECONDARY; 3102 } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && 3103 DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { 3104 activation_weight = METASLAB_WEIGHT_CLAIM; 3105 break; 3106 } 3107 } 3108 3109 /* 3110 * If we don't have enough metaslabs active to fill the entire array, we 3111 * just use the 0th slot. 3112 */ 3113 if (mg->mg_ms_ready < mg->mg_allocators * 3) 3114 allocator = 0; 3115 3116 ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2); 3117 3118 metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP); 3119 search->ms_weight = UINT64_MAX; 3120 search->ms_start = 0; 3121 /* 3122 * At the end of the metaslab tree are the already-active metaslabs, 3123 * first the primaries, then the secondaries. When we resume searching 3124 * through the tree, we need to consider ms_allocator and ms_primary so 3125 * we start in the location right after where we left off, and don't 3126 * accidentally loop forever considering the same metaslabs. 3127 */ 3128 search->ms_allocator = -1; 3129 search->ms_primary = B_TRUE; 3130 for (;;) { 3131 boolean_t was_active = B_FALSE; 3132 3133 mutex_enter(&mg->mg_lock); 3134 3135 if (activation_weight == METASLAB_WEIGHT_PRIMARY && 3136 mg->mg_primaries[allocator] != NULL) { 3137 msp = mg->mg_primaries[allocator]; 3138 was_active = B_TRUE; 3139 } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && 3140 mg->mg_secondaries[allocator] != NULL) { 3141 msp = mg->mg_secondaries[allocator]; 3142 was_active = B_TRUE; 3143 } else { 3144 msp = find_valid_metaslab(mg, activation_weight, dva, d, 3145 min_distance, asize, allocator, zal, search, 3146 &was_active); 3147 } 3148 3149 mutex_exit(&mg->mg_lock); 3150 if (msp == NULL) { 3151 kmem_free(search, sizeof (*search)); 3152 return (-1ULL); 3153 } 3154 3155 mutex_enter(&msp->ms_lock); 3156 /* 3157 * Ensure that the metaslab we have selected is still 3158 * capable of handling our request. It's possible that 3159 * another thread may have changed the weight while we 3160 * were blocked on the metaslab lock. We check the 3161 * active status first to see if we need to reselect 3162 * a new metaslab. 3163 */ 3164 if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) { 3165 mutex_exit(&msp->ms_lock); 3166 continue; 3167 } 3168 3169 /* 3170 * If the metaslab is freshly activated for an allocator that 3171 * isn't the one we're allocating from, or if it's a primary and 3172 * we're seeking a secondary (or vice versa), we go back and 3173 * select a new metaslab. 3174 */ 3175 if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) && 3176 (msp->ms_allocator != -1) && 3177 (msp->ms_allocator != allocator || ((activation_weight == 3178 METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) { 3179 mutex_exit(&msp->ms_lock); 3180 continue; 3181 } 3182 3183 if (msp->ms_weight & METASLAB_WEIGHT_CLAIM && 3184 activation_weight != METASLAB_WEIGHT_CLAIM) { 3185 metaslab_passivate(msp, msp->ms_weight & 3186 ~METASLAB_WEIGHT_CLAIM); 3187 mutex_exit(&msp->ms_lock); 3188 continue; 3189 } 3190 3191 if (metaslab_activate(msp, allocator, activation_weight) != 0) { 3192 mutex_exit(&msp->ms_lock); 3193 continue; 3194 } 3195 3196 msp->ms_selected_txg = txg; 3197 3198 /* 3199 * Now that we have the lock, recheck to see if we should 3200 * continue to use this metaslab for this allocation. The 3201 * the metaslab is now loaded so metaslab_should_allocate() can 3202 * accurately determine if the allocation attempt should 3203 * proceed. 3204 */ 3205 if (!metaslab_should_allocate(msp, asize)) { 3206 /* Passivate this metaslab and select a new one. */ 3207 metaslab_trace_add(zal, mg, msp, asize, d, 3208 TRACE_TOO_SMALL, allocator); 3209 goto next; 3210 } 3211 3212 /* 3213 * If this metaslab is currently condensing then pick again as 3214 * we can't manipulate this metaslab until it's committed 3215 * to disk. If this metaslab is being initialized, we shouldn't 3216 * allocate from it since the allocated region might be 3217 * overwritten after allocation. 3218 */ 3219 if (msp->ms_condensing) { 3220 metaslab_trace_add(zal, mg, msp, asize, d, 3221 TRACE_CONDENSING, allocator); 3222 metaslab_passivate(msp, msp->ms_weight & 3223 ~METASLAB_ACTIVE_MASK); 3224 mutex_exit(&msp->ms_lock); 3225 continue; 3226 } else if (msp->ms_initializing > 0) { 3227 metaslab_trace_add(zal, mg, msp, asize, d, 3228 TRACE_INITIALIZING, allocator); 3229 metaslab_passivate(msp, msp->ms_weight & 3230 ~METASLAB_ACTIVE_MASK); 3231 mutex_exit(&msp->ms_lock); 3232 continue; 3233 } 3234 3235 offset = metaslab_block_alloc(msp, asize, txg); 3236 metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator); 3237 3238 if (offset != -1ULL) { 3239 /* Proactively passivate the metaslab, if needed */ 3240 metaslab_segment_may_passivate(msp); 3241 break; 3242 } 3243 next: 3244 ASSERT(msp->ms_loaded); 3245 3246 /* 3247 * We were unable to allocate from this metaslab so determine 3248 * a new weight for this metaslab. Now that we have loaded 3249 * the metaslab we can provide a better hint to the metaslab 3250 * selector. 3251 * 3252 * For space-based metaslabs, we use the maximum block size. 3253 * This information is only available when the metaslab 3254 * is loaded and is more accurate than the generic free 3255 * space weight that was calculated by metaslab_weight(). 3256 * This information allows us to quickly compare the maximum 3257 * available allocation in the metaslab to the allocation 3258 * size being requested. 3259 * 3260 * For segment-based metaslabs, determine the new weight 3261 * based on the highest bucket in the range tree. We 3262 * explicitly use the loaded segment weight (i.e. the range 3263 * tree histogram) since it contains the space that is 3264 * currently available for allocation and is accurate 3265 * even within a sync pass. 3266 */ 3267 if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) { 3268 uint64_t weight = metaslab_block_maxsize(msp); 3269 WEIGHT_SET_SPACEBASED(weight); 3270 metaslab_passivate(msp, weight); 3271 } else { 3272 metaslab_passivate(msp, 3273 metaslab_weight_from_range_tree(msp)); 3274 } 3275 3276 /* 3277 * We have just failed an allocation attempt, check 3278 * that metaslab_should_allocate() agrees. Otherwise, 3279 * we may end up in an infinite loop retrying the same 3280 * metaslab. 3281 */ 3282 ASSERT(!metaslab_should_allocate(msp, asize)); 3283 mutex_exit(&msp->ms_lock); 3284 } 3285 mutex_exit(&msp->ms_lock); 3286 kmem_free(search, sizeof (*search)); 3287 return (offset); 3288 } 3289 3290 static uint64_t 3291 metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, 3292 uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d, 3293 int allocator) 3294 { 3295 uint64_t offset; 3296 ASSERT(mg->mg_initialized); 3297 3298 offset = metaslab_group_alloc_normal(mg, zal, asize, txg, 3299 min_distance, dva, d, allocator); 3300 3301 mutex_enter(&mg->mg_lock); 3302 if (offset == -1ULL) { 3303 mg->mg_failed_allocations++; 3304 metaslab_trace_add(zal, mg, NULL, asize, d, 3305 TRACE_GROUP_FAILURE, allocator); 3306 if (asize == SPA_GANGBLOCKSIZE) { 3307 /* 3308 * This metaslab group was unable to allocate 3309 * the minimum gang block size so it must be out of 3310 * space. We must notify the allocation throttle 3311 * to start skipping allocation attempts to this 3312 * metaslab group until more space becomes available. 3313 * Note: this failure cannot be caused by the 3314 * allocation throttle since the allocation throttle 3315 * is only responsible for skipping devices and 3316 * not failing block allocations. 3317 */ 3318 mg->mg_no_free_space = B_TRUE; 3319 } 3320 } 3321 mg->mg_allocations++; 3322 mutex_exit(&mg->mg_lock); 3323 return (offset); 3324 } 3325 3326 /* 3327 * If we have to write a ditto block (i.e. more than one DVA for a given BP) 3328 * on the same vdev as an existing DVA of this BP, then try to allocate it 3329 * at least (vdev_asize / (2 ^ ditto_same_vdev_distance_shift)) away from the 3330 * existing DVAs. 3331 */ 3332 int ditto_same_vdev_distance_shift = 3; 3333 3334 /* 3335 * Allocate a block for the specified i/o. 3336 */ 3337 int 3338 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, 3339 dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags, 3340 zio_alloc_list_t *zal, int allocator) 3341 { 3342 metaslab_group_t *mg, *rotor; 3343 vdev_t *vd; 3344 boolean_t try_hard = B_FALSE; 3345 3346 ASSERT(!DVA_IS_VALID(&dva[d])); 3347 3348 /* 3349 * For testing, make some blocks above a certain size be gang blocks. 3350 */ 3351 if (psize >= metaslab_force_ganging && (ddi_get_lbolt() & 3) == 0) { 3352 metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG, 3353 allocator); 3354 return (SET_ERROR(ENOSPC)); 3355 } 3356 3357 /* 3358 * Start at the rotor and loop through all mgs until we find something. 3359 * Note that there's no locking on mc_rotor or mc_aliquot because 3360 * nothing actually breaks if we miss a few updates -- we just won't 3361 * allocate quite as evenly. It all balances out over time. 3362 * 3363 * If we are doing ditto or log blocks, try to spread them across 3364 * consecutive vdevs. If we're forced to reuse a vdev before we've 3365 * allocated all of our ditto blocks, then try and spread them out on 3366 * that vdev as much as possible. If it turns out to not be possible, 3367 * gradually lower our standards until anything becomes acceptable. 3368 * Also, allocating on consecutive vdevs (as opposed to random vdevs) 3369 * gives us hope of containing our fault domains to something we're 3370 * able to reason about. Otherwise, any two top-level vdev failures 3371 * will guarantee the loss of data. With consecutive allocation, 3372 * only two adjacent top-level vdev failures will result in data loss. 3373 * 3374 * If we are doing gang blocks (hintdva is non-NULL), try to keep 3375 * ourselves on the same vdev as our gang block header. That 3376 * way, we can hope for locality in vdev_cache, plus it makes our 3377 * fault domains something tractable. 3378 */ 3379 if (hintdva) { 3380 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); 3381 3382 /* 3383 * It's possible the vdev we're using as the hint no 3384 * longer exists or its mg has been closed (e.g. by 3385 * device removal). Consult the rotor when 3386 * all else fails. 3387 */ 3388 if (vd != NULL && vd->vdev_mg != NULL) { 3389 mg = vd->vdev_mg; 3390 3391 if (flags & METASLAB_HINTBP_AVOID && 3392 mg->mg_next != NULL) 3393 mg = mg->mg_next; 3394 } else { 3395 mg = mc->mc_rotor; 3396 } 3397 } else if (d != 0) { 3398 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); 3399 mg = vd->vdev_mg->mg_next; 3400 } else { 3401 mg = mc->mc_rotor; 3402 } 3403 3404 /* 3405 * If the hint put us into the wrong metaslab class, or into a 3406 * metaslab group that has been passivated, just follow the rotor. 3407 */ 3408 if (mg->mg_class != mc || mg->mg_activation_count <= 0) 3409 mg = mc->mc_rotor; 3410 3411 rotor = mg; 3412 top: 3413 do { 3414 boolean_t allocatable; 3415 3416 ASSERT(mg->mg_activation_count == 1); 3417 vd = mg->mg_vd; 3418 3419 /* 3420 * Don't allocate from faulted devices. 3421 */ 3422 if (try_hard) { 3423 spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); 3424 allocatable = vdev_allocatable(vd); 3425 spa_config_exit(spa, SCL_ZIO, FTAG); 3426 } else { 3427 allocatable = vdev_allocatable(vd); 3428 } 3429 3430 /* 3431 * Determine if the selected metaslab group is eligible 3432 * for allocations. If we're ganging then don't allow 3433 * this metaslab group to skip allocations since that would 3434 * inadvertently return ENOSPC and suspend the pool 3435 * even though space is still available. 3436 */ 3437 if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) { 3438 allocatable = metaslab_group_allocatable(mg, rotor, 3439 psize, allocator); 3440 } 3441 3442 if (!allocatable) { 3443 metaslab_trace_add(zal, mg, NULL, psize, d, 3444 TRACE_NOT_ALLOCATABLE, allocator); 3445 goto next; 3446 } 3447 3448 ASSERT(mg->mg_initialized); 3449 3450 /* 3451 * Avoid writing single-copy data to a failing, 3452 * non-redundant vdev, unless we've already tried all 3453 * other vdevs. 3454 */ 3455 if ((vd->vdev_stat.vs_write_errors > 0 || 3456 vd->vdev_state < VDEV_STATE_HEALTHY) && 3457 d == 0 && !try_hard && vd->vdev_children == 0) { 3458 metaslab_trace_add(zal, mg, NULL, psize, d, 3459 TRACE_VDEV_ERROR, allocator); 3460 goto next; 3461 } 3462 3463 ASSERT(mg->mg_class == mc); 3464 3465 /* 3466 * If we don't need to try hard, then require that the 3467 * block be 1/8th of the device away from any other DVAs 3468 * in this BP. If we are trying hard, allow any offset 3469 * to be used (distance=0). 3470 */ 3471 uint64_t distance = 0; 3472 if (!try_hard) { 3473 distance = vd->vdev_asize >> 3474 ditto_same_vdev_distance_shift; 3475 if (distance <= (1ULL << vd->vdev_ms_shift)) 3476 distance = 0; 3477 } 3478 3479 uint64_t asize = vdev_psize_to_asize(vd, psize); 3480 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); 3481 3482 uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg, 3483 distance, dva, d, allocator); 3484 3485 if (offset != -1ULL) { 3486 /* 3487 * If we've just selected this metaslab group, 3488 * figure out whether the corresponding vdev is 3489 * over- or under-used relative to the pool, 3490 * and set an allocation bias to even it out. 3491 */ 3492 if (mc->mc_aliquot == 0 && metaslab_bias_enabled) { 3493 vdev_stat_t *vs = &vd->vdev_stat; 3494 int64_t vu, cu; 3495 3496 vu = (vs->vs_alloc * 100) / (vs->vs_space + 1); 3497 cu = (mc->mc_alloc * 100) / (mc->mc_space + 1); 3498 3499 /* 3500 * Calculate how much more or less we should 3501 * try to allocate from this device during 3502 * this iteration around the rotor. 3503 * For example, if a device is 80% full 3504 * and the pool is 20% full then we should 3505 * reduce allocations by 60% on this device. 3506 * 3507 * mg_bias = (20 - 80) * 512K / 100 = -307K 3508 * 3509 * This reduces allocations by 307K for this 3510 * iteration. 3511 */ 3512 mg->mg_bias = ((cu - vu) * 3513 (int64_t)mg->mg_aliquot) / 100; 3514 } else if (!metaslab_bias_enabled) { 3515 mg->mg_bias = 0; 3516 } 3517 3518 if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= 3519 mg->mg_aliquot + mg->mg_bias) { 3520 mc->mc_rotor = mg->mg_next; 3521 mc->mc_aliquot = 0; 3522 } 3523 3524 DVA_SET_VDEV(&dva[d], vd->vdev_id); 3525 DVA_SET_OFFSET(&dva[d], offset); 3526 DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER)); 3527 DVA_SET_ASIZE(&dva[d], asize); 3528 3529 return (0); 3530 } 3531 next: 3532 mc->mc_rotor = mg->mg_next; 3533 mc->mc_aliquot = 0; 3534 } while ((mg = mg->mg_next) != rotor); 3535 3536 /* 3537 * If we haven't tried hard, do so now. 3538 */ 3539 if (!try_hard) { 3540 try_hard = B_TRUE; 3541 goto top; 3542 } 3543 3544 bzero(&dva[d], sizeof (dva_t)); 3545 3546 metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator); 3547 return (SET_ERROR(ENOSPC)); 3548 } 3549 3550 void 3551 metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize, 3552 boolean_t checkpoint) 3553 { 3554 metaslab_t *msp; 3555 spa_t *spa = vd->vdev_spa; 3556 3557 ASSERT(vdev_is_concrete(vd)); 3558 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 3559 ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); 3560 3561 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 3562 3563 VERIFY(!msp->ms_condensing); 3564 VERIFY3U(offset, >=, msp->ms_start); 3565 VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size); 3566 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 3567 VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift)); 3568 3569 metaslab_check_free_impl(vd, offset, asize); 3570 3571 mutex_enter(&msp->ms_lock); 3572 if (range_tree_is_empty(msp->ms_freeing) && 3573 range_tree_is_empty(msp->ms_checkpointing)) { 3574 vdev_dirty(vd, VDD_METASLAB, msp, spa_syncing_txg(spa)); 3575 } 3576 3577 if (checkpoint) { 3578 ASSERT(spa_has_checkpoint(spa)); 3579 range_tree_add(msp->ms_checkpointing, offset, asize); 3580 } else { 3581 range_tree_add(msp->ms_freeing, offset, asize); 3582 } 3583 mutex_exit(&msp->ms_lock); 3584 } 3585 3586 /* ARGSUSED */ 3587 void 3588 metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, 3589 uint64_t size, void *arg) 3590 { 3591 boolean_t *checkpoint = arg; 3592 3593 ASSERT3P(checkpoint, !=, NULL); 3594 3595 if (vd->vdev_ops->vdev_op_remap != NULL) 3596 vdev_indirect_mark_obsolete(vd, offset, size); 3597 else 3598 metaslab_free_impl(vd, offset, size, *checkpoint); 3599 } 3600 3601 static void 3602 metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size, 3603 boolean_t checkpoint) 3604 { 3605 spa_t *spa = vd->vdev_spa; 3606 3607 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 3608 3609 if (spa_syncing_txg(spa) > spa_freeze_txg(spa)) 3610 return; 3611 3612 if (spa->spa_vdev_removal != NULL && 3613 spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id && 3614 vdev_is_concrete(vd)) { 3615 /* 3616 * Note: we check if the vdev is concrete because when 3617 * we complete the removal, we first change the vdev to be 3618 * an indirect vdev (in open context), and then (in syncing 3619 * context) clear spa_vdev_removal. 3620 */ 3621 free_from_removing_vdev(vd, offset, size); 3622 } else if (vd->vdev_ops->vdev_op_remap != NULL) { 3623 vdev_indirect_mark_obsolete(vd, offset, size); 3624 vd->vdev_ops->vdev_op_remap(vd, offset, size, 3625 metaslab_free_impl_cb, &checkpoint); 3626 } else { 3627 metaslab_free_concrete(vd, offset, size, checkpoint); 3628 } 3629 } 3630 3631 typedef struct remap_blkptr_cb_arg { 3632 blkptr_t *rbca_bp; 3633 spa_remap_cb_t rbca_cb; 3634 vdev_t *rbca_remap_vd; 3635 uint64_t rbca_remap_offset; 3636 void *rbca_cb_arg; 3637 } remap_blkptr_cb_arg_t; 3638 3639 void 3640 remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, 3641 uint64_t size, void *arg) 3642 { 3643 remap_blkptr_cb_arg_t *rbca = arg; 3644 blkptr_t *bp = rbca->rbca_bp; 3645 3646 /* We can not remap split blocks. */ 3647 if (size != DVA_GET_ASIZE(&bp->blk_dva[0])) 3648 return; 3649 ASSERT0(inner_offset); 3650 3651 if (rbca->rbca_cb != NULL) { 3652 /* 3653 * At this point we know that we are not handling split 3654 * blocks and we invoke the callback on the previous 3655 * vdev which must be indirect. 3656 */ 3657 ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops); 3658 3659 rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id, 3660 rbca->rbca_remap_offset, size, rbca->rbca_cb_arg); 3661 3662 /* set up remap_blkptr_cb_arg for the next call */ 3663 rbca->rbca_remap_vd = vd; 3664 rbca->rbca_remap_offset = offset; 3665 } 3666 3667 /* 3668 * The phys birth time is that of dva[0]. This ensures that we know 3669 * when each dva was written, so that resilver can determine which 3670 * blocks need to be scrubbed (i.e. those written during the time 3671 * the vdev was offline). It also ensures that the key used in 3672 * the ARC hash table is unique (i.e. dva[0] + phys_birth). If 3673 * we didn't change the phys_birth, a lookup in the ARC for a 3674 * remapped BP could find the data that was previously stored at 3675 * this vdev + offset. 3676 */ 3677 vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa, 3678 DVA_GET_VDEV(&bp->blk_dva[0])); 3679 vdev_indirect_births_t *vib = oldvd->vdev_indirect_births; 3680 bp->blk_phys_birth = vdev_indirect_births_physbirth(vib, 3681 DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0])); 3682 3683 DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id); 3684 DVA_SET_OFFSET(&bp->blk_dva[0], offset); 3685 } 3686 3687 /* 3688 * If the block pointer contains any indirect DVAs, modify them to refer to 3689 * concrete DVAs. Note that this will sometimes not be possible, leaving 3690 * the indirect DVA in place. This happens if the indirect DVA spans multiple 3691 * segments in the mapping (i.e. it is a "split block"). 3692 * 3693 * If the BP was remapped, calls the callback on the original dva (note the 3694 * callback can be called multiple times if the original indirect DVA refers 3695 * to another indirect DVA, etc). 3696 * 3697 * Returns TRUE if the BP was remapped. 3698 */ 3699 boolean_t 3700 spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg) 3701 { 3702 remap_blkptr_cb_arg_t rbca; 3703 3704 if (!zfs_remap_blkptr_enable) 3705 return (B_FALSE); 3706 3707 if (!spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) 3708 return (B_FALSE); 3709 3710 /* 3711 * Dedup BP's can not be remapped, because ddt_phys_select() depends 3712 * on DVA[0] being the same in the BP as in the DDT (dedup table). 3713 */ 3714 if (BP_GET_DEDUP(bp)) 3715 return (B_FALSE); 3716 3717 /* 3718 * Gang blocks can not be remapped, because 3719 * zio_checksum_gang_verifier() depends on the DVA[0] that's in 3720 * the BP used to read the gang block header (GBH) being the same 3721 * as the DVA[0] that we allocated for the GBH. 3722 */ 3723 if (BP_IS_GANG(bp)) 3724 return (B_FALSE); 3725 3726 /* 3727 * Embedded BP's have no DVA to remap. 3728 */ 3729 if (BP_GET_NDVAS(bp) < 1) 3730 return (B_FALSE); 3731 3732 /* 3733 * Note: we only remap dva[0]. If we remapped other dvas, we 3734 * would no longer know what their phys birth txg is. 3735 */ 3736 dva_t *dva = &bp->blk_dva[0]; 3737 3738 uint64_t offset = DVA_GET_OFFSET(dva); 3739 uint64_t size = DVA_GET_ASIZE(dva); 3740 vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); 3741 3742 if (vd->vdev_ops->vdev_op_remap == NULL) 3743 return (B_FALSE); 3744 3745 rbca.rbca_bp = bp; 3746 rbca.rbca_cb = callback; 3747 rbca.rbca_remap_vd = vd; 3748 rbca.rbca_remap_offset = offset; 3749 rbca.rbca_cb_arg = arg; 3750 3751 /* 3752 * remap_blkptr_cb() will be called in order for each level of 3753 * indirection, until a concrete vdev is reached or a split block is 3754 * encountered. old_vd and old_offset are updated within the callback 3755 * as we go from the one indirect vdev to the next one (either concrete 3756 * or indirect again) in that order. 3757 */ 3758 vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca); 3759 3760 /* Check if the DVA wasn't remapped because it is a split block */ 3761 if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id) 3762 return (B_FALSE); 3763 3764 return (B_TRUE); 3765 } 3766 3767 /* 3768 * Undo the allocation of a DVA which happened in the given transaction group. 3769 */ 3770 void 3771 metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg) 3772 { 3773 metaslab_t *msp; 3774 vdev_t *vd; 3775 uint64_t vdev = DVA_GET_VDEV(dva); 3776 uint64_t offset = DVA_GET_OFFSET(dva); 3777 uint64_t size = DVA_GET_ASIZE(dva); 3778 3779 ASSERT(DVA_IS_VALID(dva)); 3780 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 3781 3782 if (txg > spa_freeze_txg(spa)) 3783 return; 3784 3785 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 3786 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { 3787 cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu", 3788 (u_longlong_t)vdev, (u_longlong_t)offset); 3789 ASSERT(0); 3790 return; 3791 } 3792 3793 ASSERT(!vd->vdev_removing); 3794 ASSERT(vdev_is_concrete(vd)); 3795 ASSERT0(vd->vdev_indirect_config.vic_mapping_object); 3796 ASSERT3P(vd->vdev_indirect_mapping, ==, NULL); 3797 3798 if (DVA_GET_GANG(dva)) 3799 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 3800 3801 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 3802 3803 mutex_enter(&msp->ms_lock); 3804 range_tree_remove(msp->ms_allocating[txg & TXG_MASK], 3805 offset, size); 3806 3807 VERIFY(!msp->ms_condensing); 3808 VERIFY3U(offset, >=, msp->ms_start); 3809 VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size); 3810 VERIFY3U(range_tree_space(msp->ms_allocatable) + size, <=, 3811 msp->ms_size); 3812 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 3813 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 3814 range_tree_add(msp->ms_allocatable, offset, size); 3815 mutex_exit(&msp->ms_lock); 3816 } 3817 3818 /* 3819 * Free the block represented by the given DVA. 3820 */ 3821 void 3822 metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint) 3823 { 3824 uint64_t vdev = DVA_GET_VDEV(dva); 3825 uint64_t offset = DVA_GET_OFFSET(dva); 3826 uint64_t size = DVA_GET_ASIZE(dva); 3827 vdev_t *vd = vdev_lookup_top(spa, vdev); 3828 3829 ASSERT(DVA_IS_VALID(dva)); 3830 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 3831 3832 if (DVA_GET_GANG(dva)) { 3833 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 3834 } 3835 3836 metaslab_free_impl(vd, offset, size, checkpoint); 3837 } 3838 3839 /* 3840 * Reserve some allocation slots. The reservation system must be called 3841 * before we call into the allocator. If there aren't any available slots 3842 * then the I/O will be throttled until an I/O completes and its slots are 3843 * freed up. The function returns true if it was successful in placing 3844 * the reservation. 3845 */ 3846 boolean_t 3847 metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator, 3848 zio_t *zio, int flags) 3849 { 3850 uint64_t available_slots = 0; 3851 boolean_t slot_reserved = B_FALSE; 3852 uint64_t max = mc->mc_alloc_max_slots[allocator]; 3853 3854 ASSERT(mc->mc_alloc_throttle_enabled); 3855 mutex_enter(&mc->mc_lock); 3856 3857 uint64_t reserved_slots = 3858 zfs_refcount_count(&mc->mc_alloc_slots[allocator]); 3859 if (reserved_slots < max) 3860 available_slots = max - reserved_slots; 3861 3862 if (slots <= available_slots || GANG_ALLOCATION(flags)) { 3863 /* 3864 * We reserve the slots individually so that we can unreserve 3865 * them individually when an I/O completes. 3866 */ 3867 for (int d = 0; d < slots; d++) { 3868 reserved_slots = 3869 zfs_refcount_add(&mc->mc_alloc_slots[allocator], 3870 zio); 3871 } 3872 zio->io_flags |= ZIO_FLAG_IO_ALLOCATING; 3873 slot_reserved = B_TRUE; 3874 } 3875 3876 mutex_exit(&mc->mc_lock); 3877 return (slot_reserved); 3878 } 3879 3880 void 3881 metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, 3882 int allocator, zio_t *zio) 3883 { 3884 ASSERT(mc->mc_alloc_throttle_enabled); 3885 mutex_enter(&mc->mc_lock); 3886 for (int d = 0; d < slots; d++) { 3887 (void) zfs_refcount_remove(&mc->mc_alloc_slots[allocator], 3888 zio); 3889 } 3890 mutex_exit(&mc->mc_lock); 3891 } 3892 3893 static int 3894 metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size, 3895 uint64_t txg) 3896 { 3897 metaslab_t *msp; 3898 spa_t *spa = vd->vdev_spa; 3899 int error = 0; 3900 3901 if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count) 3902 return (ENXIO); 3903 3904 ASSERT3P(vd->vdev_ms, !=, NULL); 3905 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 3906 3907 mutex_enter(&msp->ms_lock); 3908 3909 if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) 3910 error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM); 3911 /* 3912 * No need to fail in that case; someone else has activated the 3913 * metaslab, but that doesn't preclude us from using it. 3914 */ 3915 if (error == EBUSY) 3916 error = 0; 3917 3918 if (error == 0 && 3919 !range_tree_contains(msp->ms_allocatable, offset, size)) 3920 error = SET_ERROR(ENOENT); 3921 3922 if (error || txg == 0) { /* txg == 0 indicates dry run */ 3923 mutex_exit(&msp->ms_lock); 3924 return (error); 3925 } 3926 3927 VERIFY(!msp->ms_condensing); 3928 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 3929 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 3930 VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=, 3931 msp->ms_size); 3932 range_tree_remove(msp->ms_allocatable, offset, size); 3933 3934 if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ 3935 if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) 3936 vdev_dirty(vd, VDD_METASLAB, msp, txg); 3937 range_tree_add(msp->ms_allocating[txg & TXG_MASK], 3938 offset, size); 3939 } 3940 3941 mutex_exit(&msp->ms_lock); 3942 3943 return (0); 3944 } 3945 3946 typedef struct metaslab_claim_cb_arg_t { 3947 uint64_t mcca_txg; 3948 int mcca_error; 3949 } metaslab_claim_cb_arg_t; 3950 3951 /* ARGSUSED */ 3952 static void 3953 metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, 3954 uint64_t size, void *arg) 3955 { 3956 metaslab_claim_cb_arg_t *mcca_arg = arg; 3957 3958 if (mcca_arg->mcca_error == 0) { 3959 mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset, 3960 size, mcca_arg->mcca_txg); 3961 } 3962 } 3963 3964 int 3965 metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg) 3966 { 3967 if (vd->vdev_ops->vdev_op_remap != NULL) { 3968 metaslab_claim_cb_arg_t arg; 3969 3970 /* 3971 * Only zdb(1M) can claim on indirect vdevs. This is used 3972 * to detect leaks of mapped space (that are not accounted 3973 * for in the obsolete counts, spacemap, or bpobj). 3974 */ 3975 ASSERT(!spa_writeable(vd->vdev_spa)); 3976 arg.mcca_error = 0; 3977 arg.mcca_txg = txg; 3978 3979 vd->vdev_ops->vdev_op_remap(vd, offset, size, 3980 metaslab_claim_impl_cb, &arg); 3981 3982 if (arg.mcca_error == 0) { 3983 arg.mcca_error = metaslab_claim_concrete(vd, 3984 offset, size, txg); 3985 } 3986 return (arg.mcca_error); 3987 } else { 3988 return (metaslab_claim_concrete(vd, offset, size, txg)); 3989 } 3990 } 3991 3992 /* 3993 * Intent log support: upon opening the pool after a crash, notify the SPA 3994 * of blocks that the intent log has allocated for immediate write, but 3995 * which are still considered free by the SPA because the last transaction 3996 * group didn't commit yet. 3997 */ 3998 static int 3999 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) 4000 { 4001 uint64_t vdev = DVA_GET_VDEV(dva); 4002 uint64_t offset = DVA_GET_OFFSET(dva); 4003 uint64_t size = DVA_GET_ASIZE(dva); 4004 vdev_t *vd; 4005 4006 if ((vd = vdev_lookup_top(spa, vdev)) == NULL) { 4007 return (SET_ERROR(ENXIO)); 4008 } 4009 4010 ASSERT(DVA_IS_VALID(dva)); 4011 4012 if (DVA_GET_GANG(dva)) 4013 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 4014 4015 return (metaslab_claim_impl(vd, offset, size, txg)); 4016 } 4017 4018 int 4019 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, 4020 int ndvas, uint64_t txg, blkptr_t *hintbp, int flags, 4021 zio_alloc_list_t *zal, zio_t *zio, int allocator) 4022 { 4023 dva_t *dva = bp->blk_dva; 4024 dva_t *hintdva = hintbp->blk_dva; 4025 int error = 0; 4026 4027 ASSERT(bp->blk_birth == 0); 4028 ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); 4029 4030 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 4031 4032 if (mc->mc_rotor == NULL) { /* no vdevs in this class */ 4033 spa_config_exit(spa, SCL_ALLOC, FTAG); 4034 return (SET_ERROR(ENOSPC)); 4035 } 4036 4037 ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); 4038 ASSERT(BP_GET_NDVAS(bp) == 0); 4039 ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); 4040 ASSERT3P(zal, !=, NULL); 4041 4042 for (int d = 0; d < ndvas; d++) { 4043 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, 4044 txg, flags, zal, allocator); 4045 if (error != 0) { 4046 for (d--; d >= 0; d--) { 4047 metaslab_unalloc_dva(spa, &dva[d], txg); 4048 metaslab_group_alloc_decrement(spa, 4049 DVA_GET_VDEV(&dva[d]), zio, flags, 4050 allocator, B_FALSE); 4051 bzero(&dva[d], sizeof (dva_t)); 4052 } 4053 spa_config_exit(spa, SCL_ALLOC, FTAG); 4054 return (error); 4055 } else { 4056 /* 4057 * Update the metaslab group's queue depth 4058 * based on the newly allocated dva. 4059 */ 4060 metaslab_group_alloc_increment(spa, 4061 DVA_GET_VDEV(&dva[d]), zio, flags, allocator); 4062 } 4063 4064 } 4065 ASSERT(error == 0); 4066 ASSERT(BP_GET_NDVAS(bp) == ndvas); 4067 4068 spa_config_exit(spa, SCL_ALLOC, FTAG); 4069 4070 BP_SET_BIRTH(bp, txg, txg); 4071 4072 return (0); 4073 } 4074 4075 void 4076 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) 4077 { 4078 const dva_t *dva = bp->blk_dva; 4079 int ndvas = BP_GET_NDVAS(bp); 4080 4081 ASSERT(!BP_IS_HOLE(bp)); 4082 ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); 4083 4084 /* 4085 * If we have a checkpoint for the pool we need to make sure that 4086 * the blocks that we free that are part of the checkpoint won't be 4087 * reused until the checkpoint is discarded or we revert to it. 4088 * 4089 * The checkpoint flag is passed down the metaslab_free code path 4090 * and is set whenever we want to add a block to the checkpoint's 4091 * accounting. That is, we "checkpoint" blocks that existed at the 4092 * time the checkpoint was created and are therefore referenced by 4093 * the checkpointed uberblock. 4094 * 4095 * Note that, we don't checkpoint any blocks if the current 4096 * syncing txg <= spa_checkpoint_txg. We want these frees to sync 4097 * normally as they will be referenced by the checkpointed uberblock. 4098 */ 4099 boolean_t checkpoint = B_FALSE; 4100 if (bp->blk_birth <= spa->spa_checkpoint_txg && 4101 spa_syncing_txg(spa) > spa->spa_checkpoint_txg) { 4102 /* 4103 * At this point, if the block is part of the checkpoint 4104 * there is no way it was created in the current txg. 4105 */ 4106 ASSERT(!now); 4107 ASSERT3U(spa_syncing_txg(spa), ==, txg); 4108 checkpoint = B_TRUE; 4109 } 4110 4111 spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); 4112 4113 for (int d = 0; d < ndvas; d++) { 4114 if (now) { 4115 metaslab_unalloc_dva(spa, &dva[d], txg); 4116 } else { 4117 ASSERT3U(txg, ==, spa_syncing_txg(spa)); 4118 metaslab_free_dva(spa, &dva[d], checkpoint); 4119 } 4120 } 4121 4122 spa_config_exit(spa, SCL_FREE, FTAG); 4123 } 4124 4125 int 4126 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) 4127 { 4128 const dva_t *dva = bp->blk_dva; 4129 int ndvas = BP_GET_NDVAS(bp); 4130 int error = 0; 4131 4132 ASSERT(!BP_IS_HOLE(bp)); 4133 4134 if (txg != 0) { 4135 /* 4136 * First do a dry run to make sure all DVAs are claimable, 4137 * so we don't have to unwind from partial failures below. 4138 */ 4139 if ((error = metaslab_claim(spa, bp, 0)) != 0) 4140 return (error); 4141 } 4142 4143 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 4144 4145 for (int d = 0; d < ndvas; d++) 4146 if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0) 4147 break; 4148 4149 spa_config_exit(spa, SCL_ALLOC, FTAG); 4150 4151 ASSERT(error == 0 || txg == 0); 4152 4153 return (error); 4154 } 4155 4156 /* ARGSUSED */ 4157 static void 4158 metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset, 4159 uint64_t size, void *arg) 4160 { 4161 if (vd->vdev_ops == &vdev_indirect_ops) 4162 return; 4163 4164 metaslab_check_free_impl(vd, offset, size); 4165 } 4166 4167 static void 4168 metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size) 4169 { 4170 metaslab_t *msp; 4171 spa_t *spa = vd->vdev_spa; 4172 4173 if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) 4174 return; 4175 4176 if (vd->vdev_ops->vdev_op_remap != NULL) { 4177 vd->vdev_ops->vdev_op_remap(vd, offset, size, 4178 metaslab_check_free_impl_cb, NULL); 4179 return; 4180 } 4181 4182 ASSERT(vdev_is_concrete(vd)); 4183 ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); 4184 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 4185 4186 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 4187 4188 mutex_enter(&msp->ms_lock); 4189 if (msp->ms_loaded) 4190 range_tree_verify(msp->ms_allocatable, offset, size); 4191 4192 range_tree_verify(msp->ms_freeing, offset, size); 4193 range_tree_verify(msp->ms_checkpointing, offset, size); 4194 range_tree_verify(msp->ms_freed, offset, size); 4195 for (int j = 0; j < TXG_DEFER_SIZE; j++) 4196 range_tree_verify(msp->ms_defer[j], offset, size); 4197 mutex_exit(&msp->ms_lock); 4198 } 4199 4200 void 4201 metaslab_check_free(spa_t *spa, const blkptr_t *bp) 4202 { 4203 if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) 4204 return; 4205 4206 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 4207 for (int i = 0; i < BP_GET_NDVAS(bp); i++) { 4208 uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]); 4209 vdev_t *vd = vdev_lookup_top(spa, vdev); 4210 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); 4211 uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]); 4212 4213 if (DVA_GET_GANG(&bp->blk_dva[i])) 4214 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 4215 4216 ASSERT3P(vd, !=, NULL); 4217 4218 metaslab_check_free_impl(vd, offset, size); 4219 } 4220 spa_config_exit(spa, SCL_VDEV, FTAG); 4221 } 4222