1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 24 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 25 * Copyright (c) 2014 Integros [integros.com] 26 */ 27 28 #include <sys/zfs_context.h> 29 #include <sys/dmu.h> 30 #include <sys/dmu_tx.h> 31 #include <sys/space_map.h> 32 #include <sys/metaslab_impl.h> 33 #include <sys/vdev_impl.h> 34 #include <sys/zio.h> 35 #include <sys/spa_impl.h> 36 #include <sys/zfeature.h> 37 #include <sys/vdev_indirect_mapping.h> 38 #include <sys/zap.h> 39 40 #define GANG_ALLOCATION(flags) \ 41 ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER)) 42 43 uint64_t metaslab_aliquot = 512ULL << 10; 44 uint64_t metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ 45 46 /* 47 * Since we can touch multiple metaslabs (and their respective space maps) 48 * with each transaction group, we benefit from having a smaller space map 49 * block size since it allows us to issue more I/O operations scattered 50 * around the disk. 51 */ 52 int zfs_metaslab_sm_blksz = (1 << 12); 53 54 /* 55 * The in-core space map representation is more compact than its on-disk form. 56 * The zfs_condense_pct determines how much more compact the in-core 57 * space map representation must be before we compact it on-disk. 58 * Values should be greater than or equal to 100. 59 */ 60 int zfs_condense_pct = 200; 61 62 /* 63 * Condensing a metaslab is not guaranteed to actually reduce the amount of 64 * space used on disk. In particular, a space map uses data in increments of 65 * MAX(1 << ashift, space_map_blksize), so a metaslab might use the 66 * same number of blocks after condensing. Since the goal of condensing is to 67 * reduce the number of IOPs required to read the space map, we only want to 68 * condense when we can be sure we will reduce the number of blocks used by the 69 * space map. Unfortunately, we cannot precisely compute whether or not this is 70 * the case in metaslab_should_condense since we are holding ms_lock. Instead, 71 * we apply the following heuristic: do not condense a spacemap unless the 72 * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold 73 * blocks. 74 */ 75 int zfs_metaslab_condense_block_threshold = 4; 76 77 /* 78 * The zfs_mg_noalloc_threshold defines which metaslab groups should 79 * be eligible for allocation. The value is defined as a percentage of 80 * free space. Metaslab groups that have more free space than 81 * zfs_mg_noalloc_threshold are always eligible for allocations. Once 82 * a metaslab group's free space is less than or equal to the 83 * zfs_mg_noalloc_threshold the allocator will avoid allocating to that 84 * group unless all groups in the pool have reached zfs_mg_noalloc_threshold. 85 * Once all groups in the pool reach zfs_mg_noalloc_threshold then all 86 * groups are allowed to accept allocations. Gang blocks are always 87 * eligible to allocate on any metaslab group. The default value of 0 means 88 * no metaslab group will be excluded based on this criterion. 89 */ 90 int zfs_mg_noalloc_threshold = 0; 91 92 /* 93 * Metaslab groups are considered eligible for allocations if their 94 * fragmenation metric (measured as a percentage) is less than or equal to 95 * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold 96 * then it will be skipped unless all metaslab groups within the metaslab 97 * class have also crossed this threshold. 98 */ 99 int zfs_mg_fragmentation_threshold = 85; 100 101 /* 102 * Allow metaslabs to keep their active state as long as their fragmentation 103 * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An 104 * active metaslab that exceeds this threshold will no longer keep its active 105 * status allowing better metaslabs to be selected. 106 */ 107 int zfs_metaslab_fragmentation_threshold = 70; 108 109 /* 110 * When set will load all metaslabs when pool is first opened. 111 */ 112 int metaslab_debug_load = 0; 113 114 /* 115 * When set will prevent metaslabs from being unloaded. 116 */ 117 int metaslab_debug_unload = 0; 118 119 /* 120 * Minimum size which forces the dynamic allocator to change 121 * it's allocation strategy. Once the space map cannot satisfy 122 * an allocation of this size then it switches to using more 123 * aggressive strategy (i.e search by size rather than offset). 124 */ 125 uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE; 126 127 /* 128 * The minimum free space, in percent, which must be available 129 * in a space map to continue allocations in a first-fit fashion. 130 * Once the space map's free space drops below this level we dynamically 131 * switch to using best-fit allocations. 132 */ 133 int metaslab_df_free_pct = 4; 134 135 /* 136 * A metaslab is considered "free" if it contains a contiguous 137 * segment which is greater than metaslab_min_alloc_size. 138 */ 139 uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS; 140 141 /* 142 * Percentage of all cpus that can be used by the metaslab taskq. 143 */ 144 int metaslab_load_pct = 50; 145 146 /* 147 * Determines how many txgs a metaslab may remain loaded without having any 148 * allocations from it. As long as a metaslab continues to be used we will 149 * keep it loaded. 150 */ 151 int metaslab_unload_delay = TXG_SIZE * 2; 152 153 /* 154 * Max number of metaslabs per group to preload. 155 */ 156 int metaslab_preload_limit = SPA_DVAS_PER_BP; 157 158 /* 159 * Enable/disable preloading of metaslab. 160 */ 161 boolean_t metaslab_preload_enabled = B_TRUE; 162 163 /* 164 * Enable/disable fragmentation weighting on metaslabs. 165 */ 166 boolean_t metaslab_fragmentation_factor_enabled = B_TRUE; 167 168 /* 169 * Enable/disable lba weighting (i.e. outer tracks are given preference). 170 */ 171 boolean_t metaslab_lba_weighting_enabled = B_TRUE; 172 173 /* 174 * Enable/disable metaslab group biasing. 175 */ 176 boolean_t metaslab_bias_enabled = B_TRUE; 177 178 /* 179 * Enable/disable remapping of indirect DVAs to their concrete vdevs. 180 */ 181 boolean_t zfs_remap_blkptr_enable = B_TRUE; 182 183 /* 184 * Enable/disable segment-based metaslab selection. 185 */ 186 boolean_t zfs_metaslab_segment_weight_enabled = B_TRUE; 187 188 /* 189 * When using segment-based metaslab selection, we will continue 190 * allocating from the active metaslab until we have exhausted 191 * zfs_metaslab_switch_threshold of its buckets. 192 */ 193 int zfs_metaslab_switch_threshold = 2; 194 195 /* 196 * Internal switch to enable/disable the metaslab allocation tracing 197 * facility. 198 */ 199 boolean_t metaslab_trace_enabled = B_TRUE; 200 201 /* 202 * Maximum entries that the metaslab allocation tracing facility will keep 203 * in a given list when running in non-debug mode. We limit the number 204 * of entries in non-debug mode to prevent us from using up too much memory. 205 * The limit should be sufficiently large that we don't expect any allocation 206 * to every exceed this value. In debug mode, the system will panic if this 207 * limit is ever reached allowing for further investigation. 208 */ 209 uint64_t metaslab_trace_max_entries = 5000; 210 211 static uint64_t metaslab_weight(metaslab_t *); 212 static void metaslab_set_fragmentation(metaslab_t *); 213 static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t); 214 static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t); 215 static void metaslab_passivate(metaslab_t *msp, uint64_t weight); 216 static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp); 217 218 kmem_cache_t *metaslab_alloc_trace_cache; 219 220 /* 221 * ========================================================================== 222 * Metaslab classes 223 * ========================================================================== 224 */ 225 metaslab_class_t * 226 metaslab_class_create(spa_t *spa, metaslab_ops_t *ops) 227 { 228 metaslab_class_t *mc; 229 230 mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); 231 232 mc->mc_spa = spa; 233 mc->mc_rotor = NULL; 234 mc->mc_ops = ops; 235 mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL); 236 mc->mc_alloc_slots = kmem_zalloc(spa->spa_alloc_count * 237 sizeof (refcount_t), KM_SLEEP); 238 mc->mc_alloc_max_slots = kmem_zalloc(spa->spa_alloc_count * 239 sizeof (uint64_t), KM_SLEEP); 240 for (int i = 0; i < spa->spa_alloc_count; i++) 241 refcount_create_tracked(&mc->mc_alloc_slots[i]); 242 243 return (mc); 244 } 245 246 void 247 metaslab_class_destroy(metaslab_class_t *mc) 248 { 249 ASSERT(mc->mc_rotor == NULL); 250 ASSERT(mc->mc_alloc == 0); 251 ASSERT(mc->mc_deferred == 0); 252 ASSERT(mc->mc_space == 0); 253 ASSERT(mc->mc_dspace == 0); 254 255 for (int i = 0; i < mc->mc_spa->spa_alloc_count; i++) 256 refcount_destroy(&mc->mc_alloc_slots[i]); 257 kmem_free(mc->mc_alloc_slots, mc->mc_spa->spa_alloc_count * 258 sizeof (refcount_t)); 259 kmem_free(mc->mc_alloc_max_slots, mc->mc_spa->spa_alloc_count * 260 sizeof (uint64_t)); 261 mutex_destroy(&mc->mc_lock); 262 kmem_free(mc, sizeof (metaslab_class_t)); 263 } 264 265 int 266 metaslab_class_validate(metaslab_class_t *mc) 267 { 268 metaslab_group_t *mg; 269 vdev_t *vd; 270 271 /* 272 * Must hold one of the spa_config locks. 273 */ 274 ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || 275 spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); 276 277 if ((mg = mc->mc_rotor) == NULL) 278 return (0); 279 280 do { 281 vd = mg->mg_vd; 282 ASSERT(vd->vdev_mg != NULL); 283 ASSERT3P(vd->vdev_top, ==, vd); 284 ASSERT3P(mg->mg_class, ==, mc); 285 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); 286 } while ((mg = mg->mg_next) != mc->mc_rotor); 287 288 return (0); 289 } 290 291 void 292 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, 293 int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) 294 { 295 atomic_add_64(&mc->mc_alloc, alloc_delta); 296 atomic_add_64(&mc->mc_deferred, defer_delta); 297 atomic_add_64(&mc->mc_space, space_delta); 298 atomic_add_64(&mc->mc_dspace, dspace_delta); 299 } 300 301 uint64_t 302 metaslab_class_get_alloc(metaslab_class_t *mc) 303 { 304 return (mc->mc_alloc); 305 } 306 307 uint64_t 308 metaslab_class_get_deferred(metaslab_class_t *mc) 309 { 310 return (mc->mc_deferred); 311 } 312 313 uint64_t 314 metaslab_class_get_space(metaslab_class_t *mc) 315 { 316 return (mc->mc_space); 317 } 318 319 uint64_t 320 metaslab_class_get_dspace(metaslab_class_t *mc) 321 { 322 return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); 323 } 324 325 void 326 metaslab_class_histogram_verify(metaslab_class_t *mc) 327 { 328 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 329 uint64_t *mc_hist; 330 int i; 331 332 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 333 return; 334 335 mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 336 KM_SLEEP); 337 338 for (int c = 0; c < rvd->vdev_children; c++) { 339 vdev_t *tvd = rvd->vdev_child[c]; 340 metaslab_group_t *mg = tvd->vdev_mg; 341 342 /* 343 * Skip any holes, uninitialized top-levels, or 344 * vdevs that are not in this metalab class. 345 */ 346 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || 347 mg->mg_class != mc) { 348 continue; 349 } 350 351 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 352 mc_hist[i] += mg->mg_histogram[i]; 353 } 354 355 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 356 VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]); 357 358 kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 359 } 360 361 /* 362 * Calculate the metaslab class's fragmentation metric. The metric 363 * is weighted based on the space contribution of each metaslab group. 364 * The return value will be a number between 0 and 100 (inclusive), or 365 * ZFS_FRAG_INVALID if the metric has not been set. See comment above the 366 * zfs_frag_table for more information about the metric. 367 */ 368 uint64_t 369 metaslab_class_fragmentation(metaslab_class_t *mc) 370 { 371 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 372 uint64_t fragmentation = 0; 373 374 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 375 376 for (int c = 0; c < rvd->vdev_children; c++) { 377 vdev_t *tvd = rvd->vdev_child[c]; 378 metaslab_group_t *mg = tvd->vdev_mg; 379 380 /* 381 * Skip any holes, uninitialized top-levels, 382 * or vdevs that are not in this metalab class. 383 */ 384 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || 385 mg->mg_class != mc) { 386 continue; 387 } 388 389 /* 390 * If a metaslab group does not contain a fragmentation 391 * metric then just bail out. 392 */ 393 if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { 394 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 395 return (ZFS_FRAG_INVALID); 396 } 397 398 /* 399 * Determine how much this metaslab_group is contributing 400 * to the overall pool fragmentation metric. 401 */ 402 fragmentation += mg->mg_fragmentation * 403 metaslab_group_get_space(mg); 404 } 405 fragmentation /= metaslab_class_get_space(mc); 406 407 ASSERT3U(fragmentation, <=, 100); 408 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 409 return (fragmentation); 410 } 411 412 /* 413 * Calculate the amount of expandable space that is available in 414 * this metaslab class. If a device is expanded then its expandable 415 * space will be the amount of allocatable space that is currently not 416 * part of this metaslab class. 417 */ 418 uint64_t 419 metaslab_class_expandable_space(metaslab_class_t *mc) 420 { 421 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 422 uint64_t space = 0; 423 424 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 425 for (int c = 0; c < rvd->vdev_children; c++) { 426 uint64_t tspace; 427 vdev_t *tvd = rvd->vdev_child[c]; 428 metaslab_group_t *mg = tvd->vdev_mg; 429 430 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || 431 mg->mg_class != mc) { 432 continue; 433 } 434 435 /* 436 * Calculate if we have enough space to add additional 437 * metaslabs. We report the expandable space in terms 438 * of the metaslab size since that's the unit of expansion. 439 * Adjust by efi system partition size. 440 */ 441 tspace = tvd->vdev_max_asize - tvd->vdev_asize; 442 if (tspace > mc->mc_spa->spa_bootsize) { 443 tspace -= mc->mc_spa->spa_bootsize; 444 } 445 space += P2ALIGN(tspace, 1ULL << tvd->vdev_ms_shift); 446 } 447 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 448 return (space); 449 } 450 451 static int 452 metaslab_compare(const void *x1, const void *x2) 453 { 454 const metaslab_t *m1 = x1; 455 const metaslab_t *m2 = x2; 456 457 int sort1 = 0; 458 int sort2 = 0; 459 if (m1->ms_allocator != -1 && m1->ms_primary) 460 sort1 = 1; 461 else if (m1->ms_allocator != -1 && !m1->ms_primary) 462 sort1 = 2; 463 if (m2->ms_allocator != -1 && m2->ms_primary) 464 sort2 = 1; 465 else if (m2->ms_allocator != -1 && !m2->ms_primary) 466 sort2 = 2; 467 468 /* 469 * Sort inactive metaslabs first, then primaries, then secondaries. When 470 * selecting a metaslab to allocate from, an allocator first tries its 471 * primary, then secondary active metaslab. If it doesn't have active 472 * metaslabs, or can't allocate from them, it searches for an inactive 473 * metaslab to activate. If it can't find a suitable one, it will steal 474 * a primary or secondary metaslab from another allocator. 475 */ 476 if (sort1 < sort2) 477 return (-1); 478 if (sort1 > sort2) 479 return (1); 480 481 if (m1->ms_weight < m2->ms_weight) 482 return (1); 483 if (m1->ms_weight > m2->ms_weight) 484 return (-1); 485 486 /* 487 * If the weights are identical, use the offset to force uniqueness. 488 */ 489 if (m1->ms_start < m2->ms_start) 490 return (-1); 491 if (m1->ms_start > m2->ms_start) 492 return (1); 493 494 ASSERT3P(m1, ==, m2); 495 496 return (0); 497 } 498 499 /* 500 * Verify that the space accounting on disk matches the in-core range_trees. 501 */ 502 void 503 metaslab_verify_space(metaslab_t *msp, uint64_t txg) 504 { 505 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 506 uint64_t allocated = 0; 507 uint64_t sm_free_space, msp_free_space; 508 509 ASSERT(MUTEX_HELD(&msp->ms_lock)); 510 511 if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) 512 return; 513 514 /* 515 * We can only verify the metaslab space when we're called 516 * from syncing context with a loaded metaslab that has an allocated 517 * space map. Calling this in non-syncing context does not 518 * provide a consistent view of the metaslab since we're performing 519 * allocations in the future. 520 */ 521 if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL || 522 !msp->ms_loaded) 523 return; 524 525 sm_free_space = msp->ms_size - space_map_allocated(msp->ms_sm) - 526 space_map_alloc_delta(msp->ms_sm); 527 528 /* 529 * Account for future allocations since we would have already 530 * deducted that space from the ms_freetree. 531 */ 532 for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { 533 allocated += 534 range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]); 535 } 536 537 msp_free_space = range_tree_space(msp->ms_allocatable) + allocated + 538 msp->ms_deferspace + range_tree_space(msp->ms_freed); 539 540 VERIFY3U(sm_free_space, ==, msp_free_space); 541 } 542 543 /* 544 * ========================================================================== 545 * Metaslab groups 546 * ========================================================================== 547 */ 548 /* 549 * Update the allocatable flag and the metaslab group's capacity. 550 * The allocatable flag is set to true if the capacity is below 551 * the zfs_mg_noalloc_threshold or has a fragmentation value that is 552 * greater than zfs_mg_fragmentation_threshold. If a metaslab group 553 * transitions from allocatable to non-allocatable or vice versa then the 554 * metaslab group's class is updated to reflect the transition. 555 */ 556 static void 557 metaslab_group_alloc_update(metaslab_group_t *mg) 558 { 559 vdev_t *vd = mg->mg_vd; 560 metaslab_class_t *mc = mg->mg_class; 561 vdev_stat_t *vs = &vd->vdev_stat; 562 boolean_t was_allocatable; 563 boolean_t was_initialized; 564 565 ASSERT(vd == vd->vdev_top); 566 ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==, 567 SCL_ALLOC); 568 569 mutex_enter(&mg->mg_lock); 570 was_allocatable = mg->mg_allocatable; 571 was_initialized = mg->mg_initialized; 572 573 mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / 574 (vs->vs_space + 1); 575 576 mutex_enter(&mc->mc_lock); 577 578 /* 579 * If the metaslab group was just added then it won't 580 * have any space until we finish syncing out this txg. 581 * At that point we will consider it initialized and available 582 * for allocations. We also don't consider non-activated 583 * metaslab groups (e.g. vdevs that are in the middle of being removed) 584 * to be initialized, because they can't be used for allocation. 585 */ 586 mg->mg_initialized = metaslab_group_initialized(mg); 587 if (!was_initialized && mg->mg_initialized) { 588 mc->mc_groups++; 589 } else if (was_initialized && !mg->mg_initialized) { 590 ASSERT3U(mc->mc_groups, >, 0); 591 mc->mc_groups--; 592 } 593 if (mg->mg_initialized) 594 mg->mg_no_free_space = B_FALSE; 595 596 /* 597 * A metaslab group is considered allocatable if it has plenty 598 * of free space or is not heavily fragmented. We only take 599 * fragmentation into account if the metaslab group has a valid 600 * fragmentation metric (i.e. a value between 0 and 100). 601 */ 602 mg->mg_allocatable = (mg->mg_activation_count > 0 && 603 mg->mg_free_capacity > zfs_mg_noalloc_threshold && 604 (mg->mg_fragmentation == ZFS_FRAG_INVALID || 605 mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)); 606 607 /* 608 * The mc_alloc_groups maintains a count of the number of 609 * groups in this metaslab class that are still above the 610 * zfs_mg_noalloc_threshold. This is used by the allocating 611 * threads to determine if they should avoid allocations to 612 * a given group. The allocator will avoid allocations to a group 613 * if that group has reached or is below the zfs_mg_noalloc_threshold 614 * and there are still other groups that are above the threshold. 615 * When a group transitions from allocatable to non-allocatable or 616 * vice versa we update the metaslab class to reflect that change. 617 * When the mc_alloc_groups value drops to 0 that means that all 618 * groups have reached the zfs_mg_noalloc_threshold making all groups 619 * eligible for allocations. This effectively means that all devices 620 * are balanced again. 621 */ 622 if (was_allocatable && !mg->mg_allocatable) 623 mc->mc_alloc_groups--; 624 else if (!was_allocatable && mg->mg_allocatable) 625 mc->mc_alloc_groups++; 626 mutex_exit(&mc->mc_lock); 627 628 mutex_exit(&mg->mg_lock); 629 } 630 631 metaslab_group_t * 632 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators) 633 { 634 metaslab_group_t *mg; 635 636 mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); 637 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); 638 mg->mg_primaries = kmem_zalloc(allocators * sizeof (metaslab_t *), 639 KM_SLEEP); 640 mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *), 641 KM_SLEEP); 642 avl_create(&mg->mg_metaslab_tree, metaslab_compare, 643 sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); 644 mg->mg_vd = vd; 645 mg->mg_class = mc; 646 mg->mg_activation_count = 0; 647 mg->mg_initialized = B_FALSE; 648 mg->mg_no_free_space = B_TRUE; 649 mg->mg_allocators = allocators; 650 651 mg->mg_alloc_queue_depth = kmem_zalloc(allocators * sizeof (refcount_t), 652 KM_SLEEP); 653 mg->mg_cur_max_alloc_queue_depth = kmem_zalloc(allocators * 654 sizeof (uint64_t), KM_SLEEP); 655 for (int i = 0; i < allocators; i++) { 656 refcount_create_tracked(&mg->mg_alloc_queue_depth[i]); 657 mg->mg_cur_max_alloc_queue_depth[i] = 0; 658 } 659 660 mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct, 661 minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT); 662 663 return (mg); 664 } 665 666 void 667 metaslab_group_destroy(metaslab_group_t *mg) 668 { 669 ASSERT(mg->mg_prev == NULL); 670 ASSERT(mg->mg_next == NULL); 671 /* 672 * We may have gone below zero with the activation count 673 * either because we never activated in the first place or 674 * because we're done, and possibly removing the vdev. 675 */ 676 ASSERT(mg->mg_activation_count <= 0); 677 678 taskq_destroy(mg->mg_taskq); 679 avl_destroy(&mg->mg_metaslab_tree); 680 kmem_free(mg->mg_primaries, mg->mg_allocators * sizeof (metaslab_t *)); 681 kmem_free(mg->mg_secondaries, mg->mg_allocators * 682 sizeof (metaslab_t *)); 683 mutex_destroy(&mg->mg_lock); 684 685 for (int i = 0; i < mg->mg_allocators; i++) { 686 refcount_destroy(&mg->mg_alloc_queue_depth[i]); 687 mg->mg_cur_max_alloc_queue_depth[i] = 0; 688 } 689 kmem_free(mg->mg_alloc_queue_depth, mg->mg_allocators * 690 sizeof (refcount_t)); 691 kmem_free(mg->mg_cur_max_alloc_queue_depth, mg->mg_allocators * 692 sizeof (uint64_t)); 693 694 kmem_free(mg, sizeof (metaslab_group_t)); 695 } 696 697 void 698 metaslab_group_activate(metaslab_group_t *mg) 699 { 700 metaslab_class_t *mc = mg->mg_class; 701 metaslab_group_t *mgprev, *mgnext; 702 703 ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER), !=, 0); 704 705 ASSERT(mc->mc_rotor != mg); 706 ASSERT(mg->mg_prev == NULL); 707 ASSERT(mg->mg_next == NULL); 708 ASSERT(mg->mg_activation_count <= 0); 709 710 if (++mg->mg_activation_count <= 0) 711 return; 712 713 mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); 714 metaslab_group_alloc_update(mg); 715 716 if ((mgprev = mc->mc_rotor) == NULL) { 717 mg->mg_prev = mg; 718 mg->mg_next = mg; 719 } else { 720 mgnext = mgprev->mg_next; 721 mg->mg_prev = mgprev; 722 mg->mg_next = mgnext; 723 mgprev->mg_next = mg; 724 mgnext->mg_prev = mg; 725 } 726 mc->mc_rotor = mg; 727 } 728 729 /* 730 * Passivate a metaslab group and remove it from the allocation rotor. 731 * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating 732 * a metaslab group. This function will momentarily drop spa_config_locks 733 * that are lower than the SCL_ALLOC lock (see comment below). 734 */ 735 void 736 metaslab_group_passivate(metaslab_group_t *mg) 737 { 738 metaslab_class_t *mc = mg->mg_class; 739 spa_t *spa = mc->mc_spa; 740 metaslab_group_t *mgprev, *mgnext; 741 int locks = spa_config_held(spa, SCL_ALL, RW_WRITER); 742 743 ASSERT3U(spa_config_held(spa, SCL_ALLOC | SCL_ZIO, RW_WRITER), ==, 744 (SCL_ALLOC | SCL_ZIO)); 745 746 if (--mg->mg_activation_count != 0) { 747 ASSERT(mc->mc_rotor != mg); 748 ASSERT(mg->mg_prev == NULL); 749 ASSERT(mg->mg_next == NULL); 750 ASSERT(mg->mg_activation_count < 0); 751 return; 752 } 753 754 /* 755 * The spa_config_lock is an array of rwlocks, ordered as 756 * follows (from highest to lowest): 757 * SCL_CONFIG > SCL_STATE > SCL_L2ARC > SCL_ALLOC > 758 * SCL_ZIO > SCL_FREE > SCL_VDEV 759 * (For more information about the spa_config_lock see spa_misc.c) 760 * The higher the lock, the broader its coverage. When we passivate 761 * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO 762 * config locks. However, the metaslab group's taskq might be trying 763 * to preload metaslabs so we must drop the SCL_ZIO lock and any 764 * lower locks to allow the I/O to complete. At a minimum, 765 * we continue to hold the SCL_ALLOC lock, which prevents any future 766 * allocations from taking place and any changes to the vdev tree. 767 */ 768 spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa); 769 taskq_wait(mg->mg_taskq); 770 spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER); 771 metaslab_group_alloc_update(mg); 772 for (int i = 0; i < mg->mg_allocators; i++) { 773 metaslab_t *msp = mg->mg_primaries[i]; 774 if (msp != NULL) { 775 mutex_enter(&msp->ms_lock); 776 metaslab_passivate(msp, 777 metaslab_weight_from_range_tree(msp)); 778 mutex_exit(&msp->ms_lock); 779 } 780 msp = mg->mg_secondaries[i]; 781 if (msp != NULL) { 782 mutex_enter(&msp->ms_lock); 783 metaslab_passivate(msp, 784 metaslab_weight_from_range_tree(msp)); 785 mutex_exit(&msp->ms_lock); 786 } 787 } 788 789 mgprev = mg->mg_prev; 790 mgnext = mg->mg_next; 791 792 if (mg == mgnext) { 793 mc->mc_rotor = NULL; 794 } else { 795 mc->mc_rotor = mgnext; 796 mgprev->mg_next = mgnext; 797 mgnext->mg_prev = mgprev; 798 } 799 800 mg->mg_prev = NULL; 801 mg->mg_next = NULL; 802 } 803 804 boolean_t 805 metaslab_group_initialized(metaslab_group_t *mg) 806 { 807 vdev_t *vd = mg->mg_vd; 808 vdev_stat_t *vs = &vd->vdev_stat; 809 810 return (vs->vs_space != 0 && mg->mg_activation_count > 0); 811 } 812 813 uint64_t 814 metaslab_group_get_space(metaslab_group_t *mg) 815 { 816 return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count); 817 } 818 819 void 820 metaslab_group_histogram_verify(metaslab_group_t *mg) 821 { 822 uint64_t *mg_hist; 823 vdev_t *vd = mg->mg_vd; 824 uint64_t ashift = vd->vdev_ashift; 825 int i; 826 827 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 828 return; 829 830 mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 831 KM_SLEEP); 832 833 ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=, 834 SPACE_MAP_HISTOGRAM_SIZE + ashift); 835 836 for (int m = 0; m < vd->vdev_ms_count; m++) { 837 metaslab_t *msp = vd->vdev_ms[m]; 838 839 if (msp->ms_sm == NULL) 840 continue; 841 842 for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) 843 mg_hist[i + ashift] += 844 msp->ms_sm->sm_phys->smp_histogram[i]; 845 } 846 847 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++) 848 VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]); 849 850 kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 851 } 852 853 static void 854 metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp) 855 { 856 metaslab_class_t *mc = mg->mg_class; 857 uint64_t ashift = mg->mg_vd->vdev_ashift; 858 859 ASSERT(MUTEX_HELD(&msp->ms_lock)); 860 if (msp->ms_sm == NULL) 861 return; 862 863 mutex_enter(&mg->mg_lock); 864 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 865 mg->mg_histogram[i + ashift] += 866 msp->ms_sm->sm_phys->smp_histogram[i]; 867 mc->mc_histogram[i + ashift] += 868 msp->ms_sm->sm_phys->smp_histogram[i]; 869 } 870 mutex_exit(&mg->mg_lock); 871 } 872 873 void 874 metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp) 875 { 876 metaslab_class_t *mc = mg->mg_class; 877 uint64_t ashift = mg->mg_vd->vdev_ashift; 878 879 ASSERT(MUTEX_HELD(&msp->ms_lock)); 880 if (msp->ms_sm == NULL) 881 return; 882 883 mutex_enter(&mg->mg_lock); 884 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 885 ASSERT3U(mg->mg_histogram[i + ashift], >=, 886 msp->ms_sm->sm_phys->smp_histogram[i]); 887 ASSERT3U(mc->mc_histogram[i + ashift], >=, 888 msp->ms_sm->sm_phys->smp_histogram[i]); 889 890 mg->mg_histogram[i + ashift] -= 891 msp->ms_sm->sm_phys->smp_histogram[i]; 892 mc->mc_histogram[i + ashift] -= 893 msp->ms_sm->sm_phys->smp_histogram[i]; 894 } 895 mutex_exit(&mg->mg_lock); 896 } 897 898 static void 899 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) 900 { 901 ASSERT(msp->ms_group == NULL); 902 mutex_enter(&mg->mg_lock); 903 msp->ms_group = mg; 904 msp->ms_weight = 0; 905 avl_add(&mg->mg_metaslab_tree, msp); 906 mutex_exit(&mg->mg_lock); 907 908 mutex_enter(&msp->ms_lock); 909 metaslab_group_histogram_add(mg, msp); 910 mutex_exit(&msp->ms_lock); 911 } 912 913 static void 914 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) 915 { 916 mutex_enter(&msp->ms_lock); 917 metaslab_group_histogram_remove(mg, msp); 918 mutex_exit(&msp->ms_lock); 919 920 mutex_enter(&mg->mg_lock); 921 ASSERT(msp->ms_group == mg); 922 avl_remove(&mg->mg_metaslab_tree, msp); 923 msp->ms_group = NULL; 924 mutex_exit(&mg->mg_lock); 925 } 926 927 static void 928 metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 929 { 930 ASSERT(MUTEX_HELD(&mg->mg_lock)); 931 ASSERT(msp->ms_group == mg); 932 avl_remove(&mg->mg_metaslab_tree, msp); 933 msp->ms_weight = weight; 934 avl_add(&mg->mg_metaslab_tree, msp); 935 936 } 937 938 static void 939 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 940 { 941 /* 942 * Although in principle the weight can be any value, in 943 * practice we do not use values in the range [1, 511]. 944 */ 945 ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0); 946 ASSERT(MUTEX_HELD(&msp->ms_lock)); 947 948 mutex_enter(&mg->mg_lock); 949 metaslab_group_sort_impl(mg, msp, weight); 950 mutex_exit(&mg->mg_lock); 951 } 952 953 /* 954 * Calculate the fragmentation for a given metaslab group. We can use 955 * a simple average here since all metaslabs within the group must have 956 * the same size. The return value will be a value between 0 and 100 957 * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this 958 * group have a fragmentation metric. 959 */ 960 uint64_t 961 metaslab_group_fragmentation(metaslab_group_t *mg) 962 { 963 vdev_t *vd = mg->mg_vd; 964 uint64_t fragmentation = 0; 965 uint64_t valid_ms = 0; 966 967 for (int m = 0; m < vd->vdev_ms_count; m++) { 968 metaslab_t *msp = vd->vdev_ms[m]; 969 970 if (msp->ms_fragmentation == ZFS_FRAG_INVALID) 971 continue; 972 973 valid_ms++; 974 fragmentation += msp->ms_fragmentation; 975 } 976 977 if (valid_ms <= vd->vdev_ms_count / 2) 978 return (ZFS_FRAG_INVALID); 979 980 fragmentation /= valid_ms; 981 ASSERT3U(fragmentation, <=, 100); 982 return (fragmentation); 983 } 984 985 /* 986 * Determine if a given metaslab group should skip allocations. A metaslab 987 * group should avoid allocations if its free capacity is less than the 988 * zfs_mg_noalloc_threshold or its fragmentation metric is greater than 989 * zfs_mg_fragmentation_threshold and there is at least one metaslab group 990 * that can still handle allocations. If the allocation throttle is enabled 991 * then we skip allocations to devices that have reached their maximum 992 * allocation queue depth unless the selected metaslab group is the only 993 * eligible group remaining. 994 */ 995 static boolean_t 996 metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, 997 uint64_t psize, int allocator) 998 { 999 spa_t *spa = mg->mg_vd->vdev_spa; 1000 metaslab_class_t *mc = mg->mg_class; 1001 1002 /* 1003 * We can only consider skipping this metaslab group if it's 1004 * in the normal metaslab class and there are other metaslab 1005 * groups to select from. Otherwise, we always consider it eligible 1006 * for allocations. 1007 */ 1008 if (mc != spa_normal_class(spa) || mc->mc_groups <= 1) 1009 return (B_TRUE); 1010 1011 /* 1012 * If the metaslab group's mg_allocatable flag is set (see comments 1013 * in metaslab_group_alloc_update() for more information) and 1014 * the allocation throttle is disabled then allow allocations to this 1015 * device. However, if the allocation throttle is enabled then 1016 * check if we have reached our allocation limit (mg_alloc_queue_depth) 1017 * to determine if we should allow allocations to this metaslab group. 1018 * If all metaslab groups are no longer considered allocatable 1019 * (mc_alloc_groups == 0) or we're trying to allocate the smallest 1020 * gang block size then we allow allocations on this metaslab group 1021 * regardless of the mg_allocatable or throttle settings. 1022 */ 1023 if (mg->mg_allocatable) { 1024 metaslab_group_t *mgp; 1025 int64_t qdepth; 1026 uint64_t qmax = mg->mg_cur_max_alloc_queue_depth[allocator]; 1027 1028 if (!mc->mc_alloc_throttle_enabled) 1029 return (B_TRUE); 1030 1031 /* 1032 * If this metaslab group does not have any free space, then 1033 * there is no point in looking further. 1034 */ 1035 if (mg->mg_no_free_space) 1036 return (B_FALSE); 1037 1038 qdepth = refcount_count(&mg->mg_alloc_queue_depth[allocator]); 1039 1040 /* 1041 * If this metaslab group is below its qmax or it's 1042 * the only allocatable metasable group, then attempt 1043 * to allocate from it. 1044 */ 1045 if (qdepth < qmax || mc->mc_alloc_groups == 1) 1046 return (B_TRUE); 1047 ASSERT3U(mc->mc_alloc_groups, >, 1); 1048 1049 /* 1050 * Since this metaslab group is at or over its qmax, we 1051 * need to determine if there are metaslab groups after this 1052 * one that might be able to handle this allocation. This is 1053 * racy since we can't hold the locks for all metaslab 1054 * groups at the same time when we make this check. 1055 */ 1056 for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) { 1057 qmax = mgp->mg_cur_max_alloc_queue_depth[allocator]; 1058 1059 qdepth = refcount_count( 1060 &mgp->mg_alloc_queue_depth[allocator]); 1061 1062 /* 1063 * If there is another metaslab group that 1064 * might be able to handle the allocation, then 1065 * we return false so that we skip this group. 1066 */ 1067 if (qdepth < qmax && !mgp->mg_no_free_space) 1068 return (B_FALSE); 1069 } 1070 1071 /* 1072 * We didn't find another group to handle the allocation 1073 * so we can't skip this metaslab group even though 1074 * we are at or over our qmax. 1075 */ 1076 return (B_TRUE); 1077 1078 } else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) { 1079 return (B_TRUE); 1080 } 1081 return (B_FALSE); 1082 } 1083 1084 /* 1085 * ========================================================================== 1086 * Range tree callbacks 1087 * ========================================================================== 1088 */ 1089 1090 /* 1091 * Comparison function for the private size-ordered tree. Tree is sorted 1092 * by size, larger sizes at the end of the tree. 1093 */ 1094 static int 1095 metaslab_rangesize_compare(const void *x1, const void *x2) 1096 { 1097 const range_seg_t *r1 = x1; 1098 const range_seg_t *r2 = x2; 1099 uint64_t rs_size1 = r1->rs_end - r1->rs_start; 1100 uint64_t rs_size2 = r2->rs_end - r2->rs_start; 1101 1102 if (rs_size1 < rs_size2) 1103 return (-1); 1104 if (rs_size1 > rs_size2) 1105 return (1); 1106 1107 if (r1->rs_start < r2->rs_start) 1108 return (-1); 1109 1110 if (r1->rs_start > r2->rs_start) 1111 return (1); 1112 1113 return (0); 1114 } 1115 1116 /* 1117 * Create any block allocator specific components. The current allocators 1118 * rely on using both a size-ordered range_tree_t and an array of uint64_t's. 1119 */ 1120 static void 1121 metaslab_rt_create(range_tree_t *rt, void *arg) 1122 { 1123 metaslab_t *msp = arg; 1124 1125 ASSERT3P(rt->rt_arg, ==, msp); 1126 ASSERT(msp->ms_allocatable == NULL); 1127 1128 avl_create(&msp->ms_allocatable_by_size, metaslab_rangesize_compare, 1129 sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node)); 1130 } 1131 1132 /* 1133 * Destroy the block allocator specific components. 1134 */ 1135 static void 1136 metaslab_rt_destroy(range_tree_t *rt, void *arg) 1137 { 1138 metaslab_t *msp = arg; 1139 1140 ASSERT3P(rt->rt_arg, ==, msp); 1141 ASSERT3P(msp->ms_allocatable, ==, rt); 1142 ASSERT0(avl_numnodes(&msp->ms_allocatable_by_size)); 1143 1144 avl_destroy(&msp->ms_allocatable_by_size); 1145 } 1146 1147 static void 1148 metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg) 1149 { 1150 metaslab_t *msp = arg; 1151 1152 ASSERT3P(rt->rt_arg, ==, msp); 1153 ASSERT3P(msp->ms_allocatable, ==, rt); 1154 VERIFY(!msp->ms_condensing); 1155 avl_add(&msp->ms_allocatable_by_size, rs); 1156 } 1157 1158 static void 1159 metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg) 1160 { 1161 metaslab_t *msp = arg; 1162 1163 ASSERT3P(rt->rt_arg, ==, msp); 1164 ASSERT3P(msp->ms_allocatable, ==, rt); 1165 VERIFY(!msp->ms_condensing); 1166 avl_remove(&msp->ms_allocatable_by_size, rs); 1167 } 1168 1169 static void 1170 metaslab_rt_vacate(range_tree_t *rt, void *arg) 1171 { 1172 metaslab_t *msp = arg; 1173 1174 ASSERT3P(rt->rt_arg, ==, msp); 1175 ASSERT3P(msp->ms_allocatable, ==, rt); 1176 1177 /* 1178 * Normally one would walk the tree freeing nodes along the way. 1179 * Since the nodes are shared with the range trees we can avoid 1180 * walking all nodes and just reinitialize the avl tree. The nodes 1181 * will be freed by the range tree, so we don't want to free them here. 1182 */ 1183 avl_create(&msp->ms_allocatable_by_size, metaslab_rangesize_compare, 1184 sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node)); 1185 } 1186 1187 static range_tree_ops_t metaslab_rt_ops = { 1188 metaslab_rt_create, 1189 metaslab_rt_destroy, 1190 metaslab_rt_add, 1191 metaslab_rt_remove, 1192 metaslab_rt_vacate 1193 }; 1194 1195 /* 1196 * ========================================================================== 1197 * Common allocator routines 1198 * ========================================================================== 1199 */ 1200 1201 /* 1202 * Return the maximum contiguous segment within the metaslab. 1203 */ 1204 uint64_t 1205 metaslab_block_maxsize(metaslab_t *msp) 1206 { 1207 avl_tree_t *t = &msp->ms_allocatable_by_size; 1208 range_seg_t *rs; 1209 1210 if (t == NULL || (rs = avl_last(t)) == NULL) 1211 return (0ULL); 1212 1213 return (rs->rs_end - rs->rs_start); 1214 } 1215 1216 static range_seg_t * 1217 metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size) 1218 { 1219 range_seg_t *rs, rsearch; 1220 avl_index_t where; 1221 1222 rsearch.rs_start = start; 1223 rsearch.rs_end = start + size; 1224 1225 rs = avl_find(t, &rsearch, &where); 1226 if (rs == NULL) { 1227 rs = avl_nearest(t, where, AVL_AFTER); 1228 } 1229 1230 return (rs); 1231 } 1232 1233 /* 1234 * This is a helper function that can be used by the allocator to find 1235 * a suitable block to allocate. This will search the specified AVL 1236 * tree looking for a block that matches the specified criteria. 1237 */ 1238 static uint64_t 1239 metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, 1240 uint64_t align) 1241 { 1242 range_seg_t *rs = metaslab_block_find(t, *cursor, size); 1243 1244 while (rs != NULL) { 1245 uint64_t offset = P2ROUNDUP(rs->rs_start, align); 1246 1247 if (offset + size <= rs->rs_end) { 1248 *cursor = offset + size; 1249 return (offset); 1250 } 1251 rs = AVL_NEXT(t, rs); 1252 } 1253 1254 /* 1255 * If we know we've searched the whole map (*cursor == 0), give up. 1256 * Otherwise, reset the cursor to the beginning and try again. 1257 */ 1258 if (*cursor == 0) 1259 return (-1ULL); 1260 1261 *cursor = 0; 1262 return (metaslab_block_picker(t, cursor, size, align)); 1263 } 1264 1265 /* 1266 * ========================================================================== 1267 * The first-fit block allocator 1268 * ========================================================================== 1269 */ 1270 static uint64_t 1271 metaslab_ff_alloc(metaslab_t *msp, uint64_t size) 1272 { 1273 /* 1274 * Find the largest power of 2 block size that evenly divides the 1275 * requested size. This is used to try to allocate blocks with similar 1276 * alignment from the same area of the metaslab (i.e. same cursor 1277 * bucket) but it does not guarantee that other allocations sizes 1278 * may exist in the same region. 1279 */ 1280 uint64_t align = size & -size; 1281 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; 1282 avl_tree_t *t = &msp->ms_allocatable->rt_root; 1283 1284 return (metaslab_block_picker(t, cursor, size, align)); 1285 } 1286 1287 static metaslab_ops_t metaslab_ff_ops = { 1288 metaslab_ff_alloc 1289 }; 1290 1291 /* 1292 * ========================================================================== 1293 * Dynamic block allocator - 1294 * Uses the first fit allocation scheme until space get low and then 1295 * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold 1296 * and metaslab_df_free_pct to determine when to switch the allocation scheme. 1297 * ========================================================================== 1298 */ 1299 static uint64_t 1300 metaslab_df_alloc(metaslab_t *msp, uint64_t size) 1301 { 1302 /* 1303 * Find the largest power of 2 block size that evenly divides the 1304 * requested size. This is used to try to allocate blocks with similar 1305 * alignment from the same area of the metaslab (i.e. same cursor 1306 * bucket) but it does not guarantee that other allocations sizes 1307 * may exist in the same region. 1308 */ 1309 uint64_t align = size & -size; 1310 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; 1311 range_tree_t *rt = msp->ms_allocatable; 1312 avl_tree_t *t = &rt->rt_root; 1313 uint64_t max_size = metaslab_block_maxsize(msp); 1314 int free_pct = range_tree_space(rt) * 100 / msp->ms_size; 1315 1316 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1317 ASSERT3U(avl_numnodes(t), ==, 1318 avl_numnodes(&msp->ms_allocatable_by_size)); 1319 1320 if (max_size < size) 1321 return (-1ULL); 1322 1323 /* 1324 * If we're running low on space switch to using the size 1325 * sorted AVL tree (best-fit). 1326 */ 1327 if (max_size < metaslab_df_alloc_threshold || 1328 free_pct < metaslab_df_free_pct) { 1329 t = &msp->ms_allocatable_by_size; 1330 *cursor = 0; 1331 } 1332 1333 return (metaslab_block_picker(t, cursor, size, 1ULL)); 1334 } 1335 1336 static metaslab_ops_t metaslab_df_ops = { 1337 metaslab_df_alloc 1338 }; 1339 1340 /* 1341 * ========================================================================== 1342 * Cursor fit block allocator - 1343 * Select the largest region in the metaslab, set the cursor to the beginning 1344 * of the range and the cursor_end to the end of the range. As allocations 1345 * are made advance the cursor. Continue allocating from the cursor until 1346 * the range is exhausted and then find a new range. 1347 * ========================================================================== 1348 */ 1349 static uint64_t 1350 metaslab_cf_alloc(metaslab_t *msp, uint64_t size) 1351 { 1352 range_tree_t *rt = msp->ms_allocatable; 1353 avl_tree_t *t = &msp->ms_allocatable_by_size; 1354 uint64_t *cursor = &msp->ms_lbas[0]; 1355 uint64_t *cursor_end = &msp->ms_lbas[1]; 1356 uint64_t offset = 0; 1357 1358 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1359 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root)); 1360 1361 ASSERT3U(*cursor_end, >=, *cursor); 1362 1363 if ((*cursor + size) > *cursor_end) { 1364 range_seg_t *rs; 1365 1366 rs = avl_last(&msp->ms_allocatable_by_size); 1367 if (rs == NULL || (rs->rs_end - rs->rs_start) < size) 1368 return (-1ULL); 1369 1370 *cursor = rs->rs_start; 1371 *cursor_end = rs->rs_end; 1372 } 1373 1374 offset = *cursor; 1375 *cursor += size; 1376 1377 return (offset); 1378 } 1379 1380 static metaslab_ops_t metaslab_cf_ops = { 1381 metaslab_cf_alloc 1382 }; 1383 1384 /* 1385 * ========================================================================== 1386 * New dynamic fit allocator - 1387 * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift 1388 * contiguous blocks. If no region is found then just use the largest segment 1389 * that remains. 1390 * ========================================================================== 1391 */ 1392 1393 /* 1394 * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift) 1395 * to request from the allocator. 1396 */ 1397 uint64_t metaslab_ndf_clump_shift = 4; 1398 1399 static uint64_t 1400 metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) 1401 { 1402 avl_tree_t *t = &msp->ms_allocatable->rt_root; 1403 avl_index_t where; 1404 range_seg_t *rs, rsearch; 1405 uint64_t hbit = highbit64(size); 1406 uint64_t *cursor = &msp->ms_lbas[hbit - 1]; 1407 uint64_t max_size = metaslab_block_maxsize(msp); 1408 1409 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1410 ASSERT3U(avl_numnodes(t), ==, 1411 avl_numnodes(&msp->ms_allocatable_by_size)); 1412 1413 if (max_size < size) 1414 return (-1ULL); 1415 1416 rsearch.rs_start = *cursor; 1417 rsearch.rs_end = *cursor + size; 1418 1419 rs = avl_find(t, &rsearch, &where); 1420 if (rs == NULL || (rs->rs_end - rs->rs_start) < size) { 1421 t = &msp->ms_allocatable_by_size; 1422 1423 rsearch.rs_start = 0; 1424 rsearch.rs_end = MIN(max_size, 1425 1ULL << (hbit + metaslab_ndf_clump_shift)); 1426 rs = avl_find(t, &rsearch, &where); 1427 if (rs == NULL) 1428 rs = avl_nearest(t, where, AVL_AFTER); 1429 ASSERT(rs != NULL); 1430 } 1431 1432 if ((rs->rs_end - rs->rs_start) >= size) { 1433 *cursor = rs->rs_start + size; 1434 return (rs->rs_start); 1435 } 1436 return (-1ULL); 1437 } 1438 1439 static metaslab_ops_t metaslab_ndf_ops = { 1440 metaslab_ndf_alloc 1441 }; 1442 1443 metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops; 1444 1445 /* 1446 * ========================================================================== 1447 * Metaslabs 1448 * ========================================================================== 1449 */ 1450 1451 /* 1452 * Wait for any in-progress metaslab loads to complete. 1453 */ 1454 void 1455 metaslab_load_wait(metaslab_t *msp) 1456 { 1457 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1458 1459 while (msp->ms_loading) { 1460 ASSERT(!msp->ms_loaded); 1461 cv_wait(&msp->ms_load_cv, &msp->ms_lock); 1462 } 1463 } 1464 1465 int 1466 metaslab_load(metaslab_t *msp) 1467 { 1468 int error = 0; 1469 boolean_t success = B_FALSE; 1470 1471 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1472 ASSERT(!msp->ms_loaded); 1473 ASSERT(!msp->ms_loading); 1474 1475 msp->ms_loading = B_TRUE; 1476 /* 1477 * Nobody else can manipulate a loading metaslab, so it's now safe 1478 * to drop the lock. This way we don't have to hold the lock while 1479 * reading the spacemap from disk. 1480 */ 1481 mutex_exit(&msp->ms_lock); 1482 1483 /* 1484 * If the space map has not been allocated yet, then treat 1485 * all the space in the metaslab as free and add it to ms_allocatable. 1486 */ 1487 if (msp->ms_sm != NULL) { 1488 error = space_map_load(msp->ms_sm, msp->ms_allocatable, 1489 SM_FREE); 1490 } else { 1491 range_tree_add(msp->ms_allocatable, 1492 msp->ms_start, msp->ms_size); 1493 } 1494 1495 success = (error == 0); 1496 1497 mutex_enter(&msp->ms_lock); 1498 msp->ms_loading = B_FALSE; 1499 1500 if (success) { 1501 ASSERT3P(msp->ms_group, !=, NULL); 1502 msp->ms_loaded = B_TRUE; 1503 1504 /* 1505 * If the metaslab already has a spacemap, then we need to 1506 * remove all segments from the defer tree; otherwise, the 1507 * metaslab is completely empty and we can skip this. 1508 */ 1509 if (msp->ms_sm != NULL) { 1510 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1511 range_tree_walk(msp->ms_defer[t], 1512 range_tree_remove, msp->ms_allocatable); 1513 } 1514 } 1515 msp->ms_max_size = metaslab_block_maxsize(msp); 1516 } 1517 cv_broadcast(&msp->ms_load_cv); 1518 return (error); 1519 } 1520 1521 void 1522 metaslab_unload(metaslab_t *msp) 1523 { 1524 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1525 range_tree_vacate(msp->ms_allocatable, NULL, NULL); 1526 msp->ms_loaded = B_FALSE; 1527 msp->ms_weight &= ~METASLAB_ACTIVE_MASK; 1528 msp->ms_max_size = 0; 1529 } 1530 1531 int 1532 metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, 1533 metaslab_t **msp) 1534 { 1535 vdev_t *vd = mg->mg_vd; 1536 objset_t *mos = vd->vdev_spa->spa_meta_objset; 1537 metaslab_t *ms; 1538 int error; 1539 1540 ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); 1541 mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL); 1542 mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL); 1543 cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); 1544 ms->ms_id = id; 1545 ms->ms_start = id << vd->vdev_ms_shift; 1546 ms->ms_size = 1ULL << vd->vdev_ms_shift; 1547 ms->ms_allocator = -1; 1548 ms->ms_new = B_TRUE; 1549 1550 /* 1551 * We only open space map objects that already exist. All others 1552 * will be opened when we finally allocate an object for it. 1553 */ 1554 if (object != 0) { 1555 error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start, 1556 ms->ms_size, vd->vdev_ashift); 1557 1558 if (error != 0) { 1559 kmem_free(ms, sizeof (metaslab_t)); 1560 return (error); 1561 } 1562 1563 ASSERT(ms->ms_sm != NULL); 1564 } 1565 1566 /* 1567 * We create the main range tree here, but we don't create the 1568 * other range trees until metaslab_sync_done(). This serves 1569 * two purposes: it allows metaslab_sync_done() to detect the 1570 * addition of new space; and for debugging, it ensures that we'd 1571 * data fault on any attempt to use this metaslab before it's ready. 1572 */ 1573 ms->ms_allocatable = range_tree_create(&metaslab_rt_ops, ms); 1574 metaslab_group_add(mg, ms); 1575 1576 metaslab_set_fragmentation(ms); 1577 1578 /* 1579 * If we're opening an existing pool (txg == 0) or creating 1580 * a new one (txg == TXG_INITIAL), all space is available now. 1581 * If we're adding space to an existing pool, the new space 1582 * does not become available until after this txg has synced. 1583 * The metaslab's weight will also be initialized when we sync 1584 * out this txg. This ensures that we don't attempt to allocate 1585 * from it before we have initialized it completely. 1586 */ 1587 if (txg <= TXG_INITIAL) 1588 metaslab_sync_done(ms, 0); 1589 1590 /* 1591 * If metaslab_debug_load is set and we're initializing a metaslab 1592 * that has an allocated space map object then load the its space 1593 * map so that can verify frees. 1594 */ 1595 if (metaslab_debug_load && ms->ms_sm != NULL) { 1596 mutex_enter(&ms->ms_lock); 1597 VERIFY0(metaslab_load(ms)); 1598 mutex_exit(&ms->ms_lock); 1599 } 1600 1601 if (txg != 0) { 1602 vdev_dirty(vd, 0, NULL, txg); 1603 vdev_dirty(vd, VDD_METASLAB, ms, txg); 1604 } 1605 1606 *msp = ms; 1607 1608 return (0); 1609 } 1610 1611 void 1612 metaslab_fini(metaslab_t *msp) 1613 { 1614 metaslab_group_t *mg = msp->ms_group; 1615 1616 metaslab_group_remove(mg, msp); 1617 1618 mutex_enter(&msp->ms_lock); 1619 VERIFY(msp->ms_group == NULL); 1620 vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm), 1621 0, -msp->ms_size); 1622 space_map_close(msp->ms_sm); 1623 1624 metaslab_unload(msp); 1625 range_tree_destroy(msp->ms_allocatable); 1626 range_tree_destroy(msp->ms_freeing); 1627 range_tree_destroy(msp->ms_freed); 1628 1629 for (int t = 0; t < TXG_SIZE; t++) { 1630 range_tree_destroy(msp->ms_allocating[t]); 1631 } 1632 1633 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1634 range_tree_destroy(msp->ms_defer[t]); 1635 } 1636 ASSERT0(msp->ms_deferspace); 1637 1638 range_tree_destroy(msp->ms_checkpointing); 1639 1640 mutex_exit(&msp->ms_lock); 1641 cv_destroy(&msp->ms_load_cv); 1642 mutex_destroy(&msp->ms_lock); 1643 mutex_destroy(&msp->ms_sync_lock); 1644 ASSERT3U(msp->ms_allocator, ==, -1); 1645 1646 kmem_free(msp, sizeof (metaslab_t)); 1647 } 1648 1649 #define FRAGMENTATION_TABLE_SIZE 17 1650 1651 /* 1652 * This table defines a segment size based fragmentation metric that will 1653 * allow each metaslab to derive its own fragmentation value. This is done 1654 * by calculating the space in each bucket of the spacemap histogram and 1655 * multiplying that by the fragmetation metric in this table. Doing 1656 * this for all buckets and dividing it by the total amount of free 1657 * space in this metaslab (i.e. the total free space in all buckets) gives 1658 * us the fragmentation metric. This means that a high fragmentation metric 1659 * equates to most of the free space being comprised of small segments. 1660 * Conversely, if the metric is low, then most of the free space is in 1661 * large segments. A 10% change in fragmentation equates to approximately 1662 * double the number of segments. 1663 * 1664 * This table defines 0% fragmented space using 16MB segments. Testing has 1665 * shown that segments that are greater than or equal to 16MB do not suffer 1666 * from drastic performance problems. Using this value, we derive the rest 1667 * of the table. Since the fragmentation value is never stored on disk, it 1668 * is possible to change these calculations in the future. 1669 */ 1670 int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = { 1671 100, /* 512B */ 1672 100, /* 1K */ 1673 98, /* 2K */ 1674 95, /* 4K */ 1675 90, /* 8K */ 1676 80, /* 16K */ 1677 70, /* 32K */ 1678 60, /* 64K */ 1679 50, /* 128K */ 1680 40, /* 256K */ 1681 30, /* 512K */ 1682 20, /* 1M */ 1683 15, /* 2M */ 1684 10, /* 4M */ 1685 5, /* 8M */ 1686 0 /* 16M */ 1687 }; 1688 1689 /* 1690 * Calclate the metaslab's fragmentation metric. A return value 1691 * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does 1692 * not support this metric. Otherwise, the return value should be in the 1693 * range [0, 100]. 1694 */ 1695 static void 1696 metaslab_set_fragmentation(metaslab_t *msp) 1697 { 1698 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1699 uint64_t fragmentation = 0; 1700 uint64_t total = 0; 1701 boolean_t feature_enabled = spa_feature_is_enabled(spa, 1702 SPA_FEATURE_SPACEMAP_HISTOGRAM); 1703 1704 if (!feature_enabled) { 1705 msp->ms_fragmentation = ZFS_FRAG_INVALID; 1706 return; 1707 } 1708 1709 /* 1710 * A null space map means that the entire metaslab is free 1711 * and thus is not fragmented. 1712 */ 1713 if (msp->ms_sm == NULL) { 1714 msp->ms_fragmentation = 0; 1715 return; 1716 } 1717 1718 /* 1719 * If this metaslab's space map has not been upgraded, flag it 1720 * so that we upgrade next time we encounter it. 1721 */ 1722 if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) { 1723 uint64_t txg = spa_syncing_txg(spa); 1724 vdev_t *vd = msp->ms_group->mg_vd; 1725 1726 /* 1727 * If we've reached the final dirty txg, then we must 1728 * be shutting down the pool. We don't want to dirty 1729 * any data past this point so skip setting the condense 1730 * flag. We can retry this action the next time the pool 1731 * is imported. 1732 */ 1733 if (spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) { 1734 msp->ms_condense_wanted = B_TRUE; 1735 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 1736 spa_dbgmsg(spa, "txg %llu, requesting force condense: " 1737 "ms_id %llu, vdev_id %llu", txg, msp->ms_id, 1738 vd->vdev_id); 1739 } 1740 msp->ms_fragmentation = ZFS_FRAG_INVALID; 1741 return; 1742 } 1743 1744 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 1745 uint64_t space = 0; 1746 uint8_t shift = msp->ms_sm->sm_shift; 1747 1748 int idx = MIN(shift - SPA_MINBLOCKSHIFT + i, 1749 FRAGMENTATION_TABLE_SIZE - 1); 1750 1751 if (msp->ms_sm->sm_phys->smp_histogram[i] == 0) 1752 continue; 1753 1754 space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift); 1755 total += space; 1756 1757 ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE); 1758 fragmentation += space * zfs_frag_table[idx]; 1759 } 1760 1761 if (total > 0) 1762 fragmentation /= total; 1763 ASSERT3U(fragmentation, <=, 100); 1764 1765 msp->ms_fragmentation = fragmentation; 1766 } 1767 1768 /* 1769 * Compute a weight -- a selection preference value -- for the given metaslab. 1770 * This is based on the amount of free space, the level of fragmentation, 1771 * the LBA range, and whether the metaslab is loaded. 1772 */ 1773 static uint64_t 1774 metaslab_space_weight(metaslab_t *msp) 1775 { 1776 metaslab_group_t *mg = msp->ms_group; 1777 vdev_t *vd = mg->mg_vd; 1778 uint64_t weight, space; 1779 1780 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1781 ASSERT(!vd->vdev_removing); 1782 1783 /* 1784 * The baseline weight is the metaslab's free space. 1785 */ 1786 space = msp->ms_size - space_map_allocated(msp->ms_sm); 1787 1788 if (metaslab_fragmentation_factor_enabled && 1789 msp->ms_fragmentation != ZFS_FRAG_INVALID) { 1790 /* 1791 * Use the fragmentation information to inversely scale 1792 * down the baseline weight. We need to ensure that we 1793 * don't exclude this metaslab completely when it's 100% 1794 * fragmented. To avoid this we reduce the fragmented value 1795 * by 1. 1796 */ 1797 space = (space * (100 - (msp->ms_fragmentation - 1))) / 100; 1798 1799 /* 1800 * If space < SPA_MINBLOCKSIZE, then we will not allocate from 1801 * this metaslab again. The fragmentation metric may have 1802 * decreased the space to something smaller than 1803 * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE 1804 * so that we can consume any remaining space. 1805 */ 1806 if (space > 0 && space < SPA_MINBLOCKSIZE) 1807 space = SPA_MINBLOCKSIZE; 1808 } 1809 weight = space; 1810 1811 /* 1812 * Modern disks have uniform bit density and constant angular velocity. 1813 * Therefore, the outer recording zones are faster (higher bandwidth) 1814 * than the inner zones by the ratio of outer to inner track diameter, 1815 * which is typically around 2:1. We account for this by assigning 1816 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). 1817 * In effect, this means that we'll select the metaslab with the most 1818 * free bandwidth rather than simply the one with the most free space. 1819 */ 1820 if (metaslab_lba_weighting_enabled) { 1821 weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count; 1822 ASSERT(weight >= space && weight <= 2 * space); 1823 } 1824 1825 /* 1826 * If this metaslab is one we're actively using, adjust its 1827 * weight to make it preferable to any inactive metaslab so 1828 * we'll polish it off. If the fragmentation on this metaslab 1829 * has exceed our threshold, then don't mark it active. 1830 */ 1831 if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID && 1832 msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) { 1833 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); 1834 } 1835 1836 WEIGHT_SET_SPACEBASED(weight); 1837 return (weight); 1838 } 1839 1840 /* 1841 * Return the weight of the specified metaslab, according to the segment-based 1842 * weighting algorithm. The metaslab must be loaded. This function can 1843 * be called within a sync pass since it relies only on the metaslab's 1844 * range tree which is always accurate when the metaslab is loaded. 1845 */ 1846 static uint64_t 1847 metaslab_weight_from_range_tree(metaslab_t *msp) 1848 { 1849 uint64_t weight = 0; 1850 uint32_t segments = 0; 1851 1852 ASSERT(msp->ms_loaded); 1853 1854 for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT; 1855 i--) { 1856 uint8_t shift = msp->ms_group->mg_vd->vdev_ashift; 1857 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; 1858 1859 segments <<= 1; 1860 segments += msp->ms_allocatable->rt_histogram[i]; 1861 1862 /* 1863 * The range tree provides more precision than the space map 1864 * and must be downgraded so that all values fit within the 1865 * space map's histogram. This allows us to compare loaded 1866 * vs. unloaded metaslabs to determine which metaslab is 1867 * considered "best". 1868 */ 1869 if (i > max_idx) 1870 continue; 1871 1872 if (segments != 0) { 1873 WEIGHT_SET_COUNT(weight, segments); 1874 WEIGHT_SET_INDEX(weight, i); 1875 WEIGHT_SET_ACTIVE(weight, 0); 1876 break; 1877 } 1878 } 1879 return (weight); 1880 } 1881 1882 /* 1883 * Calculate the weight based on the on-disk histogram. This should only 1884 * be called after a sync pass has completely finished since the on-disk 1885 * information is updated in metaslab_sync(). 1886 */ 1887 static uint64_t 1888 metaslab_weight_from_spacemap(metaslab_t *msp) 1889 { 1890 uint64_t weight = 0; 1891 1892 for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) { 1893 if (msp->ms_sm->sm_phys->smp_histogram[i] != 0) { 1894 WEIGHT_SET_COUNT(weight, 1895 msp->ms_sm->sm_phys->smp_histogram[i]); 1896 WEIGHT_SET_INDEX(weight, i + 1897 msp->ms_sm->sm_shift); 1898 WEIGHT_SET_ACTIVE(weight, 0); 1899 break; 1900 } 1901 } 1902 return (weight); 1903 } 1904 1905 /* 1906 * Compute a segment-based weight for the specified metaslab. The weight 1907 * is determined by highest bucket in the histogram. The information 1908 * for the highest bucket is encoded into the weight value. 1909 */ 1910 static uint64_t 1911 metaslab_segment_weight(metaslab_t *msp) 1912 { 1913 metaslab_group_t *mg = msp->ms_group; 1914 uint64_t weight = 0; 1915 uint8_t shift = mg->mg_vd->vdev_ashift; 1916 1917 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1918 1919 /* 1920 * The metaslab is completely free. 1921 */ 1922 if (space_map_allocated(msp->ms_sm) == 0) { 1923 int idx = highbit64(msp->ms_size) - 1; 1924 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; 1925 1926 if (idx < max_idx) { 1927 WEIGHT_SET_COUNT(weight, 1ULL); 1928 WEIGHT_SET_INDEX(weight, idx); 1929 } else { 1930 WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx)); 1931 WEIGHT_SET_INDEX(weight, max_idx); 1932 } 1933 WEIGHT_SET_ACTIVE(weight, 0); 1934 ASSERT(!WEIGHT_IS_SPACEBASED(weight)); 1935 1936 return (weight); 1937 } 1938 1939 ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); 1940 1941 /* 1942 * If the metaslab is fully allocated then just make the weight 0. 1943 */ 1944 if (space_map_allocated(msp->ms_sm) == msp->ms_size) 1945 return (0); 1946 /* 1947 * If the metaslab is already loaded, then use the range tree to 1948 * determine the weight. Otherwise, we rely on the space map information 1949 * to generate the weight. 1950 */ 1951 if (msp->ms_loaded) { 1952 weight = metaslab_weight_from_range_tree(msp); 1953 } else { 1954 weight = metaslab_weight_from_spacemap(msp); 1955 } 1956 1957 /* 1958 * If the metaslab was active the last time we calculated its weight 1959 * then keep it active. We want to consume the entire region that 1960 * is associated with this weight. 1961 */ 1962 if (msp->ms_activation_weight != 0 && weight != 0) 1963 WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight)); 1964 return (weight); 1965 } 1966 1967 /* 1968 * Determine if we should attempt to allocate from this metaslab. If the 1969 * metaslab has a maximum size then we can quickly determine if the desired 1970 * allocation size can be satisfied. Otherwise, if we're using segment-based 1971 * weighting then we can determine the maximum allocation that this metaslab 1972 * can accommodate based on the index encoded in the weight. If we're using 1973 * space-based weights then rely on the entire weight (excluding the weight 1974 * type bit). 1975 */ 1976 boolean_t 1977 metaslab_should_allocate(metaslab_t *msp, uint64_t asize) 1978 { 1979 boolean_t should_allocate; 1980 1981 if (msp->ms_max_size != 0) 1982 return (msp->ms_max_size >= asize); 1983 1984 if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) { 1985 /* 1986 * The metaslab segment weight indicates segments in the 1987 * range [2^i, 2^(i+1)), where i is the index in the weight. 1988 * Since the asize might be in the middle of the range, we 1989 * should attempt the allocation if asize < 2^(i+1). 1990 */ 1991 should_allocate = (asize < 1992 1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1)); 1993 } else { 1994 should_allocate = (asize <= 1995 (msp->ms_weight & ~METASLAB_WEIGHT_TYPE)); 1996 } 1997 return (should_allocate); 1998 } 1999 2000 static uint64_t 2001 metaslab_weight(metaslab_t *msp) 2002 { 2003 vdev_t *vd = msp->ms_group->mg_vd; 2004 spa_t *spa = vd->vdev_spa; 2005 uint64_t weight; 2006 2007 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2008 2009 /* 2010 * If this vdev is in the process of being removed, there is nothing 2011 * for us to do here. 2012 */ 2013 if (vd->vdev_removing) 2014 return (0); 2015 2016 metaslab_set_fragmentation(msp); 2017 2018 /* 2019 * Update the maximum size if the metaslab is loaded. This will 2020 * ensure that we get an accurate maximum size if newly freed space 2021 * has been added back into the free tree. 2022 */ 2023 if (msp->ms_loaded) 2024 msp->ms_max_size = metaslab_block_maxsize(msp); 2025 2026 /* 2027 * Segment-based weighting requires space map histogram support. 2028 */ 2029 if (zfs_metaslab_segment_weight_enabled && 2030 spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) && 2031 (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size == 2032 sizeof (space_map_phys_t))) { 2033 weight = metaslab_segment_weight(msp); 2034 } else { 2035 weight = metaslab_space_weight(msp); 2036 } 2037 return (weight); 2038 } 2039 2040 static int 2041 metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp, 2042 int allocator, uint64_t activation_weight) 2043 { 2044 /* 2045 * If we're activating for the claim code, we don't want to actually 2046 * set the metaslab up for a specific allocator. 2047 */ 2048 if (activation_weight == METASLAB_WEIGHT_CLAIM) 2049 return (0); 2050 metaslab_t **arr = (activation_weight == METASLAB_WEIGHT_PRIMARY ? 2051 mg->mg_primaries : mg->mg_secondaries); 2052 2053 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2054 mutex_enter(&mg->mg_lock); 2055 if (arr[allocator] != NULL) { 2056 mutex_exit(&mg->mg_lock); 2057 return (EEXIST); 2058 } 2059 2060 arr[allocator] = msp; 2061 ASSERT3S(msp->ms_allocator, ==, -1); 2062 msp->ms_allocator = allocator; 2063 msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY); 2064 mutex_exit(&mg->mg_lock); 2065 2066 return (0); 2067 } 2068 2069 static int 2070 metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight) 2071 { 2072 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2073 2074 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { 2075 int error = 0; 2076 metaslab_load_wait(msp); 2077 if (!msp->ms_loaded) { 2078 if ((error = metaslab_load(msp)) != 0) { 2079 metaslab_group_sort(msp->ms_group, msp, 0); 2080 return (error); 2081 } 2082 } 2083 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) { 2084 /* 2085 * The metaslab was activated for another allocator 2086 * while we were waiting, we should reselect. 2087 */ 2088 return (EBUSY); 2089 } 2090 if ((error = metaslab_activate_allocator(msp->ms_group, msp, 2091 allocator, activation_weight)) != 0) { 2092 return (error); 2093 } 2094 2095 msp->ms_activation_weight = msp->ms_weight; 2096 metaslab_group_sort(msp->ms_group, msp, 2097 msp->ms_weight | activation_weight); 2098 } 2099 ASSERT(msp->ms_loaded); 2100 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); 2101 2102 return (0); 2103 } 2104 2105 static void 2106 metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp, 2107 uint64_t weight) 2108 { 2109 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2110 if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) { 2111 metaslab_group_sort(mg, msp, weight); 2112 return; 2113 } 2114 2115 mutex_enter(&mg->mg_lock); 2116 ASSERT3P(msp->ms_group, ==, mg); 2117 if (msp->ms_primary) { 2118 ASSERT3U(0, <=, msp->ms_allocator); 2119 ASSERT3U(msp->ms_allocator, <, mg->mg_allocators); 2120 ASSERT3P(mg->mg_primaries[msp->ms_allocator], ==, msp); 2121 ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); 2122 mg->mg_primaries[msp->ms_allocator] = NULL; 2123 } else { 2124 ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); 2125 ASSERT3P(mg->mg_secondaries[msp->ms_allocator], ==, msp); 2126 mg->mg_secondaries[msp->ms_allocator] = NULL; 2127 } 2128 msp->ms_allocator = -1; 2129 metaslab_group_sort_impl(mg, msp, weight); 2130 mutex_exit(&mg->mg_lock); 2131 } 2132 2133 static void 2134 metaslab_passivate(metaslab_t *msp, uint64_t weight) 2135 { 2136 uint64_t size = weight & ~METASLAB_WEIGHT_TYPE; 2137 2138 /* 2139 * If size < SPA_MINBLOCKSIZE, then we will not allocate from 2140 * this metaslab again. In that case, it had better be empty, 2141 * or we would be leaving space on the table. 2142 */ 2143 ASSERT(size >= SPA_MINBLOCKSIZE || 2144 range_tree_is_empty(msp->ms_allocatable)); 2145 ASSERT0(weight & METASLAB_ACTIVE_MASK); 2146 2147 msp->ms_activation_weight = 0; 2148 metaslab_passivate_allocator(msp->ms_group, msp, weight); 2149 ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); 2150 } 2151 2152 /* 2153 * Segment-based metaslabs are activated once and remain active until 2154 * we either fail an allocation attempt (similar to space-based metaslabs) 2155 * or have exhausted the free space in zfs_metaslab_switch_threshold 2156 * buckets since the metaslab was activated. This function checks to see 2157 * if we've exhaused the zfs_metaslab_switch_threshold buckets in the 2158 * metaslab and passivates it proactively. This will allow us to select a 2159 * metaslabs with larger contiguous region if any remaining within this 2160 * metaslab group. If we're in sync pass > 1, then we continue using this 2161 * metaslab so that we don't dirty more block and cause more sync passes. 2162 */ 2163 void 2164 metaslab_segment_may_passivate(metaslab_t *msp) 2165 { 2166 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 2167 2168 if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1) 2169 return; 2170 2171 /* 2172 * Since we are in the middle of a sync pass, the most accurate 2173 * information that is accessible to us is the in-core range tree 2174 * histogram; calculate the new weight based on that information. 2175 */ 2176 uint64_t weight = metaslab_weight_from_range_tree(msp); 2177 int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight); 2178 int current_idx = WEIGHT_GET_INDEX(weight); 2179 2180 if (current_idx <= activation_idx - zfs_metaslab_switch_threshold) 2181 metaslab_passivate(msp, weight); 2182 } 2183 2184 static void 2185 metaslab_preload(void *arg) 2186 { 2187 metaslab_t *msp = arg; 2188 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 2189 2190 ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock)); 2191 2192 mutex_enter(&msp->ms_lock); 2193 metaslab_load_wait(msp); 2194 if (!msp->ms_loaded) 2195 (void) metaslab_load(msp); 2196 msp->ms_selected_txg = spa_syncing_txg(spa); 2197 mutex_exit(&msp->ms_lock); 2198 } 2199 2200 static void 2201 metaslab_group_preload(metaslab_group_t *mg) 2202 { 2203 spa_t *spa = mg->mg_vd->vdev_spa; 2204 metaslab_t *msp; 2205 avl_tree_t *t = &mg->mg_metaslab_tree; 2206 int m = 0; 2207 2208 if (spa_shutting_down(spa) || !metaslab_preload_enabled) { 2209 taskq_wait(mg->mg_taskq); 2210 return; 2211 } 2212 2213 mutex_enter(&mg->mg_lock); 2214 2215 /* 2216 * Load the next potential metaslabs 2217 */ 2218 for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) { 2219 ASSERT3P(msp->ms_group, ==, mg); 2220 2221 /* 2222 * We preload only the maximum number of metaslabs specified 2223 * by metaslab_preload_limit. If a metaslab is being forced 2224 * to condense then we preload it too. This will ensure 2225 * that force condensing happens in the next txg. 2226 */ 2227 if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) { 2228 continue; 2229 } 2230 2231 VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload, 2232 msp, TQ_SLEEP) != NULL); 2233 } 2234 mutex_exit(&mg->mg_lock); 2235 } 2236 2237 /* 2238 * Determine if the space map's on-disk footprint is past our tolerance 2239 * for inefficiency. We would like to use the following criteria to make 2240 * our decision: 2241 * 2242 * 1. The size of the space map object should not dramatically increase as a 2243 * result of writing out the free space range tree. 2244 * 2245 * 2. The minimal on-disk space map representation is zfs_condense_pct/100 2246 * times the size than the free space range tree representation 2247 * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1MB). 2248 * 2249 * 3. The on-disk size of the space map should actually decrease. 2250 * 2251 * Unfortunately, we cannot compute the on-disk size of the space map in this 2252 * context because we cannot accurately compute the effects of compression, etc. 2253 * Instead, we apply the heuristic described in the block comment for 2254 * zfs_metaslab_condense_block_threshold - we only condense if the space used 2255 * is greater than a threshold number of blocks. 2256 */ 2257 static boolean_t 2258 metaslab_should_condense(metaslab_t *msp) 2259 { 2260 space_map_t *sm = msp->ms_sm; 2261 vdev_t *vd = msp->ms_group->mg_vd; 2262 uint64_t vdev_blocksize = 1 << vd->vdev_ashift; 2263 uint64_t current_txg = spa_syncing_txg(vd->vdev_spa); 2264 2265 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2266 ASSERT(msp->ms_loaded); 2267 2268 /* 2269 * Allocations and frees in early passes are generally more space 2270 * efficient (in terms of blocks described in space map entries) 2271 * than the ones in later passes (e.g. we don't compress after 2272 * sync pass 5) and condensing a metaslab multiple times in a txg 2273 * could degrade performance. 2274 * 2275 * Thus we prefer condensing each metaslab at most once every txg at 2276 * the earliest sync pass possible. If a metaslab is eligible for 2277 * condensing again after being considered for condensing within the 2278 * same txg, it will hopefully be dirty in the next txg where it will 2279 * be condensed at an earlier pass. 2280 */ 2281 if (msp->ms_condense_checked_txg == current_txg) 2282 return (B_FALSE); 2283 msp->ms_condense_checked_txg = current_txg; 2284 2285 /* 2286 * We always condense metaslabs that are empty and metaslabs for 2287 * which a condense request has been made. 2288 */ 2289 if (avl_is_empty(&msp->ms_allocatable_by_size) || 2290 msp->ms_condense_wanted) 2291 return (B_TRUE); 2292 2293 uint64_t object_size = space_map_length(msp->ms_sm); 2294 uint64_t optimal_size = space_map_estimate_optimal_size(sm, 2295 msp->ms_allocatable, SM_NO_VDEVID); 2296 2297 dmu_object_info_t doi; 2298 dmu_object_info_from_db(sm->sm_dbuf, &doi); 2299 uint64_t record_size = MAX(doi.doi_data_block_size, vdev_blocksize); 2300 2301 return (object_size >= (optimal_size * zfs_condense_pct / 100) && 2302 object_size > zfs_metaslab_condense_block_threshold * record_size); 2303 } 2304 2305 /* 2306 * Condense the on-disk space map representation to its minimized form. 2307 * The minimized form consists of a small number of allocations followed by 2308 * the entries of the free range tree. 2309 */ 2310 static void 2311 metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) 2312 { 2313 range_tree_t *condense_tree; 2314 space_map_t *sm = msp->ms_sm; 2315 2316 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2317 ASSERT(msp->ms_loaded); 2318 2319 zfs_dbgmsg("condensing: txg %llu, msp[%llu] %p, vdev id %llu, " 2320 "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg, 2321 msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id, 2322 msp->ms_group->mg_vd->vdev_spa->spa_name, 2323 space_map_length(msp->ms_sm), 2324 avl_numnodes(&msp->ms_allocatable->rt_root), 2325 msp->ms_condense_wanted ? "TRUE" : "FALSE"); 2326 2327 msp->ms_condense_wanted = B_FALSE; 2328 2329 /* 2330 * Create an range tree that is 100% allocated. We remove segments 2331 * that have been freed in this txg, any deferred frees that exist, 2332 * and any allocation in the future. Removing segments should be 2333 * a relatively inexpensive operation since we expect these trees to 2334 * have a small number of nodes. 2335 */ 2336 condense_tree = range_tree_create(NULL, NULL); 2337 range_tree_add(condense_tree, msp->ms_start, msp->ms_size); 2338 2339 range_tree_walk(msp->ms_freeing, range_tree_remove, condense_tree); 2340 range_tree_walk(msp->ms_freed, range_tree_remove, condense_tree); 2341 2342 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2343 range_tree_walk(msp->ms_defer[t], 2344 range_tree_remove, condense_tree); 2345 } 2346 2347 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 2348 range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK], 2349 range_tree_remove, condense_tree); 2350 } 2351 2352 /* 2353 * We're about to drop the metaslab's lock thus allowing 2354 * other consumers to change it's content. Set the 2355 * metaslab's ms_condensing flag to ensure that 2356 * allocations on this metaslab do not occur while we're 2357 * in the middle of committing it to disk. This is only critical 2358 * for ms_allocatable as all other range trees use per txg 2359 * views of their content. 2360 */ 2361 msp->ms_condensing = B_TRUE; 2362 2363 mutex_exit(&msp->ms_lock); 2364 space_map_truncate(sm, zfs_metaslab_sm_blksz, tx); 2365 2366 /* 2367 * While we would ideally like to create a space map representation 2368 * that consists only of allocation records, doing so can be 2369 * prohibitively expensive because the in-core free tree can be 2370 * large, and therefore computationally expensive to subtract 2371 * from the condense_tree. Instead we sync out two trees, a cheap 2372 * allocation only tree followed by the in-core free tree. While not 2373 * optimal, this is typically close to optimal, and much cheaper to 2374 * compute. 2375 */ 2376 space_map_write(sm, condense_tree, SM_ALLOC, SM_NO_VDEVID, tx); 2377 range_tree_vacate(condense_tree, NULL, NULL); 2378 range_tree_destroy(condense_tree); 2379 2380 space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx); 2381 mutex_enter(&msp->ms_lock); 2382 msp->ms_condensing = B_FALSE; 2383 } 2384 2385 /* 2386 * Write a metaslab to disk in the context of the specified transaction group. 2387 */ 2388 void 2389 metaslab_sync(metaslab_t *msp, uint64_t txg) 2390 { 2391 metaslab_group_t *mg = msp->ms_group; 2392 vdev_t *vd = mg->mg_vd; 2393 spa_t *spa = vd->vdev_spa; 2394 objset_t *mos = spa_meta_objset(spa); 2395 range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK]; 2396 dmu_tx_t *tx; 2397 uint64_t object = space_map_object(msp->ms_sm); 2398 2399 ASSERT(!vd->vdev_ishole); 2400 2401 /* 2402 * This metaslab has just been added so there's no work to do now. 2403 */ 2404 if (msp->ms_freeing == NULL) { 2405 ASSERT3P(alloctree, ==, NULL); 2406 return; 2407 } 2408 2409 ASSERT3P(alloctree, !=, NULL); 2410 ASSERT3P(msp->ms_freeing, !=, NULL); 2411 ASSERT3P(msp->ms_freed, !=, NULL); 2412 ASSERT3P(msp->ms_checkpointing, !=, NULL); 2413 2414 /* 2415 * Normally, we don't want to process a metaslab if there are no 2416 * allocations or frees to perform. However, if the metaslab is being 2417 * forced to condense and it's loaded, we need to let it through. 2418 */ 2419 if (range_tree_is_empty(alloctree) && 2420 range_tree_is_empty(msp->ms_freeing) && 2421 range_tree_is_empty(msp->ms_checkpointing) && 2422 !(msp->ms_loaded && msp->ms_condense_wanted)) 2423 return; 2424 2425 2426 VERIFY(txg <= spa_final_dirty_txg(spa)); 2427 2428 /* 2429 * The only state that can actually be changing concurrently with 2430 * metaslab_sync() is the metaslab's ms_allocatable. No other 2431 * thread can be modifying this txg's alloc, freeing, 2432 * freed, or space_map_phys_t. We drop ms_lock whenever we 2433 * could call into the DMU, because the DMU can call down to us 2434 * (e.g. via zio_free()) at any time. 2435 * 2436 * The spa_vdev_remove_thread() can be reading metaslab state 2437 * concurrently, and it is locked out by the ms_sync_lock. Note 2438 * that the ms_lock is insufficient for this, because it is dropped 2439 * by space_map_write(). 2440 */ 2441 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 2442 2443 if (msp->ms_sm == NULL) { 2444 uint64_t new_object; 2445 2446 new_object = space_map_alloc(mos, zfs_metaslab_sm_blksz, tx); 2447 VERIFY3U(new_object, !=, 0); 2448 2449 VERIFY0(space_map_open(&msp->ms_sm, mos, new_object, 2450 msp->ms_start, msp->ms_size, vd->vdev_ashift)); 2451 ASSERT(msp->ms_sm != NULL); 2452 } 2453 2454 if (!range_tree_is_empty(msp->ms_checkpointing) && 2455 vd->vdev_checkpoint_sm == NULL) { 2456 ASSERT(spa_has_checkpoint(spa)); 2457 2458 uint64_t new_object = space_map_alloc(mos, 2459 vdev_standard_sm_blksz, tx); 2460 VERIFY3U(new_object, !=, 0); 2461 2462 VERIFY0(space_map_open(&vd->vdev_checkpoint_sm, 2463 mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift)); 2464 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); 2465 2466 /* 2467 * We save the space map object as an entry in vdev_top_zap 2468 * so it can be retrieved when the pool is reopened after an 2469 * export or through zdb. 2470 */ 2471 VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, 2472 vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, 2473 sizeof (new_object), 1, &new_object, tx)); 2474 } 2475 2476 mutex_enter(&msp->ms_sync_lock); 2477 mutex_enter(&msp->ms_lock); 2478 2479 /* 2480 * Note: metaslab_condense() clears the space map's histogram. 2481 * Therefore we must verify and remove this histogram before 2482 * condensing. 2483 */ 2484 metaslab_group_histogram_verify(mg); 2485 metaslab_class_histogram_verify(mg->mg_class); 2486 metaslab_group_histogram_remove(mg, msp); 2487 2488 if (msp->ms_loaded && metaslab_should_condense(msp)) { 2489 metaslab_condense(msp, txg, tx); 2490 } else { 2491 mutex_exit(&msp->ms_lock); 2492 space_map_write(msp->ms_sm, alloctree, SM_ALLOC, 2493 SM_NO_VDEVID, tx); 2494 space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE, 2495 SM_NO_VDEVID, tx); 2496 mutex_enter(&msp->ms_lock); 2497 } 2498 2499 if (!range_tree_is_empty(msp->ms_checkpointing)) { 2500 ASSERT(spa_has_checkpoint(spa)); 2501 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); 2502 2503 /* 2504 * Since we are doing writes to disk and the ms_checkpointing 2505 * tree won't be changing during that time, we drop the 2506 * ms_lock while writing to the checkpoint space map. 2507 */ 2508 mutex_exit(&msp->ms_lock); 2509 space_map_write(vd->vdev_checkpoint_sm, 2510 msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx); 2511 mutex_enter(&msp->ms_lock); 2512 space_map_update(vd->vdev_checkpoint_sm); 2513 2514 spa->spa_checkpoint_info.sci_dspace += 2515 range_tree_space(msp->ms_checkpointing); 2516 vd->vdev_stat.vs_checkpoint_space += 2517 range_tree_space(msp->ms_checkpointing); 2518 ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==, 2519 -vd->vdev_checkpoint_sm->sm_alloc); 2520 2521 range_tree_vacate(msp->ms_checkpointing, NULL, NULL); 2522 } 2523 2524 if (msp->ms_loaded) { 2525 /* 2526 * When the space map is loaded, we have an accurate 2527 * histogram in the range tree. This gives us an opportunity 2528 * to bring the space map's histogram up-to-date so we clear 2529 * it first before updating it. 2530 */ 2531 space_map_histogram_clear(msp->ms_sm); 2532 space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx); 2533 2534 /* 2535 * Since we've cleared the histogram we need to add back 2536 * any free space that has already been processed, plus 2537 * any deferred space. This allows the on-disk histogram 2538 * to accurately reflect all free space even if some space 2539 * is not yet available for allocation (i.e. deferred). 2540 */ 2541 space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx); 2542 2543 /* 2544 * Add back any deferred free space that has not been 2545 * added back into the in-core free tree yet. This will 2546 * ensure that we don't end up with a space map histogram 2547 * that is completely empty unless the metaslab is fully 2548 * allocated. 2549 */ 2550 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2551 space_map_histogram_add(msp->ms_sm, 2552 msp->ms_defer[t], tx); 2553 } 2554 } 2555 2556 /* 2557 * Always add the free space from this sync pass to the space 2558 * map histogram. We want to make sure that the on-disk histogram 2559 * accounts for all free space. If the space map is not loaded, 2560 * then we will lose some accuracy but will correct it the next 2561 * time we load the space map. 2562 */ 2563 space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx); 2564 2565 metaslab_group_histogram_add(mg, msp); 2566 metaslab_group_histogram_verify(mg); 2567 metaslab_class_histogram_verify(mg->mg_class); 2568 2569 /* 2570 * For sync pass 1, we avoid traversing this txg's free range tree 2571 * and instead will just swap the pointers for freeing and 2572 * freed. We can safely do this since the freed_tree is 2573 * guaranteed to be empty on the initial pass. 2574 */ 2575 if (spa_sync_pass(spa) == 1) { 2576 range_tree_swap(&msp->ms_freeing, &msp->ms_freed); 2577 } else { 2578 range_tree_vacate(msp->ms_freeing, 2579 range_tree_add, msp->ms_freed); 2580 } 2581 range_tree_vacate(alloctree, NULL, NULL); 2582 2583 ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); 2584 ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg) 2585 & TXG_MASK])); 2586 ASSERT0(range_tree_space(msp->ms_freeing)); 2587 ASSERT0(range_tree_space(msp->ms_checkpointing)); 2588 2589 mutex_exit(&msp->ms_lock); 2590 2591 if (object != space_map_object(msp->ms_sm)) { 2592 object = space_map_object(msp->ms_sm); 2593 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * 2594 msp->ms_id, sizeof (uint64_t), &object, tx); 2595 } 2596 mutex_exit(&msp->ms_sync_lock); 2597 dmu_tx_commit(tx); 2598 } 2599 2600 /* 2601 * Called after a transaction group has completely synced to mark 2602 * all of the metaslab's free space as usable. 2603 */ 2604 void 2605 metaslab_sync_done(metaslab_t *msp, uint64_t txg) 2606 { 2607 metaslab_group_t *mg = msp->ms_group; 2608 vdev_t *vd = mg->mg_vd; 2609 spa_t *spa = vd->vdev_spa; 2610 range_tree_t **defer_tree; 2611 int64_t alloc_delta, defer_delta; 2612 boolean_t defer_allowed = B_TRUE; 2613 2614 ASSERT(!vd->vdev_ishole); 2615 2616 mutex_enter(&msp->ms_lock); 2617 2618 /* 2619 * If this metaslab is just becoming available, initialize its 2620 * range trees and add its capacity to the vdev. 2621 */ 2622 if (msp->ms_freed == NULL) { 2623 for (int t = 0; t < TXG_SIZE; t++) { 2624 ASSERT(msp->ms_allocating[t] == NULL); 2625 2626 msp->ms_allocating[t] = range_tree_create(NULL, NULL); 2627 } 2628 2629 ASSERT3P(msp->ms_freeing, ==, NULL); 2630 msp->ms_freeing = range_tree_create(NULL, NULL); 2631 2632 ASSERT3P(msp->ms_freed, ==, NULL); 2633 msp->ms_freed = range_tree_create(NULL, NULL); 2634 2635 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2636 ASSERT(msp->ms_defer[t] == NULL); 2637 2638 msp->ms_defer[t] = range_tree_create(NULL, NULL); 2639 } 2640 2641 ASSERT3P(msp->ms_checkpointing, ==, NULL); 2642 msp->ms_checkpointing = range_tree_create(NULL, NULL); 2643 2644 vdev_space_update(vd, 0, 0, msp->ms_size); 2645 } 2646 ASSERT0(range_tree_space(msp->ms_freeing)); 2647 ASSERT0(range_tree_space(msp->ms_checkpointing)); 2648 2649 defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE]; 2650 2651 uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) - 2652 metaslab_class_get_alloc(spa_normal_class(spa)); 2653 if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) { 2654 defer_allowed = B_FALSE; 2655 } 2656 2657 defer_delta = 0; 2658 alloc_delta = space_map_alloc_delta(msp->ms_sm); 2659 if (defer_allowed) { 2660 defer_delta = range_tree_space(msp->ms_freed) - 2661 range_tree_space(*defer_tree); 2662 } else { 2663 defer_delta -= range_tree_space(*defer_tree); 2664 } 2665 2666 vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0); 2667 2668 /* 2669 * If there's a metaslab_load() in progress, wait for it to complete 2670 * so that we have a consistent view of the in-core space map. 2671 */ 2672 metaslab_load_wait(msp); 2673 2674 /* 2675 * Move the frees from the defer_tree back to the free 2676 * range tree (if it's loaded). Swap the freed_tree and 2677 * the defer_tree -- this is safe to do because we've 2678 * just emptied out the defer_tree. 2679 */ 2680 range_tree_vacate(*defer_tree, 2681 msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable); 2682 if (defer_allowed) { 2683 range_tree_swap(&msp->ms_freed, defer_tree); 2684 } else { 2685 range_tree_vacate(msp->ms_freed, 2686 msp->ms_loaded ? range_tree_add : NULL, 2687 msp->ms_allocatable); 2688 } 2689 space_map_update(msp->ms_sm); 2690 2691 msp->ms_deferspace += defer_delta; 2692 ASSERT3S(msp->ms_deferspace, >=, 0); 2693 ASSERT3S(msp->ms_deferspace, <=, msp->ms_size); 2694 if (msp->ms_deferspace != 0) { 2695 /* 2696 * Keep syncing this metaslab until all deferred frees 2697 * are back in circulation. 2698 */ 2699 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 2700 } 2701 2702 if (msp->ms_new) { 2703 msp->ms_new = B_FALSE; 2704 mutex_enter(&mg->mg_lock); 2705 mg->mg_ms_ready++; 2706 mutex_exit(&mg->mg_lock); 2707 } 2708 /* 2709 * Calculate the new weights before unloading any metaslabs. 2710 * This will give us the most accurate weighting. 2711 */ 2712 metaslab_group_sort(mg, msp, metaslab_weight(msp) | 2713 (msp->ms_weight & METASLAB_ACTIVE_MASK)); 2714 2715 /* 2716 * If the metaslab is loaded and we've not tried to load or allocate 2717 * from it in 'metaslab_unload_delay' txgs, then unload it. 2718 */ 2719 if (msp->ms_loaded && 2720 msp->ms_selected_txg + metaslab_unload_delay < txg) { 2721 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 2722 VERIFY0(range_tree_space( 2723 msp->ms_allocating[(txg + t) & TXG_MASK])); 2724 } 2725 if (msp->ms_allocator != -1) { 2726 metaslab_passivate(msp, msp->ms_weight & 2727 ~METASLAB_ACTIVE_MASK); 2728 } 2729 2730 if (!metaslab_debug_unload) 2731 metaslab_unload(msp); 2732 } 2733 2734 ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); 2735 ASSERT0(range_tree_space(msp->ms_freeing)); 2736 ASSERT0(range_tree_space(msp->ms_freed)); 2737 ASSERT0(range_tree_space(msp->ms_checkpointing)); 2738 2739 mutex_exit(&msp->ms_lock); 2740 } 2741 2742 void 2743 metaslab_sync_reassess(metaslab_group_t *mg) 2744 { 2745 spa_t *spa = mg->mg_class->mc_spa; 2746 2747 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 2748 metaslab_group_alloc_update(mg); 2749 mg->mg_fragmentation = metaslab_group_fragmentation(mg); 2750 2751 /* 2752 * Preload the next potential metaslabs but only on active 2753 * metaslab groups. We can get into a state where the metaslab 2754 * is no longer active since we dirty metaslabs as we remove a 2755 * a device, thus potentially making the metaslab group eligible 2756 * for preloading. 2757 */ 2758 if (mg->mg_activation_count > 0) { 2759 metaslab_group_preload(mg); 2760 } 2761 spa_config_exit(spa, SCL_ALLOC, FTAG); 2762 } 2763 2764 static uint64_t 2765 metaslab_distance(metaslab_t *msp, dva_t *dva) 2766 { 2767 uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift; 2768 uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift; 2769 uint64_t start = msp->ms_id; 2770 2771 if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) 2772 return (1ULL << 63); 2773 2774 if (offset < start) 2775 return ((start - offset) << ms_shift); 2776 if (offset > start) 2777 return ((offset - start) << ms_shift); 2778 return (0); 2779 } 2780 2781 /* 2782 * ========================================================================== 2783 * Metaslab allocation tracing facility 2784 * ========================================================================== 2785 */ 2786 kstat_t *metaslab_trace_ksp; 2787 kstat_named_t metaslab_trace_over_limit; 2788 2789 void 2790 metaslab_alloc_trace_init(void) 2791 { 2792 ASSERT(metaslab_alloc_trace_cache == NULL); 2793 metaslab_alloc_trace_cache = kmem_cache_create( 2794 "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t), 2795 0, NULL, NULL, NULL, NULL, NULL, 0); 2796 metaslab_trace_ksp = kstat_create("zfs", 0, "metaslab_trace_stats", 2797 "misc", KSTAT_TYPE_NAMED, 1, KSTAT_FLAG_VIRTUAL); 2798 if (metaslab_trace_ksp != NULL) { 2799 metaslab_trace_ksp->ks_data = &metaslab_trace_over_limit; 2800 kstat_named_init(&metaslab_trace_over_limit, 2801 "metaslab_trace_over_limit", KSTAT_DATA_UINT64); 2802 kstat_install(metaslab_trace_ksp); 2803 } 2804 } 2805 2806 void 2807 metaslab_alloc_trace_fini(void) 2808 { 2809 if (metaslab_trace_ksp != NULL) { 2810 kstat_delete(metaslab_trace_ksp); 2811 metaslab_trace_ksp = NULL; 2812 } 2813 kmem_cache_destroy(metaslab_alloc_trace_cache); 2814 metaslab_alloc_trace_cache = NULL; 2815 } 2816 2817 /* 2818 * Add an allocation trace element to the allocation tracing list. 2819 */ 2820 static void 2821 metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg, 2822 metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset, 2823 int allocator) 2824 { 2825 if (!metaslab_trace_enabled) 2826 return; 2827 2828 /* 2829 * When the tracing list reaches its maximum we remove 2830 * the second element in the list before adding a new one. 2831 * By removing the second element we preserve the original 2832 * entry as a clue to what allocations steps have already been 2833 * performed. 2834 */ 2835 if (zal->zal_size == metaslab_trace_max_entries) { 2836 metaslab_alloc_trace_t *mat_next; 2837 #ifdef DEBUG 2838 panic("too many entries in allocation list"); 2839 #endif 2840 atomic_inc_64(&metaslab_trace_over_limit.value.ui64); 2841 zal->zal_size--; 2842 mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list)); 2843 list_remove(&zal->zal_list, mat_next); 2844 kmem_cache_free(metaslab_alloc_trace_cache, mat_next); 2845 } 2846 2847 metaslab_alloc_trace_t *mat = 2848 kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP); 2849 list_link_init(&mat->mat_list_node); 2850 mat->mat_mg = mg; 2851 mat->mat_msp = msp; 2852 mat->mat_size = psize; 2853 mat->mat_dva_id = dva_id; 2854 mat->mat_offset = offset; 2855 mat->mat_weight = 0; 2856 mat->mat_allocator = allocator; 2857 2858 if (msp != NULL) 2859 mat->mat_weight = msp->ms_weight; 2860 2861 /* 2862 * The list is part of the zio so locking is not required. Only 2863 * a single thread will perform allocations for a given zio. 2864 */ 2865 list_insert_tail(&zal->zal_list, mat); 2866 zal->zal_size++; 2867 2868 ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries); 2869 } 2870 2871 void 2872 metaslab_trace_init(zio_alloc_list_t *zal) 2873 { 2874 list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t), 2875 offsetof(metaslab_alloc_trace_t, mat_list_node)); 2876 zal->zal_size = 0; 2877 } 2878 2879 void 2880 metaslab_trace_fini(zio_alloc_list_t *zal) 2881 { 2882 metaslab_alloc_trace_t *mat; 2883 2884 while ((mat = list_remove_head(&zal->zal_list)) != NULL) 2885 kmem_cache_free(metaslab_alloc_trace_cache, mat); 2886 list_destroy(&zal->zal_list); 2887 zal->zal_size = 0; 2888 } 2889 2890 /* 2891 * ========================================================================== 2892 * Metaslab block operations 2893 * ========================================================================== 2894 */ 2895 2896 static void 2897 metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags, 2898 int allocator) 2899 { 2900 if (!(flags & METASLAB_ASYNC_ALLOC) || 2901 (flags & METASLAB_DONT_THROTTLE)) 2902 return; 2903 2904 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 2905 if (!mg->mg_class->mc_alloc_throttle_enabled) 2906 return; 2907 2908 (void) refcount_add(&mg->mg_alloc_queue_depth[allocator], tag); 2909 } 2910 2911 static void 2912 metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator) 2913 { 2914 uint64_t max = mg->mg_max_alloc_queue_depth; 2915 uint64_t cur = mg->mg_cur_max_alloc_queue_depth[allocator]; 2916 while (cur < max) { 2917 if (atomic_cas_64(&mg->mg_cur_max_alloc_queue_depth[allocator], 2918 cur, cur + 1) == cur) { 2919 atomic_inc_64( 2920 &mg->mg_class->mc_alloc_max_slots[allocator]); 2921 return; 2922 } 2923 cur = mg->mg_cur_max_alloc_queue_depth[allocator]; 2924 } 2925 } 2926 2927 void 2928 metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags, 2929 int allocator, boolean_t io_complete) 2930 { 2931 if (!(flags & METASLAB_ASYNC_ALLOC) || 2932 (flags & METASLAB_DONT_THROTTLE)) 2933 return; 2934 2935 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 2936 if (!mg->mg_class->mc_alloc_throttle_enabled) 2937 return; 2938 2939 (void) refcount_remove(&mg->mg_alloc_queue_depth[allocator], tag); 2940 if (io_complete) 2941 metaslab_group_increment_qdepth(mg, allocator); 2942 } 2943 2944 void 2945 metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag, 2946 int allocator) 2947 { 2948 #ifdef ZFS_DEBUG 2949 const dva_t *dva = bp->blk_dva; 2950 int ndvas = BP_GET_NDVAS(bp); 2951 2952 for (int d = 0; d < ndvas; d++) { 2953 uint64_t vdev = DVA_GET_VDEV(&dva[d]); 2954 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 2955 VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth[allocator], 2956 tag)); 2957 } 2958 #endif 2959 } 2960 2961 static uint64_t 2962 metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) 2963 { 2964 uint64_t start; 2965 range_tree_t *rt = msp->ms_allocatable; 2966 metaslab_class_t *mc = msp->ms_group->mg_class; 2967 2968 VERIFY(!msp->ms_condensing); 2969 2970 start = mc->mc_ops->msop_alloc(msp, size); 2971 if (start != -1ULL) { 2972 metaslab_group_t *mg = msp->ms_group; 2973 vdev_t *vd = mg->mg_vd; 2974 2975 VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift)); 2976 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 2977 VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size); 2978 range_tree_remove(rt, start, size); 2979 2980 if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) 2981 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); 2982 2983 range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size); 2984 2985 /* Track the last successful allocation */ 2986 msp->ms_alloc_txg = txg; 2987 metaslab_verify_space(msp, txg); 2988 } 2989 2990 /* 2991 * Now that we've attempted the allocation we need to update the 2992 * metaslab's maximum block size since it may have changed. 2993 */ 2994 msp->ms_max_size = metaslab_block_maxsize(msp); 2995 return (start); 2996 } 2997 2998 /* 2999 * Find the metaslab with the highest weight that is less than what we've 3000 * already tried. In the common case, this means that we will examine each 3001 * metaslab at most once. Note that concurrent callers could reorder metaslabs 3002 * by activation/passivation once we have dropped the mg_lock. If a metaslab is 3003 * activated by another thread, and we fail to allocate from the metaslab we 3004 * have selected, we may not try the newly-activated metaslab, and instead 3005 * activate another metaslab. This is not optimal, but generally does not cause 3006 * any problems (a possible exception being if every metaslab is completely full 3007 * except for the the newly-activated metaslab which we fail to examine). 3008 */ 3009 static metaslab_t * 3010 find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, 3011 dva_t *dva, int d, uint64_t min_distance, uint64_t asize, int allocator, 3012 zio_alloc_list_t *zal, metaslab_t *search, boolean_t *was_active) 3013 { 3014 avl_index_t idx; 3015 avl_tree_t *t = &mg->mg_metaslab_tree; 3016 metaslab_t *msp = avl_find(t, search, &idx); 3017 if (msp == NULL) 3018 msp = avl_nearest(t, idx, AVL_AFTER); 3019 3020 for (; msp != NULL; msp = AVL_NEXT(t, msp)) { 3021 int i; 3022 if (!metaslab_should_allocate(msp, asize)) { 3023 metaslab_trace_add(zal, mg, msp, asize, d, 3024 TRACE_TOO_SMALL, allocator); 3025 continue; 3026 } 3027 3028 /* 3029 * If the selected metaslab is condensing, skip it. 3030 */ 3031 if (msp->ms_condensing) 3032 continue; 3033 3034 *was_active = msp->ms_allocator != -1; 3035 /* 3036 * If we're activating as primary, this is our first allocation 3037 * from this disk, so we don't need to check how close we are. 3038 * If the metaslab under consideration was already active, 3039 * we're getting desperate enough to steal another allocator's 3040 * metaslab, so we still don't care about distances. 3041 */ 3042 if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active) 3043 break; 3044 3045 uint64_t target_distance = min_distance 3046 + (space_map_allocated(msp->ms_sm) != 0 ? 0 : 3047 min_distance >> 1); 3048 3049 for (i = 0; i < d; i++) { 3050 if (metaslab_distance(msp, &dva[i]) < target_distance) 3051 break; 3052 } 3053 if (i == d) 3054 break; 3055 } 3056 3057 if (msp != NULL) { 3058 search->ms_weight = msp->ms_weight; 3059 search->ms_start = msp->ms_start + 1; 3060 search->ms_allocator = msp->ms_allocator; 3061 search->ms_primary = msp->ms_primary; 3062 } 3063 return (msp); 3064 } 3065 3066 /* ARGSUSED */ 3067 static uint64_t 3068 metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, 3069 uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d, 3070 int allocator) 3071 { 3072 metaslab_t *msp = NULL; 3073 uint64_t offset = -1ULL; 3074 uint64_t activation_weight; 3075 boolean_t tertiary = B_FALSE; 3076 3077 activation_weight = METASLAB_WEIGHT_PRIMARY; 3078 for (int i = 0; i < d; i++) { 3079 if (activation_weight == METASLAB_WEIGHT_PRIMARY && 3080 DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { 3081 activation_weight = METASLAB_WEIGHT_SECONDARY; 3082 } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && 3083 DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { 3084 tertiary = B_TRUE; 3085 break; 3086 } 3087 } 3088 3089 /* 3090 * If we don't have enough metaslabs active to fill the entire array, we 3091 * just use the 0th slot. 3092 */ 3093 if (mg->mg_ms_ready < mg->mg_allocators * 2) { 3094 tertiary = B_FALSE; 3095 allocator = 0; 3096 } 3097 3098 ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2); 3099 3100 metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP); 3101 search->ms_weight = UINT64_MAX; 3102 search->ms_start = 0; 3103 /* 3104 * At the end of the metaslab tree are the already-active metaslabs, 3105 * first the primaries, then the secondaries. When we resume searching 3106 * through the tree, we need to consider ms_allocator and ms_primary so 3107 * we start in the location right after where we left off, and don't 3108 * accidentally loop forever considering the same metaslabs. 3109 */ 3110 search->ms_allocator = -1; 3111 search->ms_primary = B_TRUE; 3112 for (;;) { 3113 boolean_t was_active = B_FALSE; 3114 3115 mutex_enter(&mg->mg_lock); 3116 3117 if (activation_weight == METASLAB_WEIGHT_PRIMARY && 3118 mg->mg_primaries[allocator] != NULL) { 3119 msp = mg->mg_primaries[allocator]; 3120 was_active = B_TRUE; 3121 } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && 3122 mg->mg_secondaries[allocator] != NULL && !tertiary) { 3123 msp = mg->mg_secondaries[allocator]; 3124 was_active = B_TRUE; 3125 } else { 3126 msp = find_valid_metaslab(mg, activation_weight, dva, d, 3127 min_distance, asize, allocator, zal, search, 3128 &was_active); 3129 } 3130 3131 mutex_exit(&mg->mg_lock); 3132 if (msp == NULL) { 3133 kmem_free(search, sizeof (*search)); 3134 return (-1ULL); 3135 } 3136 3137 mutex_enter(&msp->ms_lock); 3138 /* 3139 * Ensure that the metaslab we have selected is still 3140 * capable of handling our request. It's possible that 3141 * another thread may have changed the weight while we 3142 * were blocked on the metaslab lock. We check the 3143 * active status first to see if we need to reselect 3144 * a new metaslab. 3145 */ 3146 if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) { 3147 mutex_exit(&msp->ms_lock); 3148 continue; 3149 } 3150 3151 /* 3152 * If the metaslab is freshly activated for an allocator that 3153 * isn't the one we're allocating from, or if it's a primary and 3154 * we're seeking a secondary (or vice versa), we go back and 3155 * select a new metaslab. 3156 */ 3157 if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) && 3158 (msp->ms_allocator != -1) && 3159 (msp->ms_allocator != allocator || ((activation_weight == 3160 METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) { 3161 mutex_exit(&msp->ms_lock); 3162 continue; 3163 } 3164 3165 if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) { 3166 metaslab_passivate(msp, msp->ms_weight & 3167 ~METASLAB_WEIGHT_CLAIM); 3168 mutex_exit(&msp->ms_lock); 3169 continue; 3170 } 3171 3172 if (metaslab_activate(msp, allocator, activation_weight) != 0) { 3173 mutex_exit(&msp->ms_lock); 3174 continue; 3175 } 3176 3177 msp->ms_selected_txg = txg; 3178 3179 /* 3180 * Now that we have the lock, recheck to see if we should 3181 * continue to use this metaslab for this allocation. The 3182 * the metaslab is now loaded so metaslab_should_allocate() can 3183 * accurately determine if the allocation attempt should 3184 * proceed. 3185 */ 3186 if (!metaslab_should_allocate(msp, asize)) { 3187 /* Passivate this metaslab and select a new one. */ 3188 metaslab_trace_add(zal, mg, msp, asize, d, 3189 TRACE_TOO_SMALL, allocator); 3190 goto next; 3191 } 3192 3193 /* 3194 * If this metaslab is currently condensing then pick again as 3195 * we can't manipulate this metaslab until it's committed 3196 * to disk. 3197 */ 3198 if (msp->ms_condensing) { 3199 metaslab_trace_add(zal, mg, msp, asize, d, 3200 TRACE_CONDENSING, allocator); 3201 metaslab_passivate(msp, msp->ms_weight & 3202 ~METASLAB_ACTIVE_MASK); 3203 mutex_exit(&msp->ms_lock); 3204 continue; 3205 } 3206 3207 offset = metaslab_block_alloc(msp, asize, txg); 3208 metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator); 3209 3210 if (offset != -1ULL) { 3211 /* Proactively passivate the metaslab, if needed */ 3212 metaslab_segment_may_passivate(msp); 3213 break; 3214 } 3215 next: 3216 ASSERT(msp->ms_loaded); 3217 3218 /* 3219 * We were unable to allocate from this metaslab so determine 3220 * a new weight for this metaslab. Now that we have loaded 3221 * the metaslab we can provide a better hint to the metaslab 3222 * selector. 3223 * 3224 * For space-based metaslabs, we use the maximum block size. 3225 * This information is only available when the metaslab 3226 * is loaded and is more accurate than the generic free 3227 * space weight that was calculated by metaslab_weight(). 3228 * This information allows us to quickly compare the maximum 3229 * available allocation in the metaslab to the allocation 3230 * size being requested. 3231 * 3232 * For segment-based metaslabs, determine the new weight 3233 * based on the highest bucket in the range tree. We 3234 * explicitly use the loaded segment weight (i.e. the range 3235 * tree histogram) since it contains the space that is 3236 * currently available for allocation and is accurate 3237 * even within a sync pass. 3238 */ 3239 if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) { 3240 uint64_t weight = metaslab_block_maxsize(msp); 3241 WEIGHT_SET_SPACEBASED(weight); 3242 metaslab_passivate(msp, weight); 3243 } else { 3244 metaslab_passivate(msp, 3245 metaslab_weight_from_range_tree(msp)); 3246 } 3247 3248 /* 3249 * We have just failed an allocation attempt, check 3250 * that metaslab_should_allocate() agrees. Otherwise, 3251 * we may end up in an infinite loop retrying the same 3252 * metaslab. 3253 */ 3254 ASSERT(!metaslab_should_allocate(msp, asize)); 3255 mutex_exit(&msp->ms_lock); 3256 } 3257 mutex_exit(&msp->ms_lock); 3258 kmem_free(search, sizeof (*search)); 3259 return (offset); 3260 } 3261 3262 static uint64_t 3263 metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, 3264 uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d, 3265 int allocator) 3266 { 3267 uint64_t offset; 3268 ASSERT(mg->mg_initialized); 3269 3270 offset = metaslab_group_alloc_normal(mg, zal, asize, txg, 3271 min_distance, dva, d, allocator); 3272 3273 mutex_enter(&mg->mg_lock); 3274 if (offset == -1ULL) { 3275 mg->mg_failed_allocations++; 3276 metaslab_trace_add(zal, mg, NULL, asize, d, 3277 TRACE_GROUP_FAILURE, allocator); 3278 if (asize == SPA_GANGBLOCKSIZE) { 3279 /* 3280 * This metaslab group was unable to allocate 3281 * the minimum gang block size so it must be out of 3282 * space. We must notify the allocation throttle 3283 * to start skipping allocation attempts to this 3284 * metaslab group until more space becomes available. 3285 * Note: this failure cannot be caused by the 3286 * allocation throttle since the allocation throttle 3287 * is only responsible for skipping devices and 3288 * not failing block allocations. 3289 */ 3290 mg->mg_no_free_space = B_TRUE; 3291 } 3292 } 3293 mg->mg_allocations++; 3294 mutex_exit(&mg->mg_lock); 3295 return (offset); 3296 } 3297 3298 /* 3299 * If we have to write a ditto block (i.e. more than one DVA for a given BP) 3300 * on the same vdev as an existing DVA of this BP, then try to allocate it 3301 * at least (vdev_asize / (2 ^ ditto_same_vdev_distance_shift)) away from the 3302 * existing DVAs. 3303 */ 3304 int ditto_same_vdev_distance_shift = 3; 3305 3306 /* 3307 * Allocate a block for the specified i/o. 3308 */ 3309 int 3310 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, 3311 dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags, 3312 zio_alloc_list_t *zal, int allocator) 3313 { 3314 metaslab_group_t *mg, *rotor; 3315 vdev_t *vd; 3316 boolean_t try_hard = B_FALSE; 3317 3318 ASSERT(!DVA_IS_VALID(&dva[d])); 3319 3320 /* 3321 * For testing, make some blocks above a certain size be gang blocks. 3322 */ 3323 if (psize >= metaslab_force_ganging && (ddi_get_lbolt() & 3) == 0) { 3324 metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG, 3325 allocator); 3326 return (SET_ERROR(ENOSPC)); 3327 } 3328 3329 /* 3330 * Start at the rotor and loop through all mgs until we find something. 3331 * Note that there's no locking on mc_rotor or mc_aliquot because 3332 * nothing actually breaks if we miss a few updates -- we just won't 3333 * allocate quite as evenly. It all balances out over time. 3334 * 3335 * If we are doing ditto or log blocks, try to spread them across 3336 * consecutive vdevs. If we're forced to reuse a vdev before we've 3337 * allocated all of our ditto blocks, then try and spread them out on 3338 * that vdev as much as possible. If it turns out to not be possible, 3339 * gradually lower our standards until anything becomes acceptable. 3340 * Also, allocating on consecutive vdevs (as opposed to random vdevs) 3341 * gives us hope of containing our fault domains to something we're 3342 * able to reason about. Otherwise, any two top-level vdev failures 3343 * will guarantee the loss of data. With consecutive allocation, 3344 * only two adjacent top-level vdev failures will result in data loss. 3345 * 3346 * If we are doing gang blocks (hintdva is non-NULL), try to keep 3347 * ourselves on the same vdev as our gang block header. That 3348 * way, we can hope for locality in vdev_cache, plus it makes our 3349 * fault domains something tractable. 3350 */ 3351 if (hintdva) { 3352 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); 3353 3354 /* 3355 * It's possible the vdev we're using as the hint no 3356 * longer exists or its mg has been closed (e.g. by 3357 * device removal). Consult the rotor when 3358 * all else fails. 3359 */ 3360 if (vd != NULL && vd->vdev_mg != NULL) { 3361 mg = vd->vdev_mg; 3362 3363 if (flags & METASLAB_HINTBP_AVOID && 3364 mg->mg_next != NULL) 3365 mg = mg->mg_next; 3366 } else { 3367 mg = mc->mc_rotor; 3368 } 3369 } else if (d != 0) { 3370 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); 3371 mg = vd->vdev_mg->mg_next; 3372 } else { 3373 mg = mc->mc_rotor; 3374 } 3375 3376 /* 3377 * If the hint put us into the wrong metaslab class, or into a 3378 * metaslab group that has been passivated, just follow the rotor. 3379 */ 3380 if (mg->mg_class != mc || mg->mg_activation_count <= 0) 3381 mg = mc->mc_rotor; 3382 3383 rotor = mg; 3384 top: 3385 do { 3386 boolean_t allocatable; 3387 3388 ASSERT(mg->mg_activation_count == 1); 3389 vd = mg->mg_vd; 3390 3391 /* 3392 * Don't allocate from faulted devices. 3393 */ 3394 if (try_hard) { 3395 spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); 3396 allocatable = vdev_allocatable(vd); 3397 spa_config_exit(spa, SCL_ZIO, FTAG); 3398 } else { 3399 allocatable = vdev_allocatable(vd); 3400 } 3401 3402 /* 3403 * Determine if the selected metaslab group is eligible 3404 * for allocations. If we're ganging then don't allow 3405 * this metaslab group to skip allocations since that would 3406 * inadvertently return ENOSPC and suspend the pool 3407 * even though space is still available. 3408 */ 3409 if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) { 3410 allocatable = metaslab_group_allocatable(mg, rotor, 3411 psize, allocator); 3412 } 3413 3414 if (!allocatable) { 3415 metaslab_trace_add(zal, mg, NULL, psize, d, 3416 TRACE_NOT_ALLOCATABLE, allocator); 3417 goto next; 3418 } 3419 3420 ASSERT(mg->mg_initialized); 3421 3422 /* 3423 * Avoid writing single-copy data to a failing, 3424 * non-redundant vdev, unless we've already tried all 3425 * other vdevs. 3426 */ 3427 if ((vd->vdev_stat.vs_write_errors > 0 || 3428 vd->vdev_state < VDEV_STATE_HEALTHY) && 3429 d == 0 && !try_hard && vd->vdev_children == 0) { 3430 metaslab_trace_add(zal, mg, NULL, psize, d, 3431 TRACE_VDEV_ERROR, allocator); 3432 goto next; 3433 } 3434 3435 ASSERT(mg->mg_class == mc); 3436 3437 /* 3438 * If we don't need to try hard, then require that the 3439 * block be 1/8th of the device away from any other DVAs 3440 * in this BP. If we are trying hard, allow any offset 3441 * to be used (distance=0). 3442 */ 3443 uint64_t distance = 0; 3444 if (!try_hard) { 3445 distance = vd->vdev_asize >> 3446 ditto_same_vdev_distance_shift; 3447 if (distance <= (1ULL << vd->vdev_ms_shift)) 3448 distance = 0; 3449 } 3450 3451 uint64_t asize = vdev_psize_to_asize(vd, psize); 3452 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); 3453 3454 uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg, 3455 distance, dva, d, allocator); 3456 3457 if (offset != -1ULL) { 3458 /* 3459 * If we've just selected this metaslab group, 3460 * figure out whether the corresponding vdev is 3461 * over- or under-used relative to the pool, 3462 * and set an allocation bias to even it out. 3463 */ 3464 if (mc->mc_aliquot == 0 && metaslab_bias_enabled) { 3465 vdev_stat_t *vs = &vd->vdev_stat; 3466 int64_t vu, cu; 3467 3468 vu = (vs->vs_alloc * 100) / (vs->vs_space + 1); 3469 cu = (mc->mc_alloc * 100) / (mc->mc_space + 1); 3470 3471 /* 3472 * Calculate how much more or less we should 3473 * try to allocate from this device during 3474 * this iteration around the rotor. 3475 * For example, if a device is 80% full 3476 * and the pool is 20% full then we should 3477 * reduce allocations by 60% on this device. 3478 * 3479 * mg_bias = (20 - 80) * 512K / 100 = -307K 3480 * 3481 * This reduces allocations by 307K for this 3482 * iteration. 3483 */ 3484 mg->mg_bias = ((cu - vu) * 3485 (int64_t)mg->mg_aliquot) / 100; 3486 } else if (!metaslab_bias_enabled) { 3487 mg->mg_bias = 0; 3488 } 3489 3490 if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= 3491 mg->mg_aliquot + mg->mg_bias) { 3492 mc->mc_rotor = mg->mg_next; 3493 mc->mc_aliquot = 0; 3494 } 3495 3496 DVA_SET_VDEV(&dva[d], vd->vdev_id); 3497 DVA_SET_OFFSET(&dva[d], offset); 3498 DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER)); 3499 DVA_SET_ASIZE(&dva[d], asize); 3500 3501 return (0); 3502 } 3503 next: 3504 mc->mc_rotor = mg->mg_next; 3505 mc->mc_aliquot = 0; 3506 } while ((mg = mg->mg_next) != rotor); 3507 3508 /* 3509 * If we haven't tried hard, do so now. 3510 */ 3511 if (!try_hard) { 3512 try_hard = B_TRUE; 3513 goto top; 3514 } 3515 3516 bzero(&dva[d], sizeof (dva_t)); 3517 3518 metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator); 3519 return (SET_ERROR(ENOSPC)); 3520 } 3521 3522 void 3523 metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize, 3524 boolean_t checkpoint) 3525 { 3526 metaslab_t *msp; 3527 spa_t *spa = vd->vdev_spa; 3528 3529 ASSERT(vdev_is_concrete(vd)); 3530 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 3531 ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); 3532 3533 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 3534 3535 VERIFY(!msp->ms_condensing); 3536 VERIFY3U(offset, >=, msp->ms_start); 3537 VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size); 3538 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 3539 VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift)); 3540 3541 metaslab_check_free_impl(vd, offset, asize); 3542 3543 mutex_enter(&msp->ms_lock); 3544 if (range_tree_is_empty(msp->ms_freeing) && 3545 range_tree_is_empty(msp->ms_checkpointing)) { 3546 vdev_dirty(vd, VDD_METASLAB, msp, spa_syncing_txg(spa)); 3547 } 3548 3549 if (checkpoint) { 3550 ASSERT(spa_has_checkpoint(spa)); 3551 range_tree_add(msp->ms_checkpointing, offset, asize); 3552 } else { 3553 range_tree_add(msp->ms_freeing, offset, asize); 3554 } 3555 mutex_exit(&msp->ms_lock); 3556 } 3557 3558 /* ARGSUSED */ 3559 void 3560 metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, 3561 uint64_t size, void *arg) 3562 { 3563 boolean_t *checkpoint = arg; 3564 3565 ASSERT3P(checkpoint, !=, NULL); 3566 3567 if (vd->vdev_ops->vdev_op_remap != NULL) 3568 vdev_indirect_mark_obsolete(vd, offset, size); 3569 else 3570 metaslab_free_impl(vd, offset, size, *checkpoint); 3571 } 3572 3573 static void 3574 metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size, 3575 boolean_t checkpoint) 3576 { 3577 spa_t *spa = vd->vdev_spa; 3578 3579 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 3580 3581 if (spa_syncing_txg(spa) > spa_freeze_txg(spa)) 3582 return; 3583 3584 if (spa->spa_vdev_removal != NULL && 3585 spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id && 3586 vdev_is_concrete(vd)) { 3587 /* 3588 * Note: we check if the vdev is concrete because when 3589 * we complete the removal, we first change the vdev to be 3590 * an indirect vdev (in open context), and then (in syncing 3591 * context) clear spa_vdev_removal. 3592 */ 3593 free_from_removing_vdev(vd, offset, size); 3594 } else if (vd->vdev_ops->vdev_op_remap != NULL) { 3595 vdev_indirect_mark_obsolete(vd, offset, size); 3596 vd->vdev_ops->vdev_op_remap(vd, offset, size, 3597 metaslab_free_impl_cb, &checkpoint); 3598 } else { 3599 metaslab_free_concrete(vd, offset, size, checkpoint); 3600 } 3601 } 3602 3603 typedef struct remap_blkptr_cb_arg { 3604 blkptr_t *rbca_bp; 3605 spa_remap_cb_t rbca_cb; 3606 vdev_t *rbca_remap_vd; 3607 uint64_t rbca_remap_offset; 3608 void *rbca_cb_arg; 3609 } remap_blkptr_cb_arg_t; 3610 3611 void 3612 remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, 3613 uint64_t size, void *arg) 3614 { 3615 remap_blkptr_cb_arg_t *rbca = arg; 3616 blkptr_t *bp = rbca->rbca_bp; 3617 3618 /* We can not remap split blocks. */ 3619 if (size != DVA_GET_ASIZE(&bp->blk_dva[0])) 3620 return; 3621 ASSERT0(inner_offset); 3622 3623 if (rbca->rbca_cb != NULL) { 3624 /* 3625 * At this point we know that we are not handling split 3626 * blocks and we invoke the callback on the previous 3627 * vdev which must be indirect. 3628 */ 3629 ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops); 3630 3631 rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id, 3632 rbca->rbca_remap_offset, size, rbca->rbca_cb_arg); 3633 3634 /* set up remap_blkptr_cb_arg for the next call */ 3635 rbca->rbca_remap_vd = vd; 3636 rbca->rbca_remap_offset = offset; 3637 } 3638 3639 /* 3640 * The phys birth time is that of dva[0]. This ensures that we know 3641 * when each dva was written, so that resilver can determine which 3642 * blocks need to be scrubbed (i.e. those written during the time 3643 * the vdev was offline). It also ensures that the key used in 3644 * the ARC hash table is unique (i.e. dva[0] + phys_birth). If 3645 * we didn't change the phys_birth, a lookup in the ARC for a 3646 * remapped BP could find the data that was previously stored at 3647 * this vdev + offset. 3648 */ 3649 vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa, 3650 DVA_GET_VDEV(&bp->blk_dva[0])); 3651 vdev_indirect_births_t *vib = oldvd->vdev_indirect_births; 3652 bp->blk_phys_birth = vdev_indirect_births_physbirth(vib, 3653 DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0])); 3654 3655 DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id); 3656 DVA_SET_OFFSET(&bp->blk_dva[0], offset); 3657 } 3658 3659 /* 3660 * If the block pointer contains any indirect DVAs, modify them to refer to 3661 * concrete DVAs. Note that this will sometimes not be possible, leaving 3662 * the indirect DVA in place. This happens if the indirect DVA spans multiple 3663 * segments in the mapping (i.e. it is a "split block"). 3664 * 3665 * If the BP was remapped, calls the callback on the original dva (note the 3666 * callback can be called multiple times if the original indirect DVA refers 3667 * to another indirect DVA, etc). 3668 * 3669 * Returns TRUE if the BP was remapped. 3670 */ 3671 boolean_t 3672 spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg) 3673 { 3674 remap_blkptr_cb_arg_t rbca; 3675 3676 if (!zfs_remap_blkptr_enable) 3677 return (B_FALSE); 3678 3679 if (!spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) 3680 return (B_FALSE); 3681 3682 /* 3683 * Dedup BP's can not be remapped, because ddt_phys_select() depends 3684 * on DVA[0] being the same in the BP as in the DDT (dedup table). 3685 */ 3686 if (BP_GET_DEDUP(bp)) 3687 return (B_FALSE); 3688 3689 /* 3690 * Gang blocks can not be remapped, because 3691 * zio_checksum_gang_verifier() depends on the DVA[0] that's in 3692 * the BP used to read the gang block header (GBH) being the same 3693 * as the DVA[0] that we allocated for the GBH. 3694 */ 3695 if (BP_IS_GANG(bp)) 3696 return (B_FALSE); 3697 3698 /* 3699 * Embedded BP's have no DVA to remap. 3700 */ 3701 if (BP_GET_NDVAS(bp) < 1) 3702 return (B_FALSE); 3703 3704 /* 3705 * Note: we only remap dva[0]. If we remapped other dvas, we 3706 * would no longer know what their phys birth txg is. 3707 */ 3708 dva_t *dva = &bp->blk_dva[0]; 3709 3710 uint64_t offset = DVA_GET_OFFSET(dva); 3711 uint64_t size = DVA_GET_ASIZE(dva); 3712 vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); 3713 3714 if (vd->vdev_ops->vdev_op_remap == NULL) 3715 return (B_FALSE); 3716 3717 rbca.rbca_bp = bp; 3718 rbca.rbca_cb = callback; 3719 rbca.rbca_remap_vd = vd; 3720 rbca.rbca_remap_offset = offset; 3721 rbca.rbca_cb_arg = arg; 3722 3723 /* 3724 * remap_blkptr_cb() will be called in order for each level of 3725 * indirection, until a concrete vdev is reached or a split block is 3726 * encountered. old_vd and old_offset are updated within the callback 3727 * as we go from the one indirect vdev to the next one (either concrete 3728 * or indirect again) in that order. 3729 */ 3730 vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca); 3731 3732 /* Check if the DVA wasn't remapped because it is a split block */ 3733 if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id) 3734 return (B_FALSE); 3735 3736 return (B_TRUE); 3737 } 3738 3739 /* 3740 * Undo the allocation of a DVA which happened in the given transaction group. 3741 */ 3742 void 3743 metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg) 3744 { 3745 metaslab_t *msp; 3746 vdev_t *vd; 3747 uint64_t vdev = DVA_GET_VDEV(dva); 3748 uint64_t offset = DVA_GET_OFFSET(dva); 3749 uint64_t size = DVA_GET_ASIZE(dva); 3750 3751 ASSERT(DVA_IS_VALID(dva)); 3752 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 3753 3754 if (txg > spa_freeze_txg(spa)) 3755 return; 3756 3757 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 3758 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { 3759 cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu", 3760 (u_longlong_t)vdev, (u_longlong_t)offset); 3761 ASSERT(0); 3762 return; 3763 } 3764 3765 ASSERT(!vd->vdev_removing); 3766 ASSERT(vdev_is_concrete(vd)); 3767 ASSERT0(vd->vdev_indirect_config.vic_mapping_object); 3768 ASSERT3P(vd->vdev_indirect_mapping, ==, NULL); 3769 3770 if (DVA_GET_GANG(dva)) 3771 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 3772 3773 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 3774 3775 mutex_enter(&msp->ms_lock); 3776 range_tree_remove(msp->ms_allocating[txg & TXG_MASK], 3777 offset, size); 3778 3779 VERIFY(!msp->ms_condensing); 3780 VERIFY3U(offset, >=, msp->ms_start); 3781 VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size); 3782 VERIFY3U(range_tree_space(msp->ms_allocatable) + size, <=, 3783 msp->ms_size); 3784 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 3785 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 3786 range_tree_add(msp->ms_allocatable, offset, size); 3787 mutex_exit(&msp->ms_lock); 3788 } 3789 3790 /* 3791 * Free the block represented by the given DVA. 3792 */ 3793 void 3794 metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint) 3795 { 3796 uint64_t vdev = DVA_GET_VDEV(dva); 3797 uint64_t offset = DVA_GET_OFFSET(dva); 3798 uint64_t size = DVA_GET_ASIZE(dva); 3799 vdev_t *vd = vdev_lookup_top(spa, vdev); 3800 3801 ASSERT(DVA_IS_VALID(dva)); 3802 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 3803 3804 if (DVA_GET_GANG(dva)) { 3805 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 3806 } 3807 3808 metaslab_free_impl(vd, offset, size, checkpoint); 3809 } 3810 3811 /* 3812 * Reserve some allocation slots. The reservation system must be called 3813 * before we call into the allocator. If there aren't any available slots 3814 * then the I/O will be throttled until an I/O completes and its slots are 3815 * freed up. The function returns true if it was successful in placing 3816 * the reservation. 3817 */ 3818 boolean_t 3819 metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator, 3820 zio_t *zio, int flags) 3821 { 3822 uint64_t available_slots = 0; 3823 boolean_t slot_reserved = B_FALSE; 3824 uint64_t max = mc->mc_alloc_max_slots[allocator]; 3825 3826 ASSERT(mc->mc_alloc_throttle_enabled); 3827 mutex_enter(&mc->mc_lock); 3828 3829 uint64_t reserved_slots = 3830 refcount_count(&mc->mc_alloc_slots[allocator]); 3831 if (reserved_slots < max) 3832 available_slots = max - reserved_slots; 3833 3834 if (slots <= available_slots || GANG_ALLOCATION(flags)) { 3835 /* 3836 * We reserve the slots individually so that we can unreserve 3837 * them individually when an I/O completes. 3838 */ 3839 for (int d = 0; d < slots; d++) { 3840 reserved_slots = 3841 refcount_add(&mc->mc_alloc_slots[allocator], 3842 zio); 3843 } 3844 zio->io_flags |= ZIO_FLAG_IO_ALLOCATING; 3845 slot_reserved = B_TRUE; 3846 } 3847 3848 mutex_exit(&mc->mc_lock); 3849 return (slot_reserved); 3850 } 3851 3852 void 3853 metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, 3854 int allocator, zio_t *zio) 3855 { 3856 ASSERT(mc->mc_alloc_throttle_enabled); 3857 mutex_enter(&mc->mc_lock); 3858 for (int d = 0; d < slots; d++) { 3859 (void) refcount_remove(&mc->mc_alloc_slots[allocator], 3860 zio); 3861 } 3862 mutex_exit(&mc->mc_lock); 3863 } 3864 3865 static int 3866 metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size, 3867 uint64_t txg) 3868 { 3869 metaslab_t *msp; 3870 spa_t *spa = vd->vdev_spa; 3871 int error = 0; 3872 3873 if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count) 3874 return (ENXIO); 3875 3876 ASSERT3P(vd->vdev_ms, !=, NULL); 3877 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 3878 3879 mutex_enter(&msp->ms_lock); 3880 3881 if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) 3882 error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM); 3883 /* 3884 * No need to fail in that case; someone else has activated the 3885 * metaslab, but that doesn't preclude us from using it. 3886 */ 3887 if (error == EBUSY) 3888 error = 0; 3889 3890 if (error == 0 && 3891 !range_tree_contains(msp->ms_allocatable, offset, size)) 3892 error = SET_ERROR(ENOENT); 3893 3894 if (error || txg == 0) { /* txg == 0 indicates dry run */ 3895 mutex_exit(&msp->ms_lock); 3896 return (error); 3897 } 3898 3899 VERIFY(!msp->ms_condensing); 3900 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 3901 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 3902 VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=, 3903 msp->ms_size); 3904 range_tree_remove(msp->ms_allocatable, offset, size); 3905 3906 if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ 3907 if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) 3908 vdev_dirty(vd, VDD_METASLAB, msp, txg); 3909 range_tree_add(msp->ms_allocating[txg & TXG_MASK], 3910 offset, size); 3911 } 3912 3913 mutex_exit(&msp->ms_lock); 3914 3915 return (0); 3916 } 3917 3918 typedef struct metaslab_claim_cb_arg_t { 3919 uint64_t mcca_txg; 3920 int mcca_error; 3921 } metaslab_claim_cb_arg_t; 3922 3923 /* ARGSUSED */ 3924 static void 3925 metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, 3926 uint64_t size, void *arg) 3927 { 3928 metaslab_claim_cb_arg_t *mcca_arg = arg; 3929 3930 if (mcca_arg->mcca_error == 0) { 3931 mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset, 3932 size, mcca_arg->mcca_txg); 3933 } 3934 } 3935 3936 int 3937 metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg) 3938 { 3939 if (vd->vdev_ops->vdev_op_remap != NULL) { 3940 metaslab_claim_cb_arg_t arg; 3941 3942 /* 3943 * Only zdb(1M) can claim on indirect vdevs. This is used 3944 * to detect leaks of mapped space (that are not accounted 3945 * for in the obsolete counts, spacemap, or bpobj). 3946 */ 3947 ASSERT(!spa_writeable(vd->vdev_spa)); 3948 arg.mcca_error = 0; 3949 arg.mcca_txg = txg; 3950 3951 vd->vdev_ops->vdev_op_remap(vd, offset, size, 3952 metaslab_claim_impl_cb, &arg); 3953 3954 if (arg.mcca_error == 0) { 3955 arg.mcca_error = metaslab_claim_concrete(vd, 3956 offset, size, txg); 3957 } 3958 return (arg.mcca_error); 3959 } else { 3960 return (metaslab_claim_concrete(vd, offset, size, txg)); 3961 } 3962 } 3963 3964 /* 3965 * Intent log support: upon opening the pool after a crash, notify the SPA 3966 * of blocks that the intent log has allocated for immediate write, but 3967 * which are still considered free by the SPA because the last transaction 3968 * group didn't commit yet. 3969 */ 3970 static int 3971 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) 3972 { 3973 uint64_t vdev = DVA_GET_VDEV(dva); 3974 uint64_t offset = DVA_GET_OFFSET(dva); 3975 uint64_t size = DVA_GET_ASIZE(dva); 3976 vdev_t *vd; 3977 3978 if ((vd = vdev_lookup_top(spa, vdev)) == NULL) { 3979 return (SET_ERROR(ENXIO)); 3980 } 3981 3982 ASSERT(DVA_IS_VALID(dva)); 3983 3984 if (DVA_GET_GANG(dva)) 3985 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 3986 3987 return (metaslab_claim_impl(vd, offset, size, txg)); 3988 } 3989 3990 int 3991 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, 3992 int ndvas, uint64_t txg, blkptr_t *hintbp, int flags, 3993 zio_alloc_list_t *zal, zio_t *zio, int allocator) 3994 { 3995 dva_t *dva = bp->blk_dva; 3996 dva_t *hintdva = hintbp->blk_dva; 3997 int error = 0; 3998 3999 ASSERT(bp->blk_birth == 0); 4000 ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); 4001 4002 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 4003 4004 if (mc->mc_rotor == NULL) { /* no vdevs in this class */ 4005 spa_config_exit(spa, SCL_ALLOC, FTAG); 4006 return (SET_ERROR(ENOSPC)); 4007 } 4008 4009 ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); 4010 ASSERT(BP_GET_NDVAS(bp) == 0); 4011 ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); 4012 ASSERT3P(zal, !=, NULL); 4013 4014 for (int d = 0; d < ndvas; d++) { 4015 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, 4016 txg, flags, zal, allocator); 4017 if (error != 0) { 4018 for (d--; d >= 0; d--) { 4019 metaslab_unalloc_dva(spa, &dva[d], txg); 4020 metaslab_group_alloc_decrement(spa, 4021 DVA_GET_VDEV(&dva[d]), zio, flags, 4022 allocator, B_FALSE); 4023 bzero(&dva[d], sizeof (dva_t)); 4024 } 4025 spa_config_exit(spa, SCL_ALLOC, FTAG); 4026 return (error); 4027 } else { 4028 /* 4029 * Update the metaslab group's queue depth 4030 * based on the newly allocated dva. 4031 */ 4032 metaslab_group_alloc_increment(spa, 4033 DVA_GET_VDEV(&dva[d]), zio, flags, allocator); 4034 } 4035 4036 } 4037 ASSERT(error == 0); 4038 ASSERT(BP_GET_NDVAS(bp) == ndvas); 4039 4040 spa_config_exit(spa, SCL_ALLOC, FTAG); 4041 4042 BP_SET_BIRTH(bp, txg, txg); 4043 4044 return (0); 4045 } 4046 4047 void 4048 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) 4049 { 4050 const dva_t *dva = bp->blk_dva; 4051 int ndvas = BP_GET_NDVAS(bp); 4052 4053 ASSERT(!BP_IS_HOLE(bp)); 4054 ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); 4055 4056 /* 4057 * If we have a checkpoint for the pool we need to make sure that 4058 * the blocks that we free that are part of the checkpoint won't be 4059 * reused until the checkpoint is discarded or we revert to it. 4060 * 4061 * The checkpoint flag is passed down the metaslab_free code path 4062 * and is set whenever we want to add a block to the checkpoint's 4063 * accounting. That is, we "checkpoint" blocks that existed at the 4064 * time the checkpoint was created and are therefore referenced by 4065 * the checkpointed uberblock. 4066 * 4067 * Note that, we don't checkpoint any blocks if the current 4068 * syncing txg <= spa_checkpoint_txg. We want these frees to sync 4069 * normally as they will be referenced by the checkpointed uberblock. 4070 */ 4071 boolean_t checkpoint = B_FALSE; 4072 if (bp->blk_birth <= spa->spa_checkpoint_txg && 4073 spa_syncing_txg(spa) > spa->spa_checkpoint_txg) { 4074 /* 4075 * At this point, if the block is part of the checkpoint 4076 * there is no way it was created in the current txg. 4077 */ 4078 ASSERT(!now); 4079 ASSERT3U(spa_syncing_txg(spa), ==, txg); 4080 checkpoint = B_TRUE; 4081 } 4082 4083 spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); 4084 4085 for (int d = 0; d < ndvas; d++) { 4086 if (now) { 4087 metaslab_unalloc_dva(spa, &dva[d], txg); 4088 } else { 4089 ASSERT3U(txg, ==, spa_syncing_txg(spa)); 4090 metaslab_free_dva(spa, &dva[d], checkpoint); 4091 } 4092 } 4093 4094 spa_config_exit(spa, SCL_FREE, FTAG); 4095 } 4096 4097 int 4098 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) 4099 { 4100 const dva_t *dva = bp->blk_dva; 4101 int ndvas = BP_GET_NDVAS(bp); 4102 int error = 0; 4103 4104 ASSERT(!BP_IS_HOLE(bp)); 4105 4106 if (txg != 0) { 4107 /* 4108 * First do a dry run to make sure all DVAs are claimable, 4109 * so we don't have to unwind from partial failures below. 4110 */ 4111 if ((error = metaslab_claim(spa, bp, 0)) != 0) 4112 return (error); 4113 } 4114 4115 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 4116 4117 for (int d = 0; d < ndvas; d++) 4118 if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0) 4119 break; 4120 4121 spa_config_exit(spa, SCL_ALLOC, FTAG); 4122 4123 ASSERT(error == 0 || txg == 0); 4124 4125 return (error); 4126 } 4127 4128 /* ARGSUSED */ 4129 static void 4130 metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset, 4131 uint64_t size, void *arg) 4132 { 4133 if (vd->vdev_ops == &vdev_indirect_ops) 4134 return; 4135 4136 metaslab_check_free_impl(vd, offset, size); 4137 } 4138 4139 static void 4140 metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size) 4141 { 4142 metaslab_t *msp; 4143 spa_t *spa = vd->vdev_spa; 4144 4145 if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) 4146 return; 4147 4148 if (vd->vdev_ops->vdev_op_remap != NULL) { 4149 vd->vdev_ops->vdev_op_remap(vd, offset, size, 4150 metaslab_check_free_impl_cb, NULL); 4151 return; 4152 } 4153 4154 ASSERT(vdev_is_concrete(vd)); 4155 ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); 4156 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 4157 4158 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 4159 4160 mutex_enter(&msp->ms_lock); 4161 if (msp->ms_loaded) 4162 range_tree_verify(msp->ms_allocatable, offset, size); 4163 4164 range_tree_verify(msp->ms_freeing, offset, size); 4165 range_tree_verify(msp->ms_checkpointing, offset, size); 4166 range_tree_verify(msp->ms_freed, offset, size); 4167 for (int j = 0; j < TXG_DEFER_SIZE; j++) 4168 range_tree_verify(msp->ms_defer[j], offset, size); 4169 mutex_exit(&msp->ms_lock); 4170 } 4171 4172 void 4173 metaslab_check_free(spa_t *spa, const blkptr_t *bp) 4174 { 4175 if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) 4176 return; 4177 4178 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 4179 for (int i = 0; i < BP_GET_NDVAS(bp); i++) { 4180 uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]); 4181 vdev_t *vd = vdev_lookup_top(spa, vdev); 4182 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); 4183 uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]); 4184 4185 if (DVA_GET_GANG(&bp->blk_dva[i])) 4186 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 4187 4188 ASSERT3P(vd, !=, NULL); 4189 4190 metaslab_check_free_impl(vd, offset, size); 4191 } 4192 spa_config_exit(spa, SCL_VDEV, FTAG); 4193 } 4194