1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 24 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 25 * Copyright (c) 2014 Integros [integros.com] 26 * Copyright (c) 2017, Intel Corporation. 27 */ 28 29 #include <sys/zfs_context.h> 30 #include <sys/dmu.h> 31 #include <sys/dmu_tx.h> 32 #include <sys/space_map.h> 33 #include <sys/metaslab_impl.h> 34 #include <sys/vdev_impl.h> 35 #include <sys/zio.h> 36 #include <sys/spa_impl.h> 37 #include <sys/zfeature.h> 38 #include <sys/vdev_indirect_mapping.h> 39 #include <sys/zap.h> 40 41 #define GANG_ALLOCATION(flags) \ 42 ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER)) 43 44 uint64_t metaslab_aliquot = 512ULL << 10; 45 uint64_t metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ 46 47 /* 48 * Since we can touch multiple metaslabs (and their respective space maps) 49 * with each transaction group, we benefit from having a smaller space map 50 * block size since it allows us to issue more I/O operations scattered 51 * around the disk. 52 */ 53 int zfs_metaslab_sm_blksz = (1 << 12); 54 55 /* 56 * The in-core space map representation is more compact than its on-disk form. 57 * The zfs_condense_pct determines how much more compact the in-core 58 * space map representation must be before we compact it on-disk. 59 * Values should be greater than or equal to 100. 60 */ 61 int zfs_condense_pct = 200; 62 63 /* 64 * Condensing a metaslab is not guaranteed to actually reduce the amount of 65 * space used on disk. In particular, a space map uses data in increments of 66 * MAX(1 << ashift, space_map_blksize), so a metaslab might use the 67 * same number of blocks after condensing. Since the goal of condensing is to 68 * reduce the number of IOPs required to read the space map, we only want to 69 * condense when we can be sure we will reduce the number of blocks used by the 70 * space map. Unfortunately, we cannot precisely compute whether or not this is 71 * the case in metaslab_should_condense since we are holding ms_lock. Instead, 72 * we apply the following heuristic: do not condense a spacemap unless the 73 * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold 74 * blocks. 75 */ 76 int zfs_metaslab_condense_block_threshold = 4; 77 78 /* 79 * The zfs_mg_noalloc_threshold defines which metaslab groups should 80 * be eligible for allocation. The value is defined as a percentage of 81 * free space. Metaslab groups that have more free space than 82 * zfs_mg_noalloc_threshold are always eligible for allocations. Once 83 * a metaslab group's free space is less than or equal to the 84 * zfs_mg_noalloc_threshold the allocator will avoid allocating to that 85 * group unless all groups in the pool have reached zfs_mg_noalloc_threshold. 86 * Once all groups in the pool reach zfs_mg_noalloc_threshold then all 87 * groups are allowed to accept allocations. Gang blocks are always 88 * eligible to allocate on any metaslab group. The default value of 0 means 89 * no metaslab group will be excluded based on this criterion. 90 */ 91 int zfs_mg_noalloc_threshold = 0; 92 93 /* 94 * Metaslab groups are considered eligible for allocations if their 95 * fragmenation metric (measured as a percentage) is less than or equal to 96 * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold 97 * then it will be skipped unless all metaslab groups within the metaslab 98 * class have also crossed this threshold. 99 */ 100 int zfs_mg_fragmentation_threshold = 85; 101 102 /* 103 * Allow metaslabs to keep their active state as long as their fragmentation 104 * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An 105 * active metaslab that exceeds this threshold will no longer keep its active 106 * status allowing better metaslabs to be selected. 107 */ 108 int zfs_metaslab_fragmentation_threshold = 70; 109 110 /* 111 * When set will load all metaslabs when pool is first opened. 112 */ 113 int metaslab_debug_load = 0; 114 115 /* 116 * When set will prevent metaslabs from being unloaded. 117 */ 118 int metaslab_debug_unload = 0; 119 120 /* 121 * Minimum size which forces the dynamic allocator to change 122 * it's allocation strategy. Once the space map cannot satisfy 123 * an allocation of this size then it switches to using more 124 * aggressive strategy (i.e search by size rather than offset). 125 */ 126 uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE; 127 128 /* 129 * The minimum free space, in percent, which must be available 130 * in a space map to continue allocations in a first-fit fashion. 131 * Once the space map's free space drops below this level we dynamically 132 * switch to using best-fit allocations. 133 */ 134 int metaslab_df_free_pct = 4; 135 136 /* 137 * A metaslab is considered "free" if it contains a contiguous 138 * segment which is greater than metaslab_min_alloc_size. 139 */ 140 uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS; 141 142 /* 143 * Percentage of all cpus that can be used by the metaslab taskq. 144 */ 145 int metaslab_load_pct = 50; 146 147 /* 148 * Determines how many txgs a metaslab may remain loaded without having any 149 * allocations from it. As long as a metaslab continues to be used we will 150 * keep it loaded. 151 */ 152 int metaslab_unload_delay = TXG_SIZE * 2; 153 154 /* 155 * Max number of metaslabs per group to preload. 156 */ 157 int metaslab_preload_limit = SPA_DVAS_PER_BP; 158 159 /* 160 * Enable/disable preloading of metaslab. 161 */ 162 boolean_t metaslab_preload_enabled = B_TRUE; 163 164 /* 165 * Enable/disable fragmentation weighting on metaslabs. 166 */ 167 boolean_t metaslab_fragmentation_factor_enabled = B_TRUE; 168 169 /* 170 * Enable/disable lba weighting (i.e. outer tracks are given preference). 171 */ 172 boolean_t metaslab_lba_weighting_enabled = B_TRUE; 173 174 /* 175 * Enable/disable metaslab group biasing. 176 */ 177 boolean_t metaslab_bias_enabled = B_TRUE; 178 179 /* 180 * Enable/disable remapping of indirect DVAs to their concrete vdevs. 181 */ 182 boolean_t zfs_remap_blkptr_enable = B_TRUE; 183 184 /* 185 * Enable/disable segment-based metaslab selection. 186 */ 187 boolean_t zfs_metaslab_segment_weight_enabled = B_TRUE; 188 189 /* 190 * When using segment-based metaslab selection, we will continue 191 * allocating from the active metaslab until we have exhausted 192 * zfs_metaslab_switch_threshold of its buckets. 193 */ 194 int zfs_metaslab_switch_threshold = 2; 195 196 /* 197 * Internal switch to enable/disable the metaslab allocation tracing 198 * facility. 199 */ 200 boolean_t metaslab_trace_enabled = B_TRUE; 201 202 /* 203 * Maximum entries that the metaslab allocation tracing facility will keep 204 * in a given list when running in non-debug mode. We limit the number 205 * of entries in non-debug mode to prevent us from using up too much memory. 206 * The limit should be sufficiently large that we don't expect any allocation 207 * to every exceed this value. In debug mode, the system will panic if this 208 * limit is ever reached allowing for further investigation. 209 */ 210 uint64_t metaslab_trace_max_entries = 5000; 211 212 static uint64_t metaslab_weight(metaslab_t *); 213 static void metaslab_set_fragmentation(metaslab_t *); 214 static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t); 215 static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t); 216 static void metaslab_passivate(metaslab_t *msp, uint64_t weight); 217 static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp); 218 219 kmem_cache_t *metaslab_alloc_trace_cache; 220 221 /* 222 * ========================================================================== 223 * Metaslab classes 224 * ========================================================================== 225 */ 226 metaslab_class_t * 227 metaslab_class_create(spa_t *spa, metaslab_ops_t *ops) 228 { 229 metaslab_class_t *mc; 230 231 mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); 232 233 mc->mc_spa = spa; 234 mc->mc_rotor = NULL; 235 mc->mc_ops = ops; 236 mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL); 237 mc->mc_alloc_slots = kmem_zalloc(spa->spa_alloc_count * 238 sizeof (zfs_refcount_t), KM_SLEEP); 239 mc->mc_alloc_max_slots = kmem_zalloc(spa->spa_alloc_count * 240 sizeof (uint64_t), KM_SLEEP); 241 for (int i = 0; i < spa->spa_alloc_count; i++) 242 zfs_refcount_create_tracked(&mc->mc_alloc_slots[i]); 243 244 return (mc); 245 } 246 247 void 248 metaslab_class_destroy(metaslab_class_t *mc) 249 { 250 ASSERT(mc->mc_rotor == NULL); 251 ASSERT(mc->mc_alloc == 0); 252 ASSERT(mc->mc_deferred == 0); 253 ASSERT(mc->mc_space == 0); 254 ASSERT(mc->mc_dspace == 0); 255 256 for (int i = 0; i < mc->mc_spa->spa_alloc_count; i++) 257 zfs_refcount_destroy(&mc->mc_alloc_slots[i]); 258 kmem_free(mc->mc_alloc_slots, mc->mc_spa->spa_alloc_count * 259 sizeof (zfs_refcount_t)); 260 kmem_free(mc->mc_alloc_max_slots, mc->mc_spa->spa_alloc_count * 261 sizeof (uint64_t)); 262 mutex_destroy(&mc->mc_lock); 263 kmem_free(mc, sizeof (metaslab_class_t)); 264 } 265 266 int 267 metaslab_class_validate(metaslab_class_t *mc) 268 { 269 metaslab_group_t *mg; 270 vdev_t *vd; 271 272 /* 273 * Must hold one of the spa_config locks. 274 */ 275 ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || 276 spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); 277 278 if ((mg = mc->mc_rotor) == NULL) 279 return (0); 280 281 do { 282 vd = mg->mg_vd; 283 ASSERT(vd->vdev_mg != NULL); 284 ASSERT3P(vd->vdev_top, ==, vd); 285 ASSERT3P(mg->mg_class, ==, mc); 286 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); 287 } while ((mg = mg->mg_next) != mc->mc_rotor); 288 289 return (0); 290 } 291 292 static void 293 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, 294 int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) 295 { 296 atomic_add_64(&mc->mc_alloc, alloc_delta); 297 atomic_add_64(&mc->mc_deferred, defer_delta); 298 atomic_add_64(&mc->mc_space, space_delta); 299 atomic_add_64(&mc->mc_dspace, dspace_delta); 300 } 301 302 uint64_t 303 metaslab_class_get_alloc(metaslab_class_t *mc) 304 { 305 return (mc->mc_alloc); 306 } 307 308 uint64_t 309 metaslab_class_get_deferred(metaslab_class_t *mc) 310 { 311 return (mc->mc_deferred); 312 } 313 314 uint64_t 315 metaslab_class_get_space(metaslab_class_t *mc) 316 { 317 return (mc->mc_space); 318 } 319 320 uint64_t 321 metaslab_class_get_dspace(metaslab_class_t *mc) 322 { 323 return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); 324 } 325 326 void 327 metaslab_class_histogram_verify(metaslab_class_t *mc) 328 { 329 spa_t *spa = mc->mc_spa; 330 vdev_t *rvd = spa->spa_root_vdev; 331 uint64_t *mc_hist; 332 int i; 333 334 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 335 return; 336 337 mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 338 KM_SLEEP); 339 340 for (int c = 0; c < rvd->vdev_children; c++) { 341 vdev_t *tvd = rvd->vdev_child[c]; 342 metaslab_group_t *mg = tvd->vdev_mg; 343 344 /* 345 * Skip any holes, uninitialized top-levels, or 346 * vdevs that are not in this metalab class. 347 */ 348 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || 349 mg->mg_class != mc) { 350 continue; 351 } 352 353 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 354 mc_hist[i] += mg->mg_histogram[i]; 355 } 356 357 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 358 VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]); 359 360 kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 361 } 362 363 /* 364 * Calculate the metaslab class's fragmentation metric. The metric 365 * is weighted based on the space contribution of each metaslab group. 366 * The return value will be a number between 0 and 100 (inclusive), or 367 * ZFS_FRAG_INVALID if the metric has not been set. See comment above the 368 * zfs_frag_table for more information about the metric. 369 */ 370 uint64_t 371 metaslab_class_fragmentation(metaslab_class_t *mc) 372 { 373 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 374 uint64_t fragmentation = 0; 375 376 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 377 378 for (int c = 0; c < rvd->vdev_children; c++) { 379 vdev_t *tvd = rvd->vdev_child[c]; 380 metaslab_group_t *mg = tvd->vdev_mg; 381 382 /* 383 * Skip any holes, uninitialized top-levels, 384 * or vdevs that are not in this metalab class. 385 */ 386 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || 387 mg->mg_class != mc) { 388 continue; 389 } 390 391 /* 392 * If a metaslab group does not contain a fragmentation 393 * metric then just bail out. 394 */ 395 if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { 396 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 397 return (ZFS_FRAG_INVALID); 398 } 399 400 /* 401 * Determine how much this metaslab_group is contributing 402 * to the overall pool fragmentation metric. 403 */ 404 fragmentation += mg->mg_fragmentation * 405 metaslab_group_get_space(mg); 406 } 407 fragmentation /= metaslab_class_get_space(mc); 408 409 ASSERT3U(fragmentation, <=, 100); 410 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 411 return (fragmentation); 412 } 413 414 /* 415 * Calculate the amount of expandable space that is available in 416 * this metaslab class. If a device is expanded then its expandable 417 * space will be the amount of allocatable space that is currently not 418 * part of this metaslab class. 419 */ 420 uint64_t 421 metaslab_class_expandable_space(metaslab_class_t *mc) 422 { 423 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 424 uint64_t space = 0; 425 426 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 427 for (int c = 0; c < rvd->vdev_children; c++) { 428 uint64_t tspace; 429 vdev_t *tvd = rvd->vdev_child[c]; 430 metaslab_group_t *mg = tvd->vdev_mg; 431 432 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || 433 mg->mg_class != mc) { 434 continue; 435 } 436 437 /* 438 * Calculate if we have enough space to add additional 439 * metaslabs. We report the expandable space in terms 440 * of the metaslab size since that's the unit of expansion. 441 * Adjust by efi system partition size. 442 */ 443 tspace = tvd->vdev_max_asize - tvd->vdev_asize; 444 if (tspace > mc->mc_spa->spa_bootsize) { 445 tspace -= mc->mc_spa->spa_bootsize; 446 } 447 space += P2ALIGN(tspace, 1ULL << tvd->vdev_ms_shift); 448 } 449 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 450 return (space); 451 } 452 453 static int 454 metaslab_compare(const void *x1, const void *x2) 455 { 456 const metaslab_t *m1 = (const metaslab_t *)x1; 457 const metaslab_t *m2 = (const metaslab_t *)x2; 458 459 int sort1 = 0; 460 int sort2 = 0; 461 if (m1->ms_allocator != -1 && m1->ms_primary) 462 sort1 = 1; 463 else if (m1->ms_allocator != -1 && !m1->ms_primary) 464 sort1 = 2; 465 if (m2->ms_allocator != -1 && m2->ms_primary) 466 sort2 = 1; 467 else if (m2->ms_allocator != -1 && !m2->ms_primary) 468 sort2 = 2; 469 470 /* 471 * Sort inactive metaslabs first, then primaries, then secondaries. When 472 * selecting a metaslab to allocate from, an allocator first tries its 473 * primary, then secondary active metaslab. If it doesn't have active 474 * metaslabs, or can't allocate from them, it searches for an inactive 475 * metaslab to activate. If it can't find a suitable one, it will steal 476 * a primary or secondary metaslab from another allocator. 477 */ 478 if (sort1 < sort2) 479 return (-1); 480 if (sort1 > sort2) 481 return (1); 482 483 int cmp = AVL_CMP(m2->ms_weight, m1->ms_weight); 484 if (likely(cmp)) 485 return (cmp); 486 487 IMPLY(AVL_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2); 488 489 return (AVL_CMP(m1->ms_start, m2->ms_start)); 490 } 491 492 uint64_t 493 metaslab_allocated_space(metaslab_t *msp) 494 { 495 return (msp->ms_allocated_space); 496 } 497 498 /* 499 * Verify that the space accounting on disk matches the in-core range_trees. 500 */ 501 static void 502 metaslab_verify_space(metaslab_t *msp, uint64_t txg) 503 { 504 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 505 uint64_t allocating = 0; 506 uint64_t sm_free_space, msp_free_space; 507 508 ASSERT(MUTEX_HELD(&msp->ms_lock)); 509 ASSERT(!msp->ms_condensing); 510 511 if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) 512 return; 513 514 /* 515 * We can only verify the metaslab space when we're called 516 * from syncing context with a loaded metaslab that has an 517 * allocated space map. Calling this in non-syncing context 518 * does not provide a consistent view of the metaslab since 519 * we're performing allocations in the future. 520 */ 521 if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL || 522 !msp->ms_loaded) 523 return; 524 525 /* 526 * Even though the smp_alloc field can get negative (e.g. 527 * see vdev_checkpoint_sm), that should never be the case 528 * when it come's to a metaslab's space map. 529 */ 530 ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0); 531 532 sm_free_space = msp->ms_size - metaslab_allocated_space(msp); 533 534 /* 535 * Account for future allocations since we would have 536 * already deducted that space from the ms_allocatable. 537 */ 538 for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { 539 allocating += 540 range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]); 541 } 542 543 ASSERT3U(msp->ms_deferspace, ==, 544 range_tree_space(msp->ms_defer[0]) + 545 range_tree_space(msp->ms_defer[1])); 546 547 msp_free_space = range_tree_space(msp->ms_allocatable) + allocating + 548 msp->ms_deferspace + range_tree_space(msp->ms_freed); 549 550 VERIFY3U(sm_free_space, ==, msp_free_space); 551 } 552 553 /* 554 * ========================================================================== 555 * Metaslab groups 556 * ========================================================================== 557 */ 558 /* 559 * Update the allocatable flag and the metaslab group's capacity. 560 * The allocatable flag is set to true if the capacity is below 561 * the zfs_mg_noalloc_threshold or has a fragmentation value that is 562 * greater than zfs_mg_fragmentation_threshold. If a metaslab group 563 * transitions from allocatable to non-allocatable or vice versa then the 564 * metaslab group's class is updated to reflect the transition. 565 */ 566 static void 567 metaslab_group_alloc_update(metaslab_group_t *mg) 568 { 569 vdev_t *vd = mg->mg_vd; 570 metaslab_class_t *mc = mg->mg_class; 571 vdev_stat_t *vs = &vd->vdev_stat; 572 boolean_t was_allocatable; 573 boolean_t was_initialized; 574 575 ASSERT(vd == vd->vdev_top); 576 ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==, 577 SCL_ALLOC); 578 579 mutex_enter(&mg->mg_lock); 580 was_allocatable = mg->mg_allocatable; 581 was_initialized = mg->mg_initialized; 582 583 mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / 584 (vs->vs_space + 1); 585 586 mutex_enter(&mc->mc_lock); 587 588 /* 589 * If the metaslab group was just added then it won't 590 * have any space until we finish syncing out this txg. 591 * At that point we will consider it initialized and available 592 * for allocations. We also don't consider non-activated 593 * metaslab groups (e.g. vdevs that are in the middle of being removed) 594 * to be initialized, because they can't be used for allocation. 595 */ 596 mg->mg_initialized = metaslab_group_initialized(mg); 597 if (!was_initialized && mg->mg_initialized) { 598 mc->mc_groups++; 599 } else if (was_initialized && !mg->mg_initialized) { 600 ASSERT3U(mc->mc_groups, >, 0); 601 mc->mc_groups--; 602 } 603 if (mg->mg_initialized) 604 mg->mg_no_free_space = B_FALSE; 605 606 /* 607 * A metaslab group is considered allocatable if it has plenty 608 * of free space or is not heavily fragmented. We only take 609 * fragmentation into account if the metaslab group has a valid 610 * fragmentation metric (i.e. a value between 0 and 100). 611 */ 612 mg->mg_allocatable = (mg->mg_activation_count > 0 && 613 mg->mg_free_capacity > zfs_mg_noalloc_threshold && 614 (mg->mg_fragmentation == ZFS_FRAG_INVALID || 615 mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)); 616 617 /* 618 * The mc_alloc_groups maintains a count of the number of 619 * groups in this metaslab class that are still above the 620 * zfs_mg_noalloc_threshold. This is used by the allocating 621 * threads to determine if they should avoid allocations to 622 * a given group. The allocator will avoid allocations to a group 623 * if that group has reached or is below the zfs_mg_noalloc_threshold 624 * and there are still other groups that are above the threshold. 625 * When a group transitions from allocatable to non-allocatable or 626 * vice versa we update the metaslab class to reflect that change. 627 * When the mc_alloc_groups value drops to 0 that means that all 628 * groups have reached the zfs_mg_noalloc_threshold making all groups 629 * eligible for allocations. This effectively means that all devices 630 * are balanced again. 631 */ 632 if (was_allocatable && !mg->mg_allocatable) 633 mc->mc_alloc_groups--; 634 else if (!was_allocatable && mg->mg_allocatable) 635 mc->mc_alloc_groups++; 636 mutex_exit(&mc->mc_lock); 637 638 mutex_exit(&mg->mg_lock); 639 } 640 641 metaslab_group_t * 642 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators) 643 { 644 metaslab_group_t *mg; 645 646 mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); 647 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); 648 mutex_init(&mg->mg_ms_initialize_lock, NULL, MUTEX_DEFAULT, NULL); 649 cv_init(&mg->mg_ms_initialize_cv, NULL, CV_DEFAULT, NULL); 650 mg->mg_primaries = kmem_zalloc(allocators * sizeof (metaslab_t *), 651 KM_SLEEP); 652 mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *), 653 KM_SLEEP); 654 avl_create(&mg->mg_metaslab_tree, metaslab_compare, 655 sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); 656 mg->mg_vd = vd; 657 mg->mg_class = mc; 658 mg->mg_activation_count = 0; 659 mg->mg_initialized = B_FALSE; 660 mg->mg_no_free_space = B_TRUE; 661 mg->mg_allocators = allocators; 662 663 mg->mg_alloc_queue_depth = kmem_zalloc(allocators * 664 sizeof (zfs_refcount_t), KM_SLEEP); 665 mg->mg_cur_max_alloc_queue_depth = kmem_zalloc(allocators * 666 sizeof (uint64_t), KM_SLEEP); 667 for (int i = 0; i < allocators; i++) { 668 zfs_refcount_create_tracked(&mg->mg_alloc_queue_depth[i]); 669 mg->mg_cur_max_alloc_queue_depth[i] = 0; 670 } 671 672 mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct, 673 minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT); 674 675 return (mg); 676 } 677 678 void 679 metaslab_group_destroy(metaslab_group_t *mg) 680 { 681 ASSERT(mg->mg_prev == NULL); 682 ASSERT(mg->mg_next == NULL); 683 /* 684 * We may have gone below zero with the activation count 685 * either because we never activated in the first place or 686 * because we're done, and possibly removing the vdev. 687 */ 688 ASSERT(mg->mg_activation_count <= 0); 689 690 taskq_destroy(mg->mg_taskq); 691 avl_destroy(&mg->mg_metaslab_tree); 692 kmem_free(mg->mg_primaries, mg->mg_allocators * sizeof (metaslab_t *)); 693 kmem_free(mg->mg_secondaries, mg->mg_allocators * 694 sizeof (metaslab_t *)); 695 mutex_destroy(&mg->mg_lock); 696 mutex_destroy(&mg->mg_ms_initialize_lock); 697 cv_destroy(&mg->mg_ms_initialize_cv); 698 699 for (int i = 0; i < mg->mg_allocators; i++) { 700 zfs_refcount_destroy(&mg->mg_alloc_queue_depth[i]); 701 mg->mg_cur_max_alloc_queue_depth[i] = 0; 702 } 703 kmem_free(mg->mg_alloc_queue_depth, mg->mg_allocators * 704 sizeof (zfs_refcount_t)); 705 kmem_free(mg->mg_cur_max_alloc_queue_depth, mg->mg_allocators * 706 sizeof (uint64_t)); 707 708 kmem_free(mg, sizeof (metaslab_group_t)); 709 } 710 711 void 712 metaslab_group_activate(metaslab_group_t *mg) 713 { 714 metaslab_class_t *mc = mg->mg_class; 715 metaslab_group_t *mgprev, *mgnext; 716 717 ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER), !=, 0); 718 719 ASSERT(mc->mc_rotor != mg); 720 ASSERT(mg->mg_prev == NULL); 721 ASSERT(mg->mg_next == NULL); 722 ASSERT(mg->mg_activation_count <= 0); 723 724 if (++mg->mg_activation_count <= 0) 725 return; 726 727 mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); 728 metaslab_group_alloc_update(mg); 729 730 if ((mgprev = mc->mc_rotor) == NULL) { 731 mg->mg_prev = mg; 732 mg->mg_next = mg; 733 } else { 734 mgnext = mgprev->mg_next; 735 mg->mg_prev = mgprev; 736 mg->mg_next = mgnext; 737 mgprev->mg_next = mg; 738 mgnext->mg_prev = mg; 739 } 740 mc->mc_rotor = mg; 741 } 742 743 /* 744 * Passivate a metaslab group and remove it from the allocation rotor. 745 * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating 746 * a metaslab group. This function will momentarily drop spa_config_locks 747 * that are lower than the SCL_ALLOC lock (see comment below). 748 */ 749 void 750 metaslab_group_passivate(metaslab_group_t *mg) 751 { 752 metaslab_class_t *mc = mg->mg_class; 753 spa_t *spa = mc->mc_spa; 754 metaslab_group_t *mgprev, *mgnext; 755 int locks = spa_config_held(spa, SCL_ALL, RW_WRITER); 756 757 ASSERT3U(spa_config_held(spa, SCL_ALLOC | SCL_ZIO, RW_WRITER), ==, 758 (SCL_ALLOC | SCL_ZIO)); 759 760 if (--mg->mg_activation_count != 0) { 761 ASSERT(mc->mc_rotor != mg); 762 ASSERT(mg->mg_prev == NULL); 763 ASSERT(mg->mg_next == NULL); 764 ASSERT(mg->mg_activation_count < 0); 765 return; 766 } 767 768 /* 769 * The spa_config_lock is an array of rwlocks, ordered as 770 * follows (from highest to lowest): 771 * SCL_CONFIG > SCL_STATE > SCL_L2ARC > SCL_ALLOC > 772 * SCL_ZIO > SCL_FREE > SCL_VDEV 773 * (For more information about the spa_config_lock see spa_misc.c) 774 * The higher the lock, the broader its coverage. When we passivate 775 * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO 776 * config locks. However, the metaslab group's taskq might be trying 777 * to preload metaslabs so we must drop the SCL_ZIO lock and any 778 * lower locks to allow the I/O to complete. At a minimum, 779 * we continue to hold the SCL_ALLOC lock, which prevents any future 780 * allocations from taking place and any changes to the vdev tree. 781 */ 782 spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa); 783 taskq_wait(mg->mg_taskq); 784 spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER); 785 metaslab_group_alloc_update(mg); 786 for (int i = 0; i < mg->mg_allocators; i++) { 787 metaslab_t *msp = mg->mg_primaries[i]; 788 if (msp != NULL) { 789 mutex_enter(&msp->ms_lock); 790 metaslab_passivate(msp, 791 metaslab_weight_from_range_tree(msp)); 792 mutex_exit(&msp->ms_lock); 793 } 794 msp = mg->mg_secondaries[i]; 795 if (msp != NULL) { 796 mutex_enter(&msp->ms_lock); 797 metaslab_passivate(msp, 798 metaslab_weight_from_range_tree(msp)); 799 mutex_exit(&msp->ms_lock); 800 } 801 } 802 803 mgprev = mg->mg_prev; 804 mgnext = mg->mg_next; 805 806 if (mg == mgnext) { 807 mc->mc_rotor = NULL; 808 } else { 809 mc->mc_rotor = mgnext; 810 mgprev->mg_next = mgnext; 811 mgnext->mg_prev = mgprev; 812 } 813 814 mg->mg_prev = NULL; 815 mg->mg_next = NULL; 816 } 817 818 boolean_t 819 metaslab_group_initialized(metaslab_group_t *mg) 820 { 821 vdev_t *vd = mg->mg_vd; 822 vdev_stat_t *vs = &vd->vdev_stat; 823 824 return (vs->vs_space != 0 && mg->mg_activation_count > 0); 825 } 826 827 uint64_t 828 metaslab_group_get_space(metaslab_group_t *mg) 829 { 830 return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count); 831 } 832 833 void 834 metaslab_group_histogram_verify(metaslab_group_t *mg) 835 { 836 uint64_t *mg_hist; 837 vdev_t *vd = mg->mg_vd; 838 uint64_t ashift = vd->vdev_ashift; 839 int i; 840 841 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 842 return; 843 844 mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 845 KM_SLEEP); 846 847 ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=, 848 SPACE_MAP_HISTOGRAM_SIZE + ashift); 849 850 for (int m = 0; m < vd->vdev_ms_count; m++) { 851 metaslab_t *msp = vd->vdev_ms[m]; 852 ASSERT(msp != NULL); 853 854 /* skip if not active or not a member */ 855 if (msp->ms_sm == NULL || msp->ms_group != mg) 856 continue; 857 858 for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) 859 mg_hist[i + ashift] += 860 msp->ms_sm->sm_phys->smp_histogram[i]; 861 } 862 863 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++) 864 VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]); 865 866 kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 867 } 868 869 static void 870 metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp) 871 { 872 metaslab_class_t *mc = mg->mg_class; 873 uint64_t ashift = mg->mg_vd->vdev_ashift; 874 875 ASSERT(MUTEX_HELD(&msp->ms_lock)); 876 if (msp->ms_sm == NULL) 877 return; 878 879 mutex_enter(&mg->mg_lock); 880 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 881 mg->mg_histogram[i + ashift] += 882 msp->ms_sm->sm_phys->smp_histogram[i]; 883 mc->mc_histogram[i + ashift] += 884 msp->ms_sm->sm_phys->smp_histogram[i]; 885 } 886 mutex_exit(&mg->mg_lock); 887 } 888 889 void 890 metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp) 891 { 892 metaslab_class_t *mc = mg->mg_class; 893 uint64_t ashift = mg->mg_vd->vdev_ashift; 894 895 ASSERT(MUTEX_HELD(&msp->ms_lock)); 896 if (msp->ms_sm == NULL) 897 return; 898 899 mutex_enter(&mg->mg_lock); 900 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 901 ASSERT3U(mg->mg_histogram[i + ashift], >=, 902 msp->ms_sm->sm_phys->smp_histogram[i]); 903 ASSERT3U(mc->mc_histogram[i + ashift], >=, 904 msp->ms_sm->sm_phys->smp_histogram[i]); 905 906 mg->mg_histogram[i + ashift] -= 907 msp->ms_sm->sm_phys->smp_histogram[i]; 908 mc->mc_histogram[i + ashift] -= 909 msp->ms_sm->sm_phys->smp_histogram[i]; 910 } 911 mutex_exit(&mg->mg_lock); 912 } 913 914 static void 915 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) 916 { 917 ASSERT(msp->ms_group == NULL); 918 mutex_enter(&mg->mg_lock); 919 msp->ms_group = mg; 920 msp->ms_weight = 0; 921 avl_add(&mg->mg_metaslab_tree, msp); 922 mutex_exit(&mg->mg_lock); 923 924 mutex_enter(&msp->ms_lock); 925 metaslab_group_histogram_add(mg, msp); 926 mutex_exit(&msp->ms_lock); 927 } 928 929 static void 930 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) 931 { 932 mutex_enter(&msp->ms_lock); 933 metaslab_group_histogram_remove(mg, msp); 934 mutex_exit(&msp->ms_lock); 935 936 mutex_enter(&mg->mg_lock); 937 ASSERT(msp->ms_group == mg); 938 avl_remove(&mg->mg_metaslab_tree, msp); 939 msp->ms_group = NULL; 940 mutex_exit(&mg->mg_lock); 941 } 942 943 static void 944 metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 945 { 946 ASSERT(MUTEX_HELD(&mg->mg_lock)); 947 ASSERT(msp->ms_group == mg); 948 avl_remove(&mg->mg_metaslab_tree, msp); 949 msp->ms_weight = weight; 950 avl_add(&mg->mg_metaslab_tree, msp); 951 952 } 953 954 static void 955 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 956 { 957 /* 958 * Although in principle the weight can be any value, in 959 * practice we do not use values in the range [1, 511]. 960 */ 961 ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0); 962 ASSERT(MUTEX_HELD(&msp->ms_lock)); 963 964 mutex_enter(&mg->mg_lock); 965 metaslab_group_sort_impl(mg, msp, weight); 966 mutex_exit(&mg->mg_lock); 967 } 968 969 /* 970 * Calculate the fragmentation for a given metaslab group. We can use 971 * a simple average here since all metaslabs within the group must have 972 * the same size. The return value will be a value between 0 and 100 973 * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this 974 * group have a fragmentation metric. 975 */ 976 uint64_t 977 metaslab_group_fragmentation(metaslab_group_t *mg) 978 { 979 vdev_t *vd = mg->mg_vd; 980 uint64_t fragmentation = 0; 981 uint64_t valid_ms = 0; 982 983 for (int m = 0; m < vd->vdev_ms_count; m++) { 984 metaslab_t *msp = vd->vdev_ms[m]; 985 986 if (msp->ms_fragmentation == ZFS_FRAG_INVALID) 987 continue; 988 if (msp->ms_group != mg) 989 continue; 990 991 valid_ms++; 992 fragmentation += msp->ms_fragmentation; 993 } 994 995 if (valid_ms <= mg->mg_vd->vdev_ms_count / 2) 996 return (ZFS_FRAG_INVALID); 997 998 fragmentation /= valid_ms; 999 ASSERT3U(fragmentation, <=, 100); 1000 return (fragmentation); 1001 } 1002 1003 /* 1004 * Determine if a given metaslab group should skip allocations. A metaslab 1005 * group should avoid allocations if its free capacity is less than the 1006 * zfs_mg_noalloc_threshold or its fragmentation metric is greater than 1007 * zfs_mg_fragmentation_threshold and there is at least one metaslab group 1008 * that can still handle allocations. If the allocation throttle is enabled 1009 * then we skip allocations to devices that have reached their maximum 1010 * allocation queue depth unless the selected metaslab group is the only 1011 * eligible group remaining. 1012 */ 1013 static boolean_t 1014 metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, 1015 uint64_t psize, int allocator) 1016 { 1017 spa_t *spa = mg->mg_vd->vdev_spa; 1018 metaslab_class_t *mc = mg->mg_class; 1019 1020 /* 1021 * We can only consider skipping this metaslab group if it's 1022 * in the normal metaslab class and there are other metaslab 1023 * groups to select from. Otherwise, we always consider it eligible 1024 * for allocations. 1025 */ 1026 if ((mc != spa_normal_class(spa) && 1027 mc != spa_special_class(spa) && 1028 mc != spa_dedup_class(spa)) || 1029 mc->mc_groups <= 1) 1030 return (B_TRUE); 1031 1032 /* 1033 * If the metaslab group's mg_allocatable flag is set (see comments 1034 * in metaslab_group_alloc_update() for more information) and 1035 * the allocation throttle is disabled then allow allocations to this 1036 * device. However, if the allocation throttle is enabled then 1037 * check if we have reached our allocation limit (mg_alloc_queue_depth) 1038 * to determine if we should allow allocations to this metaslab group. 1039 * If all metaslab groups are no longer considered allocatable 1040 * (mc_alloc_groups == 0) or we're trying to allocate the smallest 1041 * gang block size then we allow allocations on this metaslab group 1042 * regardless of the mg_allocatable or throttle settings. 1043 */ 1044 if (mg->mg_allocatable) { 1045 metaslab_group_t *mgp; 1046 int64_t qdepth; 1047 uint64_t qmax = mg->mg_cur_max_alloc_queue_depth[allocator]; 1048 1049 if (!mc->mc_alloc_throttle_enabled) 1050 return (B_TRUE); 1051 1052 /* 1053 * If this metaslab group does not have any free space, then 1054 * there is no point in looking further. 1055 */ 1056 if (mg->mg_no_free_space) 1057 return (B_FALSE); 1058 1059 qdepth = zfs_refcount_count( 1060 &mg->mg_alloc_queue_depth[allocator]); 1061 1062 /* 1063 * If this metaslab group is below its qmax or it's 1064 * the only allocatable metasable group, then attempt 1065 * to allocate from it. 1066 */ 1067 if (qdepth < qmax || mc->mc_alloc_groups == 1) 1068 return (B_TRUE); 1069 ASSERT3U(mc->mc_alloc_groups, >, 1); 1070 1071 /* 1072 * Since this metaslab group is at or over its qmax, we 1073 * need to determine if there are metaslab groups after this 1074 * one that might be able to handle this allocation. This is 1075 * racy since we can't hold the locks for all metaslab 1076 * groups at the same time when we make this check. 1077 */ 1078 for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) { 1079 qmax = mgp->mg_cur_max_alloc_queue_depth[allocator]; 1080 1081 qdepth = zfs_refcount_count( 1082 &mgp->mg_alloc_queue_depth[allocator]); 1083 1084 /* 1085 * If there is another metaslab group that 1086 * might be able to handle the allocation, then 1087 * we return false so that we skip this group. 1088 */ 1089 if (qdepth < qmax && !mgp->mg_no_free_space) 1090 return (B_FALSE); 1091 } 1092 1093 /* 1094 * We didn't find another group to handle the allocation 1095 * so we can't skip this metaslab group even though 1096 * we are at or over our qmax. 1097 */ 1098 return (B_TRUE); 1099 1100 } else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) { 1101 return (B_TRUE); 1102 } 1103 return (B_FALSE); 1104 } 1105 1106 /* 1107 * ========================================================================== 1108 * Range tree callbacks 1109 * ========================================================================== 1110 */ 1111 1112 /* 1113 * Comparison function for the private size-ordered tree. Tree is sorted 1114 * by size, larger sizes at the end of the tree. 1115 */ 1116 static int 1117 metaslab_rangesize_compare(const void *x1, const void *x2) 1118 { 1119 const range_seg_t *r1 = x1; 1120 const range_seg_t *r2 = x2; 1121 uint64_t rs_size1 = r1->rs_end - r1->rs_start; 1122 uint64_t rs_size2 = r2->rs_end - r2->rs_start; 1123 1124 int cmp = AVL_CMP(rs_size1, rs_size2); 1125 if (likely(cmp)) 1126 return (cmp); 1127 1128 return (AVL_CMP(r1->rs_start, r2->rs_start)); 1129 } 1130 1131 /* 1132 * Create any block allocator specific components. The current allocators 1133 * rely on using both a size-ordered range_tree_t and an array of uint64_t's. 1134 */ 1135 static void 1136 metaslab_rt_create(range_tree_t *rt, void *arg) 1137 { 1138 metaslab_t *msp = arg; 1139 1140 ASSERT3P(rt->rt_arg, ==, msp); 1141 ASSERT(msp->ms_allocatable == NULL); 1142 1143 avl_create(&msp->ms_allocatable_by_size, metaslab_rangesize_compare, 1144 sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node)); 1145 } 1146 1147 /* 1148 * Destroy the block allocator specific components. 1149 */ 1150 static void 1151 metaslab_rt_destroy(range_tree_t *rt, void *arg) 1152 { 1153 metaslab_t *msp = arg; 1154 1155 ASSERT3P(rt->rt_arg, ==, msp); 1156 ASSERT3P(msp->ms_allocatable, ==, rt); 1157 ASSERT0(avl_numnodes(&msp->ms_allocatable_by_size)); 1158 1159 avl_destroy(&msp->ms_allocatable_by_size); 1160 } 1161 1162 static void 1163 metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg) 1164 { 1165 metaslab_t *msp = arg; 1166 1167 ASSERT3P(rt->rt_arg, ==, msp); 1168 ASSERT3P(msp->ms_allocatable, ==, rt); 1169 VERIFY(!msp->ms_condensing); 1170 avl_add(&msp->ms_allocatable_by_size, rs); 1171 } 1172 1173 static void 1174 metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg) 1175 { 1176 metaslab_t *msp = arg; 1177 1178 ASSERT3P(rt->rt_arg, ==, msp); 1179 ASSERT3P(msp->ms_allocatable, ==, rt); 1180 VERIFY(!msp->ms_condensing); 1181 avl_remove(&msp->ms_allocatable_by_size, rs); 1182 } 1183 1184 static void 1185 metaslab_rt_vacate(range_tree_t *rt, void *arg) 1186 { 1187 metaslab_t *msp = arg; 1188 1189 ASSERT3P(rt->rt_arg, ==, msp); 1190 ASSERT3P(msp->ms_allocatable, ==, rt); 1191 1192 /* 1193 * Normally one would walk the tree freeing nodes along the way. 1194 * Since the nodes are shared with the range trees we can avoid 1195 * walking all nodes and just reinitialize the avl tree. The nodes 1196 * will be freed by the range tree, so we don't want to free them here. 1197 */ 1198 avl_create(&msp->ms_allocatable_by_size, metaslab_rangesize_compare, 1199 sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node)); 1200 } 1201 1202 static range_tree_ops_t metaslab_rt_ops = { 1203 metaslab_rt_create, 1204 metaslab_rt_destroy, 1205 metaslab_rt_add, 1206 metaslab_rt_remove, 1207 metaslab_rt_vacate 1208 }; 1209 1210 /* 1211 * ========================================================================== 1212 * Common allocator routines 1213 * ========================================================================== 1214 */ 1215 1216 /* 1217 * Return the maximum contiguous segment within the metaslab. 1218 */ 1219 uint64_t 1220 metaslab_block_maxsize(metaslab_t *msp) 1221 { 1222 avl_tree_t *t = &msp->ms_allocatable_by_size; 1223 range_seg_t *rs; 1224 1225 if (t == NULL || (rs = avl_last(t)) == NULL) 1226 return (0ULL); 1227 1228 return (rs->rs_end - rs->rs_start); 1229 } 1230 1231 static range_seg_t * 1232 metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size) 1233 { 1234 range_seg_t *rs, rsearch; 1235 avl_index_t where; 1236 1237 rsearch.rs_start = start; 1238 rsearch.rs_end = start + size; 1239 1240 rs = avl_find(t, &rsearch, &where); 1241 if (rs == NULL) { 1242 rs = avl_nearest(t, where, AVL_AFTER); 1243 } 1244 1245 return (rs); 1246 } 1247 1248 /* 1249 * This is a helper function that can be used by the allocator to find 1250 * a suitable block to allocate. This will search the specified AVL 1251 * tree looking for a block that matches the specified criteria. 1252 */ 1253 static uint64_t 1254 metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, 1255 uint64_t align) 1256 { 1257 range_seg_t *rs = metaslab_block_find(t, *cursor, size); 1258 1259 while (rs != NULL) { 1260 uint64_t offset = P2ROUNDUP(rs->rs_start, align); 1261 1262 if (offset + size <= rs->rs_end) { 1263 *cursor = offset + size; 1264 return (offset); 1265 } 1266 rs = AVL_NEXT(t, rs); 1267 } 1268 1269 /* 1270 * If we know we've searched the whole map (*cursor == 0), give up. 1271 * Otherwise, reset the cursor to the beginning and try again. 1272 */ 1273 if (*cursor == 0) 1274 return (-1ULL); 1275 1276 *cursor = 0; 1277 return (metaslab_block_picker(t, cursor, size, align)); 1278 } 1279 1280 /* 1281 * ========================================================================== 1282 * The first-fit block allocator 1283 * ========================================================================== 1284 */ 1285 static uint64_t 1286 metaslab_ff_alloc(metaslab_t *msp, uint64_t size) 1287 { 1288 /* 1289 * Find the largest power of 2 block size that evenly divides the 1290 * requested size. This is used to try to allocate blocks with similar 1291 * alignment from the same area of the metaslab (i.e. same cursor 1292 * bucket) but it does not guarantee that other allocations sizes 1293 * may exist in the same region. 1294 */ 1295 uint64_t align = size & -size; 1296 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; 1297 avl_tree_t *t = &msp->ms_allocatable->rt_root; 1298 1299 return (metaslab_block_picker(t, cursor, size, align)); 1300 } 1301 1302 static metaslab_ops_t metaslab_ff_ops = { 1303 metaslab_ff_alloc 1304 }; 1305 1306 /* 1307 * ========================================================================== 1308 * Dynamic block allocator - 1309 * Uses the first fit allocation scheme until space get low and then 1310 * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold 1311 * and metaslab_df_free_pct to determine when to switch the allocation scheme. 1312 * ========================================================================== 1313 */ 1314 static uint64_t 1315 metaslab_df_alloc(metaslab_t *msp, uint64_t size) 1316 { 1317 /* 1318 * Find the largest power of 2 block size that evenly divides the 1319 * requested size. This is used to try to allocate blocks with similar 1320 * alignment from the same area of the metaslab (i.e. same cursor 1321 * bucket) but it does not guarantee that other allocations sizes 1322 * may exist in the same region. 1323 */ 1324 uint64_t align = size & -size; 1325 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; 1326 range_tree_t *rt = msp->ms_allocatable; 1327 avl_tree_t *t = &rt->rt_root; 1328 uint64_t max_size = metaslab_block_maxsize(msp); 1329 int free_pct = range_tree_space(rt) * 100 / msp->ms_size; 1330 1331 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1332 ASSERT3U(avl_numnodes(t), ==, 1333 avl_numnodes(&msp->ms_allocatable_by_size)); 1334 1335 if (max_size < size) 1336 return (-1ULL); 1337 1338 /* 1339 * If we're running low on space switch to using the size 1340 * sorted AVL tree (best-fit). 1341 */ 1342 if (max_size < metaslab_df_alloc_threshold || 1343 free_pct < metaslab_df_free_pct) { 1344 t = &msp->ms_allocatable_by_size; 1345 *cursor = 0; 1346 } 1347 1348 return (metaslab_block_picker(t, cursor, size, 1ULL)); 1349 } 1350 1351 static metaslab_ops_t metaslab_df_ops = { 1352 metaslab_df_alloc 1353 }; 1354 1355 /* 1356 * ========================================================================== 1357 * Cursor fit block allocator - 1358 * Select the largest region in the metaslab, set the cursor to the beginning 1359 * of the range and the cursor_end to the end of the range. As allocations 1360 * are made advance the cursor. Continue allocating from the cursor until 1361 * the range is exhausted and then find a new range. 1362 * ========================================================================== 1363 */ 1364 static uint64_t 1365 metaslab_cf_alloc(metaslab_t *msp, uint64_t size) 1366 { 1367 range_tree_t *rt = msp->ms_allocatable; 1368 avl_tree_t *t = &msp->ms_allocatable_by_size; 1369 uint64_t *cursor = &msp->ms_lbas[0]; 1370 uint64_t *cursor_end = &msp->ms_lbas[1]; 1371 uint64_t offset = 0; 1372 1373 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1374 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root)); 1375 1376 ASSERT3U(*cursor_end, >=, *cursor); 1377 1378 if ((*cursor + size) > *cursor_end) { 1379 range_seg_t *rs; 1380 1381 rs = avl_last(&msp->ms_allocatable_by_size); 1382 if (rs == NULL || (rs->rs_end - rs->rs_start) < size) 1383 return (-1ULL); 1384 1385 *cursor = rs->rs_start; 1386 *cursor_end = rs->rs_end; 1387 } 1388 1389 offset = *cursor; 1390 *cursor += size; 1391 1392 return (offset); 1393 } 1394 1395 static metaslab_ops_t metaslab_cf_ops = { 1396 metaslab_cf_alloc 1397 }; 1398 1399 /* 1400 * ========================================================================== 1401 * New dynamic fit allocator - 1402 * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift 1403 * contiguous blocks. If no region is found then just use the largest segment 1404 * that remains. 1405 * ========================================================================== 1406 */ 1407 1408 /* 1409 * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift) 1410 * to request from the allocator. 1411 */ 1412 uint64_t metaslab_ndf_clump_shift = 4; 1413 1414 static uint64_t 1415 metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) 1416 { 1417 avl_tree_t *t = &msp->ms_allocatable->rt_root; 1418 avl_index_t where; 1419 range_seg_t *rs, rsearch; 1420 uint64_t hbit = highbit64(size); 1421 uint64_t *cursor = &msp->ms_lbas[hbit - 1]; 1422 uint64_t max_size = metaslab_block_maxsize(msp); 1423 1424 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1425 ASSERT3U(avl_numnodes(t), ==, 1426 avl_numnodes(&msp->ms_allocatable_by_size)); 1427 1428 if (max_size < size) 1429 return (-1ULL); 1430 1431 rsearch.rs_start = *cursor; 1432 rsearch.rs_end = *cursor + size; 1433 1434 rs = avl_find(t, &rsearch, &where); 1435 if (rs == NULL || (rs->rs_end - rs->rs_start) < size) { 1436 t = &msp->ms_allocatable_by_size; 1437 1438 rsearch.rs_start = 0; 1439 rsearch.rs_end = MIN(max_size, 1440 1ULL << (hbit + metaslab_ndf_clump_shift)); 1441 rs = avl_find(t, &rsearch, &where); 1442 if (rs == NULL) 1443 rs = avl_nearest(t, where, AVL_AFTER); 1444 ASSERT(rs != NULL); 1445 } 1446 1447 if ((rs->rs_end - rs->rs_start) >= size) { 1448 *cursor = rs->rs_start + size; 1449 return (rs->rs_start); 1450 } 1451 return (-1ULL); 1452 } 1453 1454 static metaslab_ops_t metaslab_ndf_ops = { 1455 metaslab_ndf_alloc 1456 }; 1457 1458 metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops; 1459 1460 /* 1461 * ========================================================================== 1462 * Metaslabs 1463 * ========================================================================== 1464 */ 1465 1466 static void 1467 metaslab_aux_histograms_clear(metaslab_t *msp) 1468 { 1469 /* 1470 * Auxiliary histograms are only cleared when resetting them, 1471 * which can only happen while the metaslab is loaded. 1472 */ 1473 ASSERT(msp->ms_loaded); 1474 1475 bzero(msp->ms_synchist, sizeof (msp->ms_synchist)); 1476 for (int t = 0; t < TXG_DEFER_SIZE; t++) 1477 bzero(msp->ms_deferhist[t], sizeof (msp->ms_deferhist[t])); 1478 } 1479 1480 static void 1481 metaslab_aux_histogram_add(uint64_t *histogram, uint64_t shift, 1482 range_tree_t *rt) 1483 { 1484 /* 1485 * This is modeled after space_map_histogram_add(), so refer to that 1486 * function for implementation details. We want this to work like 1487 * the space map histogram, and not the range tree histogram, as we 1488 * are essentially constructing a delta that will be later subtracted 1489 * from the space map histogram. 1490 */ 1491 int idx = 0; 1492 for (int i = shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { 1493 ASSERT3U(i, >=, idx + shift); 1494 histogram[idx] += rt->rt_histogram[i] << (i - idx - shift); 1495 1496 if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) { 1497 ASSERT3U(idx + shift, ==, i); 1498 idx++; 1499 ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE); 1500 } 1501 } 1502 } 1503 1504 /* 1505 * Called at every sync pass that the metaslab gets synced. 1506 * 1507 * The reason is that we want our auxiliary histograms to be updated 1508 * wherever the metaslab's space map histogram is updated. This way 1509 * we stay consistent on which parts of the metaslab space map's 1510 * histogram are currently not available for allocations (e.g because 1511 * they are in the defer, freed, and freeing trees). 1512 */ 1513 static void 1514 metaslab_aux_histograms_update(metaslab_t *msp) 1515 { 1516 space_map_t *sm = msp->ms_sm; 1517 ASSERT(sm != NULL); 1518 1519 /* 1520 * This is similar to the metaslab's space map histogram updates 1521 * that take place in metaslab_sync(). The only difference is that 1522 * we only care about segments that haven't made it into the 1523 * ms_allocatable tree yet. 1524 */ 1525 if (msp->ms_loaded) { 1526 metaslab_aux_histograms_clear(msp); 1527 1528 metaslab_aux_histogram_add(msp->ms_synchist, 1529 sm->sm_shift, msp->ms_freed); 1530 1531 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1532 metaslab_aux_histogram_add(msp->ms_deferhist[t], 1533 sm->sm_shift, msp->ms_defer[t]); 1534 } 1535 } 1536 1537 metaslab_aux_histogram_add(msp->ms_synchist, 1538 sm->sm_shift, msp->ms_freeing); 1539 } 1540 1541 /* 1542 * Called every time we are done syncing (writing to) the metaslab, 1543 * i.e. at the end of each sync pass. 1544 * [see the comment in metaslab_impl.h for ms_synchist, ms_deferhist] 1545 */ 1546 static void 1547 metaslab_aux_histograms_update_done(metaslab_t *msp, boolean_t defer_allowed) 1548 { 1549 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1550 space_map_t *sm = msp->ms_sm; 1551 1552 if (sm == NULL) { 1553 /* 1554 * We came here from metaslab_init() when creating/opening a 1555 * pool, looking at a metaslab that hasn't had any allocations 1556 * yet. 1557 */ 1558 return; 1559 } 1560 1561 /* 1562 * This is similar to the actions that we take for the ms_freed 1563 * and ms_defer trees in metaslab_sync_done(). 1564 */ 1565 uint64_t hist_index = spa_syncing_txg(spa) % TXG_DEFER_SIZE; 1566 if (defer_allowed) { 1567 bcopy(msp->ms_synchist, msp->ms_deferhist[hist_index], 1568 sizeof (msp->ms_synchist)); 1569 } else { 1570 bzero(msp->ms_deferhist[hist_index], 1571 sizeof (msp->ms_deferhist[hist_index])); 1572 } 1573 bzero(msp->ms_synchist, sizeof (msp->ms_synchist)); 1574 } 1575 1576 /* 1577 * Ensure that the metaslab's weight and fragmentation are consistent 1578 * with the contents of the histogram (either the range tree's histogram 1579 * or the space map's depending whether the metaslab is loaded). 1580 */ 1581 static void 1582 metaslab_verify_weight_and_frag(metaslab_t *msp) 1583 { 1584 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1585 1586 if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) 1587 return; 1588 1589 /* see comment in metaslab_verify_unflushed_changes() */ 1590 if (msp->ms_group == NULL) 1591 return; 1592 1593 /* 1594 * Devices being removed always return a weight of 0 and leave 1595 * fragmentation and ms_max_size as is - there is nothing for 1596 * us to verify here. 1597 */ 1598 vdev_t *vd = msp->ms_group->mg_vd; 1599 if (vd->vdev_removing) 1600 return; 1601 1602 /* 1603 * If the metaslab is dirty it probably means that we've done 1604 * some allocations or frees that have changed our histograms 1605 * and thus the weight. 1606 */ 1607 for (int t = 0; t < TXG_SIZE; t++) { 1608 if (txg_list_member(&vd->vdev_ms_list, msp, t)) 1609 return; 1610 } 1611 1612 /* 1613 * This verification checks that our in-memory state is consistent 1614 * with what's on disk. If the pool is read-only then there aren't 1615 * any changes and we just have the initially-loaded state. 1616 */ 1617 if (!spa_writeable(msp->ms_group->mg_vd->vdev_spa)) 1618 return; 1619 1620 /* some extra verification for in-core tree if you can */ 1621 if (msp->ms_loaded) { 1622 range_tree_stat_verify(msp->ms_allocatable); 1623 VERIFY(space_map_histogram_verify(msp->ms_sm, 1624 msp->ms_allocatable)); 1625 } 1626 1627 uint64_t weight = msp->ms_weight; 1628 uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; 1629 boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight); 1630 uint64_t frag = msp->ms_fragmentation; 1631 uint64_t max_segsize = msp->ms_max_size; 1632 1633 msp->ms_weight = 0; 1634 msp->ms_fragmentation = 0; 1635 msp->ms_max_size = 0; 1636 1637 /* 1638 * This function is used for verification purposes. Regardless of 1639 * whether metaslab_weight() thinks this metaslab should be active or 1640 * not, we want to ensure that the actual weight (and therefore the 1641 * value of ms_weight) would be the same if it was to be recalculated 1642 * at this point. 1643 */ 1644 msp->ms_weight = metaslab_weight(msp) | was_active; 1645 1646 VERIFY3U(max_segsize, ==, msp->ms_max_size); 1647 1648 /* 1649 * If the weight type changed then there is no point in doing 1650 * verification. Revert fields to their original values. 1651 */ 1652 if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) || 1653 (!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) { 1654 msp->ms_fragmentation = frag; 1655 msp->ms_weight = weight; 1656 return; 1657 } 1658 1659 VERIFY3U(msp->ms_fragmentation, ==, frag); 1660 VERIFY3U(msp->ms_weight, ==, weight); 1661 } 1662 1663 /* 1664 * Wait for any in-progress metaslab loads to complete. 1665 */ 1666 static void 1667 metaslab_load_wait(metaslab_t *msp) 1668 { 1669 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1670 1671 while (msp->ms_loading) { 1672 ASSERT(!msp->ms_loaded); 1673 cv_wait(&msp->ms_load_cv, &msp->ms_lock); 1674 } 1675 } 1676 1677 static int 1678 metaslab_load_impl(metaslab_t *msp) 1679 { 1680 int error = 0; 1681 1682 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1683 ASSERT(msp->ms_loading); 1684 ASSERT(!msp->ms_condensing); 1685 1686 /* 1687 * We temporarily drop the lock to unblock other operations while we 1688 * are reading the space map. Therefore, metaslab_sync() and 1689 * metaslab_sync_done() can run at the same time as we do. 1690 * 1691 * metaslab_sync() can append to the space map while we are loading. 1692 * Therefore we load only entries that existed when we started the 1693 * load. Additionally, metaslab_sync_done() has to wait for the load 1694 * to complete because there are potential races like metaslab_load() 1695 * loading parts of the space map that are currently being appended 1696 * by metaslab_sync(). If we didn't, the ms_allocatable would have 1697 * entries that metaslab_sync_done() would try to re-add later. 1698 * 1699 * That's why before dropping the lock we remember the synced length 1700 * of the metaslab and read up to that point of the space map, 1701 * ignoring entries appended by metaslab_sync() that happen after we 1702 * drop the lock. 1703 */ 1704 uint64_t length = msp->ms_synced_length; 1705 mutex_exit(&msp->ms_lock); 1706 1707 if (msp->ms_sm != NULL) { 1708 error = space_map_load_length(msp->ms_sm, msp->ms_allocatable, 1709 SM_FREE, length); 1710 } else { 1711 /* 1712 * The space map has not been allocated yet, so treat 1713 * all the space in the metaslab as free and add it to the 1714 * ms_allocatable tree. 1715 */ 1716 range_tree_add(msp->ms_allocatable, 1717 msp->ms_start, msp->ms_size); 1718 } 1719 1720 /* 1721 * We need to grab the ms_sync_lock to prevent metaslab_sync() from 1722 * changing the ms_sm and the metaslab's range trees while we are 1723 * about to use them and populate the ms_allocatable. The ms_lock 1724 * is insufficient for this because metaslab_sync() doesn't hold 1725 * the ms_lock while writing the ms_checkpointing tree to disk. 1726 */ 1727 mutex_enter(&msp->ms_sync_lock); 1728 mutex_enter(&msp->ms_lock); 1729 ASSERT(!msp->ms_condensing); 1730 1731 if (error != 0) { 1732 mutex_exit(&msp->ms_sync_lock); 1733 return (error); 1734 } 1735 1736 ASSERT3P(msp->ms_group, !=, NULL); 1737 msp->ms_loaded = B_TRUE; 1738 1739 /* 1740 * The ms_allocatable contains the segments that exist in the 1741 * ms_defer trees [see ms_synced_length]. Thus we need to remove 1742 * them from ms_allocatable as they will be added again in 1743 * metaslab_sync_done(). 1744 */ 1745 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1746 range_tree_walk(msp->ms_defer[t], 1747 range_tree_remove, msp->ms_allocatable); 1748 } 1749 1750 /* 1751 * Call metaslab_recalculate_weight_and_sort() now that the 1752 * metaslab is loaded so we get the metaslab's real weight. 1753 * 1754 * Unless this metaslab was created with older software and 1755 * has not yet been converted to use segment-based weight, we 1756 * expect the new weight to be better or equal to the weight 1757 * that the metaslab had while it was not loaded. This is 1758 * because the old weight does not take into account the 1759 * consolidation of adjacent segments between TXGs. [see 1760 * comment for ms_synchist and ms_deferhist[] for more info] 1761 */ 1762 uint64_t weight = msp->ms_weight; 1763 metaslab_recalculate_weight_and_sort(msp); 1764 if (!WEIGHT_IS_SPACEBASED(weight)) 1765 ASSERT3U(weight, <=, msp->ms_weight); 1766 msp->ms_max_size = metaslab_block_maxsize(msp); 1767 1768 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1769 metaslab_verify_space(msp, spa_syncing_txg(spa)); 1770 mutex_exit(&msp->ms_sync_lock); 1771 1772 return (0); 1773 } 1774 1775 int 1776 metaslab_load(metaslab_t *msp) 1777 { 1778 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1779 1780 /* 1781 * There may be another thread loading the same metaslab, if that's 1782 * the case just wait until the other thread is done and return. 1783 */ 1784 metaslab_load_wait(msp); 1785 if (msp->ms_loaded) 1786 return (0); 1787 VERIFY(!msp->ms_loading); 1788 ASSERT(!msp->ms_condensing); 1789 1790 msp->ms_loading = B_TRUE; 1791 int error = metaslab_load_impl(msp); 1792 msp->ms_loading = B_FALSE; 1793 cv_broadcast(&msp->ms_load_cv); 1794 1795 return (error); 1796 } 1797 1798 void 1799 metaslab_unload(metaslab_t *msp) 1800 { 1801 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1802 1803 metaslab_verify_weight_and_frag(msp); 1804 1805 range_tree_vacate(msp->ms_allocatable, NULL, NULL); 1806 msp->ms_loaded = B_FALSE; 1807 1808 msp->ms_weight &= ~METASLAB_ACTIVE_MASK; 1809 msp->ms_max_size = 0; 1810 1811 /* 1812 * We explicitly recalculate the metaslab's weight based on its space 1813 * map (as it is now not loaded). We want unload metaslabs to always 1814 * have their weights calculated from the space map histograms, while 1815 * loaded ones have it calculated from their in-core range tree 1816 * [see metaslab_load()]. This way, the weight reflects the information 1817 * available in-core, whether it is loaded or not 1818 * 1819 * If ms_group == NULL means that we came here from metaslab_fini(), 1820 * at which point it doesn't make sense for us to do the recalculation 1821 * and the sorting. 1822 */ 1823 if (msp->ms_group != NULL) 1824 metaslab_recalculate_weight_and_sort(msp); 1825 } 1826 1827 static void 1828 metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta, 1829 int64_t defer_delta, int64_t space_delta) 1830 { 1831 vdev_space_update(vd, alloc_delta, defer_delta, space_delta); 1832 1833 ASSERT3P(vd->vdev_spa->spa_root_vdev, ==, vd->vdev_parent); 1834 ASSERT(vd->vdev_ms_count != 0); 1835 1836 metaslab_class_space_update(mc, alloc_delta, defer_delta, space_delta, 1837 vdev_deflated_space(vd, space_delta)); 1838 } 1839 1840 int 1841 metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, 1842 metaslab_t **msp) 1843 { 1844 vdev_t *vd = mg->mg_vd; 1845 spa_t *spa = vd->vdev_spa; 1846 objset_t *mos = spa->spa_meta_objset; 1847 metaslab_t *ms; 1848 int error; 1849 1850 ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); 1851 mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL); 1852 mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL); 1853 cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); 1854 1855 ms->ms_id = id; 1856 ms->ms_start = id << vd->vdev_ms_shift; 1857 ms->ms_size = 1ULL << vd->vdev_ms_shift; 1858 ms->ms_allocator = -1; 1859 ms->ms_new = B_TRUE; 1860 1861 /* 1862 * We only open space map objects that already exist. All others 1863 * will be opened when we finally allocate an object for it. 1864 * 1865 * Note: 1866 * When called from vdev_expand(), we can't call into the DMU as 1867 * we are holding the spa_config_lock as a writer and we would 1868 * deadlock [see relevant comment in vdev_metaslab_init()]. in 1869 * that case, the object parameter is zero though, so we won't 1870 * call into the DMU. 1871 */ 1872 if (object != 0) { 1873 error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start, 1874 ms->ms_size, vd->vdev_ashift); 1875 1876 if (error != 0) { 1877 kmem_free(ms, sizeof (metaslab_t)); 1878 return (error); 1879 } 1880 1881 ASSERT(ms->ms_sm != NULL); 1882 ASSERT3S(space_map_allocated(ms->ms_sm), >=, 0); 1883 ms->ms_allocated_space = space_map_allocated(ms->ms_sm); 1884 } 1885 1886 /* 1887 * We create the ms_allocatable here, but we don't create the 1888 * other range trees until metaslab_sync_done(). This serves 1889 * two purposes: it allows metaslab_sync_done() to detect the 1890 * addition of new space; and for debugging, it ensures that 1891 * we'd data fault on any attempt to use this metaslab before 1892 * it's ready. 1893 */ 1894 ms->ms_allocatable = range_tree_create(&metaslab_rt_ops, ms); 1895 metaslab_group_add(mg, ms); 1896 1897 metaslab_set_fragmentation(ms); 1898 1899 /* 1900 * If we're opening an existing pool (txg == 0) or creating 1901 * a new one (txg == TXG_INITIAL), all space is available now. 1902 * If we're adding space to an existing pool, the new space 1903 * does not become available until after this txg has synced. 1904 * The metaslab's weight will also be initialized when we sync 1905 * out this txg. This ensures that we don't attempt to allocate 1906 * from it before we have initialized it completely. 1907 */ 1908 if (txg <= TXG_INITIAL) { 1909 metaslab_sync_done(ms, 0); 1910 metaslab_space_update(vd, mg->mg_class, 1911 metaslab_allocated_space(ms), 0, 0); 1912 } 1913 1914 /* 1915 * If metaslab_debug_load is set and we're initializing a metaslab 1916 * that has an allocated space map object then load the space map 1917 * so that we can verify frees. 1918 */ 1919 if (metaslab_debug_load && ms->ms_sm != NULL) { 1920 mutex_enter(&ms->ms_lock); 1921 VERIFY0(metaslab_load(ms)); 1922 mutex_exit(&ms->ms_lock); 1923 } 1924 1925 if (txg != 0) { 1926 vdev_dirty(vd, 0, NULL, txg); 1927 vdev_dirty(vd, VDD_METASLAB, ms, txg); 1928 } 1929 1930 *msp = ms; 1931 1932 return (0); 1933 } 1934 1935 void 1936 metaslab_fini(metaslab_t *msp) 1937 { 1938 metaslab_group_t *mg = msp->ms_group; 1939 vdev_t *vd = mg->mg_vd; 1940 1941 metaslab_group_remove(mg, msp); 1942 1943 mutex_enter(&msp->ms_lock); 1944 VERIFY(msp->ms_group == NULL); 1945 metaslab_space_update(vd, mg->mg_class, 1946 -metaslab_allocated_space(msp), 0, -msp->ms_size); 1947 1948 space_map_close(msp->ms_sm); 1949 1950 metaslab_unload(msp); 1951 1952 range_tree_destroy(msp->ms_allocatable); 1953 range_tree_destroy(msp->ms_freeing); 1954 range_tree_destroy(msp->ms_freed); 1955 1956 for (int t = 0; t < TXG_SIZE; t++) { 1957 range_tree_destroy(msp->ms_allocating[t]); 1958 } 1959 1960 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1961 range_tree_destroy(msp->ms_defer[t]); 1962 } 1963 ASSERT0(msp->ms_deferspace); 1964 1965 range_tree_destroy(msp->ms_checkpointing); 1966 1967 for (int t = 0; t < TXG_SIZE; t++) 1968 ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t)); 1969 1970 mutex_exit(&msp->ms_lock); 1971 cv_destroy(&msp->ms_load_cv); 1972 mutex_destroy(&msp->ms_lock); 1973 mutex_destroy(&msp->ms_sync_lock); 1974 ASSERT3U(msp->ms_allocator, ==, -1); 1975 1976 kmem_free(msp, sizeof (metaslab_t)); 1977 } 1978 1979 #define FRAGMENTATION_TABLE_SIZE 17 1980 1981 /* 1982 * This table defines a segment size based fragmentation metric that will 1983 * allow each metaslab to derive its own fragmentation value. This is done 1984 * by calculating the space in each bucket of the spacemap histogram and 1985 * multiplying that by the fragmentation metric in this table. Doing 1986 * this for all buckets and dividing it by the total amount of free 1987 * space in this metaslab (i.e. the total free space in all buckets) gives 1988 * us the fragmentation metric. This means that a high fragmentation metric 1989 * equates to most of the free space being comprised of small segments. 1990 * Conversely, if the metric is low, then most of the free space is in 1991 * large segments. A 10% change in fragmentation equates to approximately 1992 * double the number of segments. 1993 * 1994 * This table defines 0% fragmented space using 16MB segments. Testing has 1995 * shown that segments that are greater than or equal to 16MB do not suffer 1996 * from drastic performance problems. Using this value, we derive the rest 1997 * of the table. Since the fragmentation value is never stored on disk, it 1998 * is possible to change these calculations in the future. 1999 */ 2000 int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = { 2001 100, /* 512B */ 2002 100, /* 1K */ 2003 98, /* 2K */ 2004 95, /* 4K */ 2005 90, /* 8K */ 2006 80, /* 16K */ 2007 70, /* 32K */ 2008 60, /* 64K */ 2009 50, /* 128K */ 2010 40, /* 256K */ 2011 30, /* 512K */ 2012 20, /* 1M */ 2013 15, /* 2M */ 2014 10, /* 4M */ 2015 5, /* 8M */ 2016 0 /* 16M */ 2017 }; 2018 2019 /* 2020 * Calculate the metaslab's fragmentation metric and set ms_fragmentation. 2021 * Setting this value to ZFS_FRAG_INVALID means that the metaslab has not 2022 * been upgraded and does not support this metric. Otherwise, the return 2023 * value should be in the range [0, 100]. 2024 */ 2025 static void 2026 metaslab_set_fragmentation(metaslab_t *msp) 2027 { 2028 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 2029 uint64_t fragmentation = 0; 2030 uint64_t total = 0; 2031 boolean_t feature_enabled = spa_feature_is_enabled(spa, 2032 SPA_FEATURE_SPACEMAP_HISTOGRAM); 2033 2034 if (!feature_enabled) { 2035 msp->ms_fragmentation = ZFS_FRAG_INVALID; 2036 return; 2037 } 2038 2039 /* 2040 * A null space map means that the entire metaslab is free 2041 * and thus is not fragmented. 2042 */ 2043 if (msp->ms_sm == NULL) { 2044 msp->ms_fragmentation = 0; 2045 return; 2046 } 2047 2048 /* 2049 * If this metaslab's space map has not been upgraded, flag it 2050 * so that we upgrade next time we encounter it. 2051 */ 2052 if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) { 2053 uint64_t txg = spa_syncing_txg(spa); 2054 vdev_t *vd = msp->ms_group->mg_vd; 2055 2056 /* 2057 * If we've reached the final dirty txg, then we must 2058 * be shutting down the pool. We don't want to dirty 2059 * any data past this point so skip setting the condense 2060 * flag. We can retry this action the next time the pool 2061 * is imported. 2062 */ 2063 if (spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) { 2064 msp->ms_condense_wanted = B_TRUE; 2065 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 2066 zfs_dbgmsg("txg %llu, requesting force condense: " 2067 "ms_id %llu, vdev_id %llu", txg, msp->ms_id, 2068 vd->vdev_id); 2069 } 2070 msp->ms_fragmentation = ZFS_FRAG_INVALID; 2071 return; 2072 } 2073 2074 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 2075 uint64_t space = 0; 2076 uint8_t shift = msp->ms_sm->sm_shift; 2077 2078 int idx = MIN(shift - SPA_MINBLOCKSHIFT + i, 2079 FRAGMENTATION_TABLE_SIZE - 1); 2080 2081 if (msp->ms_sm->sm_phys->smp_histogram[i] == 0) 2082 continue; 2083 2084 space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift); 2085 total += space; 2086 2087 ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE); 2088 fragmentation += space * zfs_frag_table[idx]; 2089 } 2090 2091 if (total > 0) 2092 fragmentation /= total; 2093 ASSERT3U(fragmentation, <=, 100); 2094 2095 msp->ms_fragmentation = fragmentation; 2096 } 2097 2098 /* 2099 * Compute a weight -- a selection preference value -- for the given metaslab. 2100 * This is based on the amount of free space, the level of fragmentation, 2101 * the LBA range, and whether the metaslab is loaded. 2102 */ 2103 static uint64_t 2104 metaslab_space_weight(metaslab_t *msp) 2105 { 2106 metaslab_group_t *mg = msp->ms_group; 2107 vdev_t *vd = mg->mg_vd; 2108 uint64_t weight, space; 2109 2110 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2111 ASSERT(!vd->vdev_removing); 2112 2113 /* 2114 * The baseline weight is the metaslab's free space. 2115 */ 2116 space = msp->ms_size - metaslab_allocated_space(msp); 2117 2118 if (metaslab_fragmentation_factor_enabled && 2119 msp->ms_fragmentation != ZFS_FRAG_INVALID) { 2120 /* 2121 * Use the fragmentation information to inversely scale 2122 * down the baseline weight. We need to ensure that we 2123 * don't exclude this metaslab completely when it's 100% 2124 * fragmented. To avoid this we reduce the fragmented value 2125 * by 1. 2126 */ 2127 space = (space * (100 - (msp->ms_fragmentation - 1))) / 100; 2128 2129 /* 2130 * If space < SPA_MINBLOCKSIZE, then we will not allocate from 2131 * this metaslab again. The fragmentation metric may have 2132 * decreased the space to something smaller than 2133 * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE 2134 * so that we can consume any remaining space. 2135 */ 2136 if (space > 0 && space < SPA_MINBLOCKSIZE) 2137 space = SPA_MINBLOCKSIZE; 2138 } 2139 weight = space; 2140 2141 /* 2142 * Modern disks have uniform bit density and constant angular velocity. 2143 * Therefore, the outer recording zones are faster (higher bandwidth) 2144 * than the inner zones by the ratio of outer to inner track diameter, 2145 * which is typically around 2:1. We account for this by assigning 2146 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). 2147 * In effect, this means that we'll select the metaslab with the most 2148 * free bandwidth rather than simply the one with the most free space. 2149 */ 2150 if (metaslab_lba_weighting_enabled) { 2151 weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count; 2152 ASSERT(weight >= space && weight <= 2 * space); 2153 } 2154 2155 /* 2156 * If this metaslab is one we're actively using, adjust its 2157 * weight to make it preferable to any inactive metaslab so 2158 * we'll polish it off. If the fragmentation on this metaslab 2159 * has exceed our threshold, then don't mark it active. 2160 */ 2161 if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID && 2162 msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) { 2163 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); 2164 } 2165 2166 WEIGHT_SET_SPACEBASED(weight); 2167 return (weight); 2168 } 2169 2170 /* 2171 * Return the weight of the specified metaslab, according to the segment-based 2172 * weighting algorithm. The metaslab must be loaded. This function can 2173 * be called within a sync pass since it relies only on the metaslab's 2174 * range tree which is always accurate when the metaslab is loaded. 2175 */ 2176 static uint64_t 2177 metaslab_weight_from_range_tree(metaslab_t *msp) 2178 { 2179 uint64_t weight = 0; 2180 uint32_t segments = 0; 2181 2182 ASSERT(msp->ms_loaded); 2183 2184 for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT; 2185 i--) { 2186 uint8_t shift = msp->ms_group->mg_vd->vdev_ashift; 2187 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; 2188 2189 segments <<= 1; 2190 segments += msp->ms_allocatable->rt_histogram[i]; 2191 2192 /* 2193 * The range tree provides more precision than the space map 2194 * and must be downgraded so that all values fit within the 2195 * space map's histogram. This allows us to compare loaded 2196 * vs. unloaded metaslabs to determine which metaslab is 2197 * considered "best". 2198 */ 2199 if (i > max_idx) 2200 continue; 2201 2202 if (segments != 0) { 2203 WEIGHT_SET_COUNT(weight, segments); 2204 WEIGHT_SET_INDEX(weight, i); 2205 WEIGHT_SET_ACTIVE(weight, 0); 2206 break; 2207 } 2208 } 2209 return (weight); 2210 } 2211 2212 /* 2213 * Calculate the weight based on the on-disk histogram. This should only 2214 * be called after a sync pass has completely finished since the on-disk 2215 * information is updated in metaslab_sync(). 2216 */ 2217 static uint64_t 2218 metaslab_weight_from_spacemap(metaslab_t *msp) 2219 { 2220 space_map_t *sm = msp->ms_sm; 2221 ASSERT(!msp->ms_loaded); 2222 ASSERT(sm != NULL); 2223 ASSERT3U(space_map_object(sm), !=, 0); 2224 ASSERT3U(sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); 2225 2226 /* 2227 * Create a joint histogram from all the segments that have made 2228 * it to the metaslab's space map histogram, that are not yet 2229 * available for allocation because they are still in the freeing 2230 * pipeline (e.g. freeing, freed, and defer trees). Then subtract 2231 * these segments from the space map's histogram to get a more 2232 * accurate weight. 2233 */ 2234 uint64_t deferspace_histogram[SPACE_MAP_HISTOGRAM_SIZE] = {0}; 2235 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) 2236 deferspace_histogram[i] += msp->ms_synchist[i]; 2237 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2238 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 2239 deferspace_histogram[i] += msp->ms_deferhist[t][i]; 2240 } 2241 } 2242 2243 uint64_t weight = 0; 2244 for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) { 2245 ASSERT3U(sm->sm_phys->smp_histogram[i], >=, 2246 deferspace_histogram[i]); 2247 uint64_t count = 2248 sm->sm_phys->smp_histogram[i] - deferspace_histogram[i]; 2249 if (count != 0) { 2250 WEIGHT_SET_COUNT(weight, count); 2251 WEIGHT_SET_INDEX(weight, i + sm->sm_shift); 2252 WEIGHT_SET_ACTIVE(weight, 0); 2253 break; 2254 } 2255 } 2256 return (weight); 2257 } 2258 2259 /* 2260 * Compute a segment-based weight for the specified metaslab. The weight 2261 * is determined by highest bucket in the histogram. The information 2262 * for the highest bucket is encoded into the weight value. 2263 */ 2264 static uint64_t 2265 metaslab_segment_weight(metaslab_t *msp) 2266 { 2267 metaslab_group_t *mg = msp->ms_group; 2268 uint64_t weight = 0; 2269 uint8_t shift = mg->mg_vd->vdev_ashift; 2270 2271 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2272 2273 /* 2274 * The metaslab is completely free. 2275 */ 2276 if (metaslab_allocated_space(msp) == 0) { 2277 int idx = highbit64(msp->ms_size) - 1; 2278 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; 2279 2280 if (idx < max_idx) { 2281 WEIGHT_SET_COUNT(weight, 1ULL); 2282 WEIGHT_SET_INDEX(weight, idx); 2283 } else { 2284 WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx)); 2285 WEIGHT_SET_INDEX(weight, max_idx); 2286 } 2287 WEIGHT_SET_ACTIVE(weight, 0); 2288 ASSERT(!WEIGHT_IS_SPACEBASED(weight)); 2289 2290 return (weight); 2291 } 2292 2293 ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); 2294 2295 /* 2296 * If the metaslab is fully allocated then just make the weight 0. 2297 */ 2298 if (metaslab_allocated_space(msp) == msp->ms_size) 2299 return (0); 2300 /* 2301 * If the metaslab is already loaded, then use the range tree to 2302 * determine the weight. Otherwise, we rely on the space map information 2303 * to generate the weight. 2304 */ 2305 if (msp->ms_loaded) { 2306 weight = metaslab_weight_from_range_tree(msp); 2307 } else { 2308 weight = metaslab_weight_from_spacemap(msp); 2309 } 2310 2311 /* 2312 * If the metaslab was active the last time we calculated its weight 2313 * then keep it active. We want to consume the entire region that 2314 * is associated with this weight. 2315 */ 2316 if (msp->ms_activation_weight != 0 && weight != 0) 2317 WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight)); 2318 return (weight); 2319 } 2320 2321 /* 2322 * Determine if we should attempt to allocate from this metaslab. If the 2323 * metaslab has a maximum size then we can quickly determine if the desired 2324 * allocation size can be satisfied. Otherwise, if we're using segment-based 2325 * weighting then we can determine the maximum allocation that this metaslab 2326 * can accommodate based on the index encoded in the weight. If we're using 2327 * space-based weights then rely on the entire weight (excluding the weight 2328 * type bit). 2329 */ 2330 boolean_t 2331 metaslab_should_allocate(metaslab_t *msp, uint64_t asize) 2332 { 2333 boolean_t should_allocate; 2334 2335 if (msp->ms_max_size != 0) 2336 return (msp->ms_max_size >= asize); 2337 2338 if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) { 2339 /* 2340 * The metaslab segment weight indicates segments in the 2341 * range [2^i, 2^(i+1)), where i is the index in the weight. 2342 * Since the asize might be in the middle of the range, we 2343 * should attempt the allocation if asize < 2^(i+1). 2344 */ 2345 should_allocate = (asize < 2346 1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1)); 2347 } else { 2348 should_allocate = (asize <= 2349 (msp->ms_weight & ~METASLAB_WEIGHT_TYPE)); 2350 } 2351 return (should_allocate); 2352 } 2353 2354 static uint64_t 2355 metaslab_weight(metaslab_t *msp) 2356 { 2357 vdev_t *vd = msp->ms_group->mg_vd; 2358 spa_t *spa = vd->vdev_spa; 2359 uint64_t weight; 2360 2361 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2362 2363 /* 2364 * If this vdev is in the process of being removed, there is nothing 2365 * for us to do here. 2366 */ 2367 if (vd->vdev_removing) 2368 return (0); 2369 2370 metaslab_set_fragmentation(msp); 2371 2372 /* 2373 * Update the maximum size if the metaslab is loaded. This will 2374 * ensure that we get an accurate maximum size if newly freed space 2375 * has been added back into the free tree. 2376 */ 2377 if (msp->ms_loaded) 2378 msp->ms_max_size = metaslab_block_maxsize(msp); 2379 else 2380 ASSERT0(msp->ms_max_size); 2381 2382 /* 2383 * Segment-based weighting requires space map histogram support. 2384 */ 2385 if (zfs_metaslab_segment_weight_enabled && 2386 spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) && 2387 (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size == 2388 sizeof (space_map_phys_t))) { 2389 weight = metaslab_segment_weight(msp); 2390 } else { 2391 weight = metaslab_space_weight(msp); 2392 } 2393 return (weight); 2394 } 2395 2396 void 2397 metaslab_recalculate_weight_and_sort(metaslab_t *msp) 2398 { 2399 /* note: we preserve the mask (e.g. indication of primary, etc..) */ 2400 uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; 2401 metaslab_group_sort(msp->ms_group, msp, 2402 metaslab_weight(msp) | was_active); 2403 } 2404 2405 static int 2406 metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp, 2407 int allocator, uint64_t activation_weight) 2408 { 2409 /* 2410 * If we're activating for the claim code, we don't want to actually 2411 * set the metaslab up for a specific allocator. 2412 */ 2413 if (activation_weight == METASLAB_WEIGHT_CLAIM) 2414 return (0); 2415 metaslab_t **arr = (activation_weight == METASLAB_WEIGHT_PRIMARY ? 2416 mg->mg_primaries : mg->mg_secondaries); 2417 2418 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2419 mutex_enter(&mg->mg_lock); 2420 if (arr[allocator] != NULL) { 2421 mutex_exit(&mg->mg_lock); 2422 return (EEXIST); 2423 } 2424 2425 arr[allocator] = msp; 2426 ASSERT3S(msp->ms_allocator, ==, -1); 2427 msp->ms_allocator = allocator; 2428 msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY); 2429 mutex_exit(&mg->mg_lock); 2430 2431 return (0); 2432 } 2433 2434 static int 2435 metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight) 2436 { 2437 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2438 2439 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { 2440 int error = metaslab_load(msp); 2441 if (error != 0) { 2442 metaslab_group_sort(msp->ms_group, msp, 0); 2443 return (error); 2444 } 2445 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) { 2446 /* 2447 * The metaslab was activated for another allocator 2448 * while we were waiting, we should reselect. 2449 */ 2450 return (EBUSY); 2451 } 2452 if ((error = metaslab_activate_allocator(msp->ms_group, msp, 2453 allocator, activation_weight)) != 0) { 2454 return (error); 2455 } 2456 2457 msp->ms_activation_weight = msp->ms_weight; 2458 metaslab_group_sort(msp->ms_group, msp, 2459 msp->ms_weight | activation_weight); 2460 } 2461 ASSERT(msp->ms_loaded); 2462 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); 2463 2464 return (0); 2465 } 2466 2467 static void 2468 metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp, 2469 uint64_t weight) 2470 { 2471 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2472 if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) { 2473 metaslab_group_sort(mg, msp, weight); 2474 return; 2475 } 2476 2477 mutex_enter(&mg->mg_lock); 2478 ASSERT3P(msp->ms_group, ==, mg); 2479 if (msp->ms_primary) { 2480 ASSERT3U(0, <=, msp->ms_allocator); 2481 ASSERT3U(msp->ms_allocator, <, mg->mg_allocators); 2482 ASSERT3P(mg->mg_primaries[msp->ms_allocator], ==, msp); 2483 ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); 2484 mg->mg_primaries[msp->ms_allocator] = NULL; 2485 } else { 2486 ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); 2487 ASSERT3P(mg->mg_secondaries[msp->ms_allocator], ==, msp); 2488 mg->mg_secondaries[msp->ms_allocator] = NULL; 2489 } 2490 msp->ms_allocator = -1; 2491 metaslab_group_sort_impl(mg, msp, weight); 2492 mutex_exit(&mg->mg_lock); 2493 } 2494 2495 static void 2496 metaslab_passivate(metaslab_t *msp, uint64_t weight) 2497 { 2498 uint64_t size = weight & ~METASLAB_WEIGHT_TYPE; 2499 2500 /* 2501 * If size < SPA_MINBLOCKSIZE, then we will not allocate from 2502 * this metaslab again. In that case, it had better be empty, 2503 * or we would be leaving space on the table. 2504 */ 2505 ASSERT(size >= SPA_MINBLOCKSIZE || 2506 range_tree_is_empty(msp->ms_allocatable)); 2507 ASSERT0(weight & METASLAB_ACTIVE_MASK); 2508 2509 msp->ms_activation_weight = 0; 2510 metaslab_passivate_allocator(msp->ms_group, msp, weight); 2511 ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); 2512 } 2513 2514 /* 2515 * Segment-based metaslabs are activated once and remain active until 2516 * we either fail an allocation attempt (similar to space-based metaslabs) 2517 * or have exhausted the free space in zfs_metaslab_switch_threshold 2518 * buckets since the metaslab was activated. This function checks to see 2519 * if we've exhaused the zfs_metaslab_switch_threshold buckets in the 2520 * metaslab and passivates it proactively. This will allow us to select a 2521 * metaslabs with larger contiguous region if any remaining within this 2522 * metaslab group. If we're in sync pass > 1, then we continue using this 2523 * metaslab so that we don't dirty more block and cause more sync passes. 2524 */ 2525 void 2526 metaslab_segment_may_passivate(metaslab_t *msp) 2527 { 2528 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 2529 2530 if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1) 2531 return; 2532 2533 /* 2534 * Since we are in the middle of a sync pass, the most accurate 2535 * information that is accessible to us is the in-core range tree 2536 * histogram; calculate the new weight based on that information. 2537 */ 2538 uint64_t weight = metaslab_weight_from_range_tree(msp); 2539 int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight); 2540 int current_idx = WEIGHT_GET_INDEX(weight); 2541 2542 if (current_idx <= activation_idx - zfs_metaslab_switch_threshold) 2543 metaslab_passivate(msp, weight); 2544 } 2545 2546 static void 2547 metaslab_preload(void *arg) 2548 { 2549 metaslab_t *msp = arg; 2550 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 2551 2552 ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock)); 2553 2554 mutex_enter(&msp->ms_lock); 2555 (void) metaslab_load(msp); 2556 msp->ms_selected_txg = spa_syncing_txg(spa); 2557 mutex_exit(&msp->ms_lock); 2558 } 2559 2560 static void 2561 metaslab_group_preload(metaslab_group_t *mg) 2562 { 2563 spa_t *spa = mg->mg_vd->vdev_spa; 2564 metaslab_t *msp; 2565 avl_tree_t *t = &mg->mg_metaslab_tree; 2566 int m = 0; 2567 2568 if (spa_shutting_down(spa) || !metaslab_preload_enabled) { 2569 taskq_wait(mg->mg_taskq); 2570 return; 2571 } 2572 2573 mutex_enter(&mg->mg_lock); 2574 2575 /* 2576 * Load the next potential metaslabs 2577 */ 2578 for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) { 2579 ASSERT3P(msp->ms_group, ==, mg); 2580 2581 /* 2582 * We preload only the maximum number of metaslabs specified 2583 * by metaslab_preload_limit. If a metaslab is being forced 2584 * to condense then we preload it too. This will ensure 2585 * that force condensing happens in the next txg. 2586 */ 2587 if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) { 2588 continue; 2589 } 2590 2591 VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload, 2592 msp, TQ_SLEEP) != TASKQID_INVALID); 2593 } 2594 mutex_exit(&mg->mg_lock); 2595 } 2596 2597 /* 2598 * Determine if the space map's on-disk footprint is past our tolerance 2599 * for inefficiency. We would like to use the following criteria to make 2600 * our decision: 2601 * 2602 * 1. The size of the space map object should not dramatically increase as a 2603 * result of writing out the free space range tree. 2604 * 2605 * 2. The minimal on-disk space map representation is zfs_condense_pct/100 2606 * times the size than the free space range tree representation 2607 * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1MB). 2608 * 2609 * 3. The on-disk size of the space map should actually decrease. 2610 * 2611 * Unfortunately, we cannot compute the on-disk size of the space map in this 2612 * context because we cannot accurately compute the effects of compression, etc. 2613 * Instead, we apply the heuristic described in the block comment for 2614 * zfs_metaslab_condense_block_threshold - we only condense if the space used 2615 * is greater than a threshold number of blocks. 2616 */ 2617 static boolean_t 2618 metaslab_should_condense(metaslab_t *msp) 2619 { 2620 space_map_t *sm = msp->ms_sm; 2621 vdev_t *vd = msp->ms_group->mg_vd; 2622 uint64_t vdev_blocksize = 1 << vd->vdev_ashift; 2623 uint64_t current_txg = spa_syncing_txg(vd->vdev_spa); 2624 2625 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2626 ASSERT(msp->ms_loaded); 2627 2628 /* 2629 * Allocations and frees in early passes are generally more space 2630 * efficient (in terms of blocks described in space map entries) 2631 * than the ones in later passes (e.g. we don't compress after 2632 * sync pass 5) and condensing a metaslab multiple times in a txg 2633 * could degrade performance. 2634 * 2635 * Thus we prefer condensing each metaslab at most once every txg at 2636 * the earliest sync pass possible. If a metaslab is eligible for 2637 * condensing again after being considered for condensing within the 2638 * same txg, it will hopefully be dirty in the next txg where it will 2639 * be condensed at an earlier pass. 2640 */ 2641 if (msp->ms_condense_checked_txg == current_txg) 2642 return (B_FALSE); 2643 msp->ms_condense_checked_txg = current_txg; 2644 2645 /* 2646 * We always condense metaslabs that are empty and metaslabs for 2647 * which a condense request has been made. 2648 */ 2649 if (avl_is_empty(&msp->ms_allocatable_by_size) || 2650 msp->ms_condense_wanted) 2651 return (B_TRUE); 2652 2653 uint64_t object_size = space_map_length(msp->ms_sm); 2654 uint64_t optimal_size = space_map_estimate_optimal_size(sm, 2655 msp->ms_allocatable, SM_NO_VDEVID); 2656 2657 dmu_object_info_t doi; 2658 dmu_object_info_from_db(sm->sm_dbuf, &doi); 2659 uint64_t record_size = MAX(doi.doi_data_block_size, vdev_blocksize); 2660 2661 return (object_size >= (optimal_size * zfs_condense_pct / 100) && 2662 object_size > zfs_metaslab_condense_block_threshold * record_size); 2663 } 2664 2665 /* 2666 * Condense the on-disk space map representation to its minimized form. 2667 * The minimized form consists of a small number of allocations followed by 2668 * the entries of the free range tree. 2669 */ 2670 static void 2671 metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) 2672 { 2673 range_tree_t *condense_tree; 2674 space_map_t *sm = msp->ms_sm; 2675 2676 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2677 ASSERT(msp->ms_loaded); 2678 2679 zfs_dbgmsg("condensing: txg %llu, msp[%llu] %p, vdev id %llu, " 2680 "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg, 2681 msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id, 2682 msp->ms_group->mg_vd->vdev_spa->spa_name, 2683 space_map_length(msp->ms_sm), 2684 avl_numnodes(&msp->ms_allocatable->rt_root), 2685 msp->ms_condense_wanted ? "TRUE" : "FALSE"); 2686 2687 msp->ms_condense_wanted = B_FALSE; 2688 2689 /* 2690 * Create an range tree that is 100% allocated. We remove segments 2691 * that have been freed in this txg, any deferred frees that exist, 2692 * and any allocation in the future. Removing segments should be 2693 * a relatively inexpensive operation since we expect these trees to 2694 * have a small number of nodes. 2695 */ 2696 condense_tree = range_tree_create(NULL, NULL); 2697 range_tree_add(condense_tree, msp->ms_start, msp->ms_size); 2698 2699 range_tree_walk(msp->ms_freeing, range_tree_remove, condense_tree); 2700 range_tree_walk(msp->ms_freed, range_tree_remove, condense_tree); 2701 2702 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2703 range_tree_walk(msp->ms_defer[t], 2704 range_tree_remove, condense_tree); 2705 } 2706 2707 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 2708 range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK], 2709 range_tree_remove, condense_tree); 2710 } 2711 2712 /* 2713 * We're about to drop the metaslab's lock thus allowing 2714 * other consumers to change it's content. Set the 2715 * metaslab's ms_condensing flag to ensure that 2716 * allocations on this metaslab do not occur while we're 2717 * in the middle of committing it to disk. This is only critical 2718 * for ms_allocatable as all other range trees use per txg 2719 * views of their content. 2720 */ 2721 msp->ms_condensing = B_TRUE; 2722 2723 mutex_exit(&msp->ms_lock); 2724 space_map_truncate(sm, zfs_metaslab_sm_blksz, tx); 2725 2726 /* 2727 * While we would ideally like to create a space map representation 2728 * that consists only of allocation records, doing so can be 2729 * prohibitively expensive because the in-core free tree can be 2730 * large, and therefore computationally expensive to subtract 2731 * from the condense_tree. Instead we sync out two trees, a cheap 2732 * allocation only tree followed by the in-core free tree. While not 2733 * optimal, this is typically close to optimal, and much cheaper to 2734 * compute. 2735 */ 2736 space_map_write(sm, condense_tree, SM_ALLOC, SM_NO_VDEVID, tx); 2737 range_tree_vacate(condense_tree, NULL, NULL); 2738 range_tree_destroy(condense_tree); 2739 2740 space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx); 2741 mutex_enter(&msp->ms_lock); 2742 msp->ms_condensing = B_FALSE; 2743 } 2744 2745 /* 2746 * Write a metaslab to disk in the context of the specified transaction group. 2747 */ 2748 void 2749 metaslab_sync(metaslab_t *msp, uint64_t txg) 2750 { 2751 metaslab_group_t *mg = msp->ms_group; 2752 vdev_t *vd = mg->mg_vd; 2753 spa_t *spa = vd->vdev_spa; 2754 objset_t *mos = spa_meta_objset(spa); 2755 range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK]; 2756 dmu_tx_t *tx; 2757 uint64_t object = space_map_object(msp->ms_sm); 2758 2759 ASSERT(!vd->vdev_ishole); 2760 2761 /* 2762 * This metaslab has just been added so there's no work to do now. 2763 */ 2764 if (msp->ms_freeing == NULL) { 2765 ASSERT3P(alloctree, ==, NULL); 2766 return; 2767 } 2768 2769 ASSERT3P(alloctree, !=, NULL); 2770 ASSERT3P(msp->ms_freeing, !=, NULL); 2771 ASSERT3P(msp->ms_freed, !=, NULL); 2772 ASSERT3P(msp->ms_checkpointing, !=, NULL); 2773 2774 /* 2775 * Normally, we don't want to process a metaslab if there are no 2776 * allocations or frees to perform. However, if the metaslab is being 2777 * forced to condense and it's loaded, we need to let it through. 2778 */ 2779 if (range_tree_is_empty(alloctree) && 2780 range_tree_is_empty(msp->ms_freeing) && 2781 range_tree_is_empty(msp->ms_checkpointing) && 2782 !(msp->ms_loaded && msp->ms_condense_wanted)) 2783 return; 2784 2785 2786 VERIFY(txg <= spa_final_dirty_txg(spa)); 2787 2788 /* 2789 * The only state that can actually be changing concurrently 2790 * with metaslab_sync() is the metaslab's ms_allocatable. No 2791 * other thread can be modifying this txg's alloc, freeing, 2792 * freed, or space_map_phys_t. We drop ms_lock whenever we 2793 * could call into the DMU, because the DMU can call down to 2794 * us (e.g. via zio_free()) at any time. 2795 * 2796 * The spa_vdev_remove_thread() can be reading metaslab state 2797 * concurrently, and it is locked out by the ms_sync_lock. 2798 * Note that the ms_lock is insufficient for this, because it 2799 * is dropped by space_map_write(). 2800 */ 2801 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 2802 2803 if (msp->ms_sm == NULL) { 2804 uint64_t new_object; 2805 2806 new_object = space_map_alloc(mos, zfs_metaslab_sm_blksz, tx); 2807 VERIFY3U(new_object, !=, 0); 2808 2809 VERIFY0(space_map_open(&msp->ms_sm, mos, new_object, 2810 msp->ms_start, msp->ms_size, vd->vdev_ashift)); 2811 2812 ASSERT(msp->ms_sm != NULL); 2813 ASSERT0(metaslab_allocated_space(msp)); 2814 } 2815 2816 if (!range_tree_is_empty(msp->ms_checkpointing) && 2817 vd->vdev_checkpoint_sm == NULL) { 2818 ASSERT(spa_has_checkpoint(spa)); 2819 2820 uint64_t new_object = space_map_alloc(mos, 2821 vdev_standard_sm_blksz, tx); 2822 VERIFY3U(new_object, !=, 0); 2823 2824 VERIFY0(space_map_open(&vd->vdev_checkpoint_sm, 2825 mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift)); 2826 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); 2827 2828 /* 2829 * We save the space map object as an entry in vdev_top_zap 2830 * so it can be retrieved when the pool is reopened after an 2831 * export or through zdb. 2832 */ 2833 VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, 2834 vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, 2835 sizeof (new_object), 1, &new_object, tx)); 2836 } 2837 2838 mutex_enter(&msp->ms_sync_lock); 2839 mutex_enter(&msp->ms_lock); 2840 2841 /* 2842 * Note: metaslab_condense() clears the space map's histogram. 2843 * Therefore we must verify and remove this histogram before 2844 * condensing. 2845 */ 2846 metaslab_group_histogram_verify(mg); 2847 metaslab_class_histogram_verify(mg->mg_class); 2848 metaslab_group_histogram_remove(mg, msp); 2849 2850 if (msp->ms_loaded && metaslab_should_condense(msp)) { 2851 metaslab_condense(msp, txg, tx); 2852 } else { 2853 mutex_exit(&msp->ms_lock); 2854 space_map_write(msp->ms_sm, alloctree, SM_ALLOC, 2855 SM_NO_VDEVID, tx); 2856 space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE, 2857 SM_NO_VDEVID, tx); 2858 mutex_enter(&msp->ms_lock); 2859 } 2860 2861 msp->ms_allocated_space += range_tree_space(alloctree); 2862 ASSERT3U(msp->ms_allocated_space, >=, 2863 range_tree_space(msp->ms_freeing)); 2864 msp->ms_allocated_space -= range_tree_space(msp->ms_freeing); 2865 2866 if (!range_tree_is_empty(msp->ms_checkpointing)) { 2867 ASSERT(spa_has_checkpoint(spa)); 2868 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); 2869 2870 /* 2871 * Since we are doing writes to disk and the ms_checkpointing 2872 * tree won't be changing during that time, we drop the 2873 * ms_lock while writing to the checkpoint space map. 2874 */ 2875 mutex_exit(&msp->ms_lock); 2876 space_map_write(vd->vdev_checkpoint_sm, 2877 msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx); 2878 mutex_enter(&msp->ms_lock); 2879 2880 spa->spa_checkpoint_info.sci_dspace += 2881 range_tree_space(msp->ms_checkpointing); 2882 vd->vdev_stat.vs_checkpoint_space += 2883 range_tree_space(msp->ms_checkpointing); 2884 ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==, 2885 -space_map_allocated(vd->vdev_checkpoint_sm)); 2886 2887 range_tree_vacate(msp->ms_checkpointing, NULL, NULL); 2888 } 2889 2890 if (msp->ms_loaded) { 2891 /* 2892 * When the space map is loaded, we have an accurate 2893 * histogram in the range tree. This gives us an opportunity 2894 * to bring the space map's histogram up-to-date so we clear 2895 * it first before updating it. 2896 */ 2897 space_map_histogram_clear(msp->ms_sm); 2898 space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx); 2899 2900 /* 2901 * Since we've cleared the histogram we need to add back 2902 * any free space that has already been processed, plus 2903 * any deferred space. This allows the on-disk histogram 2904 * to accurately reflect all free space even if some space 2905 * is not yet available for allocation (i.e. deferred). 2906 */ 2907 space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx); 2908 2909 /* 2910 * Add back any deferred free space that has not been 2911 * added back into the in-core free tree yet. This will 2912 * ensure that we don't end up with a space map histogram 2913 * that is completely empty unless the metaslab is fully 2914 * allocated. 2915 */ 2916 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2917 space_map_histogram_add(msp->ms_sm, 2918 msp->ms_defer[t], tx); 2919 } 2920 } 2921 2922 /* 2923 * Always add the free space from this sync pass to the space 2924 * map histogram. We want to make sure that the on-disk histogram 2925 * accounts for all free space. If the space map is not loaded, 2926 * then we will lose some accuracy but will correct it the next 2927 * time we load the space map. 2928 */ 2929 space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx); 2930 metaslab_aux_histograms_update(msp); 2931 2932 metaslab_group_histogram_add(mg, msp); 2933 metaslab_group_histogram_verify(mg); 2934 metaslab_class_histogram_verify(mg->mg_class); 2935 2936 /* 2937 * For sync pass 1, we avoid traversing this txg's free range tree 2938 * and instead will just swap the pointers for freeing and freed. 2939 * We can safely do this since the freed_tree is guaranteed to be 2940 * empty on the initial pass. 2941 */ 2942 if (spa_sync_pass(spa) == 1) { 2943 range_tree_swap(&msp->ms_freeing, &msp->ms_freed); 2944 ASSERT0(msp->ms_allocated_this_txg); 2945 } else { 2946 range_tree_vacate(msp->ms_freeing, 2947 range_tree_add, msp->ms_freed); 2948 } 2949 msp->ms_allocated_this_txg += range_tree_space(alloctree); 2950 range_tree_vacate(alloctree, NULL, NULL); 2951 2952 ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); 2953 ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg) 2954 & TXG_MASK])); 2955 ASSERT0(range_tree_space(msp->ms_freeing)); 2956 ASSERT0(range_tree_space(msp->ms_checkpointing)); 2957 2958 mutex_exit(&msp->ms_lock); 2959 2960 if (object != space_map_object(msp->ms_sm)) { 2961 object = space_map_object(msp->ms_sm); 2962 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * 2963 msp->ms_id, sizeof (uint64_t), &object, tx); 2964 } 2965 mutex_exit(&msp->ms_sync_lock); 2966 dmu_tx_commit(tx); 2967 } 2968 2969 /* 2970 * Called after a transaction group has completely synced to mark 2971 * all of the metaslab's free space as usable. 2972 */ 2973 void 2974 metaslab_sync_done(metaslab_t *msp, uint64_t txg) 2975 { 2976 metaslab_group_t *mg = msp->ms_group; 2977 vdev_t *vd = mg->mg_vd; 2978 spa_t *spa = vd->vdev_spa; 2979 range_tree_t **defer_tree; 2980 int64_t alloc_delta, defer_delta; 2981 boolean_t defer_allowed = B_TRUE; 2982 2983 ASSERT(!vd->vdev_ishole); 2984 2985 mutex_enter(&msp->ms_lock); 2986 2987 /* 2988 * If this metaslab is just becoming available, initialize its 2989 * range trees and add its capacity to the vdev. 2990 */ 2991 if (msp->ms_freed == NULL) { 2992 for (int t = 0; t < TXG_SIZE; t++) { 2993 ASSERT(msp->ms_allocating[t] == NULL); 2994 2995 msp->ms_allocating[t] = range_tree_create(NULL, NULL); 2996 } 2997 2998 ASSERT3P(msp->ms_freeing, ==, NULL); 2999 msp->ms_freeing = range_tree_create(NULL, NULL); 3000 3001 ASSERT3P(msp->ms_freed, ==, NULL); 3002 msp->ms_freed = range_tree_create(NULL, NULL); 3003 3004 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 3005 ASSERT(msp->ms_defer[t] == NULL); 3006 3007 msp->ms_defer[t] = range_tree_create(NULL, NULL); 3008 } 3009 3010 ASSERT3P(msp->ms_checkpointing, ==, NULL); 3011 msp->ms_checkpointing = range_tree_create(NULL, NULL); 3012 3013 metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size); 3014 } 3015 ASSERT0(range_tree_space(msp->ms_freeing)); 3016 ASSERT0(range_tree_space(msp->ms_checkpointing)); 3017 3018 defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE]; 3019 3020 uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) - 3021 metaslab_class_get_alloc(spa_normal_class(spa)); 3022 if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) { 3023 defer_allowed = B_FALSE; 3024 } 3025 3026 defer_delta = 0; 3027 alloc_delta = msp->ms_allocated_this_txg - 3028 range_tree_space(msp->ms_freed); 3029 if (defer_allowed) { 3030 defer_delta = range_tree_space(msp->ms_freed) - 3031 range_tree_space(*defer_tree); 3032 } else { 3033 defer_delta -= range_tree_space(*defer_tree); 3034 } 3035 3036 metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta, 3037 defer_delta, 0); 3038 3039 /* 3040 * If there's a metaslab_load() in progress, wait for it to complete 3041 * so that we have a consistent view of the in-core space map. 3042 */ 3043 metaslab_load_wait(msp); 3044 3045 /* 3046 * Move the frees from the defer_tree back to the free 3047 * range tree (if it's loaded). Swap the freed_tree and 3048 * the defer_tree -- this is safe to do because we've 3049 * just emptied out the defer_tree. 3050 */ 3051 range_tree_vacate(*defer_tree, 3052 msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable); 3053 if (defer_allowed) { 3054 range_tree_swap(&msp->ms_freed, defer_tree); 3055 } else { 3056 range_tree_vacate(msp->ms_freed, 3057 msp->ms_loaded ? range_tree_add : NULL, 3058 msp->ms_allocatable); 3059 } 3060 3061 msp->ms_synced_length = space_map_length(msp->ms_sm); 3062 3063 msp->ms_deferspace += defer_delta; 3064 ASSERT3S(msp->ms_deferspace, >=, 0); 3065 ASSERT3S(msp->ms_deferspace, <=, msp->ms_size); 3066 if (msp->ms_deferspace != 0) { 3067 /* 3068 * Keep syncing this metaslab until all deferred frees 3069 * are back in circulation. 3070 */ 3071 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 3072 } 3073 metaslab_aux_histograms_update_done(msp, defer_allowed); 3074 3075 if (msp->ms_new) { 3076 msp->ms_new = B_FALSE; 3077 mutex_enter(&mg->mg_lock); 3078 mg->mg_ms_ready++; 3079 mutex_exit(&mg->mg_lock); 3080 } 3081 3082 /* 3083 * Re-sort metaslab within its group now that we've adjusted 3084 * its allocatable space. 3085 */ 3086 metaslab_recalculate_weight_and_sort(msp); 3087 3088 /* 3089 * If the metaslab is loaded and we've not tried to load or allocate 3090 * from it in 'metaslab_unload_delay' txgs, then unload it. 3091 */ 3092 if (msp->ms_loaded && 3093 msp->ms_initializing == 0 && 3094 msp->ms_selected_txg + metaslab_unload_delay < txg) { 3095 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 3096 VERIFY0(range_tree_space( 3097 msp->ms_allocating[(txg + t) & TXG_MASK])); 3098 } 3099 if (msp->ms_allocator != -1) { 3100 metaslab_passivate(msp, msp->ms_weight & 3101 ~METASLAB_ACTIVE_MASK); 3102 } 3103 3104 if (!metaslab_debug_unload) 3105 metaslab_unload(msp); 3106 } 3107 3108 ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); 3109 ASSERT0(range_tree_space(msp->ms_freeing)); 3110 ASSERT0(range_tree_space(msp->ms_freed)); 3111 ASSERT0(range_tree_space(msp->ms_checkpointing)); 3112 3113 msp->ms_allocated_this_txg = 0; 3114 mutex_exit(&msp->ms_lock); 3115 } 3116 3117 void 3118 metaslab_sync_reassess(metaslab_group_t *mg) 3119 { 3120 spa_t *spa = mg->mg_class->mc_spa; 3121 3122 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 3123 metaslab_group_alloc_update(mg); 3124 mg->mg_fragmentation = metaslab_group_fragmentation(mg); 3125 3126 /* 3127 * Preload the next potential metaslabs but only on active 3128 * metaslab groups. We can get into a state where the metaslab 3129 * is no longer active since we dirty metaslabs as we remove a 3130 * a device, thus potentially making the metaslab group eligible 3131 * for preloading. 3132 */ 3133 if (mg->mg_activation_count > 0) { 3134 metaslab_group_preload(mg); 3135 } 3136 spa_config_exit(spa, SCL_ALLOC, FTAG); 3137 } 3138 3139 /* 3140 * When writing a ditto block (i.e. more than one DVA for a given BP) on 3141 * the same vdev as an existing DVA of this BP, then try to allocate it 3142 * on a different metaslab than existing DVAs (i.e. a unique metaslab). 3143 */ 3144 static boolean_t 3145 metaslab_is_unique(metaslab_t *msp, dva_t *dva) 3146 { 3147 uint64_t dva_ms_id; 3148 3149 if (DVA_GET_ASIZE(dva) == 0) 3150 return (B_TRUE); 3151 3152 if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) 3153 return (B_TRUE); 3154 3155 dva_ms_id = DVA_GET_OFFSET(dva) >> msp->ms_group->mg_vd->vdev_ms_shift; 3156 3157 return (msp->ms_id != dva_ms_id); 3158 } 3159 3160 /* 3161 * ========================================================================== 3162 * Metaslab allocation tracing facility 3163 * ========================================================================== 3164 */ 3165 kstat_t *metaslab_trace_ksp; 3166 kstat_named_t metaslab_trace_over_limit; 3167 3168 void 3169 metaslab_alloc_trace_init(void) 3170 { 3171 ASSERT(metaslab_alloc_trace_cache == NULL); 3172 metaslab_alloc_trace_cache = kmem_cache_create( 3173 "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t), 3174 0, NULL, NULL, NULL, NULL, NULL, 0); 3175 metaslab_trace_ksp = kstat_create("zfs", 0, "metaslab_trace_stats", 3176 "misc", KSTAT_TYPE_NAMED, 1, KSTAT_FLAG_VIRTUAL); 3177 if (metaslab_trace_ksp != NULL) { 3178 metaslab_trace_ksp->ks_data = &metaslab_trace_over_limit; 3179 kstat_named_init(&metaslab_trace_over_limit, 3180 "metaslab_trace_over_limit", KSTAT_DATA_UINT64); 3181 kstat_install(metaslab_trace_ksp); 3182 } 3183 } 3184 3185 void 3186 metaslab_alloc_trace_fini(void) 3187 { 3188 if (metaslab_trace_ksp != NULL) { 3189 kstat_delete(metaslab_trace_ksp); 3190 metaslab_trace_ksp = NULL; 3191 } 3192 kmem_cache_destroy(metaslab_alloc_trace_cache); 3193 metaslab_alloc_trace_cache = NULL; 3194 } 3195 3196 /* 3197 * Add an allocation trace element to the allocation tracing list. 3198 */ 3199 static void 3200 metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg, 3201 metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset, 3202 int allocator) 3203 { 3204 if (!metaslab_trace_enabled) 3205 return; 3206 3207 /* 3208 * When the tracing list reaches its maximum we remove 3209 * the second element in the list before adding a new one. 3210 * By removing the second element we preserve the original 3211 * entry as a clue to what allocations steps have already been 3212 * performed. 3213 */ 3214 if (zal->zal_size == metaslab_trace_max_entries) { 3215 metaslab_alloc_trace_t *mat_next; 3216 #ifdef DEBUG 3217 panic("too many entries in allocation list"); 3218 #endif 3219 atomic_inc_64(&metaslab_trace_over_limit.value.ui64); 3220 zal->zal_size--; 3221 mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list)); 3222 list_remove(&zal->zal_list, mat_next); 3223 kmem_cache_free(metaslab_alloc_trace_cache, mat_next); 3224 } 3225 3226 metaslab_alloc_trace_t *mat = 3227 kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP); 3228 list_link_init(&mat->mat_list_node); 3229 mat->mat_mg = mg; 3230 mat->mat_msp = msp; 3231 mat->mat_size = psize; 3232 mat->mat_dva_id = dva_id; 3233 mat->mat_offset = offset; 3234 mat->mat_weight = 0; 3235 mat->mat_allocator = allocator; 3236 3237 if (msp != NULL) 3238 mat->mat_weight = msp->ms_weight; 3239 3240 /* 3241 * The list is part of the zio so locking is not required. Only 3242 * a single thread will perform allocations for a given zio. 3243 */ 3244 list_insert_tail(&zal->zal_list, mat); 3245 zal->zal_size++; 3246 3247 ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries); 3248 } 3249 3250 void 3251 metaslab_trace_init(zio_alloc_list_t *zal) 3252 { 3253 list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t), 3254 offsetof(metaslab_alloc_trace_t, mat_list_node)); 3255 zal->zal_size = 0; 3256 } 3257 3258 void 3259 metaslab_trace_fini(zio_alloc_list_t *zal) 3260 { 3261 metaslab_alloc_trace_t *mat; 3262 3263 while ((mat = list_remove_head(&zal->zal_list)) != NULL) 3264 kmem_cache_free(metaslab_alloc_trace_cache, mat); 3265 list_destroy(&zal->zal_list); 3266 zal->zal_size = 0; 3267 } 3268 3269 /* 3270 * ========================================================================== 3271 * Metaslab block operations 3272 * ========================================================================== 3273 */ 3274 3275 static void 3276 metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags, 3277 int allocator) 3278 { 3279 if (!(flags & METASLAB_ASYNC_ALLOC) || 3280 (flags & METASLAB_DONT_THROTTLE)) 3281 return; 3282 3283 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 3284 if (!mg->mg_class->mc_alloc_throttle_enabled) 3285 return; 3286 3287 (void) zfs_refcount_add(&mg->mg_alloc_queue_depth[allocator], tag); 3288 } 3289 3290 static void 3291 metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator) 3292 { 3293 uint64_t max = mg->mg_max_alloc_queue_depth; 3294 uint64_t cur = mg->mg_cur_max_alloc_queue_depth[allocator]; 3295 while (cur < max) { 3296 if (atomic_cas_64(&mg->mg_cur_max_alloc_queue_depth[allocator], 3297 cur, cur + 1) == cur) { 3298 atomic_inc_64( 3299 &mg->mg_class->mc_alloc_max_slots[allocator]); 3300 return; 3301 } 3302 cur = mg->mg_cur_max_alloc_queue_depth[allocator]; 3303 } 3304 } 3305 3306 void 3307 metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags, 3308 int allocator, boolean_t io_complete) 3309 { 3310 if (!(flags & METASLAB_ASYNC_ALLOC) || 3311 (flags & METASLAB_DONT_THROTTLE)) 3312 return; 3313 3314 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 3315 if (!mg->mg_class->mc_alloc_throttle_enabled) 3316 return; 3317 3318 (void) zfs_refcount_remove(&mg->mg_alloc_queue_depth[allocator], tag); 3319 if (io_complete) 3320 metaslab_group_increment_qdepth(mg, allocator); 3321 } 3322 3323 void 3324 metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag, 3325 int allocator) 3326 { 3327 #ifdef ZFS_DEBUG 3328 const dva_t *dva = bp->blk_dva; 3329 int ndvas = BP_GET_NDVAS(bp); 3330 3331 for (int d = 0; d < ndvas; d++) { 3332 uint64_t vdev = DVA_GET_VDEV(&dva[d]); 3333 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 3334 VERIFY(zfs_refcount_not_held( 3335 &mg->mg_alloc_queue_depth[allocator], tag)); 3336 } 3337 #endif 3338 } 3339 3340 static uint64_t 3341 metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) 3342 { 3343 uint64_t start; 3344 range_tree_t *rt = msp->ms_allocatable; 3345 metaslab_class_t *mc = msp->ms_group->mg_class; 3346 3347 VERIFY(!msp->ms_condensing); 3348 VERIFY0(msp->ms_initializing); 3349 3350 start = mc->mc_ops->msop_alloc(msp, size); 3351 if (start != -1ULL) { 3352 metaslab_group_t *mg = msp->ms_group; 3353 vdev_t *vd = mg->mg_vd; 3354 3355 VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift)); 3356 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 3357 VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size); 3358 range_tree_remove(rt, start, size); 3359 3360 if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) 3361 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); 3362 3363 range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size); 3364 3365 /* Track the last successful allocation */ 3366 msp->ms_alloc_txg = txg; 3367 metaslab_verify_space(msp, txg); 3368 } 3369 3370 /* 3371 * Now that we've attempted the allocation we need to update the 3372 * metaslab's maximum block size since it may have changed. 3373 */ 3374 msp->ms_max_size = metaslab_block_maxsize(msp); 3375 return (start); 3376 } 3377 3378 /* 3379 * Find the metaslab with the highest weight that is less than what we've 3380 * already tried. In the common case, this means that we will examine each 3381 * metaslab at most once. Note that concurrent callers could reorder metaslabs 3382 * by activation/passivation once we have dropped the mg_lock. If a metaslab is 3383 * activated by another thread, and we fail to allocate from the metaslab we 3384 * have selected, we may not try the newly-activated metaslab, and instead 3385 * activate another metaslab. This is not optimal, but generally does not cause 3386 * any problems (a possible exception being if every metaslab is completely full 3387 * except for the the newly-activated metaslab which we fail to examine). 3388 */ 3389 static metaslab_t * 3390 find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, 3391 dva_t *dva, int d, boolean_t want_unique, uint64_t asize, int allocator, 3392 zio_alloc_list_t *zal, metaslab_t *search, boolean_t *was_active) 3393 { 3394 avl_index_t idx; 3395 avl_tree_t *t = &mg->mg_metaslab_tree; 3396 metaslab_t *msp = avl_find(t, search, &idx); 3397 if (msp == NULL) 3398 msp = avl_nearest(t, idx, AVL_AFTER); 3399 3400 for (; msp != NULL; msp = AVL_NEXT(t, msp)) { 3401 int i; 3402 if (!metaslab_should_allocate(msp, asize)) { 3403 metaslab_trace_add(zal, mg, msp, asize, d, 3404 TRACE_TOO_SMALL, allocator); 3405 continue; 3406 } 3407 3408 /* 3409 * If the selected metaslab is condensing or being 3410 * initialized, skip it. 3411 */ 3412 if (msp->ms_condensing || msp->ms_initializing > 0) 3413 continue; 3414 3415 *was_active = msp->ms_allocator != -1; 3416 /* 3417 * If we're activating as primary, this is our first allocation 3418 * from this disk, so we don't need to check how close we are. 3419 * If the metaslab under consideration was already active, 3420 * we're getting desperate enough to steal another allocator's 3421 * metaslab, so we still don't care about distances. 3422 */ 3423 if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active) 3424 break; 3425 3426 for (i = 0; i < d; i++) { 3427 if (want_unique && 3428 !metaslab_is_unique(msp, &dva[i])) 3429 break; /* try another metaslab */ 3430 } 3431 if (i == d) 3432 break; 3433 } 3434 3435 if (msp != NULL) { 3436 search->ms_weight = msp->ms_weight; 3437 search->ms_start = msp->ms_start + 1; 3438 search->ms_allocator = msp->ms_allocator; 3439 search->ms_primary = msp->ms_primary; 3440 } 3441 return (msp); 3442 } 3443 3444 /* ARGSUSED */ 3445 static uint64_t 3446 metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, 3447 uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, 3448 int d, int allocator) 3449 { 3450 metaslab_t *msp = NULL; 3451 uint64_t offset = -1ULL; 3452 uint64_t activation_weight; 3453 3454 activation_weight = METASLAB_WEIGHT_PRIMARY; 3455 for (int i = 0; i < d; i++) { 3456 if (activation_weight == METASLAB_WEIGHT_PRIMARY && 3457 DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { 3458 activation_weight = METASLAB_WEIGHT_SECONDARY; 3459 } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && 3460 DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { 3461 activation_weight = METASLAB_WEIGHT_CLAIM; 3462 break; 3463 } 3464 } 3465 3466 /* 3467 * If we don't have enough metaslabs active to fill the entire array, we 3468 * just use the 0th slot. 3469 */ 3470 if (mg->mg_ms_ready < mg->mg_allocators * 3) 3471 allocator = 0; 3472 3473 ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2); 3474 3475 metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP); 3476 search->ms_weight = UINT64_MAX; 3477 search->ms_start = 0; 3478 /* 3479 * At the end of the metaslab tree are the already-active metaslabs, 3480 * first the primaries, then the secondaries. When we resume searching 3481 * through the tree, we need to consider ms_allocator and ms_primary so 3482 * we start in the location right after where we left off, and don't 3483 * accidentally loop forever considering the same metaslabs. 3484 */ 3485 search->ms_allocator = -1; 3486 search->ms_primary = B_TRUE; 3487 for (;;) { 3488 boolean_t was_active = B_FALSE; 3489 3490 mutex_enter(&mg->mg_lock); 3491 3492 if (activation_weight == METASLAB_WEIGHT_PRIMARY && 3493 mg->mg_primaries[allocator] != NULL) { 3494 msp = mg->mg_primaries[allocator]; 3495 was_active = B_TRUE; 3496 } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && 3497 mg->mg_secondaries[allocator] != NULL) { 3498 msp = mg->mg_secondaries[allocator]; 3499 was_active = B_TRUE; 3500 } else { 3501 msp = find_valid_metaslab(mg, activation_weight, dva, d, 3502 want_unique, asize, allocator, zal, search, 3503 &was_active); 3504 } 3505 3506 mutex_exit(&mg->mg_lock); 3507 if (msp == NULL) { 3508 kmem_free(search, sizeof (*search)); 3509 return (-1ULL); 3510 } 3511 3512 mutex_enter(&msp->ms_lock); 3513 /* 3514 * Ensure that the metaslab we have selected is still 3515 * capable of handling our request. It's possible that 3516 * another thread may have changed the weight while we 3517 * were blocked on the metaslab lock. We check the 3518 * active status first to see if we need to reselect 3519 * a new metaslab. 3520 */ 3521 if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) { 3522 mutex_exit(&msp->ms_lock); 3523 continue; 3524 } 3525 3526 /* 3527 * If the metaslab is freshly activated for an allocator that 3528 * isn't the one we're allocating from, or if it's a primary and 3529 * we're seeking a secondary (or vice versa), we go back and 3530 * select a new metaslab. 3531 */ 3532 if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) && 3533 (msp->ms_allocator != -1) && 3534 (msp->ms_allocator != allocator || ((activation_weight == 3535 METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) { 3536 mutex_exit(&msp->ms_lock); 3537 continue; 3538 } 3539 3540 if (msp->ms_weight & METASLAB_WEIGHT_CLAIM && 3541 activation_weight != METASLAB_WEIGHT_CLAIM) { 3542 metaslab_passivate(msp, msp->ms_weight & 3543 ~METASLAB_WEIGHT_CLAIM); 3544 mutex_exit(&msp->ms_lock); 3545 continue; 3546 } 3547 3548 if (metaslab_activate(msp, allocator, activation_weight) != 0) { 3549 mutex_exit(&msp->ms_lock); 3550 continue; 3551 } 3552 3553 msp->ms_selected_txg = txg; 3554 3555 /* 3556 * Now that we have the lock, recheck to see if we should 3557 * continue to use this metaslab for this allocation. The 3558 * the metaslab is now loaded so metaslab_should_allocate() can 3559 * accurately determine if the allocation attempt should 3560 * proceed. 3561 */ 3562 if (!metaslab_should_allocate(msp, asize)) { 3563 /* Passivate this metaslab and select a new one. */ 3564 metaslab_trace_add(zal, mg, msp, asize, d, 3565 TRACE_TOO_SMALL, allocator); 3566 goto next; 3567 } 3568 3569 /* 3570 * If this metaslab is currently condensing then pick again as 3571 * we can't manipulate this metaslab until it's committed 3572 * to disk. If this metaslab is being initialized, we shouldn't 3573 * allocate from it since the allocated region might be 3574 * overwritten after allocation. 3575 */ 3576 if (msp->ms_condensing) { 3577 metaslab_trace_add(zal, mg, msp, asize, d, 3578 TRACE_CONDENSING, allocator); 3579 metaslab_passivate(msp, msp->ms_weight & 3580 ~METASLAB_ACTIVE_MASK); 3581 mutex_exit(&msp->ms_lock); 3582 continue; 3583 } else if (msp->ms_initializing > 0) { 3584 metaslab_trace_add(zal, mg, msp, asize, d, 3585 TRACE_INITIALIZING, allocator); 3586 metaslab_passivate(msp, msp->ms_weight & 3587 ~METASLAB_ACTIVE_MASK); 3588 mutex_exit(&msp->ms_lock); 3589 continue; 3590 } 3591 3592 offset = metaslab_block_alloc(msp, asize, txg); 3593 metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator); 3594 3595 if (offset != -1ULL) { 3596 /* Proactively passivate the metaslab, if needed */ 3597 metaslab_segment_may_passivate(msp); 3598 break; 3599 } 3600 next: 3601 ASSERT(msp->ms_loaded); 3602 3603 /* 3604 * We were unable to allocate from this metaslab so determine 3605 * a new weight for this metaslab. Now that we have loaded 3606 * the metaslab we can provide a better hint to the metaslab 3607 * selector. 3608 * 3609 * For space-based metaslabs, we use the maximum block size. 3610 * This information is only available when the metaslab 3611 * is loaded and is more accurate than the generic free 3612 * space weight that was calculated by metaslab_weight(). 3613 * This information allows us to quickly compare the maximum 3614 * available allocation in the metaslab to the allocation 3615 * size being requested. 3616 * 3617 * For segment-based metaslabs, determine the new weight 3618 * based on the highest bucket in the range tree. We 3619 * explicitly use the loaded segment weight (i.e. the range 3620 * tree histogram) since it contains the space that is 3621 * currently available for allocation and is accurate 3622 * even within a sync pass. 3623 */ 3624 if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) { 3625 uint64_t weight = metaslab_block_maxsize(msp); 3626 WEIGHT_SET_SPACEBASED(weight); 3627 metaslab_passivate(msp, weight); 3628 } else { 3629 metaslab_passivate(msp, 3630 metaslab_weight_from_range_tree(msp)); 3631 } 3632 3633 /* 3634 * We have just failed an allocation attempt, check 3635 * that metaslab_should_allocate() agrees. Otherwise, 3636 * we may end up in an infinite loop retrying the same 3637 * metaslab. 3638 */ 3639 ASSERT(!metaslab_should_allocate(msp, asize)); 3640 3641 mutex_exit(&msp->ms_lock); 3642 } 3643 mutex_exit(&msp->ms_lock); 3644 kmem_free(search, sizeof (*search)); 3645 return (offset); 3646 } 3647 3648 static uint64_t 3649 metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, 3650 uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, 3651 int d, int allocator) 3652 { 3653 uint64_t offset; 3654 ASSERT(mg->mg_initialized); 3655 3656 offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique, 3657 dva, d, allocator); 3658 3659 mutex_enter(&mg->mg_lock); 3660 if (offset == -1ULL) { 3661 mg->mg_failed_allocations++; 3662 metaslab_trace_add(zal, mg, NULL, asize, d, 3663 TRACE_GROUP_FAILURE, allocator); 3664 if (asize == SPA_GANGBLOCKSIZE) { 3665 /* 3666 * This metaslab group was unable to allocate 3667 * the minimum gang block size so it must be out of 3668 * space. We must notify the allocation throttle 3669 * to start skipping allocation attempts to this 3670 * metaslab group until more space becomes available. 3671 * Note: this failure cannot be caused by the 3672 * allocation throttle since the allocation throttle 3673 * is only responsible for skipping devices and 3674 * not failing block allocations. 3675 */ 3676 mg->mg_no_free_space = B_TRUE; 3677 } 3678 } 3679 mg->mg_allocations++; 3680 mutex_exit(&mg->mg_lock); 3681 return (offset); 3682 } 3683 3684 /* 3685 * Allocate a block for the specified i/o. 3686 */ 3687 int 3688 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, 3689 dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags, 3690 zio_alloc_list_t *zal, int allocator) 3691 { 3692 metaslab_group_t *mg, *rotor; 3693 vdev_t *vd; 3694 boolean_t try_hard = B_FALSE; 3695 3696 ASSERT(!DVA_IS_VALID(&dva[d])); 3697 3698 /* 3699 * For testing, make some blocks above a certain size be gang blocks. 3700 * This will also test spilling from special to normal. 3701 */ 3702 if (psize >= metaslab_force_ganging && (ddi_get_lbolt() & 3) == 0) { 3703 metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG, 3704 allocator); 3705 return (SET_ERROR(ENOSPC)); 3706 } 3707 3708 /* 3709 * Start at the rotor and loop through all mgs until we find something. 3710 * Note that there's no locking on mc_rotor or mc_aliquot because 3711 * nothing actually breaks if we miss a few updates -- we just won't 3712 * allocate quite as evenly. It all balances out over time. 3713 * 3714 * If we are doing ditto or log blocks, try to spread them across 3715 * consecutive vdevs. If we're forced to reuse a vdev before we've 3716 * allocated all of our ditto blocks, then try and spread them out on 3717 * that vdev as much as possible. If it turns out to not be possible, 3718 * gradually lower our standards until anything becomes acceptable. 3719 * Also, allocating on consecutive vdevs (as opposed to random vdevs) 3720 * gives us hope of containing our fault domains to something we're 3721 * able to reason about. Otherwise, any two top-level vdev failures 3722 * will guarantee the loss of data. With consecutive allocation, 3723 * only two adjacent top-level vdev failures will result in data loss. 3724 * 3725 * If we are doing gang blocks (hintdva is non-NULL), try to keep 3726 * ourselves on the same vdev as our gang block header. That 3727 * way, we can hope for locality in vdev_cache, plus it makes our 3728 * fault domains something tractable. 3729 */ 3730 if (hintdva) { 3731 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); 3732 3733 /* 3734 * It's possible the vdev we're using as the hint no 3735 * longer exists or its mg has been closed (e.g. by 3736 * device removal). Consult the rotor when 3737 * all else fails. 3738 */ 3739 if (vd != NULL && vd->vdev_mg != NULL) { 3740 mg = vd->vdev_mg; 3741 3742 if (flags & METASLAB_HINTBP_AVOID && 3743 mg->mg_next != NULL) 3744 mg = mg->mg_next; 3745 } else { 3746 mg = mc->mc_rotor; 3747 } 3748 } else if (d != 0) { 3749 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); 3750 mg = vd->vdev_mg->mg_next; 3751 } else { 3752 ASSERT(mc->mc_rotor != NULL); 3753 mg = mc->mc_rotor; 3754 } 3755 3756 /* 3757 * If the hint put us into the wrong metaslab class, or into a 3758 * metaslab group that has been passivated, just follow the rotor. 3759 */ 3760 if (mg->mg_class != mc || mg->mg_activation_count <= 0) 3761 mg = mc->mc_rotor; 3762 3763 rotor = mg; 3764 top: 3765 do { 3766 boolean_t allocatable; 3767 3768 ASSERT(mg->mg_activation_count == 1); 3769 vd = mg->mg_vd; 3770 3771 /* 3772 * Don't allocate from faulted devices. 3773 */ 3774 if (try_hard) { 3775 spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); 3776 allocatable = vdev_allocatable(vd); 3777 spa_config_exit(spa, SCL_ZIO, FTAG); 3778 } else { 3779 allocatable = vdev_allocatable(vd); 3780 } 3781 3782 /* 3783 * Determine if the selected metaslab group is eligible 3784 * for allocations. If we're ganging then don't allow 3785 * this metaslab group to skip allocations since that would 3786 * inadvertently return ENOSPC and suspend the pool 3787 * even though space is still available. 3788 */ 3789 if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) { 3790 allocatable = metaslab_group_allocatable(mg, rotor, 3791 psize, allocator); 3792 } 3793 3794 if (!allocatable) { 3795 metaslab_trace_add(zal, mg, NULL, psize, d, 3796 TRACE_NOT_ALLOCATABLE, allocator); 3797 goto next; 3798 } 3799 3800 ASSERT(mg->mg_initialized); 3801 3802 /* 3803 * Avoid writing single-copy data to a failing, 3804 * non-redundant vdev, unless we've already tried all 3805 * other vdevs. 3806 */ 3807 if ((vd->vdev_stat.vs_write_errors > 0 || 3808 vd->vdev_state < VDEV_STATE_HEALTHY) && 3809 d == 0 && !try_hard && vd->vdev_children == 0) { 3810 metaslab_trace_add(zal, mg, NULL, psize, d, 3811 TRACE_VDEV_ERROR, allocator); 3812 goto next; 3813 } 3814 3815 ASSERT(mg->mg_class == mc); 3816 3817 uint64_t asize = vdev_psize_to_asize(vd, psize); 3818 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); 3819 3820 /* 3821 * If we don't need to try hard, then require that the 3822 * block be on an different metaslab from any other DVAs 3823 * in this BP (unique=true). If we are trying hard, then 3824 * allow any metaslab to be used (unique=false). 3825 */ 3826 uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg, 3827 !try_hard, dva, d, allocator); 3828 3829 if (offset != -1ULL) { 3830 /* 3831 * If we've just selected this metaslab group, 3832 * figure out whether the corresponding vdev is 3833 * over- or under-used relative to the pool, 3834 * and set an allocation bias to even it out. 3835 */ 3836 if (mc->mc_aliquot == 0 && metaslab_bias_enabled) { 3837 vdev_stat_t *vs = &vd->vdev_stat; 3838 int64_t vu, cu; 3839 3840 vu = (vs->vs_alloc * 100) / (vs->vs_space + 1); 3841 cu = (mc->mc_alloc * 100) / (mc->mc_space + 1); 3842 3843 /* 3844 * Calculate how much more or less we should 3845 * try to allocate from this device during 3846 * this iteration around the rotor. 3847 * For example, if a device is 80% full 3848 * and the pool is 20% full then we should 3849 * reduce allocations by 60% on this device. 3850 * 3851 * mg_bias = (20 - 80) * 512K / 100 = -307K 3852 * 3853 * This reduces allocations by 307K for this 3854 * iteration. 3855 */ 3856 mg->mg_bias = ((cu - vu) * 3857 (int64_t)mg->mg_aliquot) / 100; 3858 } else if (!metaslab_bias_enabled) { 3859 mg->mg_bias = 0; 3860 } 3861 3862 if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= 3863 mg->mg_aliquot + mg->mg_bias) { 3864 mc->mc_rotor = mg->mg_next; 3865 mc->mc_aliquot = 0; 3866 } 3867 3868 DVA_SET_VDEV(&dva[d], vd->vdev_id); 3869 DVA_SET_OFFSET(&dva[d], offset); 3870 DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER)); 3871 DVA_SET_ASIZE(&dva[d], asize); 3872 3873 return (0); 3874 } 3875 next: 3876 mc->mc_rotor = mg->mg_next; 3877 mc->mc_aliquot = 0; 3878 } while ((mg = mg->mg_next) != rotor); 3879 3880 /* 3881 * If we haven't tried hard, do so now. 3882 */ 3883 if (!try_hard) { 3884 try_hard = B_TRUE; 3885 goto top; 3886 } 3887 3888 bzero(&dva[d], sizeof (dva_t)); 3889 3890 metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator); 3891 return (SET_ERROR(ENOSPC)); 3892 } 3893 3894 void 3895 metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize, 3896 boolean_t checkpoint) 3897 { 3898 metaslab_t *msp; 3899 spa_t *spa = vd->vdev_spa; 3900 3901 ASSERT(vdev_is_concrete(vd)); 3902 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 3903 ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); 3904 3905 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 3906 3907 VERIFY(!msp->ms_condensing); 3908 VERIFY3U(offset, >=, msp->ms_start); 3909 VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size); 3910 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 3911 VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift)); 3912 3913 metaslab_check_free_impl(vd, offset, asize); 3914 3915 mutex_enter(&msp->ms_lock); 3916 if (range_tree_is_empty(msp->ms_freeing) && 3917 range_tree_is_empty(msp->ms_checkpointing)) { 3918 vdev_dirty(vd, VDD_METASLAB, msp, spa_syncing_txg(spa)); 3919 } 3920 3921 if (checkpoint) { 3922 ASSERT(spa_has_checkpoint(spa)); 3923 range_tree_add(msp->ms_checkpointing, offset, asize); 3924 } else { 3925 range_tree_add(msp->ms_freeing, offset, asize); 3926 } 3927 mutex_exit(&msp->ms_lock); 3928 } 3929 3930 /* ARGSUSED */ 3931 void 3932 metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, 3933 uint64_t size, void *arg) 3934 { 3935 boolean_t *checkpoint = arg; 3936 3937 ASSERT3P(checkpoint, !=, NULL); 3938 3939 if (vd->vdev_ops->vdev_op_remap != NULL) 3940 vdev_indirect_mark_obsolete(vd, offset, size); 3941 else 3942 metaslab_free_impl(vd, offset, size, *checkpoint); 3943 } 3944 3945 static void 3946 metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size, 3947 boolean_t checkpoint) 3948 { 3949 spa_t *spa = vd->vdev_spa; 3950 3951 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 3952 3953 if (spa_syncing_txg(spa) > spa_freeze_txg(spa)) 3954 return; 3955 3956 if (spa->spa_vdev_removal != NULL && 3957 spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id && 3958 vdev_is_concrete(vd)) { 3959 /* 3960 * Note: we check if the vdev is concrete because when 3961 * we complete the removal, we first change the vdev to be 3962 * an indirect vdev (in open context), and then (in syncing 3963 * context) clear spa_vdev_removal. 3964 */ 3965 free_from_removing_vdev(vd, offset, size); 3966 } else if (vd->vdev_ops->vdev_op_remap != NULL) { 3967 vdev_indirect_mark_obsolete(vd, offset, size); 3968 vd->vdev_ops->vdev_op_remap(vd, offset, size, 3969 metaslab_free_impl_cb, &checkpoint); 3970 } else { 3971 metaslab_free_concrete(vd, offset, size, checkpoint); 3972 } 3973 } 3974 3975 typedef struct remap_blkptr_cb_arg { 3976 blkptr_t *rbca_bp; 3977 spa_remap_cb_t rbca_cb; 3978 vdev_t *rbca_remap_vd; 3979 uint64_t rbca_remap_offset; 3980 void *rbca_cb_arg; 3981 } remap_blkptr_cb_arg_t; 3982 3983 void 3984 remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, 3985 uint64_t size, void *arg) 3986 { 3987 remap_blkptr_cb_arg_t *rbca = arg; 3988 blkptr_t *bp = rbca->rbca_bp; 3989 3990 /* We can not remap split blocks. */ 3991 if (size != DVA_GET_ASIZE(&bp->blk_dva[0])) 3992 return; 3993 ASSERT0(inner_offset); 3994 3995 if (rbca->rbca_cb != NULL) { 3996 /* 3997 * At this point we know that we are not handling split 3998 * blocks and we invoke the callback on the previous 3999 * vdev which must be indirect. 4000 */ 4001 ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops); 4002 4003 rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id, 4004 rbca->rbca_remap_offset, size, rbca->rbca_cb_arg); 4005 4006 /* set up remap_blkptr_cb_arg for the next call */ 4007 rbca->rbca_remap_vd = vd; 4008 rbca->rbca_remap_offset = offset; 4009 } 4010 4011 /* 4012 * The phys birth time is that of dva[0]. This ensures that we know 4013 * when each dva was written, so that resilver can determine which 4014 * blocks need to be scrubbed (i.e. those written during the time 4015 * the vdev was offline). It also ensures that the key used in 4016 * the ARC hash table is unique (i.e. dva[0] + phys_birth). If 4017 * we didn't change the phys_birth, a lookup in the ARC for a 4018 * remapped BP could find the data that was previously stored at 4019 * this vdev + offset. 4020 */ 4021 vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa, 4022 DVA_GET_VDEV(&bp->blk_dva[0])); 4023 vdev_indirect_births_t *vib = oldvd->vdev_indirect_births; 4024 bp->blk_phys_birth = vdev_indirect_births_physbirth(vib, 4025 DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0])); 4026 4027 DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id); 4028 DVA_SET_OFFSET(&bp->blk_dva[0], offset); 4029 } 4030 4031 /* 4032 * If the block pointer contains any indirect DVAs, modify them to refer to 4033 * concrete DVAs. Note that this will sometimes not be possible, leaving 4034 * the indirect DVA in place. This happens if the indirect DVA spans multiple 4035 * segments in the mapping (i.e. it is a "split block"). 4036 * 4037 * If the BP was remapped, calls the callback on the original dva (note the 4038 * callback can be called multiple times if the original indirect DVA refers 4039 * to another indirect DVA, etc). 4040 * 4041 * Returns TRUE if the BP was remapped. 4042 */ 4043 boolean_t 4044 spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg) 4045 { 4046 remap_blkptr_cb_arg_t rbca; 4047 4048 if (!zfs_remap_blkptr_enable) 4049 return (B_FALSE); 4050 4051 if (!spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) 4052 return (B_FALSE); 4053 4054 /* 4055 * Dedup BP's can not be remapped, because ddt_phys_select() depends 4056 * on DVA[0] being the same in the BP as in the DDT (dedup table). 4057 */ 4058 if (BP_GET_DEDUP(bp)) 4059 return (B_FALSE); 4060 4061 /* 4062 * Gang blocks can not be remapped, because 4063 * zio_checksum_gang_verifier() depends on the DVA[0] that's in 4064 * the BP used to read the gang block header (GBH) being the same 4065 * as the DVA[0] that we allocated for the GBH. 4066 */ 4067 if (BP_IS_GANG(bp)) 4068 return (B_FALSE); 4069 4070 /* 4071 * Embedded BP's have no DVA to remap. 4072 */ 4073 if (BP_GET_NDVAS(bp) < 1) 4074 return (B_FALSE); 4075 4076 /* 4077 * Note: we only remap dva[0]. If we remapped other dvas, we 4078 * would no longer know what their phys birth txg is. 4079 */ 4080 dva_t *dva = &bp->blk_dva[0]; 4081 4082 uint64_t offset = DVA_GET_OFFSET(dva); 4083 uint64_t size = DVA_GET_ASIZE(dva); 4084 vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); 4085 4086 if (vd->vdev_ops->vdev_op_remap == NULL) 4087 return (B_FALSE); 4088 4089 rbca.rbca_bp = bp; 4090 rbca.rbca_cb = callback; 4091 rbca.rbca_remap_vd = vd; 4092 rbca.rbca_remap_offset = offset; 4093 rbca.rbca_cb_arg = arg; 4094 4095 /* 4096 * remap_blkptr_cb() will be called in order for each level of 4097 * indirection, until a concrete vdev is reached or a split block is 4098 * encountered. old_vd and old_offset are updated within the callback 4099 * as we go from the one indirect vdev to the next one (either concrete 4100 * or indirect again) in that order. 4101 */ 4102 vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca); 4103 4104 /* Check if the DVA wasn't remapped because it is a split block */ 4105 if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id) 4106 return (B_FALSE); 4107 4108 return (B_TRUE); 4109 } 4110 4111 /* 4112 * Undo the allocation of a DVA which happened in the given transaction group. 4113 */ 4114 void 4115 metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg) 4116 { 4117 metaslab_t *msp; 4118 vdev_t *vd; 4119 uint64_t vdev = DVA_GET_VDEV(dva); 4120 uint64_t offset = DVA_GET_OFFSET(dva); 4121 uint64_t size = DVA_GET_ASIZE(dva); 4122 4123 ASSERT(DVA_IS_VALID(dva)); 4124 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 4125 4126 if (txg > spa_freeze_txg(spa)) 4127 return; 4128 4129 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 4130 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { 4131 cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu", 4132 (u_longlong_t)vdev, (u_longlong_t)offset); 4133 ASSERT(0); 4134 return; 4135 } 4136 4137 ASSERT(!vd->vdev_removing); 4138 ASSERT(vdev_is_concrete(vd)); 4139 ASSERT0(vd->vdev_indirect_config.vic_mapping_object); 4140 ASSERT3P(vd->vdev_indirect_mapping, ==, NULL); 4141 4142 if (DVA_GET_GANG(dva)) 4143 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 4144 4145 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 4146 4147 mutex_enter(&msp->ms_lock); 4148 range_tree_remove(msp->ms_allocating[txg & TXG_MASK], 4149 offset, size); 4150 4151 VERIFY(!msp->ms_condensing); 4152 VERIFY3U(offset, >=, msp->ms_start); 4153 VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size); 4154 VERIFY3U(range_tree_space(msp->ms_allocatable) + size, <=, 4155 msp->ms_size); 4156 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 4157 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 4158 range_tree_add(msp->ms_allocatable, offset, size); 4159 mutex_exit(&msp->ms_lock); 4160 } 4161 4162 /* 4163 * Free the block represented by the given DVA. 4164 */ 4165 void 4166 metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint) 4167 { 4168 uint64_t vdev = DVA_GET_VDEV(dva); 4169 uint64_t offset = DVA_GET_OFFSET(dva); 4170 uint64_t size = DVA_GET_ASIZE(dva); 4171 vdev_t *vd = vdev_lookup_top(spa, vdev); 4172 4173 ASSERT(DVA_IS_VALID(dva)); 4174 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 4175 4176 if (DVA_GET_GANG(dva)) { 4177 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 4178 } 4179 4180 metaslab_free_impl(vd, offset, size, checkpoint); 4181 } 4182 4183 /* 4184 * Reserve some allocation slots. The reservation system must be called 4185 * before we call into the allocator. If there aren't any available slots 4186 * then the I/O will be throttled until an I/O completes and its slots are 4187 * freed up. The function returns true if it was successful in placing 4188 * the reservation. 4189 */ 4190 boolean_t 4191 metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator, 4192 zio_t *zio, int flags) 4193 { 4194 uint64_t available_slots = 0; 4195 boolean_t slot_reserved = B_FALSE; 4196 uint64_t max = mc->mc_alloc_max_slots[allocator]; 4197 4198 ASSERT(mc->mc_alloc_throttle_enabled); 4199 mutex_enter(&mc->mc_lock); 4200 4201 uint64_t reserved_slots = 4202 zfs_refcount_count(&mc->mc_alloc_slots[allocator]); 4203 if (reserved_slots < max) 4204 available_slots = max - reserved_slots; 4205 4206 if (slots <= available_slots || GANG_ALLOCATION(flags) || 4207 flags & METASLAB_MUST_RESERVE) { 4208 /* 4209 * We reserve the slots individually so that we can unreserve 4210 * them individually when an I/O completes. 4211 */ 4212 for (int d = 0; d < slots; d++) { 4213 reserved_slots = 4214 zfs_refcount_add(&mc->mc_alloc_slots[allocator], 4215 zio); 4216 } 4217 zio->io_flags |= ZIO_FLAG_IO_ALLOCATING; 4218 slot_reserved = B_TRUE; 4219 } 4220 4221 mutex_exit(&mc->mc_lock); 4222 return (slot_reserved); 4223 } 4224 4225 void 4226 metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, 4227 int allocator, zio_t *zio) 4228 { 4229 ASSERT(mc->mc_alloc_throttle_enabled); 4230 mutex_enter(&mc->mc_lock); 4231 for (int d = 0; d < slots; d++) { 4232 (void) zfs_refcount_remove(&mc->mc_alloc_slots[allocator], 4233 zio); 4234 } 4235 mutex_exit(&mc->mc_lock); 4236 } 4237 4238 static int 4239 metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size, 4240 uint64_t txg) 4241 { 4242 metaslab_t *msp; 4243 spa_t *spa = vd->vdev_spa; 4244 int error = 0; 4245 4246 if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count) 4247 return (ENXIO); 4248 4249 ASSERT3P(vd->vdev_ms, !=, NULL); 4250 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 4251 4252 mutex_enter(&msp->ms_lock); 4253 4254 if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) 4255 error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM); 4256 /* 4257 * No need to fail in that case; someone else has activated the 4258 * metaslab, but that doesn't preclude us from using it. 4259 */ 4260 if (error == EBUSY) 4261 error = 0; 4262 4263 if (error == 0 && 4264 !range_tree_contains(msp->ms_allocatable, offset, size)) 4265 error = SET_ERROR(ENOENT); 4266 4267 if (error || txg == 0) { /* txg == 0 indicates dry run */ 4268 mutex_exit(&msp->ms_lock); 4269 return (error); 4270 } 4271 4272 VERIFY(!msp->ms_condensing); 4273 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 4274 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 4275 VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=, 4276 msp->ms_size); 4277 range_tree_remove(msp->ms_allocatable, offset, size); 4278 4279 if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ 4280 if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) 4281 vdev_dirty(vd, VDD_METASLAB, msp, txg); 4282 range_tree_add(msp->ms_allocating[txg & TXG_MASK], 4283 offset, size); 4284 } 4285 4286 mutex_exit(&msp->ms_lock); 4287 4288 return (0); 4289 } 4290 4291 typedef struct metaslab_claim_cb_arg_t { 4292 uint64_t mcca_txg; 4293 int mcca_error; 4294 } metaslab_claim_cb_arg_t; 4295 4296 /* ARGSUSED */ 4297 static void 4298 metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, 4299 uint64_t size, void *arg) 4300 { 4301 metaslab_claim_cb_arg_t *mcca_arg = arg; 4302 4303 if (mcca_arg->mcca_error == 0) { 4304 mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset, 4305 size, mcca_arg->mcca_txg); 4306 } 4307 } 4308 4309 int 4310 metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg) 4311 { 4312 if (vd->vdev_ops->vdev_op_remap != NULL) { 4313 metaslab_claim_cb_arg_t arg; 4314 4315 /* 4316 * Only zdb(1M) can claim on indirect vdevs. This is used 4317 * to detect leaks of mapped space (that are not accounted 4318 * for in the obsolete counts, spacemap, or bpobj). 4319 */ 4320 ASSERT(!spa_writeable(vd->vdev_spa)); 4321 arg.mcca_error = 0; 4322 arg.mcca_txg = txg; 4323 4324 vd->vdev_ops->vdev_op_remap(vd, offset, size, 4325 metaslab_claim_impl_cb, &arg); 4326 4327 if (arg.mcca_error == 0) { 4328 arg.mcca_error = metaslab_claim_concrete(vd, 4329 offset, size, txg); 4330 } 4331 return (arg.mcca_error); 4332 } else { 4333 return (metaslab_claim_concrete(vd, offset, size, txg)); 4334 } 4335 } 4336 4337 /* 4338 * Intent log support: upon opening the pool after a crash, notify the SPA 4339 * of blocks that the intent log has allocated for immediate write, but 4340 * which are still considered free by the SPA because the last transaction 4341 * group didn't commit yet. 4342 */ 4343 static int 4344 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) 4345 { 4346 uint64_t vdev = DVA_GET_VDEV(dva); 4347 uint64_t offset = DVA_GET_OFFSET(dva); 4348 uint64_t size = DVA_GET_ASIZE(dva); 4349 vdev_t *vd; 4350 4351 if ((vd = vdev_lookup_top(spa, vdev)) == NULL) { 4352 return (SET_ERROR(ENXIO)); 4353 } 4354 4355 ASSERT(DVA_IS_VALID(dva)); 4356 4357 if (DVA_GET_GANG(dva)) 4358 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 4359 4360 return (metaslab_claim_impl(vd, offset, size, txg)); 4361 } 4362 4363 int 4364 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, 4365 int ndvas, uint64_t txg, blkptr_t *hintbp, int flags, 4366 zio_alloc_list_t *zal, zio_t *zio, int allocator) 4367 { 4368 dva_t *dva = bp->blk_dva; 4369 dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL; 4370 int error = 0; 4371 4372 ASSERT(bp->blk_birth == 0); 4373 ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); 4374 4375 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 4376 4377 if (mc->mc_rotor == NULL) { /* no vdevs in this class */ 4378 spa_config_exit(spa, SCL_ALLOC, FTAG); 4379 return (SET_ERROR(ENOSPC)); 4380 } 4381 4382 ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); 4383 ASSERT(BP_GET_NDVAS(bp) == 0); 4384 ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); 4385 ASSERT3P(zal, !=, NULL); 4386 4387 for (int d = 0; d < ndvas; d++) { 4388 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, 4389 txg, flags, zal, allocator); 4390 if (error != 0) { 4391 for (d--; d >= 0; d--) { 4392 metaslab_unalloc_dva(spa, &dva[d], txg); 4393 metaslab_group_alloc_decrement(spa, 4394 DVA_GET_VDEV(&dva[d]), zio, flags, 4395 allocator, B_FALSE); 4396 bzero(&dva[d], sizeof (dva_t)); 4397 } 4398 spa_config_exit(spa, SCL_ALLOC, FTAG); 4399 return (error); 4400 } else { 4401 /* 4402 * Update the metaslab group's queue depth 4403 * based on the newly allocated dva. 4404 */ 4405 metaslab_group_alloc_increment(spa, 4406 DVA_GET_VDEV(&dva[d]), zio, flags, allocator); 4407 } 4408 4409 } 4410 ASSERT(error == 0); 4411 ASSERT(BP_GET_NDVAS(bp) == ndvas); 4412 4413 spa_config_exit(spa, SCL_ALLOC, FTAG); 4414 4415 BP_SET_BIRTH(bp, txg, txg); 4416 4417 return (0); 4418 } 4419 4420 void 4421 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) 4422 { 4423 const dva_t *dva = bp->blk_dva; 4424 int ndvas = BP_GET_NDVAS(bp); 4425 4426 ASSERT(!BP_IS_HOLE(bp)); 4427 ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); 4428 4429 /* 4430 * If we have a checkpoint for the pool we need to make sure that 4431 * the blocks that we free that are part of the checkpoint won't be 4432 * reused until the checkpoint is discarded or we revert to it. 4433 * 4434 * The checkpoint flag is passed down the metaslab_free code path 4435 * and is set whenever we want to add a block to the checkpoint's 4436 * accounting. That is, we "checkpoint" blocks that existed at the 4437 * time the checkpoint was created and are therefore referenced by 4438 * the checkpointed uberblock. 4439 * 4440 * Note that, we don't checkpoint any blocks if the current 4441 * syncing txg <= spa_checkpoint_txg. We want these frees to sync 4442 * normally as they will be referenced by the checkpointed uberblock. 4443 */ 4444 boolean_t checkpoint = B_FALSE; 4445 if (bp->blk_birth <= spa->spa_checkpoint_txg && 4446 spa_syncing_txg(spa) > spa->spa_checkpoint_txg) { 4447 /* 4448 * At this point, if the block is part of the checkpoint 4449 * there is no way it was created in the current txg. 4450 */ 4451 ASSERT(!now); 4452 ASSERT3U(spa_syncing_txg(spa), ==, txg); 4453 checkpoint = B_TRUE; 4454 } 4455 4456 spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); 4457 4458 for (int d = 0; d < ndvas; d++) { 4459 if (now) { 4460 metaslab_unalloc_dva(spa, &dva[d], txg); 4461 } else { 4462 ASSERT3U(txg, ==, spa_syncing_txg(spa)); 4463 metaslab_free_dva(spa, &dva[d], checkpoint); 4464 } 4465 } 4466 4467 spa_config_exit(spa, SCL_FREE, FTAG); 4468 } 4469 4470 int 4471 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) 4472 { 4473 const dva_t *dva = bp->blk_dva; 4474 int ndvas = BP_GET_NDVAS(bp); 4475 int error = 0; 4476 4477 ASSERT(!BP_IS_HOLE(bp)); 4478 4479 if (txg != 0) { 4480 /* 4481 * First do a dry run to make sure all DVAs are claimable, 4482 * so we don't have to unwind from partial failures below. 4483 */ 4484 if ((error = metaslab_claim(spa, bp, 0)) != 0) 4485 return (error); 4486 } 4487 4488 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 4489 4490 for (int d = 0; d < ndvas; d++) { 4491 error = metaslab_claim_dva(spa, &dva[d], txg); 4492 if (error != 0) 4493 break; 4494 } 4495 4496 spa_config_exit(spa, SCL_ALLOC, FTAG); 4497 4498 ASSERT(error == 0 || txg == 0); 4499 4500 return (error); 4501 } 4502 4503 /* ARGSUSED */ 4504 static void 4505 metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset, 4506 uint64_t size, void *arg) 4507 { 4508 if (vd->vdev_ops == &vdev_indirect_ops) 4509 return; 4510 4511 metaslab_check_free_impl(vd, offset, size); 4512 } 4513 4514 static void 4515 metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size) 4516 { 4517 metaslab_t *msp; 4518 spa_t *spa = vd->vdev_spa; 4519 4520 if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) 4521 return; 4522 4523 if (vd->vdev_ops->vdev_op_remap != NULL) { 4524 vd->vdev_ops->vdev_op_remap(vd, offset, size, 4525 metaslab_check_free_impl_cb, NULL); 4526 return; 4527 } 4528 4529 ASSERT(vdev_is_concrete(vd)); 4530 ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); 4531 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 4532 4533 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 4534 4535 mutex_enter(&msp->ms_lock); 4536 if (msp->ms_loaded) { 4537 range_tree_verify_not_present(msp->ms_allocatable, 4538 offset, size); 4539 } 4540 4541 range_tree_verify_not_present(msp->ms_freeing, offset, size); 4542 range_tree_verify_not_present(msp->ms_checkpointing, offset, size); 4543 range_tree_verify_not_present(msp->ms_freed, offset, size); 4544 for (int j = 0; j < TXG_DEFER_SIZE; j++) 4545 range_tree_verify_not_present(msp->ms_defer[j], offset, size); 4546 mutex_exit(&msp->ms_lock); 4547 } 4548 4549 void 4550 metaslab_check_free(spa_t *spa, const blkptr_t *bp) 4551 { 4552 if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) 4553 return; 4554 4555 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 4556 for (int i = 0; i < BP_GET_NDVAS(bp); i++) { 4557 uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]); 4558 vdev_t *vd = vdev_lookup_top(spa, vdev); 4559 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); 4560 uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]); 4561 4562 if (DVA_GET_GANG(&bp->blk_dva[i])) 4563 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 4564 4565 ASSERT3P(vd, !=, NULL); 4566 4567 metaslab_check_free_impl(vd, offset, size); 4568 } 4569 spa_config_exit(spa, SCL_VDEV, FTAG); 4570 } 4571