1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 24 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 25 * Copyright (c) 2014 Integros [integros.com] 26 * Copyright (c) 2017, Intel Corporation. 27 */ 28 29 #include <sys/zfs_context.h> 30 #include <sys/dmu.h> 31 #include <sys/dmu_tx.h> 32 #include <sys/space_map.h> 33 #include <sys/metaslab_impl.h> 34 #include <sys/vdev_impl.h> 35 #include <sys/zio.h> 36 #include <sys/spa_impl.h> 37 #include <sys/zfeature.h> 38 #include <sys/vdev_indirect_mapping.h> 39 #include <sys/zap.h> 40 41 #define GANG_ALLOCATION(flags) \ 42 ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER)) 43 44 uint64_t metaslab_aliquot = 512ULL << 10; 45 uint64_t metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ 46 47 /* 48 * Since we can touch multiple metaslabs (and their respective space maps) 49 * with each transaction group, we benefit from having a smaller space map 50 * block size since it allows us to issue more I/O operations scattered 51 * around the disk. 52 */ 53 int zfs_metaslab_sm_blksz = (1 << 12); 54 55 /* 56 * The in-core space map representation is more compact than its on-disk form. 57 * The zfs_condense_pct determines how much more compact the in-core 58 * space map representation must be before we compact it on-disk. 59 * Values should be greater than or equal to 100. 60 */ 61 int zfs_condense_pct = 200; 62 63 /* 64 * Condensing a metaslab is not guaranteed to actually reduce the amount of 65 * space used on disk. In particular, a space map uses data in increments of 66 * MAX(1 << ashift, space_map_blksize), so a metaslab might use the 67 * same number of blocks after condensing. Since the goal of condensing is to 68 * reduce the number of IOPs required to read the space map, we only want to 69 * condense when we can be sure we will reduce the number of blocks used by the 70 * space map. Unfortunately, we cannot precisely compute whether or not this is 71 * the case in metaslab_should_condense since we are holding ms_lock. Instead, 72 * we apply the following heuristic: do not condense a spacemap unless the 73 * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold 74 * blocks. 75 */ 76 int zfs_metaslab_condense_block_threshold = 4; 77 78 /* 79 * The zfs_mg_noalloc_threshold defines which metaslab groups should 80 * be eligible for allocation. The value is defined as a percentage of 81 * free space. Metaslab groups that have more free space than 82 * zfs_mg_noalloc_threshold are always eligible for allocations. Once 83 * a metaslab group's free space is less than or equal to the 84 * zfs_mg_noalloc_threshold the allocator will avoid allocating to that 85 * group unless all groups in the pool have reached zfs_mg_noalloc_threshold. 86 * Once all groups in the pool reach zfs_mg_noalloc_threshold then all 87 * groups are allowed to accept allocations. Gang blocks are always 88 * eligible to allocate on any metaslab group. The default value of 0 means 89 * no metaslab group will be excluded based on this criterion. 90 */ 91 int zfs_mg_noalloc_threshold = 0; 92 93 /* 94 * Metaslab groups are considered eligible for allocations if their 95 * fragmenation metric (measured as a percentage) is less than or equal to 96 * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold 97 * then it will be skipped unless all metaslab groups within the metaslab 98 * class have also crossed this threshold. 99 */ 100 int zfs_mg_fragmentation_threshold = 85; 101 102 /* 103 * Allow metaslabs to keep their active state as long as their fragmentation 104 * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An 105 * active metaslab that exceeds this threshold will no longer keep its active 106 * status allowing better metaslabs to be selected. 107 */ 108 int zfs_metaslab_fragmentation_threshold = 70; 109 110 /* 111 * When set will load all metaslabs when pool is first opened. 112 */ 113 int metaslab_debug_load = 0; 114 115 /* 116 * When set will prevent metaslabs from being unloaded. 117 */ 118 int metaslab_debug_unload = 0; 119 120 /* 121 * Minimum size which forces the dynamic allocator to change 122 * it's allocation strategy. Once the space map cannot satisfy 123 * an allocation of this size then it switches to using more 124 * aggressive strategy (i.e search by size rather than offset). 125 */ 126 uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE; 127 128 /* 129 * The minimum free space, in percent, which must be available 130 * in a space map to continue allocations in a first-fit fashion. 131 * Once the space map's free space drops below this level we dynamically 132 * switch to using best-fit allocations. 133 */ 134 int metaslab_df_free_pct = 4; 135 136 /* 137 * A metaslab is considered "free" if it contains a contiguous 138 * segment which is greater than metaslab_min_alloc_size. 139 */ 140 uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS; 141 142 /* 143 * Percentage of all cpus that can be used by the metaslab taskq. 144 */ 145 int metaslab_load_pct = 50; 146 147 /* 148 * Determines how many txgs a metaslab may remain loaded without having any 149 * allocations from it. As long as a metaslab continues to be used we will 150 * keep it loaded. 151 */ 152 int metaslab_unload_delay = TXG_SIZE * 2; 153 154 /* 155 * Max number of metaslabs per group to preload. 156 */ 157 int metaslab_preload_limit = SPA_DVAS_PER_BP; 158 159 /* 160 * Enable/disable preloading of metaslab. 161 */ 162 boolean_t metaslab_preload_enabled = B_TRUE; 163 164 /* 165 * Enable/disable fragmentation weighting on metaslabs. 166 */ 167 boolean_t metaslab_fragmentation_factor_enabled = B_TRUE; 168 169 /* 170 * Enable/disable lba weighting (i.e. outer tracks are given preference). 171 */ 172 boolean_t metaslab_lba_weighting_enabled = B_TRUE; 173 174 /* 175 * Enable/disable metaslab group biasing. 176 */ 177 boolean_t metaslab_bias_enabled = B_TRUE; 178 179 /* 180 * Enable/disable remapping of indirect DVAs to their concrete vdevs. 181 */ 182 boolean_t zfs_remap_blkptr_enable = B_TRUE; 183 184 /* 185 * Enable/disable segment-based metaslab selection. 186 */ 187 boolean_t zfs_metaslab_segment_weight_enabled = B_TRUE; 188 189 /* 190 * When using segment-based metaslab selection, we will continue 191 * allocating from the active metaslab until we have exhausted 192 * zfs_metaslab_switch_threshold of its buckets. 193 */ 194 int zfs_metaslab_switch_threshold = 2; 195 196 /* 197 * Internal switch to enable/disable the metaslab allocation tracing 198 * facility. 199 */ 200 boolean_t metaslab_trace_enabled = B_TRUE; 201 202 /* 203 * Maximum entries that the metaslab allocation tracing facility will keep 204 * in a given list when running in non-debug mode. We limit the number 205 * of entries in non-debug mode to prevent us from using up too much memory. 206 * The limit should be sufficiently large that we don't expect any allocation 207 * to every exceed this value. In debug mode, the system will panic if this 208 * limit is ever reached allowing for further investigation. 209 */ 210 uint64_t metaslab_trace_max_entries = 5000; 211 212 static uint64_t metaslab_weight(metaslab_t *); 213 static void metaslab_set_fragmentation(metaslab_t *); 214 static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t); 215 static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t); 216 static void metaslab_passivate(metaslab_t *msp, uint64_t weight); 217 static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp); 218 219 kmem_cache_t *metaslab_alloc_trace_cache; 220 221 /* 222 * ========================================================================== 223 * Metaslab classes 224 * ========================================================================== 225 */ 226 metaslab_class_t * 227 metaslab_class_create(spa_t *spa, metaslab_ops_t *ops) 228 { 229 metaslab_class_t *mc; 230 231 mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); 232 233 mc->mc_spa = spa; 234 mc->mc_rotor = NULL; 235 mc->mc_ops = ops; 236 mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL); 237 mc->mc_alloc_slots = kmem_zalloc(spa->spa_alloc_count * 238 sizeof (zfs_refcount_t), KM_SLEEP); 239 mc->mc_alloc_max_slots = kmem_zalloc(spa->spa_alloc_count * 240 sizeof (uint64_t), KM_SLEEP); 241 for (int i = 0; i < spa->spa_alloc_count; i++) 242 zfs_refcount_create_tracked(&mc->mc_alloc_slots[i]); 243 244 return (mc); 245 } 246 247 void 248 metaslab_class_destroy(metaslab_class_t *mc) 249 { 250 ASSERT(mc->mc_rotor == NULL); 251 ASSERT(mc->mc_alloc == 0); 252 ASSERT(mc->mc_deferred == 0); 253 ASSERT(mc->mc_space == 0); 254 ASSERT(mc->mc_dspace == 0); 255 256 for (int i = 0; i < mc->mc_spa->spa_alloc_count; i++) 257 zfs_refcount_destroy(&mc->mc_alloc_slots[i]); 258 kmem_free(mc->mc_alloc_slots, mc->mc_spa->spa_alloc_count * 259 sizeof (zfs_refcount_t)); 260 kmem_free(mc->mc_alloc_max_slots, mc->mc_spa->spa_alloc_count * 261 sizeof (uint64_t)); 262 mutex_destroy(&mc->mc_lock); 263 kmem_free(mc, sizeof (metaslab_class_t)); 264 } 265 266 int 267 metaslab_class_validate(metaslab_class_t *mc) 268 { 269 metaslab_group_t *mg; 270 vdev_t *vd; 271 272 /* 273 * Must hold one of the spa_config locks. 274 */ 275 ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || 276 spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); 277 278 if ((mg = mc->mc_rotor) == NULL) 279 return (0); 280 281 do { 282 vd = mg->mg_vd; 283 ASSERT(vd->vdev_mg != NULL); 284 ASSERT3P(vd->vdev_top, ==, vd); 285 ASSERT3P(mg->mg_class, ==, mc); 286 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); 287 } while ((mg = mg->mg_next) != mc->mc_rotor); 288 289 return (0); 290 } 291 292 static void 293 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, 294 int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) 295 { 296 atomic_add_64(&mc->mc_alloc, alloc_delta); 297 atomic_add_64(&mc->mc_deferred, defer_delta); 298 atomic_add_64(&mc->mc_space, space_delta); 299 atomic_add_64(&mc->mc_dspace, dspace_delta); 300 } 301 302 uint64_t 303 metaslab_class_get_alloc(metaslab_class_t *mc) 304 { 305 return (mc->mc_alloc); 306 } 307 308 uint64_t 309 metaslab_class_get_deferred(metaslab_class_t *mc) 310 { 311 return (mc->mc_deferred); 312 } 313 314 uint64_t 315 metaslab_class_get_space(metaslab_class_t *mc) 316 { 317 return (mc->mc_space); 318 } 319 320 uint64_t 321 metaslab_class_get_dspace(metaslab_class_t *mc) 322 { 323 return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); 324 } 325 326 void 327 metaslab_class_histogram_verify(metaslab_class_t *mc) 328 { 329 spa_t *spa = mc->mc_spa; 330 vdev_t *rvd = spa->spa_root_vdev; 331 uint64_t *mc_hist; 332 int i; 333 334 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 335 return; 336 337 mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 338 KM_SLEEP); 339 340 for (int c = 0; c < rvd->vdev_children; c++) { 341 vdev_t *tvd = rvd->vdev_child[c]; 342 metaslab_group_t *mg = tvd->vdev_mg; 343 344 /* 345 * Skip any holes, uninitialized top-levels, or 346 * vdevs that are not in this metalab class. 347 */ 348 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || 349 mg->mg_class != mc) { 350 continue; 351 } 352 353 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 354 mc_hist[i] += mg->mg_histogram[i]; 355 } 356 357 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 358 VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]); 359 360 kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 361 } 362 363 /* 364 * Calculate the metaslab class's fragmentation metric. The metric 365 * is weighted based on the space contribution of each metaslab group. 366 * The return value will be a number between 0 and 100 (inclusive), or 367 * ZFS_FRAG_INVALID if the metric has not been set. See comment above the 368 * zfs_frag_table for more information about the metric. 369 */ 370 uint64_t 371 metaslab_class_fragmentation(metaslab_class_t *mc) 372 { 373 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 374 uint64_t fragmentation = 0; 375 376 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 377 378 for (int c = 0; c < rvd->vdev_children; c++) { 379 vdev_t *tvd = rvd->vdev_child[c]; 380 metaslab_group_t *mg = tvd->vdev_mg; 381 382 /* 383 * Skip any holes, uninitialized top-levels, 384 * or vdevs that are not in this metalab class. 385 */ 386 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || 387 mg->mg_class != mc) { 388 continue; 389 } 390 391 /* 392 * If a metaslab group does not contain a fragmentation 393 * metric then just bail out. 394 */ 395 if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { 396 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 397 return (ZFS_FRAG_INVALID); 398 } 399 400 /* 401 * Determine how much this metaslab_group is contributing 402 * to the overall pool fragmentation metric. 403 */ 404 fragmentation += mg->mg_fragmentation * 405 metaslab_group_get_space(mg); 406 } 407 fragmentation /= metaslab_class_get_space(mc); 408 409 ASSERT3U(fragmentation, <=, 100); 410 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 411 return (fragmentation); 412 } 413 414 /* 415 * Calculate the amount of expandable space that is available in 416 * this metaslab class. If a device is expanded then its expandable 417 * space will be the amount of allocatable space that is currently not 418 * part of this metaslab class. 419 */ 420 uint64_t 421 metaslab_class_expandable_space(metaslab_class_t *mc) 422 { 423 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 424 uint64_t space = 0; 425 426 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 427 for (int c = 0; c < rvd->vdev_children; c++) { 428 uint64_t tspace; 429 vdev_t *tvd = rvd->vdev_child[c]; 430 metaslab_group_t *mg = tvd->vdev_mg; 431 432 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || 433 mg->mg_class != mc) { 434 continue; 435 } 436 437 /* 438 * Calculate if we have enough space to add additional 439 * metaslabs. We report the expandable space in terms 440 * of the metaslab size since that's the unit of expansion. 441 * Adjust by efi system partition size. 442 */ 443 tspace = tvd->vdev_max_asize - tvd->vdev_asize; 444 if (tspace > mc->mc_spa->spa_bootsize) { 445 tspace -= mc->mc_spa->spa_bootsize; 446 } 447 space += P2ALIGN(tspace, 1ULL << tvd->vdev_ms_shift); 448 } 449 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 450 return (space); 451 } 452 453 static int 454 metaslab_compare(const void *x1, const void *x2) 455 { 456 const metaslab_t *m1 = (const metaslab_t *)x1; 457 const metaslab_t *m2 = (const metaslab_t *)x2; 458 459 int sort1 = 0; 460 int sort2 = 0; 461 if (m1->ms_allocator != -1 && m1->ms_primary) 462 sort1 = 1; 463 else if (m1->ms_allocator != -1 && !m1->ms_primary) 464 sort1 = 2; 465 if (m2->ms_allocator != -1 && m2->ms_primary) 466 sort2 = 1; 467 else if (m2->ms_allocator != -1 && !m2->ms_primary) 468 sort2 = 2; 469 470 /* 471 * Sort inactive metaslabs first, then primaries, then secondaries. When 472 * selecting a metaslab to allocate from, an allocator first tries its 473 * primary, then secondary active metaslab. If it doesn't have active 474 * metaslabs, or can't allocate from them, it searches for an inactive 475 * metaslab to activate. If it can't find a suitable one, it will steal 476 * a primary or secondary metaslab from another allocator. 477 */ 478 if (sort1 < sort2) 479 return (-1); 480 if (sort1 > sort2) 481 return (1); 482 483 int cmp = AVL_CMP(m2->ms_weight, m1->ms_weight); 484 if (likely(cmp)) 485 return (cmp); 486 487 IMPLY(AVL_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2); 488 489 return (AVL_CMP(m1->ms_start, m2->ms_start)); 490 } 491 492 uint64_t 493 metaslab_allocated_space(metaslab_t *msp) 494 { 495 return (msp->ms_allocated_space); 496 } 497 498 /* 499 * Verify that the space accounting on disk matches the in-core range_trees. 500 */ 501 static void 502 metaslab_verify_space(metaslab_t *msp, uint64_t txg) 503 { 504 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 505 uint64_t allocating = 0; 506 uint64_t sm_free_space, msp_free_space; 507 508 ASSERT(MUTEX_HELD(&msp->ms_lock)); 509 ASSERT(!msp->ms_condensing); 510 511 if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) 512 return; 513 514 /* 515 * We can only verify the metaslab space when we're called 516 * from syncing context with a loaded metaslab that has an 517 * allocated space map. Calling this in non-syncing context 518 * does not provide a consistent view of the metaslab since 519 * we're performing allocations in the future. 520 */ 521 if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL || 522 !msp->ms_loaded) 523 return; 524 525 /* 526 * Even though the smp_alloc field can get negative (e.g. 527 * see vdev_checkpoint_sm), that should never be the case 528 * when it come's to a metaslab's space map. 529 */ 530 ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0); 531 532 sm_free_space = msp->ms_size - metaslab_allocated_space(msp); 533 534 /* 535 * Account for future allocations since we would have 536 * already deducted that space from the ms_allocatable. 537 */ 538 for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { 539 allocating += 540 range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]); 541 } 542 543 ASSERT3U(msp->ms_deferspace, ==, 544 range_tree_space(msp->ms_defer[0]) + 545 range_tree_space(msp->ms_defer[1])); 546 547 msp_free_space = range_tree_space(msp->ms_allocatable) + allocating + 548 msp->ms_deferspace + range_tree_space(msp->ms_freed); 549 550 VERIFY3U(sm_free_space, ==, msp_free_space); 551 } 552 553 /* 554 * ========================================================================== 555 * Metaslab groups 556 * ========================================================================== 557 */ 558 /* 559 * Update the allocatable flag and the metaslab group's capacity. 560 * The allocatable flag is set to true if the capacity is below 561 * the zfs_mg_noalloc_threshold or has a fragmentation value that is 562 * greater than zfs_mg_fragmentation_threshold. If a metaslab group 563 * transitions from allocatable to non-allocatable or vice versa then the 564 * metaslab group's class is updated to reflect the transition. 565 */ 566 static void 567 metaslab_group_alloc_update(metaslab_group_t *mg) 568 { 569 vdev_t *vd = mg->mg_vd; 570 metaslab_class_t *mc = mg->mg_class; 571 vdev_stat_t *vs = &vd->vdev_stat; 572 boolean_t was_allocatable; 573 boolean_t was_initialized; 574 575 ASSERT(vd == vd->vdev_top); 576 ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==, 577 SCL_ALLOC); 578 579 mutex_enter(&mg->mg_lock); 580 was_allocatable = mg->mg_allocatable; 581 was_initialized = mg->mg_initialized; 582 583 mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / 584 (vs->vs_space + 1); 585 586 mutex_enter(&mc->mc_lock); 587 588 /* 589 * If the metaslab group was just added then it won't 590 * have any space until we finish syncing out this txg. 591 * At that point we will consider it initialized and available 592 * for allocations. We also don't consider non-activated 593 * metaslab groups (e.g. vdevs that are in the middle of being removed) 594 * to be initialized, because they can't be used for allocation. 595 */ 596 mg->mg_initialized = metaslab_group_initialized(mg); 597 if (!was_initialized && mg->mg_initialized) { 598 mc->mc_groups++; 599 } else if (was_initialized && !mg->mg_initialized) { 600 ASSERT3U(mc->mc_groups, >, 0); 601 mc->mc_groups--; 602 } 603 if (mg->mg_initialized) 604 mg->mg_no_free_space = B_FALSE; 605 606 /* 607 * A metaslab group is considered allocatable if it has plenty 608 * of free space or is not heavily fragmented. We only take 609 * fragmentation into account if the metaslab group has a valid 610 * fragmentation metric (i.e. a value between 0 and 100). 611 */ 612 mg->mg_allocatable = (mg->mg_activation_count > 0 && 613 mg->mg_free_capacity > zfs_mg_noalloc_threshold && 614 (mg->mg_fragmentation == ZFS_FRAG_INVALID || 615 mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)); 616 617 /* 618 * The mc_alloc_groups maintains a count of the number of 619 * groups in this metaslab class that are still above the 620 * zfs_mg_noalloc_threshold. This is used by the allocating 621 * threads to determine if they should avoid allocations to 622 * a given group. The allocator will avoid allocations to a group 623 * if that group has reached or is below the zfs_mg_noalloc_threshold 624 * and there are still other groups that are above the threshold. 625 * When a group transitions from allocatable to non-allocatable or 626 * vice versa we update the metaslab class to reflect that change. 627 * When the mc_alloc_groups value drops to 0 that means that all 628 * groups have reached the zfs_mg_noalloc_threshold making all groups 629 * eligible for allocations. This effectively means that all devices 630 * are balanced again. 631 */ 632 if (was_allocatable && !mg->mg_allocatable) 633 mc->mc_alloc_groups--; 634 else if (!was_allocatable && mg->mg_allocatable) 635 mc->mc_alloc_groups++; 636 mutex_exit(&mc->mc_lock); 637 638 mutex_exit(&mg->mg_lock); 639 } 640 641 metaslab_group_t * 642 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators) 643 { 644 metaslab_group_t *mg; 645 646 mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); 647 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); 648 mutex_init(&mg->mg_ms_initialize_lock, NULL, MUTEX_DEFAULT, NULL); 649 cv_init(&mg->mg_ms_initialize_cv, NULL, CV_DEFAULT, NULL); 650 mg->mg_primaries = kmem_zalloc(allocators * sizeof (metaslab_t *), 651 KM_SLEEP); 652 mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *), 653 KM_SLEEP); 654 avl_create(&mg->mg_metaslab_tree, metaslab_compare, 655 sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); 656 mg->mg_vd = vd; 657 mg->mg_class = mc; 658 mg->mg_activation_count = 0; 659 mg->mg_initialized = B_FALSE; 660 mg->mg_no_free_space = B_TRUE; 661 mg->mg_allocators = allocators; 662 663 mg->mg_alloc_queue_depth = kmem_zalloc(allocators * 664 sizeof (zfs_refcount_t), KM_SLEEP); 665 mg->mg_cur_max_alloc_queue_depth = kmem_zalloc(allocators * 666 sizeof (uint64_t), KM_SLEEP); 667 for (int i = 0; i < allocators; i++) { 668 zfs_refcount_create_tracked(&mg->mg_alloc_queue_depth[i]); 669 mg->mg_cur_max_alloc_queue_depth[i] = 0; 670 } 671 672 mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct, 673 minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT); 674 675 return (mg); 676 } 677 678 void 679 metaslab_group_destroy(metaslab_group_t *mg) 680 { 681 ASSERT(mg->mg_prev == NULL); 682 ASSERT(mg->mg_next == NULL); 683 /* 684 * We may have gone below zero with the activation count 685 * either because we never activated in the first place or 686 * because we're done, and possibly removing the vdev. 687 */ 688 ASSERT(mg->mg_activation_count <= 0); 689 690 taskq_destroy(mg->mg_taskq); 691 avl_destroy(&mg->mg_metaslab_tree); 692 kmem_free(mg->mg_primaries, mg->mg_allocators * sizeof (metaslab_t *)); 693 kmem_free(mg->mg_secondaries, mg->mg_allocators * 694 sizeof (metaslab_t *)); 695 mutex_destroy(&mg->mg_lock); 696 mutex_destroy(&mg->mg_ms_initialize_lock); 697 cv_destroy(&mg->mg_ms_initialize_cv); 698 699 for (int i = 0; i < mg->mg_allocators; i++) { 700 zfs_refcount_destroy(&mg->mg_alloc_queue_depth[i]); 701 mg->mg_cur_max_alloc_queue_depth[i] = 0; 702 } 703 kmem_free(mg->mg_alloc_queue_depth, mg->mg_allocators * 704 sizeof (zfs_refcount_t)); 705 kmem_free(mg->mg_cur_max_alloc_queue_depth, mg->mg_allocators * 706 sizeof (uint64_t)); 707 708 kmem_free(mg, sizeof (metaslab_group_t)); 709 } 710 711 void 712 metaslab_group_activate(metaslab_group_t *mg) 713 { 714 metaslab_class_t *mc = mg->mg_class; 715 metaslab_group_t *mgprev, *mgnext; 716 717 ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER), !=, 0); 718 719 ASSERT(mc->mc_rotor != mg); 720 ASSERT(mg->mg_prev == NULL); 721 ASSERT(mg->mg_next == NULL); 722 ASSERT(mg->mg_activation_count <= 0); 723 724 if (++mg->mg_activation_count <= 0) 725 return; 726 727 mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); 728 metaslab_group_alloc_update(mg); 729 730 if ((mgprev = mc->mc_rotor) == NULL) { 731 mg->mg_prev = mg; 732 mg->mg_next = mg; 733 } else { 734 mgnext = mgprev->mg_next; 735 mg->mg_prev = mgprev; 736 mg->mg_next = mgnext; 737 mgprev->mg_next = mg; 738 mgnext->mg_prev = mg; 739 } 740 mc->mc_rotor = mg; 741 } 742 743 /* 744 * Passivate a metaslab group and remove it from the allocation rotor. 745 * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating 746 * a metaslab group. This function will momentarily drop spa_config_locks 747 * that are lower than the SCL_ALLOC lock (see comment below). 748 */ 749 void 750 metaslab_group_passivate(metaslab_group_t *mg) 751 { 752 metaslab_class_t *mc = mg->mg_class; 753 spa_t *spa = mc->mc_spa; 754 metaslab_group_t *mgprev, *mgnext; 755 int locks = spa_config_held(spa, SCL_ALL, RW_WRITER); 756 757 ASSERT3U(spa_config_held(spa, SCL_ALLOC | SCL_ZIO, RW_WRITER), ==, 758 (SCL_ALLOC | SCL_ZIO)); 759 760 if (--mg->mg_activation_count != 0) { 761 ASSERT(mc->mc_rotor != mg); 762 ASSERT(mg->mg_prev == NULL); 763 ASSERT(mg->mg_next == NULL); 764 ASSERT(mg->mg_activation_count < 0); 765 return; 766 } 767 768 /* 769 * The spa_config_lock is an array of rwlocks, ordered as 770 * follows (from highest to lowest): 771 * SCL_CONFIG > SCL_STATE > SCL_L2ARC > SCL_ALLOC > 772 * SCL_ZIO > SCL_FREE > SCL_VDEV 773 * (For more information about the spa_config_lock see spa_misc.c) 774 * The higher the lock, the broader its coverage. When we passivate 775 * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO 776 * config locks. However, the metaslab group's taskq might be trying 777 * to preload metaslabs so we must drop the SCL_ZIO lock and any 778 * lower locks to allow the I/O to complete. At a minimum, 779 * we continue to hold the SCL_ALLOC lock, which prevents any future 780 * allocations from taking place and any changes to the vdev tree. 781 */ 782 spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa); 783 taskq_wait(mg->mg_taskq); 784 spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER); 785 metaslab_group_alloc_update(mg); 786 for (int i = 0; i < mg->mg_allocators; i++) { 787 metaslab_t *msp = mg->mg_primaries[i]; 788 if (msp != NULL) { 789 mutex_enter(&msp->ms_lock); 790 metaslab_passivate(msp, 791 metaslab_weight_from_range_tree(msp)); 792 mutex_exit(&msp->ms_lock); 793 } 794 msp = mg->mg_secondaries[i]; 795 if (msp != NULL) { 796 mutex_enter(&msp->ms_lock); 797 metaslab_passivate(msp, 798 metaslab_weight_from_range_tree(msp)); 799 mutex_exit(&msp->ms_lock); 800 } 801 } 802 803 mgprev = mg->mg_prev; 804 mgnext = mg->mg_next; 805 806 if (mg == mgnext) { 807 mc->mc_rotor = NULL; 808 } else { 809 mc->mc_rotor = mgnext; 810 mgprev->mg_next = mgnext; 811 mgnext->mg_prev = mgprev; 812 } 813 814 mg->mg_prev = NULL; 815 mg->mg_next = NULL; 816 } 817 818 boolean_t 819 metaslab_group_initialized(metaslab_group_t *mg) 820 { 821 vdev_t *vd = mg->mg_vd; 822 vdev_stat_t *vs = &vd->vdev_stat; 823 824 return (vs->vs_space != 0 && mg->mg_activation_count > 0); 825 } 826 827 uint64_t 828 metaslab_group_get_space(metaslab_group_t *mg) 829 { 830 return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count); 831 } 832 833 void 834 metaslab_group_histogram_verify(metaslab_group_t *mg) 835 { 836 uint64_t *mg_hist; 837 vdev_t *vd = mg->mg_vd; 838 uint64_t ashift = vd->vdev_ashift; 839 int i; 840 841 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 842 return; 843 844 mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 845 KM_SLEEP); 846 847 ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=, 848 SPACE_MAP_HISTOGRAM_SIZE + ashift); 849 850 for (int m = 0; m < vd->vdev_ms_count; m++) { 851 metaslab_t *msp = vd->vdev_ms[m]; 852 ASSERT(msp != NULL); 853 854 /* skip if not active or not a member */ 855 if (msp->ms_sm == NULL || msp->ms_group != mg) 856 continue; 857 858 for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) 859 mg_hist[i + ashift] += 860 msp->ms_sm->sm_phys->smp_histogram[i]; 861 } 862 863 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++) 864 VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]); 865 866 kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 867 } 868 869 static void 870 metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp) 871 { 872 metaslab_class_t *mc = mg->mg_class; 873 uint64_t ashift = mg->mg_vd->vdev_ashift; 874 875 ASSERT(MUTEX_HELD(&msp->ms_lock)); 876 if (msp->ms_sm == NULL) 877 return; 878 879 mutex_enter(&mg->mg_lock); 880 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 881 mg->mg_histogram[i + ashift] += 882 msp->ms_sm->sm_phys->smp_histogram[i]; 883 mc->mc_histogram[i + ashift] += 884 msp->ms_sm->sm_phys->smp_histogram[i]; 885 } 886 mutex_exit(&mg->mg_lock); 887 } 888 889 void 890 metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp) 891 { 892 metaslab_class_t *mc = mg->mg_class; 893 uint64_t ashift = mg->mg_vd->vdev_ashift; 894 895 ASSERT(MUTEX_HELD(&msp->ms_lock)); 896 if (msp->ms_sm == NULL) 897 return; 898 899 mutex_enter(&mg->mg_lock); 900 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 901 ASSERT3U(mg->mg_histogram[i + ashift], >=, 902 msp->ms_sm->sm_phys->smp_histogram[i]); 903 ASSERT3U(mc->mc_histogram[i + ashift], >=, 904 msp->ms_sm->sm_phys->smp_histogram[i]); 905 906 mg->mg_histogram[i + ashift] -= 907 msp->ms_sm->sm_phys->smp_histogram[i]; 908 mc->mc_histogram[i + ashift] -= 909 msp->ms_sm->sm_phys->smp_histogram[i]; 910 } 911 mutex_exit(&mg->mg_lock); 912 } 913 914 static void 915 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) 916 { 917 ASSERT(msp->ms_group == NULL); 918 mutex_enter(&mg->mg_lock); 919 msp->ms_group = mg; 920 msp->ms_weight = 0; 921 avl_add(&mg->mg_metaslab_tree, msp); 922 mutex_exit(&mg->mg_lock); 923 924 mutex_enter(&msp->ms_lock); 925 metaslab_group_histogram_add(mg, msp); 926 mutex_exit(&msp->ms_lock); 927 } 928 929 static void 930 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) 931 { 932 mutex_enter(&msp->ms_lock); 933 metaslab_group_histogram_remove(mg, msp); 934 mutex_exit(&msp->ms_lock); 935 936 mutex_enter(&mg->mg_lock); 937 ASSERT(msp->ms_group == mg); 938 avl_remove(&mg->mg_metaslab_tree, msp); 939 msp->ms_group = NULL; 940 mutex_exit(&mg->mg_lock); 941 } 942 943 static void 944 metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 945 { 946 ASSERT(MUTEX_HELD(&mg->mg_lock)); 947 ASSERT(msp->ms_group == mg); 948 avl_remove(&mg->mg_metaslab_tree, msp); 949 msp->ms_weight = weight; 950 avl_add(&mg->mg_metaslab_tree, msp); 951 952 } 953 954 static void 955 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 956 { 957 /* 958 * Although in principle the weight can be any value, in 959 * practice we do not use values in the range [1, 511]. 960 */ 961 ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0); 962 ASSERT(MUTEX_HELD(&msp->ms_lock)); 963 964 mutex_enter(&mg->mg_lock); 965 metaslab_group_sort_impl(mg, msp, weight); 966 mutex_exit(&mg->mg_lock); 967 } 968 969 /* 970 * Calculate the fragmentation for a given metaslab group. We can use 971 * a simple average here since all metaslabs within the group must have 972 * the same size. The return value will be a value between 0 and 100 973 * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this 974 * group have a fragmentation metric. 975 */ 976 uint64_t 977 metaslab_group_fragmentation(metaslab_group_t *mg) 978 { 979 vdev_t *vd = mg->mg_vd; 980 uint64_t fragmentation = 0; 981 uint64_t valid_ms = 0; 982 983 for (int m = 0; m < vd->vdev_ms_count; m++) { 984 metaslab_t *msp = vd->vdev_ms[m]; 985 986 if (msp->ms_fragmentation == ZFS_FRAG_INVALID) 987 continue; 988 if (msp->ms_group != mg) 989 continue; 990 991 valid_ms++; 992 fragmentation += msp->ms_fragmentation; 993 } 994 995 if (valid_ms <= mg->mg_vd->vdev_ms_count / 2) 996 return (ZFS_FRAG_INVALID); 997 998 fragmentation /= valid_ms; 999 ASSERT3U(fragmentation, <=, 100); 1000 return (fragmentation); 1001 } 1002 1003 /* 1004 * Determine if a given metaslab group should skip allocations. A metaslab 1005 * group should avoid allocations if its free capacity is less than the 1006 * zfs_mg_noalloc_threshold or its fragmentation metric is greater than 1007 * zfs_mg_fragmentation_threshold and there is at least one metaslab group 1008 * that can still handle allocations. If the allocation throttle is enabled 1009 * then we skip allocations to devices that have reached their maximum 1010 * allocation queue depth unless the selected metaslab group is the only 1011 * eligible group remaining. 1012 */ 1013 static boolean_t 1014 metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, 1015 uint64_t psize, int allocator) 1016 { 1017 spa_t *spa = mg->mg_vd->vdev_spa; 1018 metaslab_class_t *mc = mg->mg_class; 1019 1020 /* 1021 * We can only consider skipping this metaslab group if it's 1022 * in the normal metaslab class and there are other metaslab 1023 * groups to select from. Otherwise, we always consider it eligible 1024 * for allocations. 1025 */ 1026 if ((mc != spa_normal_class(spa) && 1027 mc != spa_special_class(spa) && 1028 mc != spa_dedup_class(spa)) || 1029 mc->mc_groups <= 1) 1030 return (B_TRUE); 1031 1032 /* 1033 * If the metaslab group's mg_allocatable flag is set (see comments 1034 * in metaslab_group_alloc_update() for more information) and 1035 * the allocation throttle is disabled then allow allocations to this 1036 * device. However, if the allocation throttle is enabled then 1037 * check if we have reached our allocation limit (mg_alloc_queue_depth) 1038 * to determine if we should allow allocations to this metaslab group. 1039 * If all metaslab groups are no longer considered allocatable 1040 * (mc_alloc_groups == 0) or we're trying to allocate the smallest 1041 * gang block size then we allow allocations on this metaslab group 1042 * regardless of the mg_allocatable or throttle settings. 1043 */ 1044 if (mg->mg_allocatable) { 1045 metaslab_group_t *mgp; 1046 int64_t qdepth; 1047 uint64_t qmax = mg->mg_cur_max_alloc_queue_depth[allocator]; 1048 1049 if (!mc->mc_alloc_throttle_enabled) 1050 return (B_TRUE); 1051 1052 /* 1053 * If this metaslab group does not have any free space, then 1054 * there is no point in looking further. 1055 */ 1056 if (mg->mg_no_free_space) 1057 return (B_FALSE); 1058 1059 qdepth = zfs_refcount_count( 1060 &mg->mg_alloc_queue_depth[allocator]); 1061 1062 /* 1063 * If this metaslab group is below its qmax or it's 1064 * the only allocatable metasable group, then attempt 1065 * to allocate from it. 1066 */ 1067 if (qdepth < qmax || mc->mc_alloc_groups == 1) 1068 return (B_TRUE); 1069 ASSERT3U(mc->mc_alloc_groups, >, 1); 1070 1071 /* 1072 * Since this metaslab group is at or over its qmax, we 1073 * need to determine if there are metaslab groups after this 1074 * one that might be able to handle this allocation. This is 1075 * racy since we can't hold the locks for all metaslab 1076 * groups at the same time when we make this check. 1077 */ 1078 for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) { 1079 qmax = mgp->mg_cur_max_alloc_queue_depth[allocator]; 1080 1081 qdepth = zfs_refcount_count( 1082 &mgp->mg_alloc_queue_depth[allocator]); 1083 1084 /* 1085 * If there is another metaslab group that 1086 * might be able to handle the allocation, then 1087 * we return false so that we skip this group. 1088 */ 1089 if (qdepth < qmax && !mgp->mg_no_free_space) 1090 return (B_FALSE); 1091 } 1092 1093 /* 1094 * We didn't find another group to handle the allocation 1095 * so we can't skip this metaslab group even though 1096 * we are at or over our qmax. 1097 */ 1098 return (B_TRUE); 1099 1100 } else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) { 1101 return (B_TRUE); 1102 } 1103 return (B_FALSE); 1104 } 1105 1106 /* 1107 * ========================================================================== 1108 * Range tree callbacks 1109 * ========================================================================== 1110 */ 1111 1112 /* 1113 * Comparison function for the private size-ordered tree. Tree is sorted 1114 * by size, larger sizes at the end of the tree. 1115 */ 1116 static int 1117 metaslab_rangesize_compare(const void *x1, const void *x2) 1118 { 1119 const range_seg_t *r1 = x1; 1120 const range_seg_t *r2 = x2; 1121 uint64_t rs_size1 = r1->rs_end - r1->rs_start; 1122 uint64_t rs_size2 = r2->rs_end - r2->rs_start; 1123 1124 int cmp = AVL_CMP(rs_size1, rs_size2); 1125 if (likely(cmp)) 1126 return (cmp); 1127 1128 return (AVL_CMP(r1->rs_start, r2->rs_start)); 1129 } 1130 1131 /* 1132 * ========================================================================== 1133 * Common allocator routines 1134 * ========================================================================== 1135 */ 1136 1137 /* 1138 * Return the maximum contiguous segment within the metaslab. 1139 */ 1140 uint64_t 1141 metaslab_block_maxsize(metaslab_t *msp) 1142 { 1143 avl_tree_t *t = &msp->ms_allocatable_by_size; 1144 range_seg_t *rs; 1145 1146 if (t == NULL || (rs = avl_last(t)) == NULL) 1147 return (0ULL); 1148 1149 return (rs->rs_end - rs->rs_start); 1150 } 1151 1152 static range_seg_t * 1153 metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size) 1154 { 1155 range_seg_t *rs, rsearch; 1156 avl_index_t where; 1157 1158 rsearch.rs_start = start; 1159 rsearch.rs_end = start + size; 1160 1161 rs = avl_find(t, &rsearch, &where); 1162 if (rs == NULL) { 1163 rs = avl_nearest(t, where, AVL_AFTER); 1164 } 1165 1166 return (rs); 1167 } 1168 1169 /* 1170 * This is a helper function that can be used by the allocator to find 1171 * a suitable block to allocate. This will search the specified AVL 1172 * tree looking for a block that matches the specified criteria. 1173 */ 1174 static uint64_t 1175 metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, 1176 uint64_t align) 1177 { 1178 range_seg_t *rs = metaslab_block_find(t, *cursor, size); 1179 1180 while (rs != NULL) { 1181 uint64_t offset = P2ROUNDUP(rs->rs_start, align); 1182 1183 if (offset + size <= rs->rs_end) { 1184 *cursor = offset + size; 1185 return (offset); 1186 } 1187 rs = AVL_NEXT(t, rs); 1188 } 1189 1190 /* 1191 * If we know we've searched the whole map (*cursor == 0), give up. 1192 * Otherwise, reset the cursor to the beginning and try again. 1193 */ 1194 if (*cursor == 0) 1195 return (-1ULL); 1196 1197 *cursor = 0; 1198 return (metaslab_block_picker(t, cursor, size, align)); 1199 } 1200 1201 /* 1202 * ========================================================================== 1203 * The first-fit block allocator 1204 * ========================================================================== 1205 */ 1206 static uint64_t 1207 metaslab_ff_alloc(metaslab_t *msp, uint64_t size) 1208 { 1209 /* 1210 * Find the largest power of 2 block size that evenly divides the 1211 * requested size. This is used to try to allocate blocks with similar 1212 * alignment from the same area of the metaslab (i.e. same cursor 1213 * bucket) but it does not guarantee that other allocations sizes 1214 * may exist in the same region. 1215 */ 1216 uint64_t align = size & -size; 1217 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; 1218 avl_tree_t *t = &msp->ms_allocatable->rt_root; 1219 1220 return (metaslab_block_picker(t, cursor, size, align)); 1221 } 1222 1223 static metaslab_ops_t metaslab_ff_ops = { 1224 metaslab_ff_alloc 1225 }; 1226 1227 /* 1228 * ========================================================================== 1229 * Dynamic block allocator - 1230 * Uses the first fit allocation scheme until space get low and then 1231 * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold 1232 * and metaslab_df_free_pct to determine when to switch the allocation scheme. 1233 * ========================================================================== 1234 */ 1235 static uint64_t 1236 metaslab_df_alloc(metaslab_t *msp, uint64_t size) 1237 { 1238 /* 1239 * Find the largest power of 2 block size that evenly divides the 1240 * requested size. This is used to try to allocate blocks with similar 1241 * alignment from the same area of the metaslab (i.e. same cursor 1242 * bucket) but it does not guarantee that other allocations sizes 1243 * may exist in the same region. 1244 */ 1245 uint64_t align = size & -size; 1246 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; 1247 range_tree_t *rt = msp->ms_allocatable; 1248 avl_tree_t *t = &rt->rt_root; 1249 uint64_t max_size = metaslab_block_maxsize(msp); 1250 int free_pct = range_tree_space(rt) * 100 / msp->ms_size; 1251 1252 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1253 ASSERT3U(avl_numnodes(t), ==, 1254 avl_numnodes(&msp->ms_allocatable_by_size)); 1255 1256 if (max_size < size) 1257 return (-1ULL); 1258 1259 /* 1260 * If we're running low on space switch to using the size 1261 * sorted AVL tree (best-fit). 1262 */ 1263 if (max_size < metaslab_df_alloc_threshold || 1264 free_pct < metaslab_df_free_pct) { 1265 t = &msp->ms_allocatable_by_size; 1266 *cursor = 0; 1267 } 1268 1269 return (metaslab_block_picker(t, cursor, size, 1ULL)); 1270 } 1271 1272 static metaslab_ops_t metaslab_df_ops = { 1273 metaslab_df_alloc 1274 }; 1275 1276 /* 1277 * ========================================================================== 1278 * Cursor fit block allocator - 1279 * Select the largest region in the metaslab, set the cursor to the beginning 1280 * of the range and the cursor_end to the end of the range. As allocations 1281 * are made advance the cursor. Continue allocating from the cursor until 1282 * the range is exhausted and then find a new range. 1283 * ========================================================================== 1284 */ 1285 static uint64_t 1286 metaslab_cf_alloc(metaslab_t *msp, uint64_t size) 1287 { 1288 range_tree_t *rt = msp->ms_allocatable; 1289 avl_tree_t *t = &msp->ms_allocatable_by_size; 1290 uint64_t *cursor = &msp->ms_lbas[0]; 1291 uint64_t *cursor_end = &msp->ms_lbas[1]; 1292 uint64_t offset = 0; 1293 1294 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1295 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root)); 1296 1297 ASSERT3U(*cursor_end, >=, *cursor); 1298 1299 if ((*cursor + size) > *cursor_end) { 1300 range_seg_t *rs; 1301 1302 rs = avl_last(&msp->ms_allocatable_by_size); 1303 if (rs == NULL || (rs->rs_end - rs->rs_start) < size) 1304 return (-1ULL); 1305 1306 *cursor = rs->rs_start; 1307 *cursor_end = rs->rs_end; 1308 } 1309 1310 offset = *cursor; 1311 *cursor += size; 1312 1313 return (offset); 1314 } 1315 1316 static metaslab_ops_t metaslab_cf_ops = { 1317 metaslab_cf_alloc 1318 }; 1319 1320 /* 1321 * ========================================================================== 1322 * New dynamic fit allocator - 1323 * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift 1324 * contiguous blocks. If no region is found then just use the largest segment 1325 * that remains. 1326 * ========================================================================== 1327 */ 1328 1329 /* 1330 * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift) 1331 * to request from the allocator. 1332 */ 1333 uint64_t metaslab_ndf_clump_shift = 4; 1334 1335 static uint64_t 1336 metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) 1337 { 1338 avl_tree_t *t = &msp->ms_allocatable->rt_root; 1339 avl_index_t where; 1340 range_seg_t *rs, rsearch; 1341 uint64_t hbit = highbit64(size); 1342 uint64_t *cursor = &msp->ms_lbas[hbit - 1]; 1343 uint64_t max_size = metaslab_block_maxsize(msp); 1344 1345 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1346 ASSERT3U(avl_numnodes(t), ==, 1347 avl_numnodes(&msp->ms_allocatable_by_size)); 1348 1349 if (max_size < size) 1350 return (-1ULL); 1351 1352 rsearch.rs_start = *cursor; 1353 rsearch.rs_end = *cursor + size; 1354 1355 rs = avl_find(t, &rsearch, &where); 1356 if (rs == NULL || (rs->rs_end - rs->rs_start) < size) { 1357 t = &msp->ms_allocatable_by_size; 1358 1359 rsearch.rs_start = 0; 1360 rsearch.rs_end = MIN(max_size, 1361 1ULL << (hbit + metaslab_ndf_clump_shift)); 1362 rs = avl_find(t, &rsearch, &where); 1363 if (rs == NULL) 1364 rs = avl_nearest(t, where, AVL_AFTER); 1365 ASSERT(rs != NULL); 1366 } 1367 1368 if ((rs->rs_end - rs->rs_start) >= size) { 1369 *cursor = rs->rs_start + size; 1370 return (rs->rs_start); 1371 } 1372 return (-1ULL); 1373 } 1374 1375 static metaslab_ops_t metaslab_ndf_ops = { 1376 metaslab_ndf_alloc 1377 }; 1378 1379 metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops; 1380 1381 /* 1382 * ========================================================================== 1383 * Metaslabs 1384 * ========================================================================== 1385 */ 1386 1387 static void 1388 metaslab_aux_histograms_clear(metaslab_t *msp) 1389 { 1390 /* 1391 * Auxiliary histograms are only cleared when resetting them, 1392 * which can only happen while the metaslab is loaded. 1393 */ 1394 ASSERT(msp->ms_loaded); 1395 1396 bzero(msp->ms_synchist, sizeof (msp->ms_synchist)); 1397 for (int t = 0; t < TXG_DEFER_SIZE; t++) 1398 bzero(msp->ms_deferhist[t], sizeof (msp->ms_deferhist[t])); 1399 } 1400 1401 static void 1402 metaslab_aux_histogram_add(uint64_t *histogram, uint64_t shift, 1403 range_tree_t *rt) 1404 { 1405 /* 1406 * This is modeled after space_map_histogram_add(), so refer to that 1407 * function for implementation details. We want this to work like 1408 * the space map histogram, and not the range tree histogram, as we 1409 * are essentially constructing a delta that will be later subtracted 1410 * from the space map histogram. 1411 */ 1412 int idx = 0; 1413 for (int i = shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { 1414 ASSERT3U(i, >=, idx + shift); 1415 histogram[idx] += rt->rt_histogram[i] << (i - idx - shift); 1416 1417 if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) { 1418 ASSERT3U(idx + shift, ==, i); 1419 idx++; 1420 ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE); 1421 } 1422 } 1423 } 1424 1425 /* 1426 * Called at every sync pass that the metaslab gets synced. 1427 * 1428 * The reason is that we want our auxiliary histograms to be updated 1429 * wherever the metaslab's space map histogram is updated. This way 1430 * we stay consistent on which parts of the metaslab space map's 1431 * histogram are currently not available for allocations (e.g because 1432 * they are in the defer, freed, and freeing trees). 1433 */ 1434 static void 1435 metaslab_aux_histograms_update(metaslab_t *msp) 1436 { 1437 space_map_t *sm = msp->ms_sm; 1438 ASSERT(sm != NULL); 1439 1440 /* 1441 * This is similar to the metaslab's space map histogram updates 1442 * that take place in metaslab_sync(). The only difference is that 1443 * we only care about segments that haven't made it into the 1444 * ms_allocatable tree yet. 1445 */ 1446 if (msp->ms_loaded) { 1447 metaslab_aux_histograms_clear(msp); 1448 1449 metaslab_aux_histogram_add(msp->ms_synchist, 1450 sm->sm_shift, msp->ms_freed); 1451 1452 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1453 metaslab_aux_histogram_add(msp->ms_deferhist[t], 1454 sm->sm_shift, msp->ms_defer[t]); 1455 } 1456 } 1457 1458 metaslab_aux_histogram_add(msp->ms_synchist, 1459 sm->sm_shift, msp->ms_freeing); 1460 } 1461 1462 /* 1463 * Called every time we are done syncing (writing to) the metaslab, 1464 * i.e. at the end of each sync pass. 1465 * [see the comment in metaslab_impl.h for ms_synchist, ms_deferhist] 1466 */ 1467 static void 1468 metaslab_aux_histograms_update_done(metaslab_t *msp, boolean_t defer_allowed) 1469 { 1470 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1471 space_map_t *sm = msp->ms_sm; 1472 1473 if (sm == NULL) { 1474 /* 1475 * We came here from metaslab_init() when creating/opening a 1476 * pool, looking at a metaslab that hasn't had any allocations 1477 * yet. 1478 */ 1479 return; 1480 } 1481 1482 /* 1483 * This is similar to the actions that we take for the ms_freed 1484 * and ms_defer trees in metaslab_sync_done(). 1485 */ 1486 uint64_t hist_index = spa_syncing_txg(spa) % TXG_DEFER_SIZE; 1487 if (defer_allowed) { 1488 bcopy(msp->ms_synchist, msp->ms_deferhist[hist_index], 1489 sizeof (msp->ms_synchist)); 1490 } else { 1491 bzero(msp->ms_deferhist[hist_index], 1492 sizeof (msp->ms_deferhist[hist_index])); 1493 } 1494 bzero(msp->ms_synchist, sizeof (msp->ms_synchist)); 1495 } 1496 1497 /* 1498 * Ensure that the metaslab's weight and fragmentation are consistent 1499 * with the contents of the histogram (either the range tree's histogram 1500 * or the space map's depending whether the metaslab is loaded). 1501 */ 1502 static void 1503 metaslab_verify_weight_and_frag(metaslab_t *msp) 1504 { 1505 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1506 1507 if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) 1508 return; 1509 1510 /* see comment in metaslab_verify_unflushed_changes() */ 1511 if (msp->ms_group == NULL) 1512 return; 1513 1514 /* 1515 * Devices being removed always return a weight of 0 and leave 1516 * fragmentation and ms_max_size as is - there is nothing for 1517 * us to verify here. 1518 */ 1519 vdev_t *vd = msp->ms_group->mg_vd; 1520 if (vd->vdev_removing) 1521 return; 1522 1523 /* 1524 * If the metaslab is dirty it probably means that we've done 1525 * some allocations or frees that have changed our histograms 1526 * and thus the weight. 1527 */ 1528 for (int t = 0; t < TXG_SIZE; t++) { 1529 if (txg_list_member(&vd->vdev_ms_list, msp, t)) 1530 return; 1531 } 1532 1533 /* 1534 * This verification checks that our in-memory state is consistent 1535 * with what's on disk. If the pool is read-only then there aren't 1536 * any changes and we just have the initially-loaded state. 1537 */ 1538 if (!spa_writeable(msp->ms_group->mg_vd->vdev_spa)) 1539 return; 1540 1541 /* some extra verification for in-core tree if you can */ 1542 if (msp->ms_loaded) { 1543 range_tree_stat_verify(msp->ms_allocatable); 1544 VERIFY(space_map_histogram_verify(msp->ms_sm, 1545 msp->ms_allocatable)); 1546 } 1547 1548 uint64_t weight = msp->ms_weight; 1549 uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; 1550 boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight); 1551 uint64_t frag = msp->ms_fragmentation; 1552 uint64_t max_segsize = msp->ms_max_size; 1553 1554 msp->ms_weight = 0; 1555 msp->ms_fragmentation = 0; 1556 msp->ms_max_size = 0; 1557 1558 /* 1559 * This function is used for verification purposes. Regardless of 1560 * whether metaslab_weight() thinks this metaslab should be active or 1561 * not, we want to ensure that the actual weight (and therefore the 1562 * value of ms_weight) would be the same if it was to be recalculated 1563 * at this point. 1564 */ 1565 msp->ms_weight = metaslab_weight(msp) | was_active; 1566 1567 VERIFY3U(max_segsize, ==, msp->ms_max_size); 1568 1569 /* 1570 * If the weight type changed then there is no point in doing 1571 * verification. Revert fields to their original values. 1572 */ 1573 if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) || 1574 (!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) { 1575 msp->ms_fragmentation = frag; 1576 msp->ms_weight = weight; 1577 return; 1578 } 1579 1580 VERIFY3U(msp->ms_fragmentation, ==, frag); 1581 VERIFY3U(msp->ms_weight, ==, weight); 1582 } 1583 1584 /* 1585 * Wait for any in-progress metaslab loads to complete. 1586 */ 1587 static void 1588 metaslab_load_wait(metaslab_t *msp) 1589 { 1590 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1591 1592 while (msp->ms_loading) { 1593 ASSERT(!msp->ms_loaded); 1594 cv_wait(&msp->ms_load_cv, &msp->ms_lock); 1595 } 1596 } 1597 1598 static int 1599 metaslab_load_impl(metaslab_t *msp) 1600 { 1601 int error = 0; 1602 1603 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1604 ASSERT(msp->ms_loading); 1605 ASSERT(!msp->ms_condensing); 1606 1607 /* 1608 * We temporarily drop the lock to unblock other operations while we 1609 * are reading the space map. Therefore, metaslab_sync() and 1610 * metaslab_sync_done() can run at the same time as we do. 1611 * 1612 * metaslab_sync() can append to the space map while we are loading. 1613 * Therefore we load only entries that existed when we started the 1614 * load. Additionally, metaslab_sync_done() has to wait for the load 1615 * to complete because there are potential races like metaslab_load() 1616 * loading parts of the space map that are currently being appended 1617 * by metaslab_sync(). If we didn't, the ms_allocatable would have 1618 * entries that metaslab_sync_done() would try to re-add later. 1619 * 1620 * That's why before dropping the lock we remember the synced length 1621 * of the metaslab and read up to that point of the space map, 1622 * ignoring entries appended by metaslab_sync() that happen after we 1623 * drop the lock. 1624 */ 1625 uint64_t length = msp->ms_synced_length; 1626 mutex_exit(&msp->ms_lock); 1627 1628 if (msp->ms_sm != NULL) { 1629 error = space_map_load_length(msp->ms_sm, msp->ms_allocatable, 1630 SM_FREE, length); 1631 } else { 1632 /* 1633 * The space map has not been allocated yet, so treat 1634 * all the space in the metaslab as free and add it to the 1635 * ms_allocatable tree. 1636 */ 1637 range_tree_add(msp->ms_allocatable, 1638 msp->ms_start, msp->ms_size); 1639 } 1640 1641 /* 1642 * We need to grab the ms_sync_lock to prevent metaslab_sync() from 1643 * changing the ms_sm and the metaslab's range trees while we are 1644 * about to use them and populate the ms_allocatable. The ms_lock 1645 * is insufficient for this because metaslab_sync() doesn't hold 1646 * the ms_lock while writing the ms_checkpointing tree to disk. 1647 */ 1648 mutex_enter(&msp->ms_sync_lock); 1649 mutex_enter(&msp->ms_lock); 1650 ASSERT(!msp->ms_condensing); 1651 1652 if (error != 0) { 1653 mutex_exit(&msp->ms_sync_lock); 1654 return (error); 1655 } 1656 1657 ASSERT3P(msp->ms_group, !=, NULL); 1658 msp->ms_loaded = B_TRUE; 1659 1660 /* 1661 * The ms_allocatable contains the segments that exist in the 1662 * ms_defer trees [see ms_synced_length]. Thus we need to remove 1663 * them from ms_allocatable as they will be added again in 1664 * metaslab_sync_done(). 1665 */ 1666 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1667 range_tree_walk(msp->ms_defer[t], 1668 range_tree_remove, msp->ms_allocatable); 1669 } 1670 1671 /* 1672 * Call metaslab_recalculate_weight_and_sort() now that the 1673 * metaslab is loaded so we get the metaslab's real weight. 1674 * 1675 * Unless this metaslab was created with older software and 1676 * has not yet been converted to use segment-based weight, we 1677 * expect the new weight to be better or equal to the weight 1678 * that the metaslab had while it was not loaded. This is 1679 * because the old weight does not take into account the 1680 * consolidation of adjacent segments between TXGs. [see 1681 * comment for ms_synchist and ms_deferhist[] for more info] 1682 */ 1683 uint64_t weight = msp->ms_weight; 1684 metaslab_recalculate_weight_and_sort(msp); 1685 if (!WEIGHT_IS_SPACEBASED(weight)) 1686 ASSERT3U(weight, <=, msp->ms_weight); 1687 msp->ms_max_size = metaslab_block_maxsize(msp); 1688 1689 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1690 metaslab_verify_space(msp, spa_syncing_txg(spa)); 1691 mutex_exit(&msp->ms_sync_lock); 1692 1693 return (0); 1694 } 1695 1696 int 1697 metaslab_load(metaslab_t *msp) 1698 { 1699 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1700 1701 /* 1702 * There may be another thread loading the same metaslab, if that's 1703 * the case just wait until the other thread is done and return. 1704 */ 1705 metaslab_load_wait(msp); 1706 if (msp->ms_loaded) 1707 return (0); 1708 VERIFY(!msp->ms_loading); 1709 ASSERT(!msp->ms_condensing); 1710 1711 msp->ms_loading = B_TRUE; 1712 int error = metaslab_load_impl(msp); 1713 msp->ms_loading = B_FALSE; 1714 cv_broadcast(&msp->ms_load_cv); 1715 1716 return (error); 1717 } 1718 1719 void 1720 metaslab_unload(metaslab_t *msp) 1721 { 1722 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1723 1724 metaslab_verify_weight_and_frag(msp); 1725 1726 range_tree_vacate(msp->ms_allocatable, NULL, NULL); 1727 msp->ms_loaded = B_FALSE; 1728 1729 msp->ms_weight &= ~METASLAB_ACTIVE_MASK; 1730 msp->ms_max_size = 0; 1731 1732 /* 1733 * We explicitly recalculate the metaslab's weight based on its space 1734 * map (as it is now not loaded). We want unload metaslabs to always 1735 * have their weights calculated from the space map histograms, while 1736 * loaded ones have it calculated from their in-core range tree 1737 * [see metaslab_load()]. This way, the weight reflects the information 1738 * available in-core, whether it is loaded or not 1739 * 1740 * If ms_group == NULL means that we came here from metaslab_fini(), 1741 * at which point it doesn't make sense for us to do the recalculation 1742 * and the sorting. 1743 */ 1744 if (msp->ms_group != NULL) 1745 metaslab_recalculate_weight_and_sort(msp); 1746 } 1747 1748 static void 1749 metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta, 1750 int64_t defer_delta, int64_t space_delta) 1751 { 1752 vdev_space_update(vd, alloc_delta, defer_delta, space_delta); 1753 1754 ASSERT3P(vd->vdev_spa->spa_root_vdev, ==, vd->vdev_parent); 1755 ASSERT(vd->vdev_ms_count != 0); 1756 1757 metaslab_class_space_update(mc, alloc_delta, defer_delta, space_delta, 1758 vdev_deflated_space(vd, space_delta)); 1759 } 1760 1761 int 1762 metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, 1763 metaslab_t **msp) 1764 { 1765 vdev_t *vd = mg->mg_vd; 1766 spa_t *spa = vd->vdev_spa; 1767 objset_t *mos = spa->spa_meta_objset; 1768 metaslab_t *ms; 1769 int error; 1770 1771 ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); 1772 mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL); 1773 mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL); 1774 cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); 1775 1776 ms->ms_id = id; 1777 ms->ms_start = id << vd->vdev_ms_shift; 1778 ms->ms_size = 1ULL << vd->vdev_ms_shift; 1779 ms->ms_allocator = -1; 1780 ms->ms_new = B_TRUE; 1781 1782 /* 1783 * We only open space map objects that already exist. All others 1784 * will be opened when we finally allocate an object for it. 1785 * 1786 * Note: 1787 * When called from vdev_expand(), we can't call into the DMU as 1788 * we are holding the spa_config_lock as a writer and we would 1789 * deadlock [see relevant comment in vdev_metaslab_init()]. in 1790 * that case, the object parameter is zero though, so we won't 1791 * call into the DMU. 1792 */ 1793 if (object != 0) { 1794 error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start, 1795 ms->ms_size, vd->vdev_ashift); 1796 1797 if (error != 0) { 1798 kmem_free(ms, sizeof (metaslab_t)); 1799 return (error); 1800 } 1801 1802 ASSERT(ms->ms_sm != NULL); 1803 ASSERT3S(space_map_allocated(ms->ms_sm), >=, 0); 1804 ms->ms_allocated_space = space_map_allocated(ms->ms_sm); 1805 } 1806 1807 /* 1808 * We create the ms_allocatable here, but we don't create the 1809 * other range trees until metaslab_sync_done(). This serves 1810 * two purposes: it allows metaslab_sync_done() to detect the 1811 * addition of new space; and for debugging, it ensures that 1812 * we'd data fault on any attempt to use this metaslab before 1813 * it's ready. 1814 */ 1815 ms->ms_allocatable = range_tree_create_impl(&rt_avl_ops, 1816 &ms->ms_allocatable_by_size, metaslab_rangesize_compare, 0); 1817 metaslab_group_add(mg, ms); 1818 1819 metaslab_set_fragmentation(ms); 1820 1821 /* 1822 * If we're opening an existing pool (txg == 0) or creating 1823 * a new one (txg == TXG_INITIAL), all space is available now. 1824 * If we're adding space to an existing pool, the new space 1825 * does not become available until after this txg has synced. 1826 * The metaslab's weight will also be initialized when we sync 1827 * out this txg. This ensures that we don't attempt to allocate 1828 * from it before we have initialized it completely. 1829 */ 1830 if (txg <= TXG_INITIAL) { 1831 metaslab_sync_done(ms, 0); 1832 metaslab_space_update(vd, mg->mg_class, 1833 metaslab_allocated_space(ms), 0, 0); 1834 } 1835 1836 /* 1837 * If metaslab_debug_load is set and we're initializing a metaslab 1838 * that has an allocated space map object then load the space map 1839 * so that we can verify frees. 1840 */ 1841 if (metaslab_debug_load && ms->ms_sm != NULL) { 1842 mutex_enter(&ms->ms_lock); 1843 VERIFY0(metaslab_load(ms)); 1844 mutex_exit(&ms->ms_lock); 1845 } 1846 1847 if (txg != 0) { 1848 vdev_dirty(vd, 0, NULL, txg); 1849 vdev_dirty(vd, VDD_METASLAB, ms, txg); 1850 } 1851 1852 *msp = ms; 1853 1854 return (0); 1855 } 1856 1857 void 1858 metaslab_fini(metaslab_t *msp) 1859 { 1860 metaslab_group_t *mg = msp->ms_group; 1861 vdev_t *vd = mg->mg_vd; 1862 1863 metaslab_group_remove(mg, msp); 1864 1865 mutex_enter(&msp->ms_lock); 1866 VERIFY(msp->ms_group == NULL); 1867 metaslab_space_update(vd, mg->mg_class, 1868 -metaslab_allocated_space(msp), 0, -msp->ms_size); 1869 1870 space_map_close(msp->ms_sm); 1871 1872 metaslab_unload(msp); 1873 1874 range_tree_destroy(msp->ms_allocatable); 1875 range_tree_destroy(msp->ms_freeing); 1876 range_tree_destroy(msp->ms_freed); 1877 1878 for (int t = 0; t < TXG_SIZE; t++) { 1879 range_tree_destroy(msp->ms_allocating[t]); 1880 } 1881 1882 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1883 range_tree_destroy(msp->ms_defer[t]); 1884 } 1885 ASSERT0(msp->ms_deferspace); 1886 1887 range_tree_destroy(msp->ms_checkpointing); 1888 1889 for (int t = 0; t < TXG_SIZE; t++) 1890 ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t)); 1891 1892 mutex_exit(&msp->ms_lock); 1893 cv_destroy(&msp->ms_load_cv); 1894 mutex_destroy(&msp->ms_lock); 1895 mutex_destroy(&msp->ms_sync_lock); 1896 ASSERT3U(msp->ms_allocator, ==, -1); 1897 1898 kmem_free(msp, sizeof (metaslab_t)); 1899 } 1900 1901 #define FRAGMENTATION_TABLE_SIZE 17 1902 1903 /* 1904 * This table defines a segment size based fragmentation metric that will 1905 * allow each metaslab to derive its own fragmentation value. This is done 1906 * by calculating the space in each bucket of the spacemap histogram and 1907 * multiplying that by the fragmentation metric in this table. Doing 1908 * this for all buckets and dividing it by the total amount of free 1909 * space in this metaslab (i.e. the total free space in all buckets) gives 1910 * us the fragmentation metric. This means that a high fragmentation metric 1911 * equates to most of the free space being comprised of small segments. 1912 * Conversely, if the metric is low, then most of the free space is in 1913 * large segments. A 10% change in fragmentation equates to approximately 1914 * double the number of segments. 1915 * 1916 * This table defines 0% fragmented space using 16MB segments. Testing has 1917 * shown that segments that are greater than or equal to 16MB do not suffer 1918 * from drastic performance problems. Using this value, we derive the rest 1919 * of the table. Since the fragmentation value is never stored on disk, it 1920 * is possible to change these calculations in the future. 1921 */ 1922 int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = { 1923 100, /* 512B */ 1924 100, /* 1K */ 1925 98, /* 2K */ 1926 95, /* 4K */ 1927 90, /* 8K */ 1928 80, /* 16K */ 1929 70, /* 32K */ 1930 60, /* 64K */ 1931 50, /* 128K */ 1932 40, /* 256K */ 1933 30, /* 512K */ 1934 20, /* 1M */ 1935 15, /* 2M */ 1936 10, /* 4M */ 1937 5, /* 8M */ 1938 0 /* 16M */ 1939 }; 1940 1941 /* 1942 * Calculate the metaslab's fragmentation metric and set ms_fragmentation. 1943 * Setting this value to ZFS_FRAG_INVALID means that the metaslab has not 1944 * been upgraded and does not support this metric. Otherwise, the return 1945 * value should be in the range [0, 100]. 1946 */ 1947 static void 1948 metaslab_set_fragmentation(metaslab_t *msp) 1949 { 1950 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1951 uint64_t fragmentation = 0; 1952 uint64_t total = 0; 1953 boolean_t feature_enabled = spa_feature_is_enabled(spa, 1954 SPA_FEATURE_SPACEMAP_HISTOGRAM); 1955 1956 if (!feature_enabled) { 1957 msp->ms_fragmentation = ZFS_FRAG_INVALID; 1958 return; 1959 } 1960 1961 /* 1962 * A null space map means that the entire metaslab is free 1963 * and thus is not fragmented. 1964 */ 1965 if (msp->ms_sm == NULL) { 1966 msp->ms_fragmentation = 0; 1967 return; 1968 } 1969 1970 /* 1971 * If this metaslab's space map has not been upgraded, flag it 1972 * so that we upgrade next time we encounter it. 1973 */ 1974 if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) { 1975 uint64_t txg = spa_syncing_txg(spa); 1976 vdev_t *vd = msp->ms_group->mg_vd; 1977 1978 /* 1979 * If we've reached the final dirty txg, then we must 1980 * be shutting down the pool. We don't want to dirty 1981 * any data past this point so skip setting the condense 1982 * flag. We can retry this action the next time the pool 1983 * is imported. 1984 */ 1985 if (spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) { 1986 msp->ms_condense_wanted = B_TRUE; 1987 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 1988 zfs_dbgmsg("txg %llu, requesting force condense: " 1989 "ms_id %llu, vdev_id %llu", txg, msp->ms_id, 1990 vd->vdev_id); 1991 } 1992 msp->ms_fragmentation = ZFS_FRAG_INVALID; 1993 return; 1994 } 1995 1996 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 1997 uint64_t space = 0; 1998 uint8_t shift = msp->ms_sm->sm_shift; 1999 2000 int idx = MIN(shift - SPA_MINBLOCKSHIFT + i, 2001 FRAGMENTATION_TABLE_SIZE - 1); 2002 2003 if (msp->ms_sm->sm_phys->smp_histogram[i] == 0) 2004 continue; 2005 2006 space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift); 2007 total += space; 2008 2009 ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE); 2010 fragmentation += space * zfs_frag_table[idx]; 2011 } 2012 2013 if (total > 0) 2014 fragmentation /= total; 2015 ASSERT3U(fragmentation, <=, 100); 2016 2017 msp->ms_fragmentation = fragmentation; 2018 } 2019 2020 /* 2021 * Compute a weight -- a selection preference value -- for the given metaslab. 2022 * This is based on the amount of free space, the level of fragmentation, 2023 * the LBA range, and whether the metaslab is loaded. 2024 */ 2025 static uint64_t 2026 metaslab_space_weight(metaslab_t *msp) 2027 { 2028 metaslab_group_t *mg = msp->ms_group; 2029 vdev_t *vd = mg->mg_vd; 2030 uint64_t weight, space; 2031 2032 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2033 ASSERT(!vd->vdev_removing); 2034 2035 /* 2036 * The baseline weight is the metaslab's free space. 2037 */ 2038 space = msp->ms_size - metaslab_allocated_space(msp); 2039 2040 if (metaslab_fragmentation_factor_enabled && 2041 msp->ms_fragmentation != ZFS_FRAG_INVALID) { 2042 /* 2043 * Use the fragmentation information to inversely scale 2044 * down the baseline weight. We need to ensure that we 2045 * don't exclude this metaslab completely when it's 100% 2046 * fragmented. To avoid this we reduce the fragmented value 2047 * by 1. 2048 */ 2049 space = (space * (100 - (msp->ms_fragmentation - 1))) / 100; 2050 2051 /* 2052 * If space < SPA_MINBLOCKSIZE, then we will not allocate from 2053 * this metaslab again. The fragmentation metric may have 2054 * decreased the space to something smaller than 2055 * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE 2056 * so that we can consume any remaining space. 2057 */ 2058 if (space > 0 && space < SPA_MINBLOCKSIZE) 2059 space = SPA_MINBLOCKSIZE; 2060 } 2061 weight = space; 2062 2063 /* 2064 * Modern disks have uniform bit density and constant angular velocity. 2065 * Therefore, the outer recording zones are faster (higher bandwidth) 2066 * than the inner zones by the ratio of outer to inner track diameter, 2067 * which is typically around 2:1. We account for this by assigning 2068 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). 2069 * In effect, this means that we'll select the metaslab with the most 2070 * free bandwidth rather than simply the one with the most free space. 2071 */ 2072 if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) { 2073 weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count; 2074 ASSERT(weight >= space && weight <= 2 * space); 2075 } 2076 2077 /* 2078 * If this metaslab is one we're actively using, adjust its 2079 * weight to make it preferable to any inactive metaslab so 2080 * we'll polish it off. If the fragmentation on this metaslab 2081 * has exceed our threshold, then don't mark it active. 2082 */ 2083 if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID && 2084 msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) { 2085 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); 2086 } 2087 2088 WEIGHT_SET_SPACEBASED(weight); 2089 return (weight); 2090 } 2091 2092 /* 2093 * Return the weight of the specified metaslab, according to the segment-based 2094 * weighting algorithm. The metaslab must be loaded. This function can 2095 * be called within a sync pass since it relies only on the metaslab's 2096 * range tree which is always accurate when the metaslab is loaded. 2097 */ 2098 static uint64_t 2099 metaslab_weight_from_range_tree(metaslab_t *msp) 2100 { 2101 uint64_t weight = 0; 2102 uint32_t segments = 0; 2103 2104 ASSERT(msp->ms_loaded); 2105 2106 for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT; 2107 i--) { 2108 uint8_t shift = msp->ms_group->mg_vd->vdev_ashift; 2109 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; 2110 2111 segments <<= 1; 2112 segments += msp->ms_allocatable->rt_histogram[i]; 2113 2114 /* 2115 * The range tree provides more precision than the space map 2116 * and must be downgraded so that all values fit within the 2117 * space map's histogram. This allows us to compare loaded 2118 * vs. unloaded metaslabs to determine which metaslab is 2119 * considered "best". 2120 */ 2121 if (i > max_idx) 2122 continue; 2123 2124 if (segments != 0) { 2125 WEIGHT_SET_COUNT(weight, segments); 2126 WEIGHT_SET_INDEX(weight, i); 2127 WEIGHT_SET_ACTIVE(weight, 0); 2128 break; 2129 } 2130 } 2131 return (weight); 2132 } 2133 2134 /* 2135 * Calculate the weight based on the on-disk histogram. This should only 2136 * be called after a sync pass has completely finished since the on-disk 2137 * information is updated in metaslab_sync(). 2138 */ 2139 static uint64_t 2140 metaslab_weight_from_spacemap(metaslab_t *msp) 2141 { 2142 space_map_t *sm = msp->ms_sm; 2143 ASSERT(!msp->ms_loaded); 2144 ASSERT(sm != NULL); 2145 ASSERT3U(space_map_object(sm), !=, 0); 2146 ASSERT3U(sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); 2147 2148 /* 2149 * Create a joint histogram from all the segments that have made 2150 * it to the metaslab's space map histogram, that are not yet 2151 * available for allocation because they are still in the freeing 2152 * pipeline (e.g. freeing, freed, and defer trees). Then subtract 2153 * these segments from the space map's histogram to get a more 2154 * accurate weight. 2155 */ 2156 uint64_t deferspace_histogram[SPACE_MAP_HISTOGRAM_SIZE] = {0}; 2157 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) 2158 deferspace_histogram[i] += msp->ms_synchist[i]; 2159 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2160 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 2161 deferspace_histogram[i] += msp->ms_deferhist[t][i]; 2162 } 2163 } 2164 2165 uint64_t weight = 0; 2166 for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) { 2167 ASSERT3U(sm->sm_phys->smp_histogram[i], >=, 2168 deferspace_histogram[i]); 2169 uint64_t count = 2170 sm->sm_phys->smp_histogram[i] - deferspace_histogram[i]; 2171 if (count != 0) { 2172 WEIGHT_SET_COUNT(weight, count); 2173 WEIGHT_SET_INDEX(weight, i + sm->sm_shift); 2174 WEIGHT_SET_ACTIVE(weight, 0); 2175 break; 2176 } 2177 } 2178 return (weight); 2179 } 2180 2181 /* 2182 * Compute a segment-based weight for the specified metaslab. The weight 2183 * is determined by highest bucket in the histogram. The information 2184 * for the highest bucket is encoded into the weight value. 2185 */ 2186 static uint64_t 2187 metaslab_segment_weight(metaslab_t *msp) 2188 { 2189 metaslab_group_t *mg = msp->ms_group; 2190 uint64_t weight = 0; 2191 uint8_t shift = mg->mg_vd->vdev_ashift; 2192 2193 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2194 2195 /* 2196 * The metaslab is completely free. 2197 */ 2198 if (metaslab_allocated_space(msp) == 0) { 2199 int idx = highbit64(msp->ms_size) - 1; 2200 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; 2201 2202 if (idx < max_idx) { 2203 WEIGHT_SET_COUNT(weight, 1ULL); 2204 WEIGHT_SET_INDEX(weight, idx); 2205 } else { 2206 WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx)); 2207 WEIGHT_SET_INDEX(weight, max_idx); 2208 } 2209 WEIGHT_SET_ACTIVE(weight, 0); 2210 ASSERT(!WEIGHT_IS_SPACEBASED(weight)); 2211 2212 return (weight); 2213 } 2214 2215 ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); 2216 2217 /* 2218 * If the metaslab is fully allocated then just make the weight 0. 2219 */ 2220 if (metaslab_allocated_space(msp) == msp->ms_size) 2221 return (0); 2222 /* 2223 * If the metaslab is already loaded, then use the range tree to 2224 * determine the weight. Otherwise, we rely on the space map information 2225 * to generate the weight. 2226 */ 2227 if (msp->ms_loaded) { 2228 weight = metaslab_weight_from_range_tree(msp); 2229 } else { 2230 weight = metaslab_weight_from_spacemap(msp); 2231 } 2232 2233 /* 2234 * If the metaslab was active the last time we calculated its weight 2235 * then keep it active. We want to consume the entire region that 2236 * is associated with this weight. 2237 */ 2238 if (msp->ms_activation_weight != 0 && weight != 0) 2239 WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight)); 2240 return (weight); 2241 } 2242 2243 /* 2244 * Determine if we should attempt to allocate from this metaslab. If the 2245 * metaslab has a maximum size then we can quickly determine if the desired 2246 * allocation size can be satisfied. Otherwise, if we're using segment-based 2247 * weighting then we can determine the maximum allocation that this metaslab 2248 * can accommodate based on the index encoded in the weight. If we're using 2249 * space-based weights then rely on the entire weight (excluding the weight 2250 * type bit). 2251 */ 2252 boolean_t 2253 metaslab_should_allocate(metaslab_t *msp, uint64_t asize) 2254 { 2255 boolean_t should_allocate; 2256 2257 if (msp->ms_max_size != 0) 2258 return (msp->ms_max_size >= asize); 2259 2260 if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) { 2261 /* 2262 * The metaslab segment weight indicates segments in the 2263 * range [2^i, 2^(i+1)), where i is the index in the weight. 2264 * Since the asize might be in the middle of the range, we 2265 * should attempt the allocation if asize < 2^(i+1). 2266 */ 2267 should_allocate = (asize < 2268 1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1)); 2269 } else { 2270 should_allocate = (asize <= 2271 (msp->ms_weight & ~METASLAB_WEIGHT_TYPE)); 2272 } 2273 return (should_allocate); 2274 } 2275 2276 static uint64_t 2277 metaslab_weight(metaslab_t *msp) 2278 { 2279 vdev_t *vd = msp->ms_group->mg_vd; 2280 spa_t *spa = vd->vdev_spa; 2281 uint64_t weight; 2282 2283 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2284 2285 /* 2286 * If this vdev is in the process of being removed, there is nothing 2287 * for us to do here. 2288 */ 2289 if (vd->vdev_removing) 2290 return (0); 2291 2292 metaslab_set_fragmentation(msp); 2293 2294 /* 2295 * Update the maximum size if the metaslab is loaded. This will 2296 * ensure that we get an accurate maximum size if newly freed space 2297 * has been added back into the free tree. 2298 */ 2299 if (msp->ms_loaded) 2300 msp->ms_max_size = metaslab_block_maxsize(msp); 2301 else 2302 ASSERT0(msp->ms_max_size); 2303 2304 /* 2305 * Segment-based weighting requires space map histogram support. 2306 */ 2307 if (zfs_metaslab_segment_weight_enabled && 2308 spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) && 2309 (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size == 2310 sizeof (space_map_phys_t))) { 2311 weight = metaslab_segment_weight(msp); 2312 } else { 2313 weight = metaslab_space_weight(msp); 2314 } 2315 return (weight); 2316 } 2317 2318 void 2319 metaslab_recalculate_weight_and_sort(metaslab_t *msp) 2320 { 2321 /* note: we preserve the mask (e.g. indication of primary, etc..) */ 2322 uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; 2323 metaslab_group_sort(msp->ms_group, msp, 2324 metaslab_weight(msp) | was_active); 2325 } 2326 2327 static int 2328 metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp, 2329 int allocator, uint64_t activation_weight) 2330 { 2331 /* 2332 * If we're activating for the claim code, we don't want to actually 2333 * set the metaslab up for a specific allocator. 2334 */ 2335 if (activation_weight == METASLAB_WEIGHT_CLAIM) 2336 return (0); 2337 metaslab_t **arr = (activation_weight == METASLAB_WEIGHT_PRIMARY ? 2338 mg->mg_primaries : mg->mg_secondaries); 2339 2340 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2341 mutex_enter(&mg->mg_lock); 2342 if (arr[allocator] != NULL) { 2343 mutex_exit(&mg->mg_lock); 2344 return (EEXIST); 2345 } 2346 2347 arr[allocator] = msp; 2348 ASSERT3S(msp->ms_allocator, ==, -1); 2349 msp->ms_allocator = allocator; 2350 msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY); 2351 mutex_exit(&mg->mg_lock); 2352 2353 return (0); 2354 } 2355 2356 static int 2357 metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight) 2358 { 2359 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2360 2361 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { 2362 int error = metaslab_load(msp); 2363 if (error != 0) { 2364 metaslab_group_sort(msp->ms_group, msp, 0); 2365 return (error); 2366 } 2367 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) { 2368 /* 2369 * The metaslab was activated for another allocator 2370 * while we were waiting, we should reselect. 2371 */ 2372 return (EBUSY); 2373 } 2374 if ((error = metaslab_activate_allocator(msp->ms_group, msp, 2375 allocator, activation_weight)) != 0) { 2376 return (error); 2377 } 2378 2379 msp->ms_activation_weight = msp->ms_weight; 2380 metaslab_group_sort(msp->ms_group, msp, 2381 msp->ms_weight | activation_weight); 2382 } 2383 ASSERT(msp->ms_loaded); 2384 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); 2385 2386 return (0); 2387 } 2388 2389 static void 2390 metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp, 2391 uint64_t weight) 2392 { 2393 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2394 if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) { 2395 metaslab_group_sort(mg, msp, weight); 2396 return; 2397 } 2398 2399 mutex_enter(&mg->mg_lock); 2400 ASSERT3P(msp->ms_group, ==, mg); 2401 if (msp->ms_primary) { 2402 ASSERT3U(0, <=, msp->ms_allocator); 2403 ASSERT3U(msp->ms_allocator, <, mg->mg_allocators); 2404 ASSERT3P(mg->mg_primaries[msp->ms_allocator], ==, msp); 2405 ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); 2406 mg->mg_primaries[msp->ms_allocator] = NULL; 2407 } else { 2408 ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); 2409 ASSERT3P(mg->mg_secondaries[msp->ms_allocator], ==, msp); 2410 mg->mg_secondaries[msp->ms_allocator] = NULL; 2411 } 2412 msp->ms_allocator = -1; 2413 metaslab_group_sort_impl(mg, msp, weight); 2414 mutex_exit(&mg->mg_lock); 2415 } 2416 2417 static void 2418 metaslab_passivate(metaslab_t *msp, uint64_t weight) 2419 { 2420 uint64_t size = weight & ~METASLAB_WEIGHT_TYPE; 2421 2422 /* 2423 * If size < SPA_MINBLOCKSIZE, then we will not allocate from 2424 * this metaslab again. In that case, it had better be empty, 2425 * or we would be leaving space on the table. 2426 */ 2427 ASSERT(size >= SPA_MINBLOCKSIZE || 2428 range_tree_is_empty(msp->ms_allocatable)); 2429 ASSERT0(weight & METASLAB_ACTIVE_MASK); 2430 2431 msp->ms_activation_weight = 0; 2432 metaslab_passivate_allocator(msp->ms_group, msp, weight); 2433 ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); 2434 } 2435 2436 /* 2437 * Segment-based metaslabs are activated once and remain active until 2438 * we either fail an allocation attempt (similar to space-based metaslabs) 2439 * or have exhausted the free space in zfs_metaslab_switch_threshold 2440 * buckets since the metaslab was activated. This function checks to see 2441 * if we've exhaused the zfs_metaslab_switch_threshold buckets in the 2442 * metaslab and passivates it proactively. This will allow us to select a 2443 * metaslabs with larger contiguous region if any remaining within this 2444 * metaslab group. If we're in sync pass > 1, then we continue using this 2445 * metaslab so that we don't dirty more block and cause more sync passes. 2446 */ 2447 void 2448 metaslab_segment_may_passivate(metaslab_t *msp) 2449 { 2450 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 2451 2452 if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1) 2453 return; 2454 2455 /* 2456 * Since we are in the middle of a sync pass, the most accurate 2457 * information that is accessible to us is the in-core range tree 2458 * histogram; calculate the new weight based on that information. 2459 */ 2460 uint64_t weight = metaslab_weight_from_range_tree(msp); 2461 int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight); 2462 int current_idx = WEIGHT_GET_INDEX(weight); 2463 2464 if (current_idx <= activation_idx - zfs_metaslab_switch_threshold) 2465 metaslab_passivate(msp, weight); 2466 } 2467 2468 static void 2469 metaslab_preload(void *arg) 2470 { 2471 metaslab_t *msp = arg; 2472 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 2473 2474 ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock)); 2475 2476 mutex_enter(&msp->ms_lock); 2477 (void) metaslab_load(msp); 2478 msp->ms_selected_txg = spa_syncing_txg(spa); 2479 mutex_exit(&msp->ms_lock); 2480 } 2481 2482 static void 2483 metaslab_group_preload(metaslab_group_t *mg) 2484 { 2485 spa_t *spa = mg->mg_vd->vdev_spa; 2486 metaslab_t *msp; 2487 avl_tree_t *t = &mg->mg_metaslab_tree; 2488 int m = 0; 2489 2490 if (spa_shutting_down(spa) || !metaslab_preload_enabled) { 2491 taskq_wait(mg->mg_taskq); 2492 return; 2493 } 2494 2495 mutex_enter(&mg->mg_lock); 2496 2497 /* 2498 * Load the next potential metaslabs 2499 */ 2500 for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) { 2501 ASSERT3P(msp->ms_group, ==, mg); 2502 2503 /* 2504 * We preload only the maximum number of metaslabs specified 2505 * by metaslab_preload_limit. If a metaslab is being forced 2506 * to condense then we preload it too. This will ensure 2507 * that force condensing happens in the next txg. 2508 */ 2509 if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) { 2510 continue; 2511 } 2512 2513 VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload, 2514 msp, TQ_SLEEP) != TASKQID_INVALID); 2515 } 2516 mutex_exit(&mg->mg_lock); 2517 } 2518 2519 /* 2520 * Determine if the space map's on-disk footprint is past our tolerance 2521 * for inefficiency. We would like to use the following criteria to make 2522 * our decision: 2523 * 2524 * 1. The size of the space map object should not dramatically increase as a 2525 * result of writing out the free space range tree. 2526 * 2527 * 2. The minimal on-disk space map representation is zfs_condense_pct/100 2528 * times the size than the free space range tree representation 2529 * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1MB). 2530 * 2531 * 3. The on-disk size of the space map should actually decrease. 2532 * 2533 * Unfortunately, we cannot compute the on-disk size of the space map in this 2534 * context because we cannot accurately compute the effects of compression, etc. 2535 * Instead, we apply the heuristic described in the block comment for 2536 * zfs_metaslab_condense_block_threshold - we only condense if the space used 2537 * is greater than a threshold number of blocks. 2538 */ 2539 static boolean_t 2540 metaslab_should_condense(metaslab_t *msp) 2541 { 2542 space_map_t *sm = msp->ms_sm; 2543 vdev_t *vd = msp->ms_group->mg_vd; 2544 uint64_t vdev_blocksize = 1 << vd->vdev_ashift; 2545 uint64_t current_txg = spa_syncing_txg(vd->vdev_spa); 2546 2547 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2548 ASSERT(msp->ms_loaded); 2549 2550 /* 2551 * Allocations and frees in early passes are generally more space 2552 * efficient (in terms of blocks described in space map entries) 2553 * than the ones in later passes (e.g. we don't compress after 2554 * sync pass 5) and condensing a metaslab multiple times in a txg 2555 * could degrade performance. 2556 * 2557 * Thus we prefer condensing each metaslab at most once every txg at 2558 * the earliest sync pass possible. If a metaslab is eligible for 2559 * condensing again after being considered for condensing within the 2560 * same txg, it will hopefully be dirty in the next txg where it will 2561 * be condensed at an earlier pass. 2562 */ 2563 if (msp->ms_condense_checked_txg == current_txg) 2564 return (B_FALSE); 2565 msp->ms_condense_checked_txg = current_txg; 2566 2567 /* 2568 * We always condense metaslabs that are empty and metaslabs for 2569 * which a condense request has been made. 2570 */ 2571 if (avl_is_empty(&msp->ms_allocatable_by_size) || 2572 msp->ms_condense_wanted) 2573 return (B_TRUE); 2574 2575 uint64_t object_size = space_map_length(msp->ms_sm); 2576 uint64_t optimal_size = space_map_estimate_optimal_size(sm, 2577 msp->ms_allocatable, SM_NO_VDEVID); 2578 2579 dmu_object_info_t doi; 2580 dmu_object_info_from_db(sm->sm_dbuf, &doi); 2581 uint64_t record_size = MAX(doi.doi_data_block_size, vdev_blocksize); 2582 2583 return (object_size >= (optimal_size * zfs_condense_pct / 100) && 2584 object_size > zfs_metaslab_condense_block_threshold * record_size); 2585 } 2586 2587 /* 2588 * Condense the on-disk space map representation to its minimized form. 2589 * The minimized form consists of a small number of allocations followed by 2590 * the entries of the free range tree. 2591 */ 2592 static void 2593 metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) 2594 { 2595 range_tree_t *condense_tree; 2596 space_map_t *sm = msp->ms_sm; 2597 2598 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2599 ASSERT(msp->ms_loaded); 2600 2601 zfs_dbgmsg("condensing: txg %llu, msp[%llu] %p, vdev id %llu, " 2602 "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg, 2603 msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id, 2604 msp->ms_group->mg_vd->vdev_spa->spa_name, 2605 space_map_length(msp->ms_sm), 2606 avl_numnodes(&msp->ms_allocatable->rt_root), 2607 msp->ms_condense_wanted ? "TRUE" : "FALSE"); 2608 2609 msp->ms_condense_wanted = B_FALSE; 2610 2611 /* 2612 * Create an range tree that is 100% allocated. We remove segments 2613 * that have been freed in this txg, any deferred frees that exist, 2614 * and any allocation in the future. Removing segments should be 2615 * a relatively inexpensive operation since we expect these trees to 2616 * have a small number of nodes. 2617 */ 2618 condense_tree = range_tree_create(NULL, NULL); 2619 range_tree_add(condense_tree, msp->ms_start, msp->ms_size); 2620 2621 range_tree_walk(msp->ms_freeing, range_tree_remove, condense_tree); 2622 range_tree_walk(msp->ms_freed, range_tree_remove, condense_tree); 2623 2624 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2625 range_tree_walk(msp->ms_defer[t], 2626 range_tree_remove, condense_tree); 2627 } 2628 2629 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 2630 range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK], 2631 range_tree_remove, condense_tree); 2632 } 2633 2634 /* 2635 * We're about to drop the metaslab's lock thus allowing 2636 * other consumers to change it's content. Set the 2637 * metaslab's ms_condensing flag to ensure that 2638 * allocations on this metaslab do not occur while we're 2639 * in the middle of committing it to disk. This is only critical 2640 * for ms_allocatable as all other range trees use per txg 2641 * views of their content. 2642 */ 2643 msp->ms_condensing = B_TRUE; 2644 2645 mutex_exit(&msp->ms_lock); 2646 space_map_truncate(sm, zfs_metaslab_sm_blksz, tx); 2647 2648 /* 2649 * While we would ideally like to create a space map representation 2650 * that consists only of allocation records, doing so can be 2651 * prohibitively expensive because the in-core free tree can be 2652 * large, and therefore computationally expensive to subtract 2653 * from the condense_tree. Instead we sync out two trees, a cheap 2654 * allocation only tree followed by the in-core free tree. While not 2655 * optimal, this is typically close to optimal, and much cheaper to 2656 * compute. 2657 */ 2658 space_map_write(sm, condense_tree, SM_ALLOC, SM_NO_VDEVID, tx); 2659 range_tree_vacate(condense_tree, NULL, NULL); 2660 range_tree_destroy(condense_tree); 2661 2662 space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx); 2663 mutex_enter(&msp->ms_lock); 2664 msp->ms_condensing = B_FALSE; 2665 } 2666 2667 /* 2668 * Write a metaslab to disk in the context of the specified transaction group. 2669 */ 2670 void 2671 metaslab_sync(metaslab_t *msp, uint64_t txg) 2672 { 2673 metaslab_group_t *mg = msp->ms_group; 2674 vdev_t *vd = mg->mg_vd; 2675 spa_t *spa = vd->vdev_spa; 2676 objset_t *mos = spa_meta_objset(spa); 2677 range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK]; 2678 dmu_tx_t *tx; 2679 uint64_t object = space_map_object(msp->ms_sm); 2680 2681 ASSERT(!vd->vdev_ishole); 2682 2683 /* 2684 * This metaslab has just been added so there's no work to do now. 2685 */ 2686 if (msp->ms_freeing == NULL) { 2687 ASSERT3P(alloctree, ==, NULL); 2688 return; 2689 } 2690 2691 ASSERT3P(alloctree, !=, NULL); 2692 ASSERT3P(msp->ms_freeing, !=, NULL); 2693 ASSERT3P(msp->ms_freed, !=, NULL); 2694 ASSERT3P(msp->ms_checkpointing, !=, NULL); 2695 2696 /* 2697 * Normally, we don't want to process a metaslab if there are no 2698 * allocations or frees to perform. However, if the metaslab is being 2699 * forced to condense and it's loaded, we need to let it through. 2700 */ 2701 if (range_tree_is_empty(alloctree) && 2702 range_tree_is_empty(msp->ms_freeing) && 2703 range_tree_is_empty(msp->ms_checkpointing) && 2704 !(msp->ms_loaded && msp->ms_condense_wanted)) 2705 return; 2706 2707 2708 VERIFY(txg <= spa_final_dirty_txg(spa)); 2709 2710 /* 2711 * The only state that can actually be changing concurrently 2712 * with metaslab_sync() is the metaslab's ms_allocatable. No 2713 * other thread can be modifying this txg's alloc, freeing, 2714 * freed, or space_map_phys_t. We drop ms_lock whenever we 2715 * could call into the DMU, because the DMU can call down to 2716 * us (e.g. via zio_free()) at any time. 2717 * 2718 * The spa_vdev_remove_thread() can be reading metaslab state 2719 * concurrently, and it is locked out by the ms_sync_lock. 2720 * Note that the ms_lock is insufficient for this, because it 2721 * is dropped by space_map_write(). 2722 */ 2723 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 2724 2725 if (msp->ms_sm == NULL) { 2726 uint64_t new_object; 2727 2728 new_object = space_map_alloc(mos, zfs_metaslab_sm_blksz, tx); 2729 VERIFY3U(new_object, !=, 0); 2730 2731 VERIFY0(space_map_open(&msp->ms_sm, mos, new_object, 2732 msp->ms_start, msp->ms_size, vd->vdev_ashift)); 2733 2734 ASSERT(msp->ms_sm != NULL); 2735 ASSERT0(metaslab_allocated_space(msp)); 2736 } 2737 2738 if (!range_tree_is_empty(msp->ms_checkpointing) && 2739 vd->vdev_checkpoint_sm == NULL) { 2740 ASSERT(spa_has_checkpoint(spa)); 2741 2742 uint64_t new_object = space_map_alloc(mos, 2743 vdev_standard_sm_blksz, tx); 2744 VERIFY3U(new_object, !=, 0); 2745 2746 VERIFY0(space_map_open(&vd->vdev_checkpoint_sm, 2747 mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift)); 2748 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); 2749 2750 /* 2751 * We save the space map object as an entry in vdev_top_zap 2752 * so it can be retrieved when the pool is reopened after an 2753 * export or through zdb. 2754 */ 2755 VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, 2756 vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, 2757 sizeof (new_object), 1, &new_object, tx)); 2758 } 2759 2760 mutex_enter(&msp->ms_sync_lock); 2761 mutex_enter(&msp->ms_lock); 2762 2763 /* 2764 * Note: metaslab_condense() clears the space map's histogram. 2765 * Therefore we must verify and remove this histogram before 2766 * condensing. 2767 */ 2768 metaslab_group_histogram_verify(mg); 2769 metaslab_class_histogram_verify(mg->mg_class); 2770 metaslab_group_histogram_remove(mg, msp); 2771 2772 if (msp->ms_loaded && metaslab_should_condense(msp)) { 2773 metaslab_condense(msp, txg, tx); 2774 } else { 2775 mutex_exit(&msp->ms_lock); 2776 space_map_write(msp->ms_sm, alloctree, SM_ALLOC, 2777 SM_NO_VDEVID, tx); 2778 space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE, 2779 SM_NO_VDEVID, tx); 2780 mutex_enter(&msp->ms_lock); 2781 } 2782 2783 msp->ms_allocated_space += range_tree_space(alloctree); 2784 ASSERT3U(msp->ms_allocated_space, >=, 2785 range_tree_space(msp->ms_freeing)); 2786 msp->ms_allocated_space -= range_tree_space(msp->ms_freeing); 2787 2788 if (!range_tree_is_empty(msp->ms_checkpointing)) { 2789 ASSERT(spa_has_checkpoint(spa)); 2790 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); 2791 2792 /* 2793 * Since we are doing writes to disk and the ms_checkpointing 2794 * tree won't be changing during that time, we drop the 2795 * ms_lock while writing to the checkpoint space map. 2796 */ 2797 mutex_exit(&msp->ms_lock); 2798 space_map_write(vd->vdev_checkpoint_sm, 2799 msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx); 2800 mutex_enter(&msp->ms_lock); 2801 2802 spa->spa_checkpoint_info.sci_dspace += 2803 range_tree_space(msp->ms_checkpointing); 2804 vd->vdev_stat.vs_checkpoint_space += 2805 range_tree_space(msp->ms_checkpointing); 2806 ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==, 2807 -space_map_allocated(vd->vdev_checkpoint_sm)); 2808 2809 range_tree_vacate(msp->ms_checkpointing, NULL, NULL); 2810 } 2811 2812 if (msp->ms_loaded) { 2813 /* 2814 * When the space map is loaded, we have an accurate 2815 * histogram in the range tree. This gives us an opportunity 2816 * to bring the space map's histogram up-to-date so we clear 2817 * it first before updating it. 2818 */ 2819 space_map_histogram_clear(msp->ms_sm); 2820 space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx); 2821 2822 /* 2823 * Since we've cleared the histogram we need to add back 2824 * any free space that has already been processed, plus 2825 * any deferred space. This allows the on-disk histogram 2826 * to accurately reflect all free space even if some space 2827 * is not yet available for allocation (i.e. deferred). 2828 */ 2829 space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx); 2830 2831 /* 2832 * Add back any deferred free space that has not been 2833 * added back into the in-core free tree yet. This will 2834 * ensure that we don't end up with a space map histogram 2835 * that is completely empty unless the metaslab is fully 2836 * allocated. 2837 */ 2838 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2839 space_map_histogram_add(msp->ms_sm, 2840 msp->ms_defer[t], tx); 2841 } 2842 } 2843 2844 /* 2845 * Always add the free space from this sync pass to the space 2846 * map histogram. We want to make sure that the on-disk histogram 2847 * accounts for all free space. If the space map is not loaded, 2848 * then we will lose some accuracy but will correct it the next 2849 * time we load the space map. 2850 */ 2851 space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx); 2852 metaslab_aux_histograms_update(msp); 2853 2854 metaslab_group_histogram_add(mg, msp); 2855 metaslab_group_histogram_verify(mg); 2856 metaslab_class_histogram_verify(mg->mg_class); 2857 2858 /* 2859 * For sync pass 1, we avoid traversing this txg's free range tree 2860 * and instead will just swap the pointers for freeing and freed. 2861 * We can safely do this since the freed_tree is guaranteed to be 2862 * empty on the initial pass. 2863 */ 2864 if (spa_sync_pass(spa) == 1) { 2865 range_tree_swap(&msp->ms_freeing, &msp->ms_freed); 2866 ASSERT0(msp->ms_allocated_this_txg); 2867 } else { 2868 range_tree_vacate(msp->ms_freeing, 2869 range_tree_add, msp->ms_freed); 2870 } 2871 msp->ms_allocated_this_txg += range_tree_space(alloctree); 2872 range_tree_vacate(alloctree, NULL, NULL); 2873 2874 ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); 2875 ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg) 2876 & TXG_MASK])); 2877 ASSERT0(range_tree_space(msp->ms_freeing)); 2878 ASSERT0(range_tree_space(msp->ms_checkpointing)); 2879 2880 mutex_exit(&msp->ms_lock); 2881 2882 if (object != space_map_object(msp->ms_sm)) { 2883 object = space_map_object(msp->ms_sm); 2884 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * 2885 msp->ms_id, sizeof (uint64_t), &object, tx); 2886 } 2887 mutex_exit(&msp->ms_sync_lock); 2888 dmu_tx_commit(tx); 2889 } 2890 2891 /* 2892 * Called after a transaction group has completely synced to mark 2893 * all of the metaslab's free space as usable. 2894 */ 2895 void 2896 metaslab_sync_done(metaslab_t *msp, uint64_t txg) 2897 { 2898 metaslab_group_t *mg = msp->ms_group; 2899 vdev_t *vd = mg->mg_vd; 2900 spa_t *spa = vd->vdev_spa; 2901 range_tree_t **defer_tree; 2902 int64_t alloc_delta, defer_delta; 2903 boolean_t defer_allowed = B_TRUE; 2904 2905 ASSERT(!vd->vdev_ishole); 2906 2907 mutex_enter(&msp->ms_lock); 2908 2909 /* 2910 * If this metaslab is just becoming available, initialize its 2911 * range trees and add its capacity to the vdev. 2912 */ 2913 if (msp->ms_freed == NULL) { 2914 for (int t = 0; t < TXG_SIZE; t++) { 2915 ASSERT(msp->ms_allocating[t] == NULL); 2916 2917 msp->ms_allocating[t] = range_tree_create(NULL, NULL); 2918 } 2919 2920 ASSERT3P(msp->ms_freeing, ==, NULL); 2921 msp->ms_freeing = range_tree_create(NULL, NULL); 2922 2923 ASSERT3P(msp->ms_freed, ==, NULL); 2924 msp->ms_freed = range_tree_create(NULL, NULL); 2925 2926 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2927 ASSERT(msp->ms_defer[t] == NULL); 2928 2929 msp->ms_defer[t] = range_tree_create(NULL, NULL); 2930 } 2931 2932 ASSERT3P(msp->ms_checkpointing, ==, NULL); 2933 msp->ms_checkpointing = range_tree_create(NULL, NULL); 2934 2935 metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size); 2936 } 2937 ASSERT0(range_tree_space(msp->ms_freeing)); 2938 ASSERT0(range_tree_space(msp->ms_checkpointing)); 2939 2940 defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE]; 2941 2942 uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) - 2943 metaslab_class_get_alloc(spa_normal_class(spa)); 2944 if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) { 2945 defer_allowed = B_FALSE; 2946 } 2947 2948 defer_delta = 0; 2949 alloc_delta = msp->ms_allocated_this_txg - 2950 range_tree_space(msp->ms_freed); 2951 if (defer_allowed) { 2952 defer_delta = range_tree_space(msp->ms_freed) - 2953 range_tree_space(*defer_tree); 2954 } else { 2955 defer_delta -= range_tree_space(*defer_tree); 2956 } 2957 2958 metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta, 2959 defer_delta, 0); 2960 2961 /* 2962 * If there's a metaslab_load() in progress, wait for it to complete 2963 * so that we have a consistent view of the in-core space map. 2964 */ 2965 metaslab_load_wait(msp); 2966 2967 /* 2968 * Move the frees from the defer_tree back to the free 2969 * range tree (if it's loaded). Swap the freed_tree and 2970 * the defer_tree -- this is safe to do because we've 2971 * just emptied out the defer_tree. 2972 */ 2973 range_tree_vacate(*defer_tree, 2974 msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable); 2975 if (defer_allowed) { 2976 range_tree_swap(&msp->ms_freed, defer_tree); 2977 } else { 2978 range_tree_vacate(msp->ms_freed, 2979 msp->ms_loaded ? range_tree_add : NULL, 2980 msp->ms_allocatable); 2981 } 2982 2983 msp->ms_synced_length = space_map_length(msp->ms_sm); 2984 2985 msp->ms_deferspace += defer_delta; 2986 ASSERT3S(msp->ms_deferspace, >=, 0); 2987 ASSERT3S(msp->ms_deferspace, <=, msp->ms_size); 2988 if (msp->ms_deferspace != 0) { 2989 /* 2990 * Keep syncing this metaslab until all deferred frees 2991 * are back in circulation. 2992 */ 2993 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 2994 } 2995 metaslab_aux_histograms_update_done(msp, defer_allowed); 2996 2997 if (msp->ms_new) { 2998 msp->ms_new = B_FALSE; 2999 mutex_enter(&mg->mg_lock); 3000 mg->mg_ms_ready++; 3001 mutex_exit(&mg->mg_lock); 3002 } 3003 3004 /* 3005 * Re-sort metaslab within its group now that we've adjusted 3006 * its allocatable space. 3007 */ 3008 metaslab_recalculate_weight_and_sort(msp); 3009 3010 /* 3011 * If the metaslab is loaded and we've not tried to load or allocate 3012 * from it in 'metaslab_unload_delay' txgs, then unload it. 3013 */ 3014 if (msp->ms_loaded && 3015 msp->ms_initializing == 0 && 3016 msp->ms_selected_txg + metaslab_unload_delay < txg) { 3017 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 3018 VERIFY0(range_tree_space( 3019 msp->ms_allocating[(txg + t) & TXG_MASK])); 3020 } 3021 if (msp->ms_allocator != -1) { 3022 metaslab_passivate(msp, msp->ms_weight & 3023 ~METASLAB_ACTIVE_MASK); 3024 } 3025 3026 if (!metaslab_debug_unload) 3027 metaslab_unload(msp); 3028 } 3029 3030 ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); 3031 ASSERT0(range_tree_space(msp->ms_freeing)); 3032 ASSERT0(range_tree_space(msp->ms_freed)); 3033 ASSERT0(range_tree_space(msp->ms_checkpointing)); 3034 3035 msp->ms_allocated_this_txg = 0; 3036 mutex_exit(&msp->ms_lock); 3037 } 3038 3039 void 3040 metaslab_sync_reassess(metaslab_group_t *mg) 3041 { 3042 spa_t *spa = mg->mg_class->mc_spa; 3043 3044 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 3045 metaslab_group_alloc_update(mg); 3046 mg->mg_fragmentation = metaslab_group_fragmentation(mg); 3047 3048 /* 3049 * Preload the next potential metaslabs but only on active 3050 * metaslab groups. We can get into a state where the metaslab 3051 * is no longer active since we dirty metaslabs as we remove a 3052 * a device, thus potentially making the metaslab group eligible 3053 * for preloading. 3054 */ 3055 if (mg->mg_activation_count > 0) { 3056 metaslab_group_preload(mg); 3057 } 3058 spa_config_exit(spa, SCL_ALLOC, FTAG); 3059 } 3060 3061 /* 3062 * When writing a ditto block (i.e. more than one DVA for a given BP) on 3063 * the same vdev as an existing DVA of this BP, then try to allocate it 3064 * on a different metaslab than existing DVAs (i.e. a unique metaslab). 3065 */ 3066 static boolean_t 3067 metaslab_is_unique(metaslab_t *msp, dva_t *dva) 3068 { 3069 uint64_t dva_ms_id; 3070 3071 if (DVA_GET_ASIZE(dva) == 0) 3072 return (B_TRUE); 3073 3074 if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) 3075 return (B_TRUE); 3076 3077 dva_ms_id = DVA_GET_OFFSET(dva) >> msp->ms_group->mg_vd->vdev_ms_shift; 3078 3079 return (msp->ms_id != dva_ms_id); 3080 } 3081 3082 /* 3083 * ========================================================================== 3084 * Metaslab allocation tracing facility 3085 * ========================================================================== 3086 */ 3087 kstat_t *metaslab_trace_ksp; 3088 kstat_named_t metaslab_trace_over_limit; 3089 3090 void 3091 metaslab_alloc_trace_init(void) 3092 { 3093 ASSERT(metaslab_alloc_trace_cache == NULL); 3094 metaslab_alloc_trace_cache = kmem_cache_create( 3095 "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t), 3096 0, NULL, NULL, NULL, NULL, NULL, 0); 3097 metaslab_trace_ksp = kstat_create("zfs", 0, "metaslab_trace_stats", 3098 "misc", KSTAT_TYPE_NAMED, 1, KSTAT_FLAG_VIRTUAL); 3099 if (metaslab_trace_ksp != NULL) { 3100 metaslab_trace_ksp->ks_data = &metaslab_trace_over_limit; 3101 kstat_named_init(&metaslab_trace_over_limit, 3102 "metaslab_trace_over_limit", KSTAT_DATA_UINT64); 3103 kstat_install(metaslab_trace_ksp); 3104 } 3105 } 3106 3107 void 3108 metaslab_alloc_trace_fini(void) 3109 { 3110 if (metaslab_trace_ksp != NULL) { 3111 kstat_delete(metaslab_trace_ksp); 3112 metaslab_trace_ksp = NULL; 3113 } 3114 kmem_cache_destroy(metaslab_alloc_trace_cache); 3115 metaslab_alloc_trace_cache = NULL; 3116 } 3117 3118 /* 3119 * Add an allocation trace element to the allocation tracing list. 3120 */ 3121 static void 3122 metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg, 3123 metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset, 3124 int allocator) 3125 { 3126 if (!metaslab_trace_enabled) 3127 return; 3128 3129 /* 3130 * When the tracing list reaches its maximum we remove 3131 * the second element in the list before adding a new one. 3132 * By removing the second element we preserve the original 3133 * entry as a clue to what allocations steps have already been 3134 * performed. 3135 */ 3136 if (zal->zal_size == metaslab_trace_max_entries) { 3137 metaslab_alloc_trace_t *mat_next; 3138 #ifdef DEBUG 3139 panic("too many entries in allocation list"); 3140 #endif 3141 atomic_inc_64(&metaslab_trace_over_limit.value.ui64); 3142 zal->zal_size--; 3143 mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list)); 3144 list_remove(&zal->zal_list, mat_next); 3145 kmem_cache_free(metaslab_alloc_trace_cache, mat_next); 3146 } 3147 3148 metaslab_alloc_trace_t *mat = 3149 kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP); 3150 list_link_init(&mat->mat_list_node); 3151 mat->mat_mg = mg; 3152 mat->mat_msp = msp; 3153 mat->mat_size = psize; 3154 mat->mat_dva_id = dva_id; 3155 mat->mat_offset = offset; 3156 mat->mat_weight = 0; 3157 mat->mat_allocator = allocator; 3158 3159 if (msp != NULL) 3160 mat->mat_weight = msp->ms_weight; 3161 3162 /* 3163 * The list is part of the zio so locking is not required. Only 3164 * a single thread will perform allocations for a given zio. 3165 */ 3166 list_insert_tail(&zal->zal_list, mat); 3167 zal->zal_size++; 3168 3169 ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries); 3170 } 3171 3172 void 3173 metaslab_trace_init(zio_alloc_list_t *zal) 3174 { 3175 list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t), 3176 offsetof(metaslab_alloc_trace_t, mat_list_node)); 3177 zal->zal_size = 0; 3178 } 3179 3180 void 3181 metaslab_trace_fini(zio_alloc_list_t *zal) 3182 { 3183 metaslab_alloc_trace_t *mat; 3184 3185 while ((mat = list_remove_head(&zal->zal_list)) != NULL) 3186 kmem_cache_free(metaslab_alloc_trace_cache, mat); 3187 list_destroy(&zal->zal_list); 3188 zal->zal_size = 0; 3189 } 3190 3191 /* 3192 * ========================================================================== 3193 * Metaslab block operations 3194 * ========================================================================== 3195 */ 3196 3197 static void 3198 metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags, 3199 int allocator) 3200 { 3201 if (!(flags & METASLAB_ASYNC_ALLOC) || 3202 (flags & METASLAB_DONT_THROTTLE)) 3203 return; 3204 3205 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 3206 if (!mg->mg_class->mc_alloc_throttle_enabled) 3207 return; 3208 3209 (void) zfs_refcount_add(&mg->mg_alloc_queue_depth[allocator], tag); 3210 } 3211 3212 static void 3213 metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator) 3214 { 3215 uint64_t max = mg->mg_max_alloc_queue_depth; 3216 uint64_t cur = mg->mg_cur_max_alloc_queue_depth[allocator]; 3217 while (cur < max) { 3218 if (atomic_cas_64(&mg->mg_cur_max_alloc_queue_depth[allocator], 3219 cur, cur + 1) == cur) { 3220 atomic_inc_64( 3221 &mg->mg_class->mc_alloc_max_slots[allocator]); 3222 return; 3223 } 3224 cur = mg->mg_cur_max_alloc_queue_depth[allocator]; 3225 } 3226 } 3227 3228 void 3229 metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags, 3230 int allocator, boolean_t io_complete) 3231 { 3232 if (!(flags & METASLAB_ASYNC_ALLOC) || 3233 (flags & METASLAB_DONT_THROTTLE)) 3234 return; 3235 3236 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 3237 if (!mg->mg_class->mc_alloc_throttle_enabled) 3238 return; 3239 3240 (void) zfs_refcount_remove(&mg->mg_alloc_queue_depth[allocator], tag); 3241 if (io_complete) 3242 metaslab_group_increment_qdepth(mg, allocator); 3243 } 3244 3245 void 3246 metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag, 3247 int allocator) 3248 { 3249 #ifdef ZFS_DEBUG 3250 const dva_t *dva = bp->blk_dva; 3251 int ndvas = BP_GET_NDVAS(bp); 3252 3253 for (int d = 0; d < ndvas; d++) { 3254 uint64_t vdev = DVA_GET_VDEV(&dva[d]); 3255 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 3256 VERIFY(zfs_refcount_not_held( 3257 &mg->mg_alloc_queue_depth[allocator], tag)); 3258 } 3259 #endif 3260 } 3261 3262 static uint64_t 3263 metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) 3264 { 3265 uint64_t start; 3266 range_tree_t *rt = msp->ms_allocatable; 3267 metaslab_class_t *mc = msp->ms_group->mg_class; 3268 3269 VERIFY(!msp->ms_condensing); 3270 VERIFY0(msp->ms_initializing); 3271 3272 start = mc->mc_ops->msop_alloc(msp, size); 3273 if (start != -1ULL) { 3274 metaslab_group_t *mg = msp->ms_group; 3275 vdev_t *vd = mg->mg_vd; 3276 3277 VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift)); 3278 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 3279 VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size); 3280 range_tree_remove(rt, start, size); 3281 3282 if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) 3283 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); 3284 3285 range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size); 3286 3287 /* Track the last successful allocation */ 3288 msp->ms_alloc_txg = txg; 3289 metaslab_verify_space(msp, txg); 3290 } 3291 3292 /* 3293 * Now that we've attempted the allocation we need to update the 3294 * metaslab's maximum block size since it may have changed. 3295 */ 3296 msp->ms_max_size = metaslab_block_maxsize(msp); 3297 return (start); 3298 } 3299 3300 /* 3301 * Find the metaslab with the highest weight that is less than what we've 3302 * already tried. In the common case, this means that we will examine each 3303 * metaslab at most once. Note that concurrent callers could reorder metaslabs 3304 * by activation/passivation once we have dropped the mg_lock. If a metaslab is 3305 * activated by another thread, and we fail to allocate from the metaslab we 3306 * have selected, we may not try the newly-activated metaslab, and instead 3307 * activate another metaslab. This is not optimal, but generally does not cause 3308 * any problems (a possible exception being if every metaslab is completely full 3309 * except for the the newly-activated metaslab which we fail to examine). 3310 */ 3311 static metaslab_t * 3312 find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, 3313 dva_t *dva, int d, boolean_t want_unique, uint64_t asize, int allocator, 3314 zio_alloc_list_t *zal, metaslab_t *search, boolean_t *was_active) 3315 { 3316 avl_index_t idx; 3317 avl_tree_t *t = &mg->mg_metaslab_tree; 3318 metaslab_t *msp = avl_find(t, search, &idx); 3319 if (msp == NULL) 3320 msp = avl_nearest(t, idx, AVL_AFTER); 3321 3322 for (; msp != NULL; msp = AVL_NEXT(t, msp)) { 3323 int i; 3324 if (!metaslab_should_allocate(msp, asize)) { 3325 metaslab_trace_add(zal, mg, msp, asize, d, 3326 TRACE_TOO_SMALL, allocator); 3327 continue; 3328 } 3329 3330 /* 3331 * If the selected metaslab is condensing or being 3332 * initialized, skip it. 3333 */ 3334 if (msp->ms_condensing || msp->ms_initializing > 0) 3335 continue; 3336 3337 *was_active = msp->ms_allocator != -1; 3338 /* 3339 * If we're activating as primary, this is our first allocation 3340 * from this disk, so we don't need to check how close we are. 3341 * If the metaslab under consideration was already active, 3342 * we're getting desperate enough to steal another allocator's 3343 * metaslab, so we still don't care about distances. 3344 */ 3345 if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active) 3346 break; 3347 3348 for (i = 0; i < d; i++) { 3349 if (want_unique && 3350 !metaslab_is_unique(msp, &dva[i])) 3351 break; /* try another metaslab */ 3352 } 3353 if (i == d) 3354 break; 3355 } 3356 3357 if (msp != NULL) { 3358 search->ms_weight = msp->ms_weight; 3359 search->ms_start = msp->ms_start + 1; 3360 search->ms_allocator = msp->ms_allocator; 3361 search->ms_primary = msp->ms_primary; 3362 } 3363 return (msp); 3364 } 3365 3366 /* ARGSUSED */ 3367 static uint64_t 3368 metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, 3369 uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, 3370 int d, int allocator) 3371 { 3372 metaslab_t *msp = NULL; 3373 uint64_t offset = -1ULL; 3374 uint64_t activation_weight; 3375 3376 activation_weight = METASLAB_WEIGHT_PRIMARY; 3377 for (int i = 0; i < d; i++) { 3378 if (activation_weight == METASLAB_WEIGHT_PRIMARY && 3379 DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { 3380 activation_weight = METASLAB_WEIGHT_SECONDARY; 3381 } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && 3382 DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { 3383 activation_weight = METASLAB_WEIGHT_CLAIM; 3384 break; 3385 } 3386 } 3387 3388 /* 3389 * If we don't have enough metaslabs active to fill the entire array, we 3390 * just use the 0th slot. 3391 */ 3392 if (mg->mg_ms_ready < mg->mg_allocators * 3) 3393 allocator = 0; 3394 3395 ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2); 3396 3397 metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP); 3398 search->ms_weight = UINT64_MAX; 3399 search->ms_start = 0; 3400 /* 3401 * At the end of the metaslab tree are the already-active metaslabs, 3402 * first the primaries, then the secondaries. When we resume searching 3403 * through the tree, we need to consider ms_allocator and ms_primary so 3404 * we start in the location right after where we left off, and don't 3405 * accidentally loop forever considering the same metaslabs. 3406 */ 3407 search->ms_allocator = -1; 3408 search->ms_primary = B_TRUE; 3409 for (;;) { 3410 boolean_t was_active = B_FALSE; 3411 3412 mutex_enter(&mg->mg_lock); 3413 3414 if (activation_weight == METASLAB_WEIGHT_PRIMARY && 3415 mg->mg_primaries[allocator] != NULL) { 3416 msp = mg->mg_primaries[allocator]; 3417 was_active = B_TRUE; 3418 } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && 3419 mg->mg_secondaries[allocator] != NULL) { 3420 msp = mg->mg_secondaries[allocator]; 3421 was_active = B_TRUE; 3422 } else { 3423 msp = find_valid_metaslab(mg, activation_weight, dva, d, 3424 want_unique, asize, allocator, zal, search, 3425 &was_active); 3426 } 3427 3428 mutex_exit(&mg->mg_lock); 3429 if (msp == NULL) { 3430 kmem_free(search, sizeof (*search)); 3431 return (-1ULL); 3432 } 3433 3434 mutex_enter(&msp->ms_lock); 3435 /* 3436 * Ensure that the metaslab we have selected is still 3437 * capable of handling our request. It's possible that 3438 * another thread may have changed the weight while we 3439 * were blocked on the metaslab lock. We check the 3440 * active status first to see if we need to reselect 3441 * a new metaslab. 3442 */ 3443 if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) { 3444 mutex_exit(&msp->ms_lock); 3445 continue; 3446 } 3447 3448 /* 3449 * If the metaslab is freshly activated for an allocator that 3450 * isn't the one we're allocating from, or if it's a primary and 3451 * we're seeking a secondary (or vice versa), we go back and 3452 * select a new metaslab. 3453 */ 3454 if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) && 3455 (msp->ms_allocator != -1) && 3456 (msp->ms_allocator != allocator || ((activation_weight == 3457 METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) { 3458 mutex_exit(&msp->ms_lock); 3459 continue; 3460 } 3461 3462 if (msp->ms_weight & METASLAB_WEIGHT_CLAIM && 3463 activation_weight != METASLAB_WEIGHT_CLAIM) { 3464 metaslab_passivate(msp, msp->ms_weight & 3465 ~METASLAB_WEIGHT_CLAIM); 3466 mutex_exit(&msp->ms_lock); 3467 continue; 3468 } 3469 3470 if (metaslab_activate(msp, allocator, activation_weight) != 0) { 3471 mutex_exit(&msp->ms_lock); 3472 continue; 3473 } 3474 3475 msp->ms_selected_txg = txg; 3476 3477 /* 3478 * Now that we have the lock, recheck to see if we should 3479 * continue to use this metaslab for this allocation. The 3480 * the metaslab is now loaded so metaslab_should_allocate() can 3481 * accurately determine if the allocation attempt should 3482 * proceed. 3483 */ 3484 if (!metaslab_should_allocate(msp, asize)) { 3485 /* Passivate this metaslab and select a new one. */ 3486 metaslab_trace_add(zal, mg, msp, asize, d, 3487 TRACE_TOO_SMALL, allocator); 3488 goto next; 3489 } 3490 3491 /* 3492 * If this metaslab is currently condensing then pick again as 3493 * we can't manipulate this metaslab until it's committed 3494 * to disk. If this metaslab is being initialized, we shouldn't 3495 * allocate from it since the allocated region might be 3496 * overwritten after allocation. 3497 */ 3498 if (msp->ms_condensing) { 3499 metaslab_trace_add(zal, mg, msp, asize, d, 3500 TRACE_CONDENSING, allocator); 3501 metaslab_passivate(msp, msp->ms_weight & 3502 ~METASLAB_ACTIVE_MASK); 3503 mutex_exit(&msp->ms_lock); 3504 continue; 3505 } else if (msp->ms_initializing > 0) { 3506 metaslab_trace_add(zal, mg, msp, asize, d, 3507 TRACE_INITIALIZING, allocator); 3508 metaslab_passivate(msp, msp->ms_weight & 3509 ~METASLAB_ACTIVE_MASK); 3510 mutex_exit(&msp->ms_lock); 3511 continue; 3512 } 3513 3514 offset = metaslab_block_alloc(msp, asize, txg); 3515 metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator); 3516 3517 if (offset != -1ULL) { 3518 /* Proactively passivate the metaslab, if needed */ 3519 metaslab_segment_may_passivate(msp); 3520 break; 3521 } 3522 next: 3523 ASSERT(msp->ms_loaded); 3524 3525 /* 3526 * We were unable to allocate from this metaslab so determine 3527 * a new weight for this metaslab. Now that we have loaded 3528 * the metaslab we can provide a better hint to the metaslab 3529 * selector. 3530 * 3531 * For space-based metaslabs, we use the maximum block size. 3532 * This information is only available when the metaslab 3533 * is loaded and is more accurate than the generic free 3534 * space weight that was calculated by metaslab_weight(). 3535 * This information allows us to quickly compare the maximum 3536 * available allocation in the metaslab to the allocation 3537 * size being requested. 3538 * 3539 * For segment-based metaslabs, determine the new weight 3540 * based on the highest bucket in the range tree. We 3541 * explicitly use the loaded segment weight (i.e. the range 3542 * tree histogram) since it contains the space that is 3543 * currently available for allocation and is accurate 3544 * even within a sync pass. 3545 */ 3546 if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) { 3547 uint64_t weight = metaslab_block_maxsize(msp); 3548 WEIGHT_SET_SPACEBASED(weight); 3549 metaslab_passivate(msp, weight); 3550 } else { 3551 metaslab_passivate(msp, 3552 metaslab_weight_from_range_tree(msp)); 3553 } 3554 3555 /* 3556 * We have just failed an allocation attempt, check 3557 * that metaslab_should_allocate() agrees. Otherwise, 3558 * we may end up in an infinite loop retrying the same 3559 * metaslab. 3560 */ 3561 ASSERT(!metaslab_should_allocate(msp, asize)); 3562 3563 mutex_exit(&msp->ms_lock); 3564 } 3565 mutex_exit(&msp->ms_lock); 3566 kmem_free(search, sizeof (*search)); 3567 return (offset); 3568 } 3569 3570 static uint64_t 3571 metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, 3572 uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, 3573 int d, int allocator) 3574 { 3575 uint64_t offset; 3576 ASSERT(mg->mg_initialized); 3577 3578 offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique, 3579 dva, d, allocator); 3580 3581 mutex_enter(&mg->mg_lock); 3582 if (offset == -1ULL) { 3583 mg->mg_failed_allocations++; 3584 metaslab_trace_add(zal, mg, NULL, asize, d, 3585 TRACE_GROUP_FAILURE, allocator); 3586 if (asize == SPA_GANGBLOCKSIZE) { 3587 /* 3588 * This metaslab group was unable to allocate 3589 * the minimum gang block size so it must be out of 3590 * space. We must notify the allocation throttle 3591 * to start skipping allocation attempts to this 3592 * metaslab group until more space becomes available. 3593 * Note: this failure cannot be caused by the 3594 * allocation throttle since the allocation throttle 3595 * is only responsible for skipping devices and 3596 * not failing block allocations. 3597 */ 3598 mg->mg_no_free_space = B_TRUE; 3599 } 3600 } 3601 mg->mg_allocations++; 3602 mutex_exit(&mg->mg_lock); 3603 return (offset); 3604 } 3605 3606 /* 3607 * Allocate a block for the specified i/o. 3608 */ 3609 int 3610 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, 3611 dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags, 3612 zio_alloc_list_t *zal, int allocator) 3613 { 3614 metaslab_group_t *mg, *rotor; 3615 vdev_t *vd; 3616 boolean_t try_hard = B_FALSE; 3617 3618 ASSERT(!DVA_IS_VALID(&dva[d])); 3619 3620 /* 3621 * For testing, make some blocks above a certain size be gang blocks. 3622 * This will also test spilling from special to normal. 3623 */ 3624 if (psize >= metaslab_force_ganging && (ddi_get_lbolt() & 3) == 0) { 3625 metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG, 3626 allocator); 3627 return (SET_ERROR(ENOSPC)); 3628 } 3629 3630 /* 3631 * Start at the rotor and loop through all mgs until we find something. 3632 * Note that there's no locking on mc_rotor or mc_aliquot because 3633 * nothing actually breaks if we miss a few updates -- we just won't 3634 * allocate quite as evenly. It all balances out over time. 3635 * 3636 * If we are doing ditto or log blocks, try to spread them across 3637 * consecutive vdevs. If we're forced to reuse a vdev before we've 3638 * allocated all of our ditto blocks, then try and spread them out on 3639 * that vdev as much as possible. If it turns out to not be possible, 3640 * gradually lower our standards until anything becomes acceptable. 3641 * Also, allocating on consecutive vdevs (as opposed to random vdevs) 3642 * gives us hope of containing our fault domains to something we're 3643 * able to reason about. Otherwise, any two top-level vdev failures 3644 * will guarantee the loss of data. With consecutive allocation, 3645 * only two adjacent top-level vdev failures will result in data loss. 3646 * 3647 * If we are doing gang blocks (hintdva is non-NULL), try to keep 3648 * ourselves on the same vdev as our gang block header. That 3649 * way, we can hope for locality in vdev_cache, plus it makes our 3650 * fault domains something tractable. 3651 */ 3652 if (hintdva) { 3653 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); 3654 3655 /* 3656 * It's possible the vdev we're using as the hint no 3657 * longer exists or its mg has been closed (e.g. by 3658 * device removal). Consult the rotor when 3659 * all else fails. 3660 */ 3661 if (vd != NULL && vd->vdev_mg != NULL) { 3662 mg = vd->vdev_mg; 3663 3664 if (flags & METASLAB_HINTBP_AVOID && 3665 mg->mg_next != NULL) 3666 mg = mg->mg_next; 3667 } else { 3668 mg = mc->mc_rotor; 3669 } 3670 } else if (d != 0) { 3671 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); 3672 mg = vd->vdev_mg->mg_next; 3673 } else { 3674 ASSERT(mc->mc_rotor != NULL); 3675 mg = mc->mc_rotor; 3676 } 3677 3678 /* 3679 * If the hint put us into the wrong metaslab class, or into a 3680 * metaslab group that has been passivated, just follow the rotor. 3681 */ 3682 if (mg->mg_class != mc || mg->mg_activation_count <= 0) 3683 mg = mc->mc_rotor; 3684 3685 rotor = mg; 3686 top: 3687 do { 3688 boolean_t allocatable; 3689 3690 ASSERT(mg->mg_activation_count == 1); 3691 vd = mg->mg_vd; 3692 3693 /* 3694 * Don't allocate from faulted devices. 3695 */ 3696 if (try_hard) { 3697 spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); 3698 allocatable = vdev_allocatable(vd); 3699 spa_config_exit(spa, SCL_ZIO, FTAG); 3700 } else { 3701 allocatable = vdev_allocatable(vd); 3702 } 3703 3704 /* 3705 * Determine if the selected metaslab group is eligible 3706 * for allocations. If we're ganging then don't allow 3707 * this metaslab group to skip allocations since that would 3708 * inadvertently return ENOSPC and suspend the pool 3709 * even though space is still available. 3710 */ 3711 if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) { 3712 allocatable = metaslab_group_allocatable(mg, rotor, 3713 psize, allocator); 3714 } 3715 3716 if (!allocatable) { 3717 metaslab_trace_add(zal, mg, NULL, psize, d, 3718 TRACE_NOT_ALLOCATABLE, allocator); 3719 goto next; 3720 } 3721 3722 ASSERT(mg->mg_initialized); 3723 3724 /* 3725 * Avoid writing single-copy data to a failing, 3726 * non-redundant vdev, unless we've already tried all 3727 * other vdevs. 3728 */ 3729 if ((vd->vdev_stat.vs_write_errors > 0 || 3730 vd->vdev_state < VDEV_STATE_HEALTHY) && 3731 d == 0 && !try_hard && vd->vdev_children == 0) { 3732 metaslab_trace_add(zal, mg, NULL, psize, d, 3733 TRACE_VDEV_ERROR, allocator); 3734 goto next; 3735 } 3736 3737 ASSERT(mg->mg_class == mc); 3738 3739 uint64_t asize = vdev_psize_to_asize(vd, psize); 3740 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); 3741 3742 /* 3743 * If we don't need to try hard, then require that the 3744 * block be on an different metaslab from any other DVAs 3745 * in this BP (unique=true). If we are trying hard, then 3746 * allow any metaslab to be used (unique=false). 3747 */ 3748 uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg, 3749 !try_hard, dva, d, allocator); 3750 3751 if (offset != -1ULL) { 3752 /* 3753 * If we've just selected this metaslab group, 3754 * figure out whether the corresponding vdev is 3755 * over- or under-used relative to the pool, 3756 * and set an allocation bias to even it out. 3757 */ 3758 if (mc->mc_aliquot == 0 && metaslab_bias_enabled) { 3759 vdev_stat_t *vs = &vd->vdev_stat; 3760 int64_t vu, cu; 3761 3762 vu = (vs->vs_alloc * 100) / (vs->vs_space + 1); 3763 cu = (mc->mc_alloc * 100) / (mc->mc_space + 1); 3764 3765 /* 3766 * Calculate how much more or less we should 3767 * try to allocate from this device during 3768 * this iteration around the rotor. 3769 * For example, if a device is 80% full 3770 * and the pool is 20% full then we should 3771 * reduce allocations by 60% on this device. 3772 * 3773 * mg_bias = (20 - 80) * 512K / 100 = -307K 3774 * 3775 * This reduces allocations by 307K for this 3776 * iteration. 3777 */ 3778 mg->mg_bias = ((cu - vu) * 3779 (int64_t)mg->mg_aliquot) / 100; 3780 } else if (!metaslab_bias_enabled) { 3781 mg->mg_bias = 0; 3782 } 3783 3784 if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= 3785 mg->mg_aliquot + mg->mg_bias) { 3786 mc->mc_rotor = mg->mg_next; 3787 mc->mc_aliquot = 0; 3788 } 3789 3790 DVA_SET_VDEV(&dva[d], vd->vdev_id); 3791 DVA_SET_OFFSET(&dva[d], offset); 3792 DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER)); 3793 DVA_SET_ASIZE(&dva[d], asize); 3794 3795 return (0); 3796 } 3797 next: 3798 mc->mc_rotor = mg->mg_next; 3799 mc->mc_aliquot = 0; 3800 } while ((mg = mg->mg_next) != rotor); 3801 3802 /* 3803 * If we haven't tried hard, do so now. 3804 */ 3805 if (!try_hard) { 3806 try_hard = B_TRUE; 3807 goto top; 3808 } 3809 3810 bzero(&dva[d], sizeof (dva_t)); 3811 3812 metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator); 3813 return (SET_ERROR(ENOSPC)); 3814 } 3815 3816 void 3817 metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize, 3818 boolean_t checkpoint) 3819 { 3820 metaslab_t *msp; 3821 spa_t *spa = vd->vdev_spa; 3822 3823 ASSERT(vdev_is_concrete(vd)); 3824 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 3825 ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); 3826 3827 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 3828 3829 VERIFY(!msp->ms_condensing); 3830 VERIFY3U(offset, >=, msp->ms_start); 3831 VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size); 3832 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 3833 VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift)); 3834 3835 metaslab_check_free_impl(vd, offset, asize); 3836 3837 mutex_enter(&msp->ms_lock); 3838 if (range_tree_is_empty(msp->ms_freeing) && 3839 range_tree_is_empty(msp->ms_checkpointing)) { 3840 vdev_dirty(vd, VDD_METASLAB, msp, spa_syncing_txg(spa)); 3841 } 3842 3843 if (checkpoint) { 3844 ASSERT(spa_has_checkpoint(spa)); 3845 range_tree_add(msp->ms_checkpointing, offset, asize); 3846 } else { 3847 range_tree_add(msp->ms_freeing, offset, asize); 3848 } 3849 mutex_exit(&msp->ms_lock); 3850 } 3851 3852 /* ARGSUSED */ 3853 void 3854 metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, 3855 uint64_t size, void *arg) 3856 { 3857 boolean_t *checkpoint = arg; 3858 3859 ASSERT3P(checkpoint, !=, NULL); 3860 3861 if (vd->vdev_ops->vdev_op_remap != NULL) 3862 vdev_indirect_mark_obsolete(vd, offset, size); 3863 else 3864 metaslab_free_impl(vd, offset, size, *checkpoint); 3865 } 3866 3867 static void 3868 metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size, 3869 boolean_t checkpoint) 3870 { 3871 spa_t *spa = vd->vdev_spa; 3872 3873 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 3874 3875 if (spa_syncing_txg(spa) > spa_freeze_txg(spa)) 3876 return; 3877 3878 if (spa->spa_vdev_removal != NULL && 3879 spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id && 3880 vdev_is_concrete(vd)) { 3881 /* 3882 * Note: we check if the vdev is concrete because when 3883 * we complete the removal, we first change the vdev to be 3884 * an indirect vdev (in open context), and then (in syncing 3885 * context) clear spa_vdev_removal. 3886 */ 3887 free_from_removing_vdev(vd, offset, size); 3888 } else if (vd->vdev_ops->vdev_op_remap != NULL) { 3889 vdev_indirect_mark_obsolete(vd, offset, size); 3890 vd->vdev_ops->vdev_op_remap(vd, offset, size, 3891 metaslab_free_impl_cb, &checkpoint); 3892 } else { 3893 metaslab_free_concrete(vd, offset, size, checkpoint); 3894 } 3895 } 3896 3897 typedef struct remap_blkptr_cb_arg { 3898 blkptr_t *rbca_bp; 3899 spa_remap_cb_t rbca_cb; 3900 vdev_t *rbca_remap_vd; 3901 uint64_t rbca_remap_offset; 3902 void *rbca_cb_arg; 3903 } remap_blkptr_cb_arg_t; 3904 3905 void 3906 remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, 3907 uint64_t size, void *arg) 3908 { 3909 remap_blkptr_cb_arg_t *rbca = arg; 3910 blkptr_t *bp = rbca->rbca_bp; 3911 3912 /* We can not remap split blocks. */ 3913 if (size != DVA_GET_ASIZE(&bp->blk_dva[0])) 3914 return; 3915 ASSERT0(inner_offset); 3916 3917 if (rbca->rbca_cb != NULL) { 3918 /* 3919 * At this point we know that we are not handling split 3920 * blocks and we invoke the callback on the previous 3921 * vdev which must be indirect. 3922 */ 3923 ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops); 3924 3925 rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id, 3926 rbca->rbca_remap_offset, size, rbca->rbca_cb_arg); 3927 3928 /* set up remap_blkptr_cb_arg for the next call */ 3929 rbca->rbca_remap_vd = vd; 3930 rbca->rbca_remap_offset = offset; 3931 } 3932 3933 /* 3934 * The phys birth time is that of dva[0]. This ensures that we know 3935 * when each dva was written, so that resilver can determine which 3936 * blocks need to be scrubbed (i.e. those written during the time 3937 * the vdev was offline). It also ensures that the key used in 3938 * the ARC hash table is unique (i.e. dva[0] + phys_birth). If 3939 * we didn't change the phys_birth, a lookup in the ARC for a 3940 * remapped BP could find the data that was previously stored at 3941 * this vdev + offset. 3942 */ 3943 vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa, 3944 DVA_GET_VDEV(&bp->blk_dva[0])); 3945 vdev_indirect_births_t *vib = oldvd->vdev_indirect_births; 3946 bp->blk_phys_birth = vdev_indirect_births_physbirth(vib, 3947 DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0])); 3948 3949 DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id); 3950 DVA_SET_OFFSET(&bp->blk_dva[0], offset); 3951 } 3952 3953 /* 3954 * If the block pointer contains any indirect DVAs, modify them to refer to 3955 * concrete DVAs. Note that this will sometimes not be possible, leaving 3956 * the indirect DVA in place. This happens if the indirect DVA spans multiple 3957 * segments in the mapping (i.e. it is a "split block"). 3958 * 3959 * If the BP was remapped, calls the callback on the original dva (note the 3960 * callback can be called multiple times if the original indirect DVA refers 3961 * to another indirect DVA, etc). 3962 * 3963 * Returns TRUE if the BP was remapped. 3964 */ 3965 boolean_t 3966 spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg) 3967 { 3968 remap_blkptr_cb_arg_t rbca; 3969 3970 if (!zfs_remap_blkptr_enable) 3971 return (B_FALSE); 3972 3973 if (!spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) 3974 return (B_FALSE); 3975 3976 /* 3977 * Dedup BP's can not be remapped, because ddt_phys_select() depends 3978 * on DVA[0] being the same in the BP as in the DDT (dedup table). 3979 */ 3980 if (BP_GET_DEDUP(bp)) 3981 return (B_FALSE); 3982 3983 /* 3984 * Gang blocks can not be remapped, because 3985 * zio_checksum_gang_verifier() depends on the DVA[0] that's in 3986 * the BP used to read the gang block header (GBH) being the same 3987 * as the DVA[0] that we allocated for the GBH. 3988 */ 3989 if (BP_IS_GANG(bp)) 3990 return (B_FALSE); 3991 3992 /* 3993 * Embedded BP's have no DVA to remap. 3994 */ 3995 if (BP_GET_NDVAS(bp) < 1) 3996 return (B_FALSE); 3997 3998 /* 3999 * Note: we only remap dva[0]. If we remapped other dvas, we 4000 * would no longer know what their phys birth txg is. 4001 */ 4002 dva_t *dva = &bp->blk_dva[0]; 4003 4004 uint64_t offset = DVA_GET_OFFSET(dva); 4005 uint64_t size = DVA_GET_ASIZE(dva); 4006 vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); 4007 4008 if (vd->vdev_ops->vdev_op_remap == NULL) 4009 return (B_FALSE); 4010 4011 rbca.rbca_bp = bp; 4012 rbca.rbca_cb = callback; 4013 rbca.rbca_remap_vd = vd; 4014 rbca.rbca_remap_offset = offset; 4015 rbca.rbca_cb_arg = arg; 4016 4017 /* 4018 * remap_blkptr_cb() will be called in order for each level of 4019 * indirection, until a concrete vdev is reached or a split block is 4020 * encountered. old_vd and old_offset are updated within the callback 4021 * as we go from the one indirect vdev to the next one (either concrete 4022 * or indirect again) in that order. 4023 */ 4024 vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca); 4025 4026 /* Check if the DVA wasn't remapped because it is a split block */ 4027 if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id) 4028 return (B_FALSE); 4029 4030 return (B_TRUE); 4031 } 4032 4033 /* 4034 * Undo the allocation of a DVA which happened in the given transaction group. 4035 */ 4036 void 4037 metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg) 4038 { 4039 metaslab_t *msp; 4040 vdev_t *vd; 4041 uint64_t vdev = DVA_GET_VDEV(dva); 4042 uint64_t offset = DVA_GET_OFFSET(dva); 4043 uint64_t size = DVA_GET_ASIZE(dva); 4044 4045 ASSERT(DVA_IS_VALID(dva)); 4046 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 4047 4048 if (txg > spa_freeze_txg(spa)) 4049 return; 4050 4051 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 4052 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { 4053 cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu", 4054 (u_longlong_t)vdev, (u_longlong_t)offset); 4055 ASSERT(0); 4056 return; 4057 } 4058 4059 ASSERT(!vd->vdev_removing); 4060 ASSERT(vdev_is_concrete(vd)); 4061 ASSERT0(vd->vdev_indirect_config.vic_mapping_object); 4062 ASSERT3P(vd->vdev_indirect_mapping, ==, NULL); 4063 4064 if (DVA_GET_GANG(dva)) 4065 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 4066 4067 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 4068 4069 mutex_enter(&msp->ms_lock); 4070 range_tree_remove(msp->ms_allocating[txg & TXG_MASK], 4071 offset, size); 4072 4073 VERIFY(!msp->ms_condensing); 4074 VERIFY3U(offset, >=, msp->ms_start); 4075 VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size); 4076 VERIFY3U(range_tree_space(msp->ms_allocatable) + size, <=, 4077 msp->ms_size); 4078 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 4079 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 4080 range_tree_add(msp->ms_allocatable, offset, size); 4081 mutex_exit(&msp->ms_lock); 4082 } 4083 4084 /* 4085 * Free the block represented by the given DVA. 4086 */ 4087 void 4088 metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint) 4089 { 4090 uint64_t vdev = DVA_GET_VDEV(dva); 4091 uint64_t offset = DVA_GET_OFFSET(dva); 4092 uint64_t size = DVA_GET_ASIZE(dva); 4093 vdev_t *vd = vdev_lookup_top(spa, vdev); 4094 4095 ASSERT(DVA_IS_VALID(dva)); 4096 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 4097 4098 if (DVA_GET_GANG(dva)) { 4099 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 4100 } 4101 4102 metaslab_free_impl(vd, offset, size, checkpoint); 4103 } 4104 4105 /* 4106 * Reserve some allocation slots. The reservation system must be called 4107 * before we call into the allocator. If there aren't any available slots 4108 * then the I/O will be throttled until an I/O completes and its slots are 4109 * freed up. The function returns true if it was successful in placing 4110 * the reservation. 4111 */ 4112 boolean_t 4113 metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator, 4114 zio_t *zio, int flags) 4115 { 4116 uint64_t available_slots = 0; 4117 boolean_t slot_reserved = B_FALSE; 4118 uint64_t max = mc->mc_alloc_max_slots[allocator]; 4119 4120 ASSERT(mc->mc_alloc_throttle_enabled); 4121 mutex_enter(&mc->mc_lock); 4122 4123 uint64_t reserved_slots = 4124 zfs_refcount_count(&mc->mc_alloc_slots[allocator]); 4125 if (reserved_slots < max) 4126 available_slots = max - reserved_slots; 4127 4128 if (slots <= available_slots || GANG_ALLOCATION(flags) || 4129 flags & METASLAB_MUST_RESERVE) { 4130 /* 4131 * We reserve the slots individually so that we can unreserve 4132 * them individually when an I/O completes. 4133 */ 4134 for (int d = 0; d < slots; d++) { 4135 reserved_slots = 4136 zfs_refcount_add(&mc->mc_alloc_slots[allocator], 4137 zio); 4138 } 4139 zio->io_flags |= ZIO_FLAG_IO_ALLOCATING; 4140 slot_reserved = B_TRUE; 4141 } 4142 4143 mutex_exit(&mc->mc_lock); 4144 return (slot_reserved); 4145 } 4146 4147 void 4148 metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, 4149 int allocator, zio_t *zio) 4150 { 4151 ASSERT(mc->mc_alloc_throttle_enabled); 4152 mutex_enter(&mc->mc_lock); 4153 for (int d = 0; d < slots; d++) { 4154 (void) zfs_refcount_remove(&mc->mc_alloc_slots[allocator], 4155 zio); 4156 } 4157 mutex_exit(&mc->mc_lock); 4158 } 4159 4160 static int 4161 metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size, 4162 uint64_t txg) 4163 { 4164 metaslab_t *msp; 4165 spa_t *spa = vd->vdev_spa; 4166 int error = 0; 4167 4168 if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count) 4169 return (ENXIO); 4170 4171 ASSERT3P(vd->vdev_ms, !=, NULL); 4172 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 4173 4174 mutex_enter(&msp->ms_lock); 4175 4176 if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) 4177 error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM); 4178 /* 4179 * No need to fail in that case; someone else has activated the 4180 * metaslab, but that doesn't preclude us from using it. 4181 */ 4182 if (error == EBUSY) 4183 error = 0; 4184 4185 if (error == 0 && 4186 !range_tree_contains(msp->ms_allocatable, offset, size)) 4187 error = SET_ERROR(ENOENT); 4188 4189 if (error || txg == 0) { /* txg == 0 indicates dry run */ 4190 mutex_exit(&msp->ms_lock); 4191 return (error); 4192 } 4193 4194 VERIFY(!msp->ms_condensing); 4195 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 4196 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 4197 VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=, 4198 msp->ms_size); 4199 range_tree_remove(msp->ms_allocatable, offset, size); 4200 4201 if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ 4202 if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) 4203 vdev_dirty(vd, VDD_METASLAB, msp, txg); 4204 range_tree_add(msp->ms_allocating[txg & TXG_MASK], 4205 offset, size); 4206 } 4207 4208 mutex_exit(&msp->ms_lock); 4209 4210 return (0); 4211 } 4212 4213 typedef struct metaslab_claim_cb_arg_t { 4214 uint64_t mcca_txg; 4215 int mcca_error; 4216 } metaslab_claim_cb_arg_t; 4217 4218 /* ARGSUSED */ 4219 static void 4220 metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, 4221 uint64_t size, void *arg) 4222 { 4223 metaslab_claim_cb_arg_t *mcca_arg = arg; 4224 4225 if (mcca_arg->mcca_error == 0) { 4226 mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset, 4227 size, mcca_arg->mcca_txg); 4228 } 4229 } 4230 4231 int 4232 metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg) 4233 { 4234 if (vd->vdev_ops->vdev_op_remap != NULL) { 4235 metaslab_claim_cb_arg_t arg; 4236 4237 /* 4238 * Only zdb(1M) can claim on indirect vdevs. This is used 4239 * to detect leaks of mapped space (that are not accounted 4240 * for in the obsolete counts, spacemap, or bpobj). 4241 */ 4242 ASSERT(!spa_writeable(vd->vdev_spa)); 4243 arg.mcca_error = 0; 4244 arg.mcca_txg = txg; 4245 4246 vd->vdev_ops->vdev_op_remap(vd, offset, size, 4247 metaslab_claim_impl_cb, &arg); 4248 4249 if (arg.mcca_error == 0) { 4250 arg.mcca_error = metaslab_claim_concrete(vd, 4251 offset, size, txg); 4252 } 4253 return (arg.mcca_error); 4254 } else { 4255 return (metaslab_claim_concrete(vd, offset, size, txg)); 4256 } 4257 } 4258 4259 /* 4260 * Intent log support: upon opening the pool after a crash, notify the SPA 4261 * of blocks that the intent log has allocated for immediate write, but 4262 * which are still considered free by the SPA because the last transaction 4263 * group didn't commit yet. 4264 */ 4265 static int 4266 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) 4267 { 4268 uint64_t vdev = DVA_GET_VDEV(dva); 4269 uint64_t offset = DVA_GET_OFFSET(dva); 4270 uint64_t size = DVA_GET_ASIZE(dva); 4271 vdev_t *vd; 4272 4273 if ((vd = vdev_lookup_top(spa, vdev)) == NULL) { 4274 return (SET_ERROR(ENXIO)); 4275 } 4276 4277 ASSERT(DVA_IS_VALID(dva)); 4278 4279 if (DVA_GET_GANG(dva)) 4280 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 4281 4282 return (metaslab_claim_impl(vd, offset, size, txg)); 4283 } 4284 4285 int 4286 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, 4287 int ndvas, uint64_t txg, blkptr_t *hintbp, int flags, 4288 zio_alloc_list_t *zal, zio_t *zio, int allocator) 4289 { 4290 dva_t *dva = bp->blk_dva; 4291 dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL; 4292 int error = 0; 4293 4294 ASSERT(bp->blk_birth == 0); 4295 ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); 4296 4297 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 4298 4299 if (mc->mc_rotor == NULL) { /* no vdevs in this class */ 4300 spa_config_exit(spa, SCL_ALLOC, FTAG); 4301 return (SET_ERROR(ENOSPC)); 4302 } 4303 4304 ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); 4305 ASSERT(BP_GET_NDVAS(bp) == 0); 4306 ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); 4307 ASSERT3P(zal, !=, NULL); 4308 4309 for (int d = 0; d < ndvas; d++) { 4310 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, 4311 txg, flags, zal, allocator); 4312 if (error != 0) { 4313 for (d--; d >= 0; d--) { 4314 metaslab_unalloc_dva(spa, &dva[d], txg); 4315 metaslab_group_alloc_decrement(spa, 4316 DVA_GET_VDEV(&dva[d]), zio, flags, 4317 allocator, B_FALSE); 4318 bzero(&dva[d], sizeof (dva_t)); 4319 } 4320 spa_config_exit(spa, SCL_ALLOC, FTAG); 4321 return (error); 4322 } else { 4323 /* 4324 * Update the metaslab group's queue depth 4325 * based on the newly allocated dva. 4326 */ 4327 metaslab_group_alloc_increment(spa, 4328 DVA_GET_VDEV(&dva[d]), zio, flags, allocator); 4329 } 4330 4331 } 4332 ASSERT(error == 0); 4333 ASSERT(BP_GET_NDVAS(bp) == ndvas); 4334 4335 spa_config_exit(spa, SCL_ALLOC, FTAG); 4336 4337 BP_SET_BIRTH(bp, txg, txg); 4338 4339 return (0); 4340 } 4341 4342 void 4343 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) 4344 { 4345 const dva_t *dva = bp->blk_dva; 4346 int ndvas = BP_GET_NDVAS(bp); 4347 4348 ASSERT(!BP_IS_HOLE(bp)); 4349 ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); 4350 4351 /* 4352 * If we have a checkpoint for the pool we need to make sure that 4353 * the blocks that we free that are part of the checkpoint won't be 4354 * reused until the checkpoint is discarded or we revert to it. 4355 * 4356 * The checkpoint flag is passed down the metaslab_free code path 4357 * and is set whenever we want to add a block to the checkpoint's 4358 * accounting. That is, we "checkpoint" blocks that existed at the 4359 * time the checkpoint was created and are therefore referenced by 4360 * the checkpointed uberblock. 4361 * 4362 * Note that, we don't checkpoint any blocks if the current 4363 * syncing txg <= spa_checkpoint_txg. We want these frees to sync 4364 * normally as they will be referenced by the checkpointed uberblock. 4365 */ 4366 boolean_t checkpoint = B_FALSE; 4367 if (bp->blk_birth <= spa->spa_checkpoint_txg && 4368 spa_syncing_txg(spa) > spa->spa_checkpoint_txg) { 4369 /* 4370 * At this point, if the block is part of the checkpoint 4371 * there is no way it was created in the current txg. 4372 */ 4373 ASSERT(!now); 4374 ASSERT3U(spa_syncing_txg(spa), ==, txg); 4375 checkpoint = B_TRUE; 4376 } 4377 4378 spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); 4379 4380 for (int d = 0; d < ndvas; d++) { 4381 if (now) { 4382 metaslab_unalloc_dva(spa, &dva[d], txg); 4383 } else { 4384 ASSERT3U(txg, ==, spa_syncing_txg(spa)); 4385 metaslab_free_dva(spa, &dva[d], checkpoint); 4386 } 4387 } 4388 4389 spa_config_exit(spa, SCL_FREE, FTAG); 4390 } 4391 4392 int 4393 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) 4394 { 4395 const dva_t *dva = bp->blk_dva; 4396 int ndvas = BP_GET_NDVAS(bp); 4397 int error = 0; 4398 4399 ASSERT(!BP_IS_HOLE(bp)); 4400 4401 if (txg != 0) { 4402 /* 4403 * First do a dry run to make sure all DVAs are claimable, 4404 * so we don't have to unwind from partial failures below. 4405 */ 4406 if ((error = metaslab_claim(spa, bp, 0)) != 0) 4407 return (error); 4408 } 4409 4410 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 4411 4412 for (int d = 0; d < ndvas; d++) { 4413 error = metaslab_claim_dva(spa, &dva[d], txg); 4414 if (error != 0) 4415 break; 4416 } 4417 4418 spa_config_exit(spa, SCL_ALLOC, FTAG); 4419 4420 ASSERT(error == 0 || txg == 0); 4421 4422 return (error); 4423 } 4424 4425 /* ARGSUSED */ 4426 static void 4427 metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset, 4428 uint64_t size, void *arg) 4429 { 4430 if (vd->vdev_ops == &vdev_indirect_ops) 4431 return; 4432 4433 metaslab_check_free_impl(vd, offset, size); 4434 } 4435 4436 static void 4437 metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size) 4438 { 4439 metaslab_t *msp; 4440 spa_t *spa = vd->vdev_spa; 4441 4442 if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) 4443 return; 4444 4445 if (vd->vdev_ops->vdev_op_remap != NULL) { 4446 vd->vdev_ops->vdev_op_remap(vd, offset, size, 4447 metaslab_check_free_impl_cb, NULL); 4448 return; 4449 } 4450 4451 ASSERT(vdev_is_concrete(vd)); 4452 ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); 4453 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 4454 4455 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 4456 4457 mutex_enter(&msp->ms_lock); 4458 if (msp->ms_loaded) { 4459 range_tree_verify_not_present(msp->ms_allocatable, 4460 offset, size); 4461 } 4462 4463 range_tree_verify_not_present(msp->ms_freeing, offset, size); 4464 range_tree_verify_not_present(msp->ms_checkpointing, offset, size); 4465 range_tree_verify_not_present(msp->ms_freed, offset, size); 4466 for (int j = 0; j < TXG_DEFER_SIZE; j++) 4467 range_tree_verify_not_present(msp->ms_defer[j], offset, size); 4468 mutex_exit(&msp->ms_lock); 4469 } 4470 4471 void 4472 metaslab_check_free(spa_t *spa, const blkptr_t *bp) 4473 { 4474 if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) 4475 return; 4476 4477 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 4478 for (int i = 0; i < BP_GET_NDVAS(bp); i++) { 4479 uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]); 4480 vdev_t *vd = vdev_lookup_top(spa, vdev); 4481 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); 4482 uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]); 4483 4484 if (DVA_GET_GANG(&bp->blk_dva[i])) 4485 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 4486 4487 ASSERT3P(vd, !=, NULL); 4488 4489 metaslab_check_free_impl(vd, offset, size); 4490 } 4491 spa_config_exit(spa, SCL_VDEV, FTAG); 4492 } 4493