1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2019 by Delphix. All rights reserved. 24 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 25 * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved. 26 * Copyright (c) 2017, Intel Corporation. 27 */ 28 29 #include <sys/zfs_context.h> 30 #include <sys/dmu.h> 31 #include <sys/dmu_tx.h> 32 #include <sys/space_map.h> 33 #include <sys/metaslab_impl.h> 34 #include <sys/vdev_impl.h> 35 #include <sys/zio.h> 36 #include <sys/spa_impl.h> 37 #include <sys/zfeature.h> 38 #include <sys/vdev_indirect_mapping.h> 39 #include <sys/zap.h> 40 #include <sys/btree.h> 41 42 #define WITH_DF_BLOCK_ALLOCATOR 43 44 #define GANG_ALLOCATION(flags) \ 45 ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER)) 46 47 /* 48 * Metaslab granularity, in bytes. This is roughly similar to what would be 49 * referred to as the "stripe size" in traditional RAID arrays. In normal 50 * operation, we will try to write this amount of data to a top-level vdev 51 * before moving on to the next one. 52 */ 53 unsigned long metaslab_aliquot = 512 << 10; 54 55 /* 56 * For testing, make some blocks above a certain size be gang blocks. 57 */ 58 unsigned long metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1; 59 60 /* 61 * In pools where the log space map feature is not enabled we touch 62 * multiple metaslabs (and their respective space maps) with each 63 * transaction group. Thus, we benefit from having a small space map 64 * block size since it allows us to issue more I/O operations scattered 65 * around the disk. So a sane default for the space map block size 66 * is 8~16K. 67 */ 68 int zfs_metaslab_sm_blksz_no_log = (1 << 14); 69 70 /* 71 * When the log space map feature is enabled, we accumulate a lot of 72 * changes per metaslab that are flushed once in a while so we benefit 73 * from a bigger block size like 128K for the metaslab space maps. 74 */ 75 int zfs_metaslab_sm_blksz_with_log = (1 << 17); 76 77 /* 78 * The in-core space map representation is more compact than its on-disk form. 79 * The zfs_condense_pct determines how much more compact the in-core 80 * space map representation must be before we compact it on-disk. 81 * Values should be greater than or equal to 100. 82 */ 83 int zfs_condense_pct = 200; 84 85 /* 86 * Condensing a metaslab is not guaranteed to actually reduce the amount of 87 * space used on disk. In particular, a space map uses data in increments of 88 * MAX(1 << ashift, space_map_blksz), so a metaslab might use the 89 * same number of blocks after condensing. Since the goal of condensing is to 90 * reduce the number of IOPs required to read the space map, we only want to 91 * condense when we can be sure we will reduce the number of blocks used by the 92 * space map. Unfortunately, we cannot precisely compute whether or not this is 93 * the case in metaslab_should_condense since we are holding ms_lock. Instead, 94 * we apply the following heuristic: do not condense a spacemap unless the 95 * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold 96 * blocks. 97 */ 98 int zfs_metaslab_condense_block_threshold = 4; 99 100 /* 101 * The zfs_mg_noalloc_threshold defines which metaslab groups should 102 * be eligible for allocation. The value is defined as a percentage of 103 * free space. Metaslab groups that have more free space than 104 * zfs_mg_noalloc_threshold are always eligible for allocations. Once 105 * a metaslab group's free space is less than or equal to the 106 * zfs_mg_noalloc_threshold the allocator will avoid allocating to that 107 * group unless all groups in the pool have reached zfs_mg_noalloc_threshold. 108 * Once all groups in the pool reach zfs_mg_noalloc_threshold then all 109 * groups are allowed to accept allocations. Gang blocks are always 110 * eligible to allocate on any metaslab group. The default value of 0 means 111 * no metaslab group will be excluded based on this criterion. 112 */ 113 int zfs_mg_noalloc_threshold = 0; 114 115 /* 116 * Metaslab groups are considered eligible for allocations if their 117 * fragmentation metric (measured as a percentage) is less than or 118 * equal to zfs_mg_fragmentation_threshold. If a metaslab group 119 * exceeds this threshold then it will be skipped unless all metaslab 120 * groups within the metaslab class have also crossed this threshold. 121 * 122 * This tunable was introduced to avoid edge cases where we continue 123 * allocating from very fragmented disks in our pool while other, less 124 * fragmented disks, exists. On the other hand, if all disks in the 125 * pool are uniformly approaching the threshold, the threshold can 126 * be a speed bump in performance, where we keep switching the disks 127 * that we allocate from (e.g. we allocate some segments from disk A 128 * making it bypassing the threshold while freeing segments from disk 129 * B getting its fragmentation below the threshold). 130 * 131 * Empirically, we've seen that our vdev selection for allocations is 132 * good enough that fragmentation increases uniformly across all vdevs 133 * the majority of the time. Thus we set the threshold percentage high 134 * enough to avoid hitting the speed bump on pools that are being pushed 135 * to the edge. 136 */ 137 int zfs_mg_fragmentation_threshold = 95; 138 139 /* 140 * Allow metaslabs to keep their active state as long as their fragmentation 141 * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An 142 * active metaslab that exceeds this threshold will no longer keep its active 143 * status allowing better metaslabs to be selected. 144 */ 145 int zfs_metaslab_fragmentation_threshold = 70; 146 147 /* 148 * When set will load all metaslabs when pool is first opened. 149 */ 150 int metaslab_debug_load = 0; 151 152 /* 153 * When set will prevent metaslabs from being unloaded. 154 */ 155 int metaslab_debug_unload = 0; 156 157 /* 158 * Minimum size which forces the dynamic allocator to change 159 * it's allocation strategy. Once the space map cannot satisfy 160 * an allocation of this size then it switches to using more 161 * aggressive strategy (i.e search by size rather than offset). 162 */ 163 uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE; 164 165 /* 166 * The minimum free space, in percent, which must be available 167 * in a space map to continue allocations in a first-fit fashion. 168 * Once the space map's free space drops below this level we dynamically 169 * switch to using best-fit allocations. 170 */ 171 int metaslab_df_free_pct = 4; 172 173 /* 174 * Maximum distance to search forward from the last offset. Without this 175 * limit, fragmented pools can see >100,000 iterations and 176 * metaslab_block_picker() becomes the performance limiting factor on 177 * high-performance storage. 178 * 179 * With the default setting of 16MB, we typically see less than 500 180 * iterations, even with very fragmented, ashift=9 pools. The maximum number 181 * of iterations possible is: 182 * metaslab_df_max_search / (2 * (1<<ashift)) 183 * With the default setting of 16MB this is 16*1024 (with ashift=9) or 184 * 2048 (with ashift=12). 185 */ 186 int metaslab_df_max_search = 16 * 1024 * 1024; 187 188 /* 189 * Forces the metaslab_block_picker function to search for at least this many 190 * segments forwards until giving up on finding a segment that the allocation 191 * will fit into. 192 */ 193 uint32_t metaslab_min_search_count = 100; 194 195 /* 196 * If we are not searching forward (due to metaslab_df_max_search, 197 * metaslab_df_free_pct, or metaslab_df_alloc_threshold), this tunable 198 * controls what segment is used. If it is set, we will use the largest free 199 * segment. If it is not set, we will use a segment of exactly the requested 200 * size (or larger). 201 */ 202 int metaslab_df_use_largest_segment = B_FALSE; 203 204 /* 205 * Percentage of all cpus that can be used by the metaslab taskq. 206 */ 207 int metaslab_load_pct = 50; 208 209 /* 210 * These tunables control how long a metaslab will remain loaded after the 211 * last allocation from it. A metaslab can't be unloaded until at least 212 * metaslab_unload_delay TXG's and metaslab_unload_delay_ms milliseconds 213 * have elapsed. However, zfs_metaslab_mem_limit may cause it to be 214 * unloaded sooner. These settings are intended to be generous -- to keep 215 * metaslabs loaded for a long time, reducing the rate of metaslab loading. 216 */ 217 int metaslab_unload_delay = 32; 218 int metaslab_unload_delay_ms = 10 * 60 * 1000; /* ten minutes */ 219 220 /* 221 * Max number of metaslabs per group to preload. 222 */ 223 int metaslab_preload_limit = 10; 224 225 /* 226 * Enable/disable preloading of metaslab. 227 */ 228 int metaslab_preload_enabled = B_TRUE; 229 230 /* 231 * Enable/disable fragmentation weighting on metaslabs. 232 */ 233 int metaslab_fragmentation_factor_enabled = B_TRUE; 234 235 /* 236 * Enable/disable lba weighting (i.e. outer tracks are given preference). 237 */ 238 int metaslab_lba_weighting_enabled = B_TRUE; 239 240 /* 241 * Enable/disable metaslab group biasing. 242 */ 243 int metaslab_bias_enabled = B_TRUE; 244 245 /* 246 * Enable/disable remapping of indirect DVAs to their concrete vdevs. 247 */ 248 boolean_t zfs_remap_blkptr_enable = B_TRUE; 249 250 /* 251 * Enable/disable segment-based metaslab selection. 252 */ 253 int zfs_metaslab_segment_weight_enabled = B_TRUE; 254 255 /* 256 * When using segment-based metaslab selection, we will continue 257 * allocating from the active metaslab until we have exhausted 258 * zfs_metaslab_switch_threshold of its buckets. 259 */ 260 int zfs_metaslab_switch_threshold = 2; 261 262 /* 263 * Internal switch to enable/disable the metaslab allocation tracing 264 * facility. 265 */ 266 #ifdef _METASLAB_TRACING 267 boolean_t metaslab_trace_enabled = B_TRUE; 268 #endif 269 270 /* 271 * Maximum entries that the metaslab allocation tracing facility will keep 272 * in a given list when running in non-debug mode. We limit the number 273 * of entries in non-debug mode to prevent us from using up too much memory. 274 * The limit should be sufficiently large that we don't expect any allocation 275 * to every exceed this value. In debug mode, the system will panic if this 276 * limit is ever reached allowing for further investigation. 277 */ 278 #ifdef _METASLAB_TRACING 279 uint64_t metaslab_trace_max_entries = 5000; 280 #endif 281 282 /* 283 * Maximum number of metaslabs per group that can be disabled 284 * simultaneously. 285 */ 286 int max_disabled_ms = 3; 287 288 /* 289 * Time (in seconds) to respect ms_max_size when the metaslab is not loaded. 290 * To avoid 64-bit overflow, don't set above UINT32_MAX. 291 */ 292 unsigned long zfs_metaslab_max_size_cache_sec = 3600; /* 1 hour */ 293 294 /* 295 * Maximum percentage of memory to use on storing loaded metaslabs. If loading 296 * a metaslab would take it over this percentage, the oldest selected metaslab 297 * is automatically unloaded. 298 */ 299 int zfs_metaslab_mem_limit = 75; 300 301 /* 302 * Force the per-metaslab range trees to use 64-bit integers to store 303 * segments. Used for debugging purposes. 304 */ 305 boolean_t zfs_metaslab_force_large_segs = B_FALSE; 306 307 /* 308 * By default we only store segments over a certain size in the size-sorted 309 * metaslab trees (ms_allocatable_by_size and 310 * ms_unflushed_frees_by_size). This dramatically reduces memory usage and 311 * improves load and unload times at the cost of causing us to use slightly 312 * larger segments than we would otherwise in some cases. 313 */ 314 uint32_t metaslab_by_size_min_shift = 14; 315 316 static uint64_t metaslab_weight(metaslab_t *, boolean_t); 317 static void metaslab_set_fragmentation(metaslab_t *, boolean_t); 318 static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t); 319 static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t); 320 321 static void metaslab_passivate(metaslab_t *msp, uint64_t weight); 322 static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp); 323 static void metaslab_flush_update(metaslab_t *, dmu_tx_t *); 324 static unsigned int metaslab_idx_func(multilist_t *, void *); 325 static void metaslab_evict(metaslab_t *, uint64_t); 326 static void metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg); 327 #ifdef _METASLAB_TRACING 328 kmem_cache_t *metaslab_alloc_trace_cache; 329 330 typedef struct metaslab_stats { 331 kstat_named_t metaslabstat_trace_over_limit; 332 kstat_named_t metaslabstat_df_find_under_floor; 333 kstat_named_t metaslabstat_reload_tree; 334 } metaslab_stats_t; 335 336 static metaslab_stats_t metaslab_stats = { 337 { "trace_over_limit", KSTAT_DATA_UINT64 }, 338 { "df_find_under_floor", KSTAT_DATA_UINT64 }, 339 { "reload_tree", KSTAT_DATA_UINT64 }, 340 }; 341 342 #define METASLABSTAT_BUMP(stat) \ 343 atomic_inc_64(&metaslab_stats.stat.value.ui64); 344 345 346 kstat_t *metaslab_ksp; 347 348 void 349 metaslab_stat_init(void) 350 { 351 ASSERT(metaslab_alloc_trace_cache == NULL); 352 metaslab_alloc_trace_cache = kmem_cache_create( 353 "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t), 354 0, NULL, NULL, NULL, NULL, NULL, 0); 355 metaslab_ksp = kstat_create("zfs", 0, "metaslab_stats", 356 "misc", KSTAT_TYPE_NAMED, sizeof (metaslab_stats) / 357 sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 358 if (metaslab_ksp != NULL) { 359 metaslab_ksp->ks_data = &metaslab_stats; 360 kstat_install(metaslab_ksp); 361 } 362 } 363 364 void 365 metaslab_stat_fini(void) 366 { 367 if (metaslab_ksp != NULL) { 368 kstat_delete(metaslab_ksp); 369 metaslab_ksp = NULL; 370 } 371 372 kmem_cache_destroy(metaslab_alloc_trace_cache); 373 metaslab_alloc_trace_cache = NULL; 374 } 375 #else 376 377 void 378 metaslab_stat_init(void) 379 { 380 } 381 382 void 383 metaslab_stat_fini(void) 384 { 385 } 386 #endif 387 388 /* 389 * ========================================================================== 390 * Metaslab classes 391 * ========================================================================== 392 */ 393 metaslab_class_t * 394 metaslab_class_create(spa_t *spa, metaslab_ops_t *ops) 395 { 396 metaslab_class_t *mc; 397 398 mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); 399 400 mc->mc_spa = spa; 401 mc->mc_rotor = NULL; 402 mc->mc_ops = ops; 403 mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL); 404 mc->mc_metaslab_txg_list = multilist_create(sizeof (metaslab_t), 405 offsetof(metaslab_t, ms_class_txg_node), metaslab_idx_func); 406 mc->mc_alloc_slots = kmem_zalloc(spa->spa_alloc_count * 407 sizeof (zfs_refcount_t), KM_SLEEP); 408 mc->mc_alloc_max_slots = kmem_zalloc(spa->spa_alloc_count * 409 sizeof (uint64_t), KM_SLEEP); 410 for (int i = 0; i < spa->spa_alloc_count; i++) 411 zfs_refcount_create_tracked(&mc->mc_alloc_slots[i]); 412 413 return (mc); 414 } 415 416 void 417 metaslab_class_destroy(metaslab_class_t *mc) 418 { 419 ASSERT(mc->mc_rotor == NULL); 420 ASSERT(mc->mc_alloc == 0); 421 ASSERT(mc->mc_deferred == 0); 422 ASSERT(mc->mc_space == 0); 423 ASSERT(mc->mc_dspace == 0); 424 425 for (int i = 0; i < mc->mc_spa->spa_alloc_count; i++) 426 zfs_refcount_destroy(&mc->mc_alloc_slots[i]); 427 kmem_free(mc->mc_alloc_slots, mc->mc_spa->spa_alloc_count * 428 sizeof (zfs_refcount_t)); 429 kmem_free(mc->mc_alloc_max_slots, mc->mc_spa->spa_alloc_count * 430 sizeof (uint64_t)); 431 mutex_destroy(&mc->mc_lock); 432 multilist_destroy(mc->mc_metaslab_txg_list); 433 kmem_free(mc, sizeof (metaslab_class_t)); 434 } 435 436 int 437 metaslab_class_validate(metaslab_class_t *mc) 438 { 439 metaslab_group_t *mg; 440 vdev_t *vd; 441 442 /* 443 * Must hold one of the spa_config locks. 444 */ 445 ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || 446 spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); 447 448 if ((mg = mc->mc_rotor) == NULL) 449 return (0); 450 451 do { 452 vd = mg->mg_vd; 453 ASSERT(vd->vdev_mg != NULL); 454 ASSERT3P(vd->vdev_top, ==, vd); 455 ASSERT3P(mg->mg_class, ==, mc); 456 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); 457 } while ((mg = mg->mg_next) != mc->mc_rotor); 458 459 return (0); 460 } 461 462 static void 463 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, 464 int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) 465 { 466 atomic_add_64(&mc->mc_alloc, alloc_delta); 467 atomic_add_64(&mc->mc_deferred, defer_delta); 468 atomic_add_64(&mc->mc_space, space_delta); 469 atomic_add_64(&mc->mc_dspace, dspace_delta); 470 } 471 472 uint64_t 473 metaslab_class_get_alloc(metaslab_class_t *mc) 474 { 475 return (mc->mc_alloc); 476 } 477 478 uint64_t 479 metaslab_class_get_deferred(metaslab_class_t *mc) 480 { 481 return (mc->mc_deferred); 482 } 483 484 uint64_t 485 metaslab_class_get_space(metaslab_class_t *mc) 486 { 487 return (mc->mc_space); 488 } 489 490 uint64_t 491 metaslab_class_get_dspace(metaslab_class_t *mc) 492 { 493 return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); 494 } 495 496 void 497 metaslab_class_histogram_verify(metaslab_class_t *mc) 498 { 499 spa_t *spa = mc->mc_spa; 500 vdev_t *rvd = spa->spa_root_vdev; 501 uint64_t *mc_hist; 502 int i; 503 504 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 505 return; 506 507 mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 508 KM_SLEEP); 509 510 for (int c = 0; c < rvd->vdev_children; c++) { 511 vdev_t *tvd = rvd->vdev_child[c]; 512 metaslab_group_t *mg = tvd->vdev_mg; 513 514 /* 515 * Skip any holes, uninitialized top-levels, or 516 * vdevs that are not in this metalab class. 517 */ 518 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || 519 mg->mg_class != mc) { 520 continue; 521 } 522 523 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 524 mc_hist[i] += mg->mg_histogram[i]; 525 } 526 527 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 528 VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]); 529 530 kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 531 } 532 533 /* 534 * Calculate the metaslab class's fragmentation metric. The metric 535 * is weighted based on the space contribution of each metaslab group. 536 * The return value will be a number between 0 and 100 (inclusive), or 537 * ZFS_FRAG_INVALID if the metric has not been set. See comment above the 538 * zfs_frag_table for more information about the metric. 539 */ 540 uint64_t 541 metaslab_class_fragmentation(metaslab_class_t *mc) 542 { 543 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 544 uint64_t fragmentation = 0; 545 546 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 547 548 for (int c = 0; c < rvd->vdev_children; c++) { 549 vdev_t *tvd = rvd->vdev_child[c]; 550 metaslab_group_t *mg = tvd->vdev_mg; 551 552 /* 553 * Skip any holes, uninitialized top-levels, 554 * or vdevs that are not in this metalab class. 555 */ 556 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || 557 mg->mg_class != mc) { 558 continue; 559 } 560 561 /* 562 * If a metaslab group does not contain a fragmentation 563 * metric then just bail out. 564 */ 565 if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { 566 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 567 return (ZFS_FRAG_INVALID); 568 } 569 570 /* 571 * Determine how much this metaslab_group is contributing 572 * to the overall pool fragmentation metric. 573 */ 574 fragmentation += mg->mg_fragmentation * 575 metaslab_group_get_space(mg); 576 } 577 fragmentation /= metaslab_class_get_space(mc); 578 579 ASSERT3U(fragmentation, <=, 100); 580 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 581 return (fragmentation); 582 } 583 584 /* 585 * Calculate the amount of expandable space that is available in 586 * this metaslab class. If a device is expanded then its expandable 587 * space will be the amount of allocatable space that is currently not 588 * part of this metaslab class. 589 */ 590 uint64_t 591 metaslab_class_expandable_space(metaslab_class_t *mc) 592 { 593 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 594 uint64_t space = 0; 595 596 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 597 for (int c = 0; c < rvd->vdev_children; c++) { 598 vdev_t *tvd = rvd->vdev_child[c]; 599 metaslab_group_t *mg = tvd->vdev_mg; 600 601 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || 602 mg->mg_class != mc) { 603 continue; 604 } 605 606 /* 607 * Calculate if we have enough space to add additional 608 * metaslabs. We report the expandable space in terms 609 * of the metaslab size since that's the unit of expansion. 610 */ 611 space += P2ALIGN(tvd->vdev_max_asize - tvd->vdev_asize, 612 1ULL << tvd->vdev_ms_shift); 613 } 614 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 615 return (space); 616 } 617 618 void 619 metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg) 620 { 621 multilist_t *ml = mc->mc_metaslab_txg_list; 622 for (int i = 0; i < multilist_get_num_sublists(ml); i++) { 623 multilist_sublist_t *mls = multilist_sublist_lock(ml, i); 624 metaslab_t *msp = multilist_sublist_head(mls); 625 multilist_sublist_unlock(mls); 626 while (msp != NULL) { 627 mutex_enter(&msp->ms_lock); 628 629 /* 630 * If the metaslab has been removed from the list 631 * (which could happen if we were at the memory limit 632 * and it was evicted during this loop), then we can't 633 * proceed and we should restart the sublist. 634 */ 635 if (!multilist_link_active(&msp->ms_class_txg_node)) { 636 mutex_exit(&msp->ms_lock); 637 i--; 638 break; 639 } 640 mls = multilist_sublist_lock(ml, i); 641 metaslab_t *next_msp = multilist_sublist_next(mls, msp); 642 multilist_sublist_unlock(mls); 643 if (txg > 644 msp->ms_selected_txg + metaslab_unload_delay && 645 gethrtime() > msp->ms_selected_time + 646 (uint64_t)MSEC2NSEC(metaslab_unload_delay_ms)) { 647 metaslab_evict(msp, txg); 648 } else { 649 /* 650 * Once we've hit a metaslab selected too 651 * recently to evict, we're done evicting for 652 * now. 653 */ 654 mutex_exit(&msp->ms_lock); 655 break; 656 } 657 mutex_exit(&msp->ms_lock); 658 msp = next_msp; 659 } 660 } 661 } 662 663 static int 664 metaslab_compare(const void *x1, const void *x2) 665 { 666 const metaslab_t *m1 = (const metaslab_t *)x1; 667 const metaslab_t *m2 = (const metaslab_t *)x2; 668 669 int sort1 = 0; 670 int sort2 = 0; 671 if (m1->ms_allocator != -1 && m1->ms_primary) 672 sort1 = 1; 673 else if (m1->ms_allocator != -1 && !m1->ms_primary) 674 sort1 = 2; 675 if (m2->ms_allocator != -1 && m2->ms_primary) 676 sort2 = 1; 677 else if (m2->ms_allocator != -1 && !m2->ms_primary) 678 sort2 = 2; 679 680 /* 681 * Sort inactive metaslabs first, then primaries, then secondaries. When 682 * selecting a metaslab to allocate from, an allocator first tries its 683 * primary, then secondary active metaslab. If it doesn't have active 684 * metaslabs, or can't allocate from them, it searches for an inactive 685 * metaslab to activate. If it can't find a suitable one, it will steal 686 * a primary or secondary metaslab from another allocator. 687 */ 688 if (sort1 < sort2) 689 return (-1); 690 if (sort1 > sort2) 691 return (1); 692 693 int cmp = TREE_CMP(m2->ms_weight, m1->ms_weight); 694 if (likely(cmp)) 695 return (cmp); 696 697 IMPLY(TREE_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2); 698 699 return (TREE_CMP(m1->ms_start, m2->ms_start)); 700 } 701 702 /* 703 * ========================================================================== 704 * Metaslab groups 705 * ========================================================================== 706 */ 707 /* 708 * Update the allocatable flag and the metaslab group's capacity. 709 * The allocatable flag is set to true if the capacity is below 710 * the zfs_mg_noalloc_threshold or has a fragmentation value that is 711 * greater than zfs_mg_fragmentation_threshold. If a metaslab group 712 * transitions from allocatable to non-allocatable or vice versa then the 713 * metaslab group's class is updated to reflect the transition. 714 */ 715 static void 716 metaslab_group_alloc_update(metaslab_group_t *mg) 717 { 718 vdev_t *vd = mg->mg_vd; 719 metaslab_class_t *mc = mg->mg_class; 720 vdev_stat_t *vs = &vd->vdev_stat; 721 boolean_t was_allocatable; 722 boolean_t was_initialized; 723 724 ASSERT(vd == vd->vdev_top); 725 ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==, 726 SCL_ALLOC); 727 728 mutex_enter(&mg->mg_lock); 729 was_allocatable = mg->mg_allocatable; 730 was_initialized = mg->mg_initialized; 731 732 mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / 733 (vs->vs_space + 1); 734 735 mutex_enter(&mc->mc_lock); 736 737 /* 738 * If the metaslab group was just added then it won't 739 * have any space until we finish syncing out this txg. 740 * At that point we will consider it initialized and available 741 * for allocations. We also don't consider non-activated 742 * metaslab groups (e.g. vdevs that are in the middle of being removed) 743 * to be initialized, because they can't be used for allocation. 744 */ 745 mg->mg_initialized = metaslab_group_initialized(mg); 746 if (!was_initialized && mg->mg_initialized) { 747 mc->mc_groups++; 748 } else if (was_initialized && !mg->mg_initialized) { 749 ASSERT3U(mc->mc_groups, >, 0); 750 mc->mc_groups--; 751 } 752 if (mg->mg_initialized) 753 mg->mg_no_free_space = B_FALSE; 754 755 /* 756 * A metaslab group is considered allocatable if it has plenty 757 * of free space or is not heavily fragmented. We only take 758 * fragmentation into account if the metaslab group has a valid 759 * fragmentation metric (i.e. a value between 0 and 100). 760 */ 761 mg->mg_allocatable = (mg->mg_activation_count > 0 && 762 mg->mg_free_capacity > zfs_mg_noalloc_threshold && 763 (mg->mg_fragmentation == ZFS_FRAG_INVALID || 764 mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)); 765 766 /* 767 * The mc_alloc_groups maintains a count of the number of 768 * groups in this metaslab class that are still above the 769 * zfs_mg_noalloc_threshold. This is used by the allocating 770 * threads to determine if they should avoid allocations to 771 * a given group. The allocator will avoid allocations to a group 772 * if that group has reached or is below the zfs_mg_noalloc_threshold 773 * and there are still other groups that are above the threshold. 774 * When a group transitions from allocatable to non-allocatable or 775 * vice versa we update the metaslab class to reflect that change. 776 * When the mc_alloc_groups value drops to 0 that means that all 777 * groups have reached the zfs_mg_noalloc_threshold making all groups 778 * eligible for allocations. This effectively means that all devices 779 * are balanced again. 780 */ 781 if (was_allocatable && !mg->mg_allocatable) 782 mc->mc_alloc_groups--; 783 else if (!was_allocatable && mg->mg_allocatable) 784 mc->mc_alloc_groups++; 785 mutex_exit(&mc->mc_lock); 786 787 mutex_exit(&mg->mg_lock); 788 } 789 790 int 791 metaslab_sort_by_flushed(const void *va, const void *vb) 792 { 793 const metaslab_t *a = va; 794 const metaslab_t *b = vb; 795 796 int cmp = TREE_CMP(a->ms_unflushed_txg, b->ms_unflushed_txg); 797 if (likely(cmp)) 798 return (cmp); 799 800 uint64_t a_vdev_id = a->ms_group->mg_vd->vdev_id; 801 uint64_t b_vdev_id = b->ms_group->mg_vd->vdev_id; 802 cmp = TREE_CMP(a_vdev_id, b_vdev_id); 803 if (cmp) 804 return (cmp); 805 806 return (TREE_CMP(a->ms_id, b->ms_id)); 807 } 808 809 metaslab_group_t * 810 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators) 811 { 812 metaslab_group_t *mg; 813 814 mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); 815 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); 816 mutex_init(&mg->mg_ms_disabled_lock, NULL, MUTEX_DEFAULT, NULL); 817 cv_init(&mg->mg_ms_disabled_cv, NULL, CV_DEFAULT, NULL); 818 avl_create(&mg->mg_metaslab_tree, metaslab_compare, 819 sizeof (metaslab_t), offsetof(metaslab_t, ms_group_node)); 820 mg->mg_vd = vd; 821 mg->mg_class = mc; 822 mg->mg_activation_count = 0; 823 mg->mg_initialized = B_FALSE; 824 mg->mg_no_free_space = B_TRUE; 825 mg->mg_allocators = allocators; 826 827 mg->mg_allocator = kmem_zalloc(allocators * 828 sizeof (metaslab_group_allocator_t), KM_SLEEP); 829 for (int i = 0; i < allocators; i++) { 830 metaslab_group_allocator_t *mga = &mg->mg_allocator[i]; 831 zfs_refcount_create_tracked(&mga->mga_alloc_queue_depth); 832 } 833 834 mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct, 835 maxclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT | TASKQ_DYNAMIC); 836 837 return (mg); 838 } 839 840 void 841 metaslab_group_destroy(metaslab_group_t *mg) 842 { 843 ASSERT(mg->mg_prev == NULL); 844 ASSERT(mg->mg_next == NULL); 845 /* 846 * We may have gone below zero with the activation count 847 * either because we never activated in the first place or 848 * because we're done, and possibly removing the vdev. 849 */ 850 ASSERT(mg->mg_activation_count <= 0); 851 852 taskq_destroy(mg->mg_taskq); 853 avl_destroy(&mg->mg_metaslab_tree); 854 mutex_destroy(&mg->mg_lock); 855 mutex_destroy(&mg->mg_ms_disabled_lock); 856 cv_destroy(&mg->mg_ms_disabled_cv); 857 858 for (int i = 0; i < mg->mg_allocators; i++) { 859 metaslab_group_allocator_t *mga = &mg->mg_allocator[i]; 860 zfs_refcount_destroy(&mga->mga_alloc_queue_depth); 861 } 862 kmem_free(mg->mg_allocator, mg->mg_allocators * 863 sizeof (metaslab_group_allocator_t)); 864 865 kmem_free(mg, sizeof (metaslab_group_t)); 866 } 867 868 void 869 metaslab_group_activate(metaslab_group_t *mg) 870 { 871 metaslab_class_t *mc = mg->mg_class; 872 metaslab_group_t *mgprev, *mgnext; 873 874 ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER), !=, 0); 875 876 ASSERT(mc->mc_rotor != mg); 877 ASSERT(mg->mg_prev == NULL); 878 ASSERT(mg->mg_next == NULL); 879 ASSERT(mg->mg_activation_count <= 0); 880 881 if (++mg->mg_activation_count <= 0) 882 return; 883 884 mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); 885 metaslab_group_alloc_update(mg); 886 887 if ((mgprev = mc->mc_rotor) == NULL) { 888 mg->mg_prev = mg; 889 mg->mg_next = mg; 890 } else { 891 mgnext = mgprev->mg_next; 892 mg->mg_prev = mgprev; 893 mg->mg_next = mgnext; 894 mgprev->mg_next = mg; 895 mgnext->mg_prev = mg; 896 } 897 mc->mc_rotor = mg; 898 } 899 900 /* 901 * Passivate a metaslab group and remove it from the allocation rotor. 902 * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating 903 * a metaslab group. This function will momentarily drop spa_config_locks 904 * that are lower than the SCL_ALLOC lock (see comment below). 905 */ 906 void 907 metaslab_group_passivate(metaslab_group_t *mg) 908 { 909 metaslab_class_t *mc = mg->mg_class; 910 spa_t *spa = mc->mc_spa; 911 metaslab_group_t *mgprev, *mgnext; 912 int locks = spa_config_held(spa, SCL_ALL, RW_WRITER); 913 914 ASSERT3U(spa_config_held(spa, SCL_ALLOC | SCL_ZIO, RW_WRITER), ==, 915 (SCL_ALLOC | SCL_ZIO)); 916 917 if (--mg->mg_activation_count != 0) { 918 ASSERT(mc->mc_rotor != mg); 919 ASSERT(mg->mg_prev == NULL); 920 ASSERT(mg->mg_next == NULL); 921 ASSERT(mg->mg_activation_count < 0); 922 return; 923 } 924 925 /* 926 * The spa_config_lock is an array of rwlocks, ordered as 927 * follows (from highest to lowest): 928 * SCL_CONFIG > SCL_STATE > SCL_L2ARC > SCL_ALLOC > 929 * SCL_ZIO > SCL_FREE > SCL_VDEV 930 * (For more information about the spa_config_lock see spa_misc.c) 931 * The higher the lock, the broader its coverage. When we passivate 932 * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO 933 * config locks. However, the metaslab group's taskq might be trying 934 * to preload metaslabs so we must drop the SCL_ZIO lock and any 935 * lower locks to allow the I/O to complete. At a minimum, 936 * we continue to hold the SCL_ALLOC lock, which prevents any future 937 * allocations from taking place and any changes to the vdev tree. 938 */ 939 spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa); 940 taskq_wait_outstanding(mg->mg_taskq, 0); 941 spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER); 942 metaslab_group_alloc_update(mg); 943 for (int i = 0; i < mg->mg_allocators; i++) { 944 metaslab_group_allocator_t *mga = &mg->mg_allocator[i]; 945 metaslab_t *msp = mga->mga_primary; 946 if (msp != NULL) { 947 mutex_enter(&msp->ms_lock); 948 metaslab_passivate(msp, 949 metaslab_weight_from_range_tree(msp)); 950 mutex_exit(&msp->ms_lock); 951 } 952 msp = mga->mga_secondary; 953 if (msp != NULL) { 954 mutex_enter(&msp->ms_lock); 955 metaslab_passivate(msp, 956 metaslab_weight_from_range_tree(msp)); 957 mutex_exit(&msp->ms_lock); 958 } 959 } 960 961 mgprev = mg->mg_prev; 962 mgnext = mg->mg_next; 963 964 if (mg == mgnext) { 965 mc->mc_rotor = NULL; 966 } else { 967 mc->mc_rotor = mgnext; 968 mgprev->mg_next = mgnext; 969 mgnext->mg_prev = mgprev; 970 } 971 972 mg->mg_prev = NULL; 973 mg->mg_next = NULL; 974 } 975 976 boolean_t 977 metaslab_group_initialized(metaslab_group_t *mg) 978 { 979 vdev_t *vd = mg->mg_vd; 980 vdev_stat_t *vs = &vd->vdev_stat; 981 982 return (vs->vs_space != 0 && mg->mg_activation_count > 0); 983 } 984 985 uint64_t 986 metaslab_group_get_space(metaslab_group_t *mg) 987 { 988 return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count); 989 } 990 991 void 992 metaslab_group_histogram_verify(metaslab_group_t *mg) 993 { 994 uint64_t *mg_hist; 995 vdev_t *vd = mg->mg_vd; 996 uint64_t ashift = vd->vdev_ashift; 997 int i; 998 999 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 1000 return; 1001 1002 mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 1003 KM_SLEEP); 1004 1005 ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=, 1006 SPACE_MAP_HISTOGRAM_SIZE + ashift); 1007 1008 for (int m = 0; m < vd->vdev_ms_count; m++) { 1009 metaslab_t *msp = vd->vdev_ms[m]; 1010 1011 /* skip if not active or not a member */ 1012 if (msp->ms_sm == NULL || msp->ms_group != mg) 1013 continue; 1014 1015 for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) 1016 mg_hist[i + ashift] += 1017 msp->ms_sm->sm_phys->smp_histogram[i]; 1018 } 1019 1020 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++) 1021 VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]); 1022 1023 kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 1024 } 1025 1026 static void 1027 metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp) 1028 { 1029 metaslab_class_t *mc = mg->mg_class; 1030 uint64_t ashift = mg->mg_vd->vdev_ashift; 1031 1032 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1033 if (msp->ms_sm == NULL) 1034 return; 1035 1036 mutex_enter(&mg->mg_lock); 1037 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 1038 mg->mg_histogram[i + ashift] += 1039 msp->ms_sm->sm_phys->smp_histogram[i]; 1040 mc->mc_histogram[i + ashift] += 1041 msp->ms_sm->sm_phys->smp_histogram[i]; 1042 } 1043 mutex_exit(&mg->mg_lock); 1044 } 1045 1046 void 1047 metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp) 1048 { 1049 metaslab_class_t *mc = mg->mg_class; 1050 uint64_t ashift = mg->mg_vd->vdev_ashift; 1051 1052 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1053 if (msp->ms_sm == NULL) 1054 return; 1055 1056 mutex_enter(&mg->mg_lock); 1057 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 1058 ASSERT3U(mg->mg_histogram[i + ashift], >=, 1059 msp->ms_sm->sm_phys->smp_histogram[i]); 1060 ASSERT3U(mc->mc_histogram[i + ashift], >=, 1061 msp->ms_sm->sm_phys->smp_histogram[i]); 1062 1063 mg->mg_histogram[i + ashift] -= 1064 msp->ms_sm->sm_phys->smp_histogram[i]; 1065 mc->mc_histogram[i + ashift] -= 1066 msp->ms_sm->sm_phys->smp_histogram[i]; 1067 } 1068 mutex_exit(&mg->mg_lock); 1069 } 1070 1071 static void 1072 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) 1073 { 1074 ASSERT(msp->ms_group == NULL); 1075 mutex_enter(&mg->mg_lock); 1076 msp->ms_group = mg; 1077 msp->ms_weight = 0; 1078 avl_add(&mg->mg_metaslab_tree, msp); 1079 mutex_exit(&mg->mg_lock); 1080 1081 mutex_enter(&msp->ms_lock); 1082 metaslab_group_histogram_add(mg, msp); 1083 mutex_exit(&msp->ms_lock); 1084 } 1085 1086 static void 1087 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) 1088 { 1089 mutex_enter(&msp->ms_lock); 1090 metaslab_group_histogram_remove(mg, msp); 1091 mutex_exit(&msp->ms_lock); 1092 1093 mutex_enter(&mg->mg_lock); 1094 ASSERT(msp->ms_group == mg); 1095 avl_remove(&mg->mg_metaslab_tree, msp); 1096 1097 metaslab_class_t *mc = msp->ms_group->mg_class; 1098 multilist_sublist_t *mls = 1099 multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp); 1100 if (multilist_link_active(&msp->ms_class_txg_node)) 1101 multilist_sublist_remove(mls, msp); 1102 multilist_sublist_unlock(mls); 1103 1104 msp->ms_group = NULL; 1105 mutex_exit(&mg->mg_lock); 1106 } 1107 1108 static void 1109 metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 1110 { 1111 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1112 ASSERT(MUTEX_HELD(&mg->mg_lock)); 1113 ASSERT(msp->ms_group == mg); 1114 1115 avl_remove(&mg->mg_metaslab_tree, msp); 1116 msp->ms_weight = weight; 1117 avl_add(&mg->mg_metaslab_tree, msp); 1118 1119 } 1120 1121 static void 1122 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 1123 { 1124 /* 1125 * Although in principle the weight can be any value, in 1126 * practice we do not use values in the range [1, 511]. 1127 */ 1128 ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0); 1129 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1130 1131 mutex_enter(&mg->mg_lock); 1132 metaslab_group_sort_impl(mg, msp, weight); 1133 mutex_exit(&mg->mg_lock); 1134 } 1135 1136 /* 1137 * Calculate the fragmentation for a given metaslab group. We can use 1138 * a simple average here since all metaslabs within the group must have 1139 * the same size. The return value will be a value between 0 and 100 1140 * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this 1141 * group have a fragmentation metric. 1142 */ 1143 uint64_t 1144 metaslab_group_fragmentation(metaslab_group_t *mg) 1145 { 1146 vdev_t *vd = mg->mg_vd; 1147 uint64_t fragmentation = 0; 1148 uint64_t valid_ms = 0; 1149 1150 for (int m = 0; m < vd->vdev_ms_count; m++) { 1151 metaslab_t *msp = vd->vdev_ms[m]; 1152 1153 if (msp->ms_fragmentation == ZFS_FRAG_INVALID) 1154 continue; 1155 if (msp->ms_group != mg) 1156 continue; 1157 1158 valid_ms++; 1159 fragmentation += msp->ms_fragmentation; 1160 } 1161 1162 if (valid_ms <= mg->mg_vd->vdev_ms_count / 2) 1163 return (ZFS_FRAG_INVALID); 1164 1165 fragmentation /= valid_ms; 1166 ASSERT3U(fragmentation, <=, 100); 1167 return (fragmentation); 1168 } 1169 1170 /* 1171 * Determine if a given metaslab group should skip allocations. A metaslab 1172 * group should avoid allocations if its free capacity is less than the 1173 * zfs_mg_noalloc_threshold or its fragmentation metric is greater than 1174 * zfs_mg_fragmentation_threshold and there is at least one metaslab group 1175 * that can still handle allocations. If the allocation throttle is enabled 1176 * then we skip allocations to devices that have reached their maximum 1177 * allocation queue depth unless the selected metaslab group is the only 1178 * eligible group remaining. 1179 */ 1180 static boolean_t 1181 metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, 1182 uint64_t psize, int allocator, int d) 1183 { 1184 spa_t *spa = mg->mg_vd->vdev_spa; 1185 metaslab_class_t *mc = mg->mg_class; 1186 1187 /* 1188 * We can only consider skipping this metaslab group if it's 1189 * in the normal metaslab class and there are other metaslab 1190 * groups to select from. Otherwise, we always consider it eligible 1191 * for allocations. 1192 */ 1193 if ((mc != spa_normal_class(spa) && 1194 mc != spa_special_class(spa) && 1195 mc != spa_dedup_class(spa)) || 1196 mc->mc_groups <= 1) 1197 return (B_TRUE); 1198 1199 /* 1200 * If the metaslab group's mg_allocatable flag is set (see comments 1201 * in metaslab_group_alloc_update() for more information) and 1202 * the allocation throttle is disabled then allow allocations to this 1203 * device. However, if the allocation throttle is enabled then 1204 * check if we have reached our allocation limit (mg_alloc_queue_depth) 1205 * to determine if we should allow allocations to this metaslab group. 1206 * If all metaslab groups are no longer considered allocatable 1207 * (mc_alloc_groups == 0) or we're trying to allocate the smallest 1208 * gang block size then we allow allocations on this metaslab group 1209 * regardless of the mg_allocatable or throttle settings. 1210 */ 1211 if (mg->mg_allocatable) { 1212 metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; 1213 int64_t qdepth; 1214 uint64_t qmax = mga->mga_cur_max_alloc_queue_depth; 1215 1216 if (!mc->mc_alloc_throttle_enabled) 1217 return (B_TRUE); 1218 1219 /* 1220 * If this metaslab group does not have any free space, then 1221 * there is no point in looking further. 1222 */ 1223 if (mg->mg_no_free_space) 1224 return (B_FALSE); 1225 1226 /* 1227 * Relax allocation throttling for ditto blocks. Due to 1228 * random imbalances in allocation it tends to push copies 1229 * to one vdev, that looks a bit better at the moment. 1230 */ 1231 qmax = qmax * (4 + d) / 4; 1232 1233 qdepth = zfs_refcount_count(&mga->mga_alloc_queue_depth); 1234 1235 /* 1236 * If this metaslab group is below its qmax or it's 1237 * the only allocatable metasable group, then attempt 1238 * to allocate from it. 1239 */ 1240 if (qdepth < qmax || mc->mc_alloc_groups == 1) 1241 return (B_TRUE); 1242 ASSERT3U(mc->mc_alloc_groups, >, 1); 1243 1244 /* 1245 * Since this metaslab group is at or over its qmax, we 1246 * need to determine if there are metaslab groups after this 1247 * one that might be able to handle this allocation. This is 1248 * racy since we can't hold the locks for all metaslab 1249 * groups at the same time when we make this check. 1250 */ 1251 for (metaslab_group_t *mgp = mg->mg_next; 1252 mgp != rotor; mgp = mgp->mg_next) { 1253 metaslab_group_allocator_t *mgap = 1254 &mgp->mg_allocator[allocator]; 1255 qmax = mgap->mga_cur_max_alloc_queue_depth; 1256 qmax = qmax * (4 + d) / 4; 1257 qdepth = 1258 zfs_refcount_count(&mgap->mga_alloc_queue_depth); 1259 1260 /* 1261 * If there is another metaslab group that 1262 * might be able to handle the allocation, then 1263 * we return false so that we skip this group. 1264 */ 1265 if (qdepth < qmax && !mgp->mg_no_free_space) 1266 return (B_FALSE); 1267 } 1268 1269 /* 1270 * We didn't find another group to handle the allocation 1271 * so we can't skip this metaslab group even though 1272 * we are at or over our qmax. 1273 */ 1274 return (B_TRUE); 1275 1276 } else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) { 1277 return (B_TRUE); 1278 } 1279 return (B_FALSE); 1280 } 1281 1282 /* 1283 * ========================================================================== 1284 * Range tree callbacks 1285 * ========================================================================== 1286 */ 1287 1288 /* 1289 * Comparison function for the private size-ordered tree using 32-bit 1290 * ranges. Tree is sorted by size, larger sizes at the end of the tree. 1291 */ 1292 static int 1293 metaslab_rangesize32_compare(const void *x1, const void *x2) 1294 { 1295 const range_seg32_t *r1 = x1; 1296 const range_seg32_t *r2 = x2; 1297 1298 uint64_t rs_size1 = r1->rs_end - r1->rs_start; 1299 uint64_t rs_size2 = r2->rs_end - r2->rs_start; 1300 1301 int cmp = TREE_CMP(rs_size1, rs_size2); 1302 if (likely(cmp)) 1303 return (cmp); 1304 1305 return (TREE_CMP(r1->rs_start, r2->rs_start)); 1306 } 1307 1308 /* 1309 * Comparison function for the private size-ordered tree using 64-bit 1310 * ranges. Tree is sorted by size, larger sizes at the end of the tree. 1311 */ 1312 static int 1313 metaslab_rangesize64_compare(const void *x1, const void *x2) 1314 { 1315 const range_seg64_t *r1 = x1; 1316 const range_seg64_t *r2 = x2; 1317 1318 uint64_t rs_size1 = r1->rs_end - r1->rs_start; 1319 uint64_t rs_size2 = r2->rs_end - r2->rs_start; 1320 1321 int cmp = TREE_CMP(rs_size1, rs_size2); 1322 if (likely(cmp)) 1323 return (cmp); 1324 1325 return (TREE_CMP(r1->rs_start, r2->rs_start)); 1326 } 1327 typedef struct metaslab_rt_arg { 1328 zfs_btree_t *mra_bt; 1329 uint32_t mra_floor_shift; 1330 } metaslab_rt_arg_t; 1331 1332 struct mssa_arg { 1333 range_tree_t *rt; 1334 metaslab_rt_arg_t *mra; 1335 }; 1336 1337 static void 1338 metaslab_size_sorted_add(void *arg, uint64_t start, uint64_t size) 1339 { 1340 struct mssa_arg *mssap = arg; 1341 range_tree_t *rt = mssap->rt; 1342 metaslab_rt_arg_t *mrap = mssap->mra; 1343 range_seg_max_t seg = {0}; 1344 rs_set_start(&seg, rt, start); 1345 rs_set_end(&seg, rt, start + size); 1346 metaslab_rt_add(rt, &seg, mrap); 1347 } 1348 1349 static void 1350 metaslab_size_tree_full_load(range_tree_t *rt) 1351 { 1352 metaslab_rt_arg_t *mrap = rt->rt_arg; 1353 #ifdef _METASLAB_TRACING 1354 METASLABSTAT_BUMP(metaslabstat_reload_tree); 1355 #endif 1356 ASSERT0(zfs_btree_numnodes(mrap->mra_bt)); 1357 mrap->mra_floor_shift = 0; 1358 struct mssa_arg arg = {0}; 1359 arg.rt = rt; 1360 arg.mra = mrap; 1361 range_tree_walk(rt, metaslab_size_sorted_add, &arg); 1362 } 1363 1364 /* 1365 * Create any block allocator specific components. The current allocators 1366 * rely on using both a size-ordered range_tree_t and an array of uint64_t's. 1367 */ 1368 /* ARGSUSED */ 1369 static void 1370 metaslab_rt_create(range_tree_t *rt, void *arg) 1371 { 1372 metaslab_rt_arg_t *mrap = arg; 1373 zfs_btree_t *size_tree = mrap->mra_bt; 1374 1375 size_t size; 1376 int (*compare) (const void *, const void *); 1377 switch (rt->rt_type) { 1378 case RANGE_SEG32: 1379 size = sizeof (range_seg32_t); 1380 compare = metaslab_rangesize32_compare; 1381 break; 1382 case RANGE_SEG64: 1383 size = sizeof (range_seg64_t); 1384 compare = metaslab_rangesize64_compare; 1385 break; 1386 default: 1387 panic("Invalid range seg type %d", rt->rt_type); 1388 } 1389 zfs_btree_create(size_tree, compare, size); 1390 mrap->mra_floor_shift = metaslab_by_size_min_shift; 1391 } 1392 1393 /* ARGSUSED */ 1394 static void 1395 metaslab_rt_destroy(range_tree_t *rt, void *arg) 1396 { 1397 metaslab_rt_arg_t *mrap = arg; 1398 zfs_btree_t *size_tree = mrap->mra_bt; 1399 1400 zfs_btree_destroy(size_tree); 1401 kmem_free(mrap, sizeof (*mrap)); 1402 } 1403 1404 /* ARGSUSED */ 1405 static void 1406 metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg) 1407 { 1408 metaslab_rt_arg_t *mrap = arg; 1409 zfs_btree_t *size_tree = mrap->mra_bt; 1410 1411 if (rs_get_end(rs, rt) - rs_get_start(rs, rt) < 1412 (1 << mrap->mra_floor_shift)) 1413 return; 1414 1415 zfs_btree_add(size_tree, rs); 1416 } 1417 1418 /* ARGSUSED */ 1419 static void 1420 metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg) 1421 { 1422 metaslab_rt_arg_t *mrap = arg; 1423 zfs_btree_t *size_tree = mrap->mra_bt; 1424 1425 if (rs_get_end(rs, rt) - rs_get_start(rs, rt) < (1 << 1426 mrap->mra_floor_shift)) 1427 return; 1428 1429 zfs_btree_remove(size_tree, rs); 1430 } 1431 1432 /* ARGSUSED */ 1433 static void 1434 metaslab_rt_vacate(range_tree_t *rt, void *arg) 1435 { 1436 metaslab_rt_arg_t *mrap = arg; 1437 zfs_btree_t *size_tree = mrap->mra_bt; 1438 zfs_btree_clear(size_tree); 1439 zfs_btree_destroy(size_tree); 1440 1441 metaslab_rt_create(rt, arg); 1442 } 1443 1444 static range_tree_ops_t metaslab_rt_ops = { 1445 .rtop_create = metaslab_rt_create, 1446 .rtop_destroy = metaslab_rt_destroy, 1447 .rtop_add = metaslab_rt_add, 1448 .rtop_remove = metaslab_rt_remove, 1449 .rtop_vacate = metaslab_rt_vacate 1450 }; 1451 1452 /* 1453 * ========================================================================== 1454 * Common allocator routines 1455 * ========================================================================== 1456 */ 1457 1458 /* 1459 * Return the maximum contiguous segment within the metaslab. 1460 */ 1461 uint64_t 1462 metaslab_largest_allocatable(metaslab_t *msp) 1463 { 1464 zfs_btree_t *t = &msp->ms_allocatable_by_size; 1465 range_seg_t *rs; 1466 1467 if (t == NULL) 1468 return (0); 1469 if (zfs_btree_numnodes(t) == 0) 1470 metaslab_size_tree_full_load(msp->ms_allocatable); 1471 1472 rs = zfs_btree_last(t, NULL); 1473 if (rs == NULL) 1474 return (0); 1475 1476 return (rs_get_end(rs, msp->ms_allocatable) - rs_get_start(rs, 1477 msp->ms_allocatable)); 1478 } 1479 1480 /* 1481 * Return the maximum contiguous segment within the unflushed frees of this 1482 * metaslab. 1483 */ 1484 static uint64_t 1485 metaslab_largest_unflushed_free(metaslab_t *msp) 1486 { 1487 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1488 1489 if (msp->ms_unflushed_frees == NULL) 1490 return (0); 1491 1492 if (zfs_btree_numnodes(&msp->ms_unflushed_frees_by_size) == 0) 1493 metaslab_size_tree_full_load(msp->ms_unflushed_frees); 1494 range_seg_t *rs = zfs_btree_last(&msp->ms_unflushed_frees_by_size, 1495 NULL); 1496 if (rs == NULL) 1497 return (0); 1498 1499 /* 1500 * When a range is freed from the metaslab, that range is added to 1501 * both the unflushed frees and the deferred frees. While the block 1502 * will eventually be usable, if the metaslab were loaded the range 1503 * would not be added to the ms_allocatable tree until TXG_DEFER_SIZE 1504 * txgs had passed. As a result, when attempting to estimate an upper 1505 * bound for the largest currently-usable free segment in the 1506 * metaslab, we need to not consider any ranges currently in the defer 1507 * trees. This algorithm approximates the largest available chunk in 1508 * the largest range in the unflushed_frees tree by taking the first 1509 * chunk. While this may be a poor estimate, it should only remain so 1510 * briefly and should eventually self-correct as frees are no longer 1511 * deferred. Similar logic applies to the ms_freed tree. See 1512 * metaslab_load() for more details. 1513 * 1514 * There are two primary sources of inaccuracy in this estimate. Both 1515 * are tolerated for performance reasons. The first source is that we 1516 * only check the largest segment for overlaps. Smaller segments may 1517 * have more favorable overlaps with the other trees, resulting in 1518 * larger usable chunks. Second, we only look at the first chunk in 1519 * the largest segment; there may be other usable chunks in the 1520 * largest segment, but we ignore them. 1521 */ 1522 uint64_t rstart = rs_get_start(rs, msp->ms_unflushed_frees); 1523 uint64_t rsize = rs_get_end(rs, msp->ms_unflushed_frees) - rstart; 1524 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1525 uint64_t start = 0; 1526 uint64_t size = 0; 1527 boolean_t found = range_tree_find_in(msp->ms_defer[t], rstart, 1528 rsize, &start, &size); 1529 if (found) { 1530 if (rstart == start) 1531 return (0); 1532 rsize = start - rstart; 1533 } 1534 } 1535 1536 uint64_t start = 0; 1537 uint64_t size = 0; 1538 boolean_t found = range_tree_find_in(msp->ms_freed, rstart, 1539 rsize, &start, &size); 1540 if (found) 1541 rsize = start - rstart; 1542 1543 return (rsize); 1544 } 1545 1546 static range_seg_t * 1547 metaslab_block_find(zfs_btree_t *t, range_tree_t *rt, uint64_t start, 1548 uint64_t size, zfs_btree_index_t *where) 1549 { 1550 range_seg_t *rs; 1551 range_seg_max_t rsearch; 1552 1553 rs_set_start(&rsearch, rt, start); 1554 rs_set_end(&rsearch, rt, start + size); 1555 1556 rs = zfs_btree_find(t, &rsearch, where); 1557 if (rs == NULL) { 1558 rs = zfs_btree_next(t, where, where); 1559 } 1560 1561 return (rs); 1562 } 1563 1564 #if defined(WITH_DF_BLOCK_ALLOCATOR) || \ 1565 defined(WITH_CF_BLOCK_ALLOCATOR) 1566 /* 1567 * This is a helper function that can be used by the allocator to find a 1568 * suitable block to allocate. This will search the specified B-tree looking 1569 * for a block that matches the specified criteria. 1570 */ 1571 static uint64_t 1572 metaslab_block_picker(range_tree_t *rt, uint64_t *cursor, uint64_t size, 1573 uint64_t max_search) 1574 { 1575 if (*cursor == 0) 1576 *cursor = rt->rt_start; 1577 zfs_btree_t *bt = &rt->rt_root; 1578 zfs_btree_index_t where; 1579 range_seg_t *rs = metaslab_block_find(bt, rt, *cursor, size, &where); 1580 uint64_t first_found; 1581 int count_searched = 0; 1582 1583 if (rs != NULL) 1584 first_found = rs_get_start(rs, rt); 1585 1586 while (rs != NULL && (rs_get_start(rs, rt) - first_found <= 1587 max_search || count_searched < metaslab_min_search_count)) { 1588 uint64_t offset = rs_get_start(rs, rt); 1589 if (offset + size <= rs_get_end(rs, rt)) { 1590 *cursor = offset + size; 1591 return (offset); 1592 } 1593 rs = zfs_btree_next(bt, &where, &where); 1594 count_searched++; 1595 } 1596 1597 *cursor = 0; 1598 return (-1ULL); 1599 } 1600 #endif /* WITH_DF/CF_BLOCK_ALLOCATOR */ 1601 1602 #if defined(WITH_DF_BLOCK_ALLOCATOR) 1603 /* 1604 * ========================================================================== 1605 * Dynamic Fit (df) block allocator 1606 * 1607 * Search for a free chunk of at least this size, starting from the last 1608 * offset (for this alignment of block) looking for up to 1609 * metaslab_df_max_search bytes (16MB). If a large enough free chunk is not 1610 * found within 16MB, then return a free chunk of exactly the requested size (or 1611 * larger). 1612 * 1613 * If it seems like searching from the last offset will be unproductive, skip 1614 * that and just return a free chunk of exactly the requested size (or larger). 1615 * This is based on metaslab_df_alloc_threshold and metaslab_df_free_pct. This 1616 * mechanism is probably not very useful and may be removed in the future. 1617 * 1618 * The behavior when not searching can be changed to return the largest free 1619 * chunk, instead of a free chunk of exactly the requested size, by setting 1620 * metaslab_df_use_largest_segment. 1621 * ========================================================================== 1622 */ 1623 static uint64_t 1624 metaslab_df_alloc(metaslab_t *msp, uint64_t size) 1625 { 1626 /* 1627 * Find the largest power of 2 block size that evenly divides the 1628 * requested size. This is used to try to allocate blocks with similar 1629 * alignment from the same area of the metaslab (i.e. same cursor 1630 * bucket) but it does not guarantee that other allocations sizes 1631 * may exist in the same region. 1632 */ 1633 uint64_t align = size & -size; 1634 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; 1635 range_tree_t *rt = msp->ms_allocatable; 1636 int free_pct = range_tree_space(rt) * 100 / msp->ms_size; 1637 uint64_t offset; 1638 1639 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1640 1641 /* 1642 * If we're running low on space, find a segment based on size, 1643 * rather than iterating based on offset. 1644 */ 1645 if (metaslab_largest_allocatable(msp) < metaslab_df_alloc_threshold || 1646 free_pct < metaslab_df_free_pct) { 1647 offset = -1; 1648 } else { 1649 offset = metaslab_block_picker(rt, 1650 cursor, size, metaslab_df_max_search); 1651 } 1652 1653 if (offset == -1) { 1654 range_seg_t *rs; 1655 if (zfs_btree_numnodes(&msp->ms_allocatable_by_size) == 0) 1656 metaslab_size_tree_full_load(msp->ms_allocatable); 1657 if (metaslab_df_use_largest_segment) { 1658 /* use largest free segment */ 1659 rs = zfs_btree_last(&msp->ms_allocatable_by_size, NULL); 1660 } else { 1661 zfs_btree_index_t where; 1662 /* use segment of this size, or next largest */ 1663 #ifdef _METASLAB_TRACING 1664 metaslab_rt_arg_t *mrap = msp->ms_allocatable->rt_arg; 1665 if (size < (1 << mrap->mra_floor_shift)) { 1666 METASLABSTAT_BUMP( 1667 metaslabstat_df_find_under_floor); 1668 } 1669 #endif 1670 rs = metaslab_block_find(&msp->ms_allocatable_by_size, 1671 rt, msp->ms_start, size, &where); 1672 } 1673 if (rs != NULL && rs_get_start(rs, rt) + size <= rs_get_end(rs, 1674 rt)) { 1675 offset = rs_get_start(rs, rt); 1676 *cursor = offset + size; 1677 } 1678 } 1679 1680 return (offset); 1681 } 1682 1683 static metaslab_ops_t metaslab_df_ops = { 1684 metaslab_df_alloc 1685 }; 1686 1687 metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops; 1688 #endif /* WITH_DF_BLOCK_ALLOCATOR */ 1689 1690 #if defined(WITH_CF_BLOCK_ALLOCATOR) 1691 /* 1692 * ========================================================================== 1693 * Cursor fit block allocator - 1694 * Select the largest region in the metaslab, set the cursor to the beginning 1695 * of the range and the cursor_end to the end of the range. As allocations 1696 * are made advance the cursor. Continue allocating from the cursor until 1697 * the range is exhausted and then find a new range. 1698 * ========================================================================== 1699 */ 1700 static uint64_t 1701 metaslab_cf_alloc(metaslab_t *msp, uint64_t size) 1702 { 1703 range_tree_t *rt = msp->ms_allocatable; 1704 zfs_btree_t *t = &msp->ms_allocatable_by_size; 1705 uint64_t *cursor = &msp->ms_lbas[0]; 1706 uint64_t *cursor_end = &msp->ms_lbas[1]; 1707 uint64_t offset = 0; 1708 1709 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1710 1711 ASSERT3U(*cursor_end, >=, *cursor); 1712 1713 if ((*cursor + size) > *cursor_end) { 1714 range_seg_t *rs; 1715 1716 if (zfs_btree_numnodes(t) == 0) 1717 metaslab_size_tree_full_load(msp->ms_allocatable); 1718 rs = zfs_btree_last(t, NULL); 1719 if (rs == NULL || (rs_get_end(rs, rt) - rs_get_start(rs, rt)) < 1720 size) 1721 return (-1ULL); 1722 1723 *cursor = rs_get_start(rs, rt); 1724 *cursor_end = rs_get_end(rs, rt); 1725 } 1726 1727 offset = *cursor; 1728 *cursor += size; 1729 1730 return (offset); 1731 } 1732 1733 static metaslab_ops_t metaslab_cf_ops = { 1734 metaslab_cf_alloc 1735 }; 1736 1737 metaslab_ops_t *zfs_metaslab_ops = &metaslab_cf_ops; 1738 #endif /* WITH_CF_BLOCK_ALLOCATOR */ 1739 1740 #if defined(WITH_NDF_BLOCK_ALLOCATOR) 1741 /* 1742 * ========================================================================== 1743 * New dynamic fit allocator - 1744 * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift 1745 * contiguous blocks. If no region is found then just use the largest segment 1746 * that remains. 1747 * ========================================================================== 1748 */ 1749 1750 /* 1751 * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift) 1752 * to request from the allocator. 1753 */ 1754 uint64_t metaslab_ndf_clump_shift = 4; 1755 1756 static uint64_t 1757 metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) 1758 { 1759 zfs_btree_t *t = &msp->ms_allocatable->rt_root; 1760 range_tree_t *rt = msp->ms_allocatable; 1761 zfs_btree_index_t where; 1762 range_seg_t *rs; 1763 range_seg_max_t rsearch; 1764 uint64_t hbit = highbit64(size); 1765 uint64_t *cursor = &msp->ms_lbas[hbit - 1]; 1766 uint64_t max_size = metaslab_largest_allocatable(msp); 1767 1768 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1769 1770 if (max_size < size) 1771 return (-1ULL); 1772 1773 rs_set_start(&rsearch, rt, *cursor); 1774 rs_set_end(&rsearch, rt, *cursor + size); 1775 1776 rs = zfs_btree_find(t, &rsearch, &where); 1777 if (rs == NULL || (rs_get_end(rs, rt) - rs_get_start(rs, rt)) < size) { 1778 t = &msp->ms_allocatable_by_size; 1779 1780 rs_set_start(&rsearch, rt, 0); 1781 rs_set_end(&rsearch, rt, MIN(max_size, 1ULL << (hbit + 1782 metaslab_ndf_clump_shift))); 1783 1784 rs = zfs_btree_find(t, &rsearch, &where); 1785 if (rs == NULL) 1786 rs = zfs_btree_next(t, &where, &where); 1787 ASSERT(rs != NULL); 1788 } 1789 1790 if ((rs_get_end(rs, rt) - rs_get_start(rs, rt)) >= size) { 1791 *cursor = rs_get_start(rs, rt) + size; 1792 return (rs_get_start(rs, rt)); 1793 } 1794 return (-1ULL); 1795 } 1796 1797 static metaslab_ops_t metaslab_ndf_ops = { 1798 metaslab_ndf_alloc 1799 }; 1800 1801 metaslab_ops_t *zfs_metaslab_ops = &metaslab_ndf_ops; 1802 #endif /* WITH_NDF_BLOCK_ALLOCATOR */ 1803 1804 1805 /* 1806 * ========================================================================== 1807 * Metaslabs 1808 * ========================================================================== 1809 */ 1810 1811 /* 1812 * Wait for any in-progress metaslab loads to complete. 1813 */ 1814 static void 1815 metaslab_load_wait(metaslab_t *msp) 1816 { 1817 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1818 1819 while (msp->ms_loading) { 1820 ASSERT(!msp->ms_loaded); 1821 cv_wait(&msp->ms_load_cv, &msp->ms_lock); 1822 } 1823 } 1824 1825 /* 1826 * Wait for any in-progress flushing to complete. 1827 */ 1828 static void 1829 metaslab_flush_wait(metaslab_t *msp) 1830 { 1831 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1832 1833 while (msp->ms_flushing) 1834 cv_wait(&msp->ms_flush_cv, &msp->ms_lock); 1835 } 1836 1837 static unsigned int 1838 metaslab_idx_func(multilist_t *ml, void *arg) 1839 { 1840 metaslab_t *msp = arg; 1841 return (msp->ms_id % multilist_get_num_sublists(ml)); 1842 } 1843 1844 uint64_t 1845 metaslab_allocated_space(metaslab_t *msp) 1846 { 1847 return (msp->ms_allocated_space); 1848 } 1849 1850 /* 1851 * Verify that the space accounting on disk matches the in-core range_trees. 1852 */ 1853 static void 1854 metaslab_verify_space(metaslab_t *msp, uint64_t txg) 1855 { 1856 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1857 uint64_t allocating = 0; 1858 uint64_t sm_free_space, msp_free_space; 1859 1860 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1861 ASSERT(!msp->ms_condensing); 1862 1863 if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) 1864 return; 1865 1866 /* 1867 * We can only verify the metaslab space when we're called 1868 * from syncing context with a loaded metaslab that has an 1869 * allocated space map. Calling this in non-syncing context 1870 * does not provide a consistent view of the metaslab since 1871 * we're performing allocations in the future. 1872 */ 1873 if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL || 1874 !msp->ms_loaded) 1875 return; 1876 1877 /* 1878 * Even though the smp_alloc field can get negative, 1879 * when it comes to a metaslab's space map, that should 1880 * never be the case. 1881 */ 1882 ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0); 1883 1884 ASSERT3U(space_map_allocated(msp->ms_sm), >=, 1885 range_tree_space(msp->ms_unflushed_frees)); 1886 1887 ASSERT3U(metaslab_allocated_space(msp), ==, 1888 space_map_allocated(msp->ms_sm) + 1889 range_tree_space(msp->ms_unflushed_allocs) - 1890 range_tree_space(msp->ms_unflushed_frees)); 1891 1892 sm_free_space = msp->ms_size - metaslab_allocated_space(msp); 1893 1894 /* 1895 * Account for future allocations since we would have 1896 * already deducted that space from the ms_allocatable. 1897 */ 1898 for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { 1899 allocating += 1900 range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]); 1901 } 1902 ASSERT3U(allocating + msp->ms_allocated_this_txg, ==, 1903 msp->ms_allocating_total); 1904 1905 ASSERT3U(msp->ms_deferspace, ==, 1906 range_tree_space(msp->ms_defer[0]) + 1907 range_tree_space(msp->ms_defer[1])); 1908 1909 msp_free_space = range_tree_space(msp->ms_allocatable) + allocating + 1910 msp->ms_deferspace + range_tree_space(msp->ms_freed); 1911 1912 VERIFY3U(sm_free_space, ==, msp_free_space); 1913 } 1914 1915 static void 1916 metaslab_aux_histograms_clear(metaslab_t *msp) 1917 { 1918 /* 1919 * Auxiliary histograms are only cleared when resetting them, 1920 * which can only happen while the metaslab is loaded. 1921 */ 1922 ASSERT(msp->ms_loaded); 1923 1924 bzero(msp->ms_synchist, sizeof (msp->ms_synchist)); 1925 for (int t = 0; t < TXG_DEFER_SIZE; t++) 1926 bzero(msp->ms_deferhist[t], sizeof (msp->ms_deferhist[t])); 1927 } 1928 1929 static void 1930 metaslab_aux_histogram_add(uint64_t *histogram, uint64_t shift, 1931 range_tree_t *rt) 1932 { 1933 /* 1934 * This is modeled after space_map_histogram_add(), so refer to that 1935 * function for implementation details. We want this to work like 1936 * the space map histogram, and not the range tree histogram, as we 1937 * are essentially constructing a delta that will be later subtracted 1938 * from the space map histogram. 1939 */ 1940 int idx = 0; 1941 for (int i = shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { 1942 ASSERT3U(i, >=, idx + shift); 1943 histogram[idx] += rt->rt_histogram[i] << (i - idx - shift); 1944 1945 if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) { 1946 ASSERT3U(idx + shift, ==, i); 1947 idx++; 1948 ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE); 1949 } 1950 } 1951 } 1952 1953 /* 1954 * Called at every sync pass that the metaslab gets synced. 1955 * 1956 * The reason is that we want our auxiliary histograms to be updated 1957 * wherever the metaslab's space map histogram is updated. This way 1958 * we stay consistent on which parts of the metaslab space map's 1959 * histogram are currently not available for allocations (e.g because 1960 * they are in the defer, freed, and freeing trees). 1961 */ 1962 static void 1963 metaslab_aux_histograms_update(metaslab_t *msp) 1964 { 1965 space_map_t *sm = msp->ms_sm; 1966 ASSERT(sm != NULL); 1967 1968 /* 1969 * This is similar to the metaslab's space map histogram updates 1970 * that take place in metaslab_sync(). The only difference is that 1971 * we only care about segments that haven't made it into the 1972 * ms_allocatable tree yet. 1973 */ 1974 if (msp->ms_loaded) { 1975 metaslab_aux_histograms_clear(msp); 1976 1977 metaslab_aux_histogram_add(msp->ms_synchist, 1978 sm->sm_shift, msp->ms_freed); 1979 1980 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1981 metaslab_aux_histogram_add(msp->ms_deferhist[t], 1982 sm->sm_shift, msp->ms_defer[t]); 1983 } 1984 } 1985 1986 metaslab_aux_histogram_add(msp->ms_synchist, 1987 sm->sm_shift, msp->ms_freeing); 1988 } 1989 1990 /* 1991 * Called every time we are done syncing (writing to) the metaslab, 1992 * i.e. at the end of each sync pass. 1993 * [see the comment in metaslab_impl.h for ms_synchist, ms_deferhist] 1994 */ 1995 static void 1996 metaslab_aux_histograms_update_done(metaslab_t *msp, boolean_t defer_allowed) 1997 { 1998 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1999 space_map_t *sm = msp->ms_sm; 2000 2001 if (sm == NULL) { 2002 /* 2003 * We came here from metaslab_init() when creating/opening a 2004 * pool, looking at a metaslab that hasn't had any allocations 2005 * yet. 2006 */ 2007 return; 2008 } 2009 2010 /* 2011 * This is similar to the actions that we take for the ms_freed 2012 * and ms_defer trees in metaslab_sync_done(). 2013 */ 2014 uint64_t hist_index = spa_syncing_txg(spa) % TXG_DEFER_SIZE; 2015 if (defer_allowed) { 2016 bcopy(msp->ms_synchist, msp->ms_deferhist[hist_index], 2017 sizeof (msp->ms_synchist)); 2018 } else { 2019 bzero(msp->ms_deferhist[hist_index], 2020 sizeof (msp->ms_deferhist[hist_index])); 2021 } 2022 bzero(msp->ms_synchist, sizeof (msp->ms_synchist)); 2023 } 2024 2025 /* 2026 * Ensure that the metaslab's weight and fragmentation are consistent 2027 * with the contents of the histogram (either the range tree's histogram 2028 * or the space map's depending whether the metaslab is loaded). 2029 */ 2030 static void 2031 metaslab_verify_weight_and_frag(metaslab_t *msp) 2032 { 2033 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2034 2035 if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) 2036 return; 2037 2038 /* 2039 * We can end up here from vdev_remove_complete(), in which case we 2040 * cannot do these assertions because we hold spa config locks and 2041 * thus we are not allowed to read from the DMU. 2042 * 2043 * We check if the metaslab group has been removed and if that's 2044 * the case we return immediately as that would mean that we are 2045 * here from the aforementioned code path. 2046 */ 2047 if (msp->ms_group == NULL) 2048 return; 2049 2050 /* 2051 * Devices being removed always return a weight of 0 and leave 2052 * fragmentation and ms_max_size as is - there is nothing for 2053 * us to verify here. 2054 */ 2055 vdev_t *vd = msp->ms_group->mg_vd; 2056 if (vd->vdev_removing) 2057 return; 2058 2059 /* 2060 * If the metaslab is dirty it probably means that we've done 2061 * some allocations or frees that have changed our histograms 2062 * and thus the weight. 2063 */ 2064 for (int t = 0; t < TXG_SIZE; t++) { 2065 if (txg_list_member(&vd->vdev_ms_list, msp, t)) 2066 return; 2067 } 2068 2069 /* 2070 * This verification checks that our in-memory state is consistent 2071 * with what's on disk. If the pool is read-only then there aren't 2072 * any changes and we just have the initially-loaded state. 2073 */ 2074 if (!spa_writeable(msp->ms_group->mg_vd->vdev_spa)) 2075 return; 2076 2077 /* some extra verification for in-core tree if you can */ 2078 if (msp->ms_loaded) { 2079 range_tree_stat_verify(msp->ms_allocatable); 2080 VERIFY(space_map_histogram_verify(msp->ms_sm, 2081 msp->ms_allocatable)); 2082 } 2083 2084 uint64_t weight = msp->ms_weight; 2085 uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; 2086 boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight); 2087 uint64_t frag = msp->ms_fragmentation; 2088 uint64_t max_segsize = msp->ms_max_size; 2089 2090 msp->ms_weight = 0; 2091 msp->ms_fragmentation = 0; 2092 2093 /* 2094 * This function is used for verification purposes and thus should 2095 * not introduce any side-effects/mutations on the system's state. 2096 * 2097 * Regardless of whether metaslab_weight() thinks this metaslab 2098 * should be active or not, we want to ensure that the actual weight 2099 * (and therefore the value of ms_weight) would be the same if it 2100 * was to be recalculated at this point. 2101 * 2102 * In addition we set the nodirty flag so metaslab_weight() does 2103 * not dirty the metaslab for future TXGs (e.g. when trying to 2104 * force condensing to upgrade the metaslab spacemaps). 2105 */ 2106 msp->ms_weight = metaslab_weight(msp, B_TRUE) | was_active; 2107 2108 VERIFY3U(max_segsize, ==, msp->ms_max_size); 2109 2110 /* 2111 * If the weight type changed then there is no point in doing 2112 * verification. Revert fields to their original values. 2113 */ 2114 if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) || 2115 (!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) { 2116 msp->ms_fragmentation = frag; 2117 msp->ms_weight = weight; 2118 return; 2119 } 2120 2121 VERIFY3U(msp->ms_fragmentation, ==, frag); 2122 VERIFY3U(msp->ms_weight, ==, weight); 2123 } 2124 2125 /* 2126 * If we're over the zfs_metaslab_mem_limit, select the loaded metaslab from 2127 * this class that was used longest ago, and attempt to unload it. We don't 2128 * want to spend too much time in this loop to prevent performance 2129 * degradation, and we expect that most of the time this operation will 2130 * succeed. Between that and the normal unloading processing during txg sync, 2131 * we expect this to keep the metaslab memory usage under control. 2132 */ 2133 static void 2134 metaslab_potentially_evict(metaslab_class_t *mc) 2135 { 2136 #ifdef _KERNEL 2137 uint64_t allmem = arc_all_memory(); 2138 uint64_t inuse = spl_kmem_cache_inuse(zfs_btree_leaf_cache); 2139 uint64_t size = spl_kmem_cache_entry_size(zfs_btree_leaf_cache); 2140 int tries = 0; 2141 for (; allmem * zfs_metaslab_mem_limit / 100 < inuse * size && 2142 tries < multilist_get_num_sublists(mc->mc_metaslab_txg_list) * 2; 2143 tries++) { 2144 unsigned int idx = multilist_get_random_index( 2145 mc->mc_metaslab_txg_list); 2146 multilist_sublist_t *mls = 2147 multilist_sublist_lock(mc->mc_metaslab_txg_list, idx); 2148 metaslab_t *msp = multilist_sublist_head(mls); 2149 multilist_sublist_unlock(mls); 2150 while (msp != NULL && allmem * zfs_metaslab_mem_limit / 100 < 2151 inuse * size) { 2152 VERIFY3P(mls, ==, multilist_sublist_lock( 2153 mc->mc_metaslab_txg_list, idx)); 2154 ASSERT3U(idx, ==, 2155 metaslab_idx_func(mc->mc_metaslab_txg_list, msp)); 2156 2157 if (!multilist_link_active(&msp->ms_class_txg_node)) { 2158 multilist_sublist_unlock(mls); 2159 break; 2160 } 2161 metaslab_t *next_msp = multilist_sublist_next(mls, msp); 2162 multilist_sublist_unlock(mls); 2163 /* 2164 * If the metaslab is currently loading there are two 2165 * cases. If it's the metaslab we're evicting, we 2166 * can't continue on or we'll panic when we attempt to 2167 * recursively lock the mutex. If it's another 2168 * metaslab that's loading, it can be safely skipped, 2169 * since we know it's very new and therefore not a 2170 * good eviction candidate. We check later once the 2171 * lock is held that the metaslab is fully loaded 2172 * before actually unloading it. 2173 */ 2174 if (msp->ms_loading) { 2175 msp = next_msp; 2176 inuse = 2177 spl_kmem_cache_inuse(zfs_btree_leaf_cache); 2178 continue; 2179 } 2180 /* 2181 * We can't unload metaslabs with no spacemap because 2182 * they're not ready to be unloaded yet. We can't 2183 * unload metaslabs with outstanding allocations 2184 * because doing so could cause the metaslab's weight 2185 * to decrease while it's unloaded, which violates an 2186 * invariant that we use to prevent unnecessary 2187 * loading. We also don't unload metaslabs that are 2188 * currently active because they are high-weight 2189 * metaslabs that are likely to be used in the near 2190 * future. 2191 */ 2192 mutex_enter(&msp->ms_lock); 2193 if (msp->ms_allocator == -1 && msp->ms_sm != NULL && 2194 msp->ms_allocating_total == 0) { 2195 metaslab_unload(msp); 2196 } 2197 mutex_exit(&msp->ms_lock); 2198 msp = next_msp; 2199 inuse = spl_kmem_cache_inuse(zfs_btree_leaf_cache); 2200 } 2201 } 2202 #endif 2203 } 2204 2205 static int 2206 metaslab_load_impl(metaslab_t *msp) 2207 { 2208 int error = 0; 2209 2210 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2211 ASSERT(msp->ms_loading); 2212 ASSERT(!msp->ms_condensing); 2213 2214 /* 2215 * We temporarily drop the lock to unblock other operations while we 2216 * are reading the space map. Therefore, metaslab_sync() and 2217 * metaslab_sync_done() can run at the same time as we do. 2218 * 2219 * If we are using the log space maps, metaslab_sync() can't write to 2220 * the metaslab's space map while we are loading as we only write to 2221 * it when we are flushing the metaslab, and that can't happen while 2222 * we are loading it. 2223 * 2224 * If we are not using log space maps though, metaslab_sync() can 2225 * append to the space map while we are loading. Therefore we load 2226 * only entries that existed when we started the load. Additionally, 2227 * metaslab_sync_done() has to wait for the load to complete because 2228 * there are potential races like metaslab_load() loading parts of the 2229 * space map that are currently being appended by metaslab_sync(). If 2230 * we didn't, the ms_allocatable would have entries that 2231 * metaslab_sync_done() would try to re-add later. 2232 * 2233 * That's why before dropping the lock we remember the synced length 2234 * of the metaslab and read up to that point of the space map, 2235 * ignoring entries appended by metaslab_sync() that happen after we 2236 * drop the lock. 2237 */ 2238 uint64_t length = msp->ms_synced_length; 2239 mutex_exit(&msp->ms_lock); 2240 2241 hrtime_t load_start = gethrtime(); 2242 metaslab_rt_arg_t *mrap; 2243 if (msp->ms_allocatable->rt_arg == NULL) { 2244 mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP); 2245 } else { 2246 mrap = msp->ms_allocatable->rt_arg; 2247 msp->ms_allocatable->rt_ops = NULL; 2248 msp->ms_allocatable->rt_arg = NULL; 2249 } 2250 mrap->mra_bt = &msp->ms_allocatable_by_size; 2251 mrap->mra_floor_shift = metaslab_by_size_min_shift; 2252 2253 if (msp->ms_sm != NULL) { 2254 error = space_map_load_length(msp->ms_sm, msp->ms_allocatable, 2255 SM_FREE, length); 2256 2257 /* Now, populate the size-sorted tree. */ 2258 metaslab_rt_create(msp->ms_allocatable, mrap); 2259 msp->ms_allocatable->rt_ops = &metaslab_rt_ops; 2260 msp->ms_allocatable->rt_arg = mrap; 2261 2262 struct mssa_arg arg = {0}; 2263 arg.rt = msp->ms_allocatable; 2264 arg.mra = mrap; 2265 range_tree_walk(msp->ms_allocatable, metaslab_size_sorted_add, 2266 &arg); 2267 } else { 2268 /* 2269 * Add the size-sorted tree first, since we don't need to load 2270 * the metaslab from the spacemap. 2271 */ 2272 metaslab_rt_create(msp->ms_allocatable, mrap); 2273 msp->ms_allocatable->rt_ops = &metaslab_rt_ops; 2274 msp->ms_allocatable->rt_arg = mrap; 2275 /* 2276 * The space map has not been allocated yet, so treat 2277 * all the space in the metaslab as free and add it to the 2278 * ms_allocatable tree. 2279 */ 2280 range_tree_add(msp->ms_allocatable, 2281 msp->ms_start, msp->ms_size); 2282 2283 if (msp->ms_freed != NULL) { 2284 /* 2285 * If the ms_sm doesn't exist, this means that this 2286 * metaslab hasn't gone through metaslab_sync() and 2287 * thus has never been dirtied. So we shouldn't 2288 * expect any unflushed allocs or frees from previous 2289 * TXGs. 2290 * 2291 * Note: ms_freed and all the other trees except for 2292 * the ms_allocatable, can be NULL at this point only 2293 * if this is a new metaslab of a vdev that just got 2294 * expanded. 2295 */ 2296 ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); 2297 ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); 2298 } 2299 } 2300 2301 /* 2302 * We need to grab the ms_sync_lock to prevent metaslab_sync() from 2303 * changing the ms_sm (or log_sm) and the metaslab's range trees 2304 * while we are about to use them and populate the ms_allocatable. 2305 * The ms_lock is insufficient for this because metaslab_sync() doesn't 2306 * hold the ms_lock while writing the ms_checkpointing tree to disk. 2307 */ 2308 mutex_enter(&msp->ms_sync_lock); 2309 mutex_enter(&msp->ms_lock); 2310 2311 ASSERT(!msp->ms_condensing); 2312 ASSERT(!msp->ms_flushing); 2313 2314 if (error != 0) { 2315 mutex_exit(&msp->ms_sync_lock); 2316 return (error); 2317 } 2318 2319 ASSERT3P(msp->ms_group, !=, NULL); 2320 msp->ms_loaded = B_TRUE; 2321 2322 /* 2323 * Apply all the unflushed changes to ms_allocatable right 2324 * away so any manipulations we do below have a clear view 2325 * of what is allocated and what is free. 2326 */ 2327 range_tree_walk(msp->ms_unflushed_allocs, 2328 range_tree_remove, msp->ms_allocatable); 2329 range_tree_walk(msp->ms_unflushed_frees, 2330 range_tree_add, msp->ms_allocatable); 2331 2332 msp->ms_loaded = B_TRUE; 2333 2334 ASSERT3P(msp->ms_group, !=, NULL); 2335 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 2336 if (spa_syncing_log_sm(spa) != NULL) { 2337 ASSERT(spa_feature_is_enabled(spa, 2338 SPA_FEATURE_LOG_SPACEMAP)); 2339 2340 /* 2341 * If we use a log space map we add all the segments 2342 * that are in ms_unflushed_frees so they are available 2343 * for allocation. 2344 * 2345 * ms_allocatable needs to contain all free segments 2346 * that are ready for allocations (thus not segments 2347 * from ms_freeing, ms_freed, and the ms_defer trees). 2348 * But if we grab the lock in this code path at a sync 2349 * pass later that 1, then it also contains the 2350 * segments of ms_freed (they were added to it earlier 2351 * in this path through ms_unflushed_frees). So we 2352 * need to remove all the segments that exist in 2353 * ms_freed from ms_allocatable as they will be added 2354 * later in metaslab_sync_done(). 2355 * 2356 * When there's no log space map, the ms_allocatable 2357 * correctly doesn't contain any segments that exist 2358 * in ms_freed [see ms_synced_length]. 2359 */ 2360 range_tree_walk(msp->ms_freed, 2361 range_tree_remove, msp->ms_allocatable); 2362 } 2363 2364 /* 2365 * If we are not using the log space map, ms_allocatable 2366 * contains the segments that exist in the ms_defer trees 2367 * [see ms_synced_length]. Thus we need to remove them 2368 * from ms_allocatable as they will be added again in 2369 * metaslab_sync_done(). 2370 * 2371 * If we are using the log space map, ms_allocatable still 2372 * contains the segments that exist in the ms_defer trees. 2373 * Not because it read them through the ms_sm though. But 2374 * because these segments are part of ms_unflushed_frees 2375 * whose segments we add to ms_allocatable earlier in this 2376 * code path. 2377 */ 2378 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2379 range_tree_walk(msp->ms_defer[t], 2380 range_tree_remove, msp->ms_allocatable); 2381 } 2382 2383 /* 2384 * Call metaslab_recalculate_weight_and_sort() now that the 2385 * metaslab is loaded so we get the metaslab's real weight. 2386 * 2387 * Unless this metaslab was created with older software and 2388 * has not yet been converted to use segment-based weight, we 2389 * expect the new weight to be better or equal to the weight 2390 * that the metaslab had while it was not loaded. This is 2391 * because the old weight does not take into account the 2392 * consolidation of adjacent segments between TXGs. [see 2393 * comment for ms_synchist and ms_deferhist[] for more info] 2394 */ 2395 uint64_t weight = msp->ms_weight; 2396 uint64_t max_size = msp->ms_max_size; 2397 metaslab_recalculate_weight_and_sort(msp); 2398 if (!WEIGHT_IS_SPACEBASED(weight)) 2399 ASSERT3U(weight, <=, msp->ms_weight); 2400 msp->ms_max_size = metaslab_largest_allocatable(msp); 2401 ASSERT3U(max_size, <=, msp->ms_max_size); 2402 hrtime_t load_end = gethrtime(); 2403 msp->ms_load_time = load_end; 2404 zfs_dbgmsg("metaslab_load: txg %llu, spa %s, vdev_id %llu, " 2405 "ms_id %llu, smp_length %llu, " 2406 "unflushed_allocs %llu, unflushed_frees %llu, " 2407 "freed %llu, defer %llu + %llu, unloaded time %llu ms, " 2408 "loading_time %lld ms, ms_max_size %llu, " 2409 "max size error %lld, " 2410 "old_weight %llx, new_weight %llx", 2411 spa_syncing_txg(spa), spa_name(spa), 2412 msp->ms_group->mg_vd->vdev_id, msp->ms_id, 2413 space_map_length(msp->ms_sm), 2414 range_tree_space(msp->ms_unflushed_allocs), 2415 range_tree_space(msp->ms_unflushed_frees), 2416 range_tree_space(msp->ms_freed), 2417 range_tree_space(msp->ms_defer[0]), 2418 range_tree_space(msp->ms_defer[1]), 2419 (longlong_t)((load_start - msp->ms_unload_time) / 1000000), 2420 (longlong_t)((load_end - load_start) / 1000000), 2421 msp->ms_max_size, msp->ms_max_size - max_size, 2422 weight, msp->ms_weight); 2423 2424 metaslab_verify_space(msp, spa_syncing_txg(spa)); 2425 mutex_exit(&msp->ms_sync_lock); 2426 return (0); 2427 } 2428 2429 int 2430 metaslab_load(metaslab_t *msp) 2431 { 2432 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2433 2434 /* 2435 * There may be another thread loading the same metaslab, if that's 2436 * the case just wait until the other thread is done and return. 2437 */ 2438 metaslab_load_wait(msp); 2439 if (msp->ms_loaded) 2440 return (0); 2441 VERIFY(!msp->ms_loading); 2442 ASSERT(!msp->ms_condensing); 2443 2444 /* 2445 * We set the loading flag BEFORE potentially dropping the lock to 2446 * wait for an ongoing flush (see ms_flushing below). This way other 2447 * threads know that there is already a thread that is loading this 2448 * metaslab. 2449 */ 2450 msp->ms_loading = B_TRUE; 2451 2452 /* 2453 * Wait for any in-progress flushing to finish as we drop the ms_lock 2454 * both here (during space_map_load()) and in metaslab_flush() (when 2455 * we flush our changes to the ms_sm). 2456 */ 2457 if (msp->ms_flushing) 2458 metaslab_flush_wait(msp); 2459 2460 /* 2461 * In the possibility that we were waiting for the metaslab to be 2462 * flushed (where we temporarily dropped the ms_lock), ensure that 2463 * no one else loaded the metaslab somehow. 2464 */ 2465 ASSERT(!msp->ms_loaded); 2466 2467 /* 2468 * If we're loading a metaslab in the normal class, consider evicting 2469 * another one to keep our memory usage under the limit defined by the 2470 * zfs_metaslab_mem_limit tunable. 2471 */ 2472 if (spa_normal_class(msp->ms_group->mg_class->mc_spa) == 2473 msp->ms_group->mg_class) { 2474 metaslab_potentially_evict(msp->ms_group->mg_class); 2475 } 2476 2477 int error = metaslab_load_impl(msp); 2478 2479 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2480 msp->ms_loading = B_FALSE; 2481 cv_broadcast(&msp->ms_load_cv); 2482 2483 return (error); 2484 } 2485 2486 void 2487 metaslab_unload(metaslab_t *msp) 2488 { 2489 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2490 2491 /* 2492 * This can happen if a metaslab is selected for eviction (in 2493 * metaslab_potentially_evict) and then unloaded during spa_sync (via 2494 * metaslab_class_evict_old). 2495 */ 2496 if (!msp->ms_loaded) 2497 return; 2498 2499 range_tree_vacate(msp->ms_allocatable, NULL, NULL); 2500 msp->ms_loaded = B_FALSE; 2501 msp->ms_unload_time = gethrtime(); 2502 2503 msp->ms_activation_weight = 0; 2504 msp->ms_weight &= ~METASLAB_ACTIVE_MASK; 2505 2506 if (msp->ms_group != NULL) { 2507 metaslab_class_t *mc = msp->ms_group->mg_class; 2508 multilist_sublist_t *mls = 2509 multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp); 2510 if (multilist_link_active(&msp->ms_class_txg_node)) 2511 multilist_sublist_remove(mls, msp); 2512 multilist_sublist_unlock(mls); 2513 2514 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 2515 zfs_dbgmsg("metaslab_unload: txg %llu, spa %s, vdev_id %llu, " 2516 "ms_id %llu, weight %llx, " 2517 "selected txg %llu (%llu ms ago), alloc_txg %llu, " 2518 "loaded %llu ms ago, max_size %llu", 2519 spa_syncing_txg(spa), spa_name(spa), 2520 msp->ms_group->mg_vd->vdev_id, msp->ms_id, 2521 msp->ms_weight, 2522 msp->ms_selected_txg, 2523 (msp->ms_unload_time - msp->ms_selected_time) / 1000 / 1000, 2524 msp->ms_alloc_txg, 2525 (msp->ms_unload_time - msp->ms_load_time) / 1000 / 1000, 2526 msp->ms_max_size); 2527 } 2528 2529 /* 2530 * We explicitly recalculate the metaslab's weight based on its space 2531 * map (as it is now not loaded). We want unload metaslabs to always 2532 * have their weights calculated from the space map histograms, while 2533 * loaded ones have it calculated from their in-core range tree 2534 * [see metaslab_load()]. This way, the weight reflects the information 2535 * available in-core, whether it is loaded or not. 2536 * 2537 * If ms_group == NULL means that we came here from metaslab_fini(), 2538 * at which point it doesn't make sense for us to do the recalculation 2539 * and the sorting. 2540 */ 2541 if (msp->ms_group != NULL) 2542 metaslab_recalculate_weight_and_sort(msp); 2543 } 2544 2545 /* 2546 * We want to optimize the memory use of the per-metaslab range 2547 * trees. To do this, we store the segments in the range trees in 2548 * units of sectors, zero-indexing from the start of the metaslab. If 2549 * the vdev_ms_shift - the vdev_ashift is less than 32, we can store 2550 * the ranges using two uint32_ts, rather than two uint64_ts. 2551 */ 2552 range_seg_type_t 2553 metaslab_calculate_range_tree_type(vdev_t *vdev, metaslab_t *msp, 2554 uint64_t *start, uint64_t *shift) 2555 { 2556 if (vdev->vdev_ms_shift - vdev->vdev_ashift < 32 && 2557 !zfs_metaslab_force_large_segs) { 2558 *shift = vdev->vdev_ashift; 2559 *start = msp->ms_start; 2560 return (RANGE_SEG32); 2561 } else { 2562 *shift = 0; 2563 *start = 0; 2564 return (RANGE_SEG64); 2565 } 2566 } 2567 2568 void 2569 metaslab_set_selected_txg(metaslab_t *msp, uint64_t txg) 2570 { 2571 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2572 metaslab_class_t *mc = msp->ms_group->mg_class; 2573 multilist_sublist_t *mls = 2574 multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp); 2575 if (multilist_link_active(&msp->ms_class_txg_node)) 2576 multilist_sublist_remove(mls, msp); 2577 msp->ms_selected_txg = txg; 2578 msp->ms_selected_time = gethrtime(); 2579 multilist_sublist_insert_tail(mls, msp); 2580 multilist_sublist_unlock(mls); 2581 } 2582 2583 void 2584 metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta, 2585 int64_t defer_delta, int64_t space_delta) 2586 { 2587 vdev_space_update(vd, alloc_delta, defer_delta, space_delta); 2588 2589 ASSERT3P(vd->vdev_spa->spa_root_vdev, ==, vd->vdev_parent); 2590 ASSERT(vd->vdev_ms_count != 0); 2591 2592 metaslab_class_space_update(mc, alloc_delta, defer_delta, space_delta, 2593 vdev_deflated_space(vd, space_delta)); 2594 } 2595 2596 int 2597 metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, 2598 uint64_t txg, metaslab_t **msp) 2599 { 2600 vdev_t *vd = mg->mg_vd; 2601 spa_t *spa = vd->vdev_spa; 2602 objset_t *mos = spa->spa_meta_objset; 2603 metaslab_t *ms; 2604 int error; 2605 2606 ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); 2607 mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL); 2608 mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL); 2609 cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); 2610 cv_init(&ms->ms_flush_cv, NULL, CV_DEFAULT, NULL); 2611 multilist_link_init(&ms->ms_class_txg_node); 2612 2613 ms->ms_id = id; 2614 ms->ms_start = id << vd->vdev_ms_shift; 2615 ms->ms_size = 1ULL << vd->vdev_ms_shift; 2616 ms->ms_allocator = -1; 2617 ms->ms_new = B_TRUE; 2618 2619 /* 2620 * We only open space map objects that already exist. All others 2621 * will be opened when we finally allocate an object for it. 2622 * 2623 * Note: 2624 * When called from vdev_expand(), we can't call into the DMU as 2625 * we are holding the spa_config_lock as a writer and we would 2626 * deadlock [see relevant comment in vdev_metaslab_init()]. in 2627 * that case, the object parameter is zero though, so we won't 2628 * call into the DMU. 2629 */ 2630 if (object != 0) { 2631 error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start, 2632 ms->ms_size, vd->vdev_ashift); 2633 2634 if (error != 0) { 2635 kmem_free(ms, sizeof (metaslab_t)); 2636 return (error); 2637 } 2638 2639 ASSERT(ms->ms_sm != NULL); 2640 ms->ms_allocated_space = space_map_allocated(ms->ms_sm); 2641 } 2642 2643 range_seg_type_t type; 2644 uint64_t shift, start; 2645 type = metaslab_calculate_range_tree_type(vd, ms, &start, &shift); 2646 2647 /* 2648 * We create the ms_allocatable here, but we don't create the 2649 * other range trees until metaslab_sync_done(). This serves 2650 * two purposes: it allows metaslab_sync_done() to detect the 2651 * addition of new space; and for debugging, it ensures that 2652 * we'd data fault on any attempt to use this metaslab before 2653 * it's ready. 2654 */ 2655 ms->ms_allocatable = range_tree_create(NULL, type, NULL, start, shift); 2656 2657 ms->ms_trim = range_tree_create(NULL, type, NULL, start, shift); 2658 2659 metaslab_group_add(mg, ms); 2660 metaslab_set_fragmentation(ms, B_FALSE); 2661 2662 /* 2663 * If we're opening an existing pool (txg == 0) or creating 2664 * a new one (txg == TXG_INITIAL), all space is available now. 2665 * If we're adding space to an existing pool, the new space 2666 * does not become available until after this txg has synced. 2667 * The metaslab's weight will also be initialized when we sync 2668 * out this txg. This ensures that we don't attempt to allocate 2669 * from it before we have initialized it completely. 2670 */ 2671 if (txg <= TXG_INITIAL) { 2672 metaslab_sync_done(ms, 0); 2673 metaslab_space_update(vd, mg->mg_class, 2674 metaslab_allocated_space(ms), 0, 0); 2675 } 2676 2677 if (txg != 0) { 2678 vdev_dirty(vd, 0, NULL, txg); 2679 vdev_dirty(vd, VDD_METASLAB, ms, txg); 2680 } 2681 2682 *msp = ms; 2683 2684 return (0); 2685 } 2686 2687 static void 2688 metaslab_fini_flush_data(metaslab_t *msp) 2689 { 2690 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 2691 2692 if (metaslab_unflushed_txg(msp) == 0) { 2693 ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), 2694 ==, NULL); 2695 return; 2696 } 2697 ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); 2698 2699 mutex_enter(&spa->spa_flushed_ms_lock); 2700 avl_remove(&spa->spa_metaslabs_by_flushed, msp); 2701 mutex_exit(&spa->spa_flushed_ms_lock); 2702 2703 spa_log_sm_decrement_mscount(spa, metaslab_unflushed_txg(msp)); 2704 spa_log_summary_decrement_mscount(spa, metaslab_unflushed_txg(msp)); 2705 } 2706 2707 uint64_t 2708 metaslab_unflushed_changes_memused(metaslab_t *ms) 2709 { 2710 return ((range_tree_numsegs(ms->ms_unflushed_allocs) + 2711 range_tree_numsegs(ms->ms_unflushed_frees)) * 2712 ms->ms_unflushed_allocs->rt_root.bt_elem_size); 2713 } 2714 2715 void 2716 metaslab_fini(metaslab_t *msp) 2717 { 2718 metaslab_group_t *mg = msp->ms_group; 2719 vdev_t *vd = mg->mg_vd; 2720 spa_t *spa = vd->vdev_spa; 2721 2722 metaslab_fini_flush_data(msp); 2723 2724 metaslab_group_remove(mg, msp); 2725 2726 mutex_enter(&msp->ms_lock); 2727 VERIFY(msp->ms_group == NULL); 2728 metaslab_space_update(vd, mg->mg_class, 2729 -metaslab_allocated_space(msp), 0, -msp->ms_size); 2730 2731 space_map_close(msp->ms_sm); 2732 msp->ms_sm = NULL; 2733 2734 metaslab_unload(msp); 2735 range_tree_destroy(msp->ms_allocatable); 2736 range_tree_destroy(msp->ms_freeing); 2737 range_tree_destroy(msp->ms_freed); 2738 2739 ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, 2740 metaslab_unflushed_changes_memused(msp)); 2741 spa->spa_unflushed_stats.sus_memused -= 2742 metaslab_unflushed_changes_memused(msp); 2743 range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); 2744 range_tree_destroy(msp->ms_unflushed_allocs); 2745 range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); 2746 range_tree_destroy(msp->ms_unflushed_frees); 2747 2748 for (int t = 0; t < TXG_SIZE; t++) { 2749 range_tree_destroy(msp->ms_allocating[t]); 2750 } 2751 2752 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2753 range_tree_destroy(msp->ms_defer[t]); 2754 } 2755 ASSERT0(msp->ms_deferspace); 2756 2757 range_tree_destroy(msp->ms_checkpointing); 2758 2759 for (int t = 0; t < TXG_SIZE; t++) 2760 ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t)); 2761 2762 range_tree_vacate(msp->ms_trim, NULL, NULL); 2763 range_tree_destroy(msp->ms_trim); 2764 2765 mutex_exit(&msp->ms_lock); 2766 cv_destroy(&msp->ms_load_cv); 2767 cv_destroy(&msp->ms_flush_cv); 2768 mutex_destroy(&msp->ms_lock); 2769 mutex_destroy(&msp->ms_sync_lock); 2770 ASSERT3U(msp->ms_allocator, ==, -1); 2771 2772 kmem_free(msp, sizeof (metaslab_t)); 2773 } 2774 2775 #define FRAGMENTATION_TABLE_SIZE 17 2776 2777 /* 2778 * This table defines a segment size based fragmentation metric that will 2779 * allow each metaslab to derive its own fragmentation value. This is done 2780 * by calculating the space in each bucket of the spacemap histogram and 2781 * multiplying that by the fragmentation metric in this table. Doing 2782 * this for all buckets and dividing it by the total amount of free 2783 * space in this metaslab (i.e. the total free space in all buckets) gives 2784 * us the fragmentation metric. This means that a high fragmentation metric 2785 * equates to most of the free space being comprised of small segments. 2786 * Conversely, if the metric is low, then most of the free space is in 2787 * large segments. A 10% change in fragmentation equates to approximately 2788 * double the number of segments. 2789 * 2790 * This table defines 0% fragmented space using 16MB segments. Testing has 2791 * shown that segments that are greater than or equal to 16MB do not suffer 2792 * from drastic performance problems. Using this value, we derive the rest 2793 * of the table. Since the fragmentation value is never stored on disk, it 2794 * is possible to change these calculations in the future. 2795 */ 2796 int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = { 2797 100, /* 512B */ 2798 100, /* 1K */ 2799 98, /* 2K */ 2800 95, /* 4K */ 2801 90, /* 8K */ 2802 80, /* 16K */ 2803 70, /* 32K */ 2804 60, /* 64K */ 2805 50, /* 128K */ 2806 40, /* 256K */ 2807 30, /* 512K */ 2808 20, /* 1M */ 2809 15, /* 2M */ 2810 10, /* 4M */ 2811 5, /* 8M */ 2812 0 /* 16M */ 2813 }; 2814 2815 /* 2816 * Calculate the metaslab's fragmentation metric and set ms_fragmentation. 2817 * Setting this value to ZFS_FRAG_INVALID means that the metaslab has not 2818 * been upgraded and does not support this metric. Otherwise, the return 2819 * value should be in the range [0, 100]. 2820 */ 2821 static void 2822 metaslab_set_fragmentation(metaslab_t *msp, boolean_t nodirty) 2823 { 2824 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 2825 uint64_t fragmentation = 0; 2826 uint64_t total = 0; 2827 boolean_t feature_enabled = spa_feature_is_enabled(spa, 2828 SPA_FEATURE_SPACEMAP_HISTOGRAM); 2829 2830 if (!feature_enabled) { 2831 msp->ms_fragmentation = ZFS_FRAG_INVALID; 2832 return; 2833 } 2834 2835 /* 2836 * A null space map means that the entire metaslab is free 2837 * and thus is not fragmented. 2838 */ 2839 if (msp->ms_sm == NULL) { 2840 msp->ms_fragmentation = 0; 2841 return; 2842 } 2843 2844 /* 2845 * If this metaslab's space map has not been upgraded, flag it 2846 * so that we upgrade next time we encounter it. 2847 */ 2848 if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) { 2849 uint64_t txg = spa_syncing_txg(spa); 2850 vdev_t *vd = msp->ms_group->mg_vd; 2851 2852 /* 2853 * If we've reached the final dirty txg, then we must 2854 * be shutting down the pool. We don't want to dirty 2855 * any data past this point so skip setting the condense 2856 * flag. We can retry this action the next time the pool 2857 * is imported. We also skip marking this metaslab for 2858 * condensing if the caller has explicitly set nodirty. 2859 */ 2860 if (!nodirty && 2861 spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) { 2862 msp->ms_condense_wanted = B_TRUE; 2863 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 2864 zfs_dbgmsg("txg %llu, requesting force condense: " 2865 "ms_id %llu, vdev_id %llu", txg, msp->ms_id, 2866 vd->vdev_id); 2867 } 2868 msp->ms_fragmentation = ZFS_FRAG_INVALID; 2869 return; 2870 } 2871 2872 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 2873 uint64_t space = 0; 2874 uint8_t shift = msp->ms_sm->sm_shift; 2875 2876 int idx = MIN(shift - SPA_MINBLOCKSHIFT + i, 2877 FRAGMENTATION_TABLE_SIZE - 1); 2878 2879 if (msp->ms_sm->sm_phys->smp_histogram[i] == 0) 2880 continue; 2881 2882 space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift); 2883 total += space; 2884 2885 ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE); 2886 fragmentation += space * zfs_frag_table[idx]; 2887 } 2888 2889 if (total > 0) 2890 fragmentation /= total; 2891 ASSERT3U(fragmentation, <=, 100); 2892 2893 msp->ms_fragmentation = fragmentation; 2894 } 2895 2896 /* 2897 * Compute a weight -- a selection preference value -- for the given metaslab. 2898 * This is based on the amount of free space, the level of fragmentation, 2899 * the LBA range, and whether the metaslab is loaded. 2900 */ 2901 static uint64_t 2902 metaslab_space_weight(metaslab_t *msp) 2903 { 2904 metaslab_group_t *mg = msp->ms_group; 2905 vdev_t *vd = mg->mg_vd; 2906 uint64_t weight, space; 2907 2908 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2909 2910 /* 2911 * The baseline weight is the metaslab's free space. 2912 */ 2913 space = msp->ms_size - metaslab_allocated_space(msp); 2914 2915 if (metaslab_fragmentation_factor_enabled && 2916 msp->ms_fragmentation != ZFS_FRAG_INVALID) { 2917 /* 2918 * Use the fragmentation information to inversely scale 2919 * down the baseline weight. We need to ensure that we 2920 * don't exclude this metaslab completely when it's 100% 2921 * fragmented. To avoid this we reduce the fragmented value 2922 * by 1. 2923 */ 2924 space = (space * (100 - (msp->ms_fragmentation - 1))) / 100; 2925 2926 /* 2927 * If space < SPA_MINBLOCKSIZE, then we will not allocate from 2928 * this metaslab again. The fragmentation metric may have 2929 * decreased the space to something smaller than 2930 * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE 2931 * so that we can consume any remaining space. 2932 */ 2933 if (space > 0 && space < SPA_MINBLOCKSIZE) 2934 space = SPA_MINBLOCKSIZE; 2935 } 2936 weight = space; 2937 2938 /* 2939 * Modern disks have uniform bit density and constant angular velocity. 2940 * Therefore, the outer recording zones are faster (higher bandwidth) 2941 * than the inner zones by the ratio of outer to inner track diameter, 2942 * which is typically around 2:1. We account for this by assigning 2943 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). 2944 * In effect, this means that we'll select the metaslab with the most 2945 * free bandwidth rather than simply the one with the most free space. 2946 */ 2947 if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) { 2948 weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count; 2949 ASSERT(weight >= space && weight <= 2 * space); 2950 } 2951 2952 /* 2953 * If this metaslab is one we're actively using, adjust its 2954 * weight to make it preferable to any inactive metaslab so 2955 * we'll polish it off. If the fragmentation on this metaslab 2956 * has exceed our threshold, then don't mark it active. 2957 */ 2958 if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID && 2959 msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) { 2960 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); 2961 } 2962 2963 WEIGHT_SET_SPACEBASED(weight); 2964 return (weight); 2965 } 2966 2967 /* 2968 * Return the weight of the specified metaslab, according to the segment-based 2969 * weighting algorithm. The metaslab must be loaded. This function can 2970 * be called within a sync pass since it relies only on the metaslab's 2971 * range tree which is always accurate when the metaslab is loaded. 2972 */ 2973 static uint64_t 2974 metaslab_weight_from_range_tree(metaslab_t *msp) 2975 { 2976 uint64_t weight = 0; 2977 uint32_t segments = 0; 2978 2979 ASSERT(msp->ms_loaded); 2980 2981 for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT; 2982 i--) { 2983 uint8_t shift = msp->ms_group->mg_vd->vdev_ashift; 2984 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; 2985 2986 segments <<= 1; 2987 segments += msp->ms_allocatable->rt_histogram[i]; 2988 2989 /* 2990 * The range tree provides more precision than the space map 2991 * and must be downgraded so that all values fit within the 2992 * space map's histogram. This allows us to compare loaded 2993 * vs. unloaded metaslabs to determine which metaslab is 2994 * considered "best". 2995 */ 2996 if (i > max_idx) 2997 continue; 2998 2999 if (segments != 0) { 3000 WEIGHT_SET_COUNT(weight, segments); 3001 WEIGHT_SET_INDEX(weight, i); 3002 WEIGHT_SET_ACTIVE(weight, 0); 3003 break; 3004 } 3005 } 3006 return (weight); 3007 } 3008 3009 /* 3010 * Calculate the weight based on the on-disk histogram. Should be applied 3011 * only to unloaded metaslabs (i.e no incoming allocations) in-order to 3012 * give results consistent with the on-disk state 3013 */ 3014 static uint64_t 3015 metaslab_weight_from_spacemap(metaslab_t *msp) 3016 { 3017 space_map_t *sm = msp->ms_sm; 3018 ASSERT(!msp->ms_loaded); 3019 ASSERT(sm != NULL); 3020 ASSERT3U(space_map_object(sm), !=, 0); 3021 ASSERT3U(sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); 3022 3023 /* 3024 * Create a joint histogram from all the segments that have made 3025 * it to the metaslab's space map histogram, that are not yet 3026 * available for allocation because they are still in the freeing 3027 * pipeline (e.g. freeing, freed, and defer trees). Then subtract 3028 * these segments from the space map's histogram to get a more 3029 * accurate weight. 3030 */ 3031 uint64_t deferspace_histogram[SPACE_MAP_HISTOGRAM_SIZE] = {0}; 3032 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) 3033 deferspace_histogram[i] += msp->ms_synchist[i]; 3034 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 3035 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 3036 deferspace_histogram[i] += msp->ms_deferhist[t][i]; 3037 } 3038 } 3039 3040 uint64_t weight = 0; 3041 for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) { 3042 ASSERT3U(sm->sm_phys->smp_histogram[i], >=, 3043 deferspace_histogram[i]); 3044 uint64_t count = 3045 sm->sm_phys->smp_histogram[i] - deferspace_histogram[i]; 3046 if (count != 0) { 3047 WEIGHT_SET_COUNT(weight, count); 3048 WEIGHT_SET_INDEX(weight, i + sm->sm_shift); 3049 WEIGHT_SET_ACTIVE(weight, 0); 3050 break; 3051 } 3052 } 3053 return (weight); 3054 } 3055 3056 /* 3057 * Compute a segment-based weight for the specified metaslab. The weight 3058 * is determined by highest bucket in the histogram. The information 3059 * for the highest bucket is encoded into the weight value. 3060 */ 3061 static uint64_t 3062 metaslab_segment_weight(metaslab_t *msp) 3063 { 3064 metaslab_group_t *mg = msp->ms_group; 3065 uint64_t weight = 0; 3066 uint8_t shift = mg->mg_vd->vdev_ashift; 3067 3068 ASSERT(MUTEX_HELD(&msp->ms_lock)); 3069 3070 /* 3071 * The metaslab is completely free. 3072 */ 3073 if (metaslab_allocated_space(msp) == 0) { 3074 int idx = highbit64(msp->ms_size) - 1; 3075 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; 3076 3077 if (idx < max_idx) { 3078 WEIGHT_SET_COUNT(weight, 1ULL); 3079 WEIGHT_SET_INDEX(weight, idx); 3080 } else { 3081 WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx)); 3082 WEIGHT_SET_INDEX(weight, max_idx); 3083 } 3084 WEIGHT_SET_ACTIVE(weight, 0); 3085 ASSERT(!WEIGHT_IS_SPACEBASED(weight)); 3086 return (weight); 3087 } 3088 3089 ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); 3090 3091 /* 3092 * If the metaslab is fully allocated then just make the weight 0. 3093 */ 3094 if (metaslab_allocated_space(msp) == msp->ms_size) 3095 return (0); 3096 /* 3097 * If the metaslab is already loaded, then use the range tree to 3098 * determine the weight. Otherwise, we rely on the space map information 3099 * to generate the weight. 3100 */ 3101 if (msp->ms_loaded) { 3102 weight = metaslab_weight_from_range_tree(msp); 3103 } else { 3104 weight = metaslab_weight_from_spacemap(msp); 3105 } 3106 3107 /* 3108 * If the metaslab was active the last time we calculated its weight 3109 * then keep it active. We want to consume the entire region that 3110 * is associated with this weight. 3111 */ 3112 if (msp->ms_activation_weight != 0 && weight != 0) 3113 WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight)); 3114 return (weight); 3115 } 3116 3117 /* 3118 * Determine if we should attempt to allocate from this metaslab. If the 3119 * metaslab is loaded, then we can determine if the desired allocation 3120 * can be satisfied by looking at the size of the maximum free segment 3121 * on that metaslab. Otherwise, we make our decision based on the metaslab's 3122 * weight. For segment-based weighting we can determine the maximum 3123 * allocation based on the index encoded in its value. For space-based 3124 * weights we rely on the entire weight (excluding the weight-type bit). 3125 */ 3126 static boolean_t 3127 metaslab_should_allocate(metaslab_t *msp, uint64_t asize, boolean_t try_hard) 3128 { 3129 /* 3130 * If the metaslab is loaded, ms_max_size is definitive and we can use 3131 * the fast check. If it's not, the ms_max_size is a lower bound (once 3132 * set), and we should use the fast check as long as we're not in 3133 * try_hard and it's been less than zfs_metaslab_max_size_cache_sec 3134 * seconds since the metaslab was unloaded. 3135 */ 3136 if (msp->ms_loaded || 3137 (msp->ms_max_size != 0 && !try_hard && gethrtime() < 3138 msp->ms_unload_time + SEC2NSEC(zfs_metaslab_max_size_cache_sec))) 3139 return (msp->ms_max_size >= asize); 3140 3141 boolean_t should_allocate; 3142 if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) { 3143 /* 3144 * The metaslab segment weight indicates segments in the 3145 * range [2^i, 2^(i+1)), where i is the index in the weight. 3146 * Since the asize might be in the middle of the range, we 3147 * should attempt the allocation if asize < 2^(i+1). 3148 */ 3149 should_allocate = (asize < 3150 1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1)); 3151 } else { 3152 should_allocate = (asize <= 3153 (msp->ms_weight & ~METASLAB_WEIGHT_TYPE)); 3154 } 3155 3156 return (should_allocate); 3157 } 3158 3159 static uint64_t 3160 metaslab_weight(metaslab_t *msp, boolean_t nodirty) 3161 { 3162 vdev_t *vd = msp->ms_group->mg_vd; 3163 spa_t *spa = vd->vdev_spa; 3164 uint64_t weight; 3165 3166 ASSERT(MUTEX_HELD(&msp->ms_lock)); 3167 3168 metaslab_set_fragmentation(msp, nodirty); 3169 3170 /* 3171 * Update the maximum size. If the metaslab is loaded, this will 3172 * ensure that we get an accurate maximum size if newly freed space 3173 * has been added back into the free tree. If the metaslab is 3174 * unloaded, we check if there's a larger free segment in the 3175 * unflushed frees. This is a lower bound on the largest allocatable 3176 * segment size. Coalescing of adjacent entries may reveal larger 3177 * allocatable segments, but we aren't aware of those until loading 3178 * the space map into a range tree. 3179 */ 3180 if (msp->ms_loaded) { 3181 msp->ms_max_size = metaslab_largest_allocatable(msp); 3182 } else { 3183 msp->ms_max_size = MAX(msp->ms_max_size, 3184 metaslab_largest_unflushed_free(msp)); 3185 } 3186 3187 /* 3188 * Segment-based weighting requires space map histogram support. 3189 */ 3190 if (zfs_metaslab_segment_weight_enabled && 3191 spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) && 3192 (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size == 3193 sizeof (space_map_phys_t))) { 3194 weight = metaslab_segment_weight(msp); 3195 } else { 3196 weight = metaslab_space_weight(msp); 3197 } 3198 return (weight); 3199 } 3200 3201 void 3202 metaslab_recalculate_weight_and_sort(metaslab_t *msp) 3203 { 3204 ASSERT(MUTEX_HELD(&msp->ms_lock)); 3205 3206 /* note: we preserve the mask (e.g. indication of primary, etc..) */ 3207 uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; 3208 metaslab_group_sort(msp->ms_group, msp, 3209 metaslab_weight(msp, B_FALSE) | was_active); 3210 } 3211 3212 static int 3213 metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp, 3214 int allocator, uint64_t activation_weight) 3215 { 3216 metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; 3217 ASSERT(MUTEX_HELD(&msp->ms_lock)); 3218 3219 /* 3220 * If we're activating for the claim code, we don't want to actually 3221 * set the metaslab up for a specific allocator. 3222 */ 3223 if (activation_weight == METASLAB_WEIGHT_CLAIM) { 3224 ASSERT0(msp->ms_activation_weight); 3225 msp->ms_activation_weight = msp->ms_weight; 3226 metaslab_group_sort(mg, msp, msp->ms_weight | 3227 activation_weight); 3228 return (0); 3229 } 3230 3231 metaslab_t **mspp = (activation_weight == METASLAB_WEIGHT_PRIMARY ? 3232 &mga->mga_primary : &mga->mga_secondary); 3233 3234 mutex_enter(&mg->mg_lock); 3235 if (*mspp != NULL) { 3236 mutex_exit(&mg->mg_lock); 3237 return (EEXIST); 3238 } 3239 3240 *mspp = msp; 3241 ASSERT3S(msp->ms_allocator, ==, -1); 3242 msp->ms_allocator = allocator; 3243 msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY); 3244 3245 ASSERT0(msp->ms_activation_weight); 3246 msp->ms_activation_weight = msp->ms_weight; 3247 metaslab_group_sort_impl(mg, msp, 3248 msp->ms_weight | activation_weight); 3249 mutex_exit(&mg->mg_lock); 3250 3251 return (0); 3252 } 3253 3254 static int 3255 metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight) 3256 { 3257 ASSERT(MUTEX_HELD(&msp->ms_lock)); 3258 3259 /* 3260 * The current metaslab is already activated for us so there 3261 * is nothing to do. Already activated though, doesn't mean 3262 * that this metaslab is activated for our allocator nor our 3263 * requested activation weight. The metaslab could have started 3264 * as an active one for our allocator but changed allocators 3265 * while we were waiting to grab its ms_lock or we stole it 3266 * [see find_valid_metaslab()]. This means that there is a 3267 * possibility of passivating a metaslab of another allocator 3268 * or from a different activation mask, from this thread. 3269 */ 3270 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) { 3271 ASSERT(msp->ms_loaded); 3272 return (0); 3273 } 3274 3275 int error = metaslab_load(msp); 3276 if (error != 0) { 3277 metaslab_group_sort(msp->ms_group, msp, 0); 3278 return (error); 3279 } 3280 3281 /* 3282 * When entering metaslab_load() we may have dropped the 3283 * ms_lock because we were loading this metaslab, or we 3284 * were waiting for another thread to load it for us. In 3285 * that scenario, we recheck the weight of the metaslab 3286 * to see if it was activated by another thread. 3287 * 3288 * If the metaslab was activated for another allocator or 3289 * it was activated with a different activation weight (e.g. 3290 * we wanted to make it a primary but it was activated as 3291 * secondary) we return error (EBUSY). 3292 * 3293 * If the metaslab was activated for the same allocator 3294 * and requested activation mask, skip activating it. 3295 */ 3296 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) { 3297 if (msp->ms_allocator != allocator) 3298 return (EBUSY); 3299 3300 if ((msp->ms_weight & activation_weight) == 0) 3301 return (SET_ERROR(EBUSY)); 3302 3303 EQUIV((activation_weight == METASLAB_WEIGHT_PRIMARY), 3304 msp->ms_primary); 3305 return (0); 3306 } 3307 3308 /* 3309 * If the metaslab has literally 0 space, it will have weight 0. In 3310 * that case, don't bother activating it. This can happen if the 3311 * metaslab had space during find_valid_metaslab, but another thread 3312 * loaded it and used all that space while we were waiting to grab the 3313 * lock. 3314 */ 3315 if (msp->ms_weight == 0) { 3316 ASSERT0(range_tree_space(msp->ms_allocatable)); 3317 return (SET_ERROR(ENOSPC)); 3318 } 3319 3320 if ((error = metaslab_activate_allocator(msp->ms_group, msp, 3321 allocator, activation_weight)) != 0) { 3322 return (error); 3323 } 3324 3325 ASSERT(msp->ms_loaded); 3326 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); 3327 3328 return (0); 3329 } 3330 3331 static void 3332 metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp, 3333 uint64_t weight) 3334 { 3335 ASSERT(MUTEX_HELD(&msp->ms_lock)); 3336 ASSERT(msp->ms_loaded); 3337 3338 if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) { 3339 metaslab_group_sort(mg, msp, weight); 3340 return; 3341 } 3342 3343 mutex_enter(&mg->mg_lock); 3344 ASSERT3P(msp->ms_group, ==, mg); 3345 ASSERT3S(0, <=, msp->ms_allocator); 3346 ASSERT3U(msp->ms_allocator, <, mg->mg_allocators); 3347 3348 metaslab_group_allocator_t *mga = &mg->mg_allocator[msp->ms_allocator]; 3349 if (msp->ms_primary) { 3350 ASSERT3P(mga->mga_primary, ==, msp); 3351 ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); 3352 mga->mga_primary = NULL; 3353 } else { 3354 ASSERT3P(mga->mga_secondary, ==, msp); 3355 ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); 3356 mga->mga_secondary = NULL; 3357 } 3358 msp->ms_allocator = -1; 3359 metaslab_group_sort_impl(mg, msp, weight); 3360 mutex_exit(&mg->mg_lock); 3361 } 3362 3363 static void 3364 metaslab_passivate(metaslab_t *msp, uint64_t weight) 3365 { 3366 uint64_t size __maybe_unused = weight & ~METASLAB_WEIGHT_TYPE; 3367 3368 /* 3369 * If size < SPA_MINBLOCKSIZE, then we will not allocate from 3370 * this metaslab again. In that case, it had better be empty, 3371 * or we would be leaving space on the table. 3372 */ 3373 ASSERT(!WEIGHT_IS_SPACEBASED(msp->ms_weight) || 3374 size >= SPA_MINBLOCKSIZE || 3375 range_tree_space(msp->ms_allocatable) == 0); 3376 ASSERT0(weight & METASLAB_ACTIVE_MASK); 3377 3378 ASSERT(msp->ms_activation_weight != 0); 3379 msp->ms_activation_weight = 0; 3380 metaslab_passivate_allocator(msp->ms_group, msp, weight); 3381 ASSERT0(msp->ms_weight & METASLAB_ACTIVE_MASK); 3382 } 3383 3384 /* 3385 * Segment-based metaslabs are activated once and remain active until 3386 * we either fail an allocation attempt (similar to space-based metaslabs) 3387 * or have exhausted the free space in zfs_metaslab_switch_threshold 3388 * buckets since the metaslab was activated. This function checks to see 3389 * if we've exhausted the zfs_metaslab_switch_threshold buckets in the 3390 * metaslab and passivates it proactively. This will allow us to select a 3391 * metaslab with a larger contiguous region, if any, remaining within this 3392 * metaslab group. If we're in sync pass > 1, then we continue using this 3393 * metaslab so that we don't dirty more block and cause more sync passes. 3394 */ 3395 static void 3396 metaslab_segment_may_passivate(metaslab_t *msp) 3397 { 3398 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 3399 3400 if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1) 3401 return; 3402 3403 /* 3404 * Since we are in the middle of a sync pass, the most accurate 3405 * information that is accessible to us is the in-core range tree 3406 * histogram; calculate the new weight based on that information. 3407 */ 3408 uint64_t weight = metaslab_weight_from_range_tree(msp); 3409 int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight); 3410 int current_idx = WEIGHT_GET_INDEX(weight); 3411 3412 if (current_idx <= activation_idx - zfs_metaslab_switch_threshold) 3413 metaslab_passivate(msp, weight); 3414 } 3415 3416 static void 3417 metaslab_preload(void *arg) 3418 { 3419 metaslab_t *msp = arg; 3420 metaslab_class_t *mc = msp->ms_group->mg_class; 3421 spa_t *spa = mc->mc_spa; 3422 fstrans_cookie_t cookie = spl_fstrans_mark(); 3423 3424 ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock)); 3425 3426 mutex_enter(&msp->ms_lock); 3427 (void) metaslab_load(msp); 3428 metaslab_set_selected_txg(msp, spa_syncing_txg(spa)); 3429 mutex_exit(&msp->ms_lock); 3430 spl_fstrans_unmark(cookie); 3431 } 3432 3433 static void 3434 metaslab_group_preload(metaslab_group_t *mg) 3435 { 3436 spa_t *spa = mg->mg_vd->vdev_spa; 3437 metaslab_t *msp; 3438 avl_tree_t *t = &mg->mg_metaslab_tree; 3439 int m = 0; 3440 3441 if (spa_shutting_down(spa) || !metaslab_preload_enabled) { 3442 taskq_wait_outstanding(mg->mg_taskq, 0); 3443 return; 3444 } 3445 3446 mutex_enter(&mg->mg_lock); 3447 3448 /* 3449 * Load the next potential metaslabs 3450 */ 3451 for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) { 3452 ASSERT3P(msp->ms_group, ==, mg); 3453 3454 /* 3455 * We preload only the maximum number of metaslabs specified 3456 * by metaslab_preload_limit. If a metaslab is being forced 3457 * to condense then we preload it too. This will ensure 3458 * that force condensing happens in the next txg. 3459 */ 3460 if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) { 3461 continue; 3462 } 3463 3464 VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload, 3465 msp, TQ_SLEEP) != TASKQID_INVALID); 3466 } 3467 mutex_exit(&mg->mg_lock); 3468 } 3469 3470 /* 3471 * Determine if the space map's on-disk footprint is past our tolerance for 3472 * inefficiency. We would like to use the following criteria to make our 3473 * decision: 3474 * 3475 * 1. Do not condense if the size of the space map object would dramatically 3476 * increase as a result of writing out the free space range tree. 3477 * 3478 * 2. Condense if the on on-disk space map representation is at least 3479 * zfs_condense_pct/100 times the size of the optimal representation 3480 * (i.e. zfs_condense_pct = 110 and in-core = 1MB, optimal = 1.1MB). 3481 * 3482 * 3. Do not condense if the on-disk size of the space map does not actually 3483 * decrease. 3484 * 3485 * Unfortunately, we cannot compute the on-disk size of the space map in this 3486 * context because we cannot accurately compute the effects of compression, etc. 3487 * Instead, we apply the heuristic described in the block comment for 3488 * zfs_metaslab_condense_block_threshold - we only condense if the space used 3489 * is greater than a threshold number of blocks. 3490 */ 3491 static boolean_t 3492 metaslab_should_condense(metaslab_t *msp) 3493 { 3494 space_map_t *sm = msp->ms_sm; 3495 vdev_t *vd = msp->ms_group->mg_vd; 3496 uint64_t vdev_blocksize = 1 << vd->vdev_ashift; 3497 3498 ASSERT(MUTEX_HELD(&msp->ms_lock)); 3499 ASSERT(msp->ms_loaded); 3500 ASSERT(sm != NULL); 3501 ASSERT3U(spa_sync_pass(vd->vdev_spa), ==, 1); 3502 3503 /* 3504 * We always condense metaslabs that are empty and metaslabs for 3505 * which a condense request has been made. 3506 */ 3507 if (range_tree_numsegs(msp->ms_allocatable) == 0 || 3508 msp->ms_condense_wanted) 3509 return (B_TRUE); 3510 3511 uint64_t record_size = MAX(sm->sm_blksz, vdev_blocksize); 3512 uint64_t object_size = space_map_length(sm); 3513 uint64_t optimal_size = space_map_estimate_optimal_size(sm, 3514 msp->ms_allocatable, SM_NO_VDEVID); 3515 3516 return (object_size >= (optimal_size * zfs_condense_pct / 100) && 3517 object_size > zfs_metaslab_condense_block_threshold * record_size); 3518 } 3519 3520 /* 3521 * Condense the on-disk space map representation to its minimized form. 3522 * The minimized form consists of a small number of allocations followed 3523 * by the entries of the free range tree (ms_allocatable). The condensed 3524 * spacemap contains all the entries of previous TXGs (including those in 3525 * the pool-wide log spacemaps; thus this is effectively a superset of 3526 * metaslab_flush()), but this TXG's entries still need to be written. 3527 */ 3528 static void 3529 metaslab_condense(metaslab_t *msp, dmu_tx_t *tx) 3530 { 3531 range_tree_t *condense_tree; 3532 space_map_t *sm = msp->ms_sm; 3533 uint64_t txg = dmu_tx_get_txg(tx); 3534 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 3535 3536 ASSERT(MUTEX_HELD(&msp->ms_lock)); 3537 ASSERT(msp->ms_loaded); 3538 ASSERT(msp->ms_sm != NULL); 3539 3540 /* 3541 * In order to condense the space map, we need to change it so it 3542 * only describes which segments are currently allocated and free. 3543 * 3544 * All the current free space resides in the ms_allocatable, all 3545 * the ms_defer trees, and all the ms_allocating trees. We ignore 3546 * ms_freed because it is empty because we're in sync pass 1. We 3547 * ignore ms_freeing because these changes are not yet reflected 3548 * in the spacemap (they will be written later this txg). 3549 * 3550 * So to truncate the space map to represent all the entries of 3551 * previous TXGs we do the following: 3552 * 3553 * 1] We create a range tree (condense tree) that is 100% empty. 3554 * 2] We add to it all segments found in the ms_defer trees 3555 * as those segments are marked as free in the original space 3556 * map. We do the same with the ms_allocating trees for the same 3557 * reason. Adding these segments should be a relatively 3558 * inexpensive operation since we expect these trees to have a 3559 * small number of nodes. 3560 * 3] We vacate any unflushed allocs, since they are not frees we 3561 * need to add to the condense tree. Then we vacate any 3562 * unflushed frees as they should already be part of ms_allocatable. 3563 * 4] At this point, we would ideally like to add all segments 3564 * in the ms_allocatable tree from the condense tree. This way 3565 * we would write all the entries of the condense tree as the 3566 * condensed space map, which would only contain freed 3567 * segments with everything else assumed to be allocated. 3568 * 3569 * Doing so can be prohibitively expensive as ms_allocatable can 3570 * be large, and therefore computationally expensive to add to 3571 * the condense_tree. Instead we first sync out an entry marking 3572 * everything as allocated, then the condense_tree and then the 3573 * ms_allocatable, in the condensed space map. While this is not 3574 * optimal, it is typically close to optimal and more importantly 3575 * much cheaper to compute. 3576 * 3577 * 5] Finally, as both of the unflushed trees were written to our 3578 * new and condensed metaslab space map, we basically flushed 3579 * all the unflushed changes to disk, thus we call 3580 * metaslab_flush_update(). 3581 */ 3582 ASSERT3U(spa_sync_pass(spa), ==, 1); 3583 ASSERT(range_tree_is_empty(msp->ms_freed)); /* since it is pass 1 */ 3584 3585 zfs_dbgmsg("condensing: txg %llu, msp[%llu] %px, vdev id %llu, " 3586 "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg, 3587 msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id, 3588 spa->spa_name, space_map_length(msp->ms_sm), 3589 range_tree_numsegs(msp->ms_allocatable), 3590 msp->ms_condense_wanted ? "TRUE" : "FALSE"); 3591 3592 msp->ms_condense_wanted = B_FALSE; 3593 3594 range_seg_type_t type; 3595 uint64_t shift, start; 3596 type = metaslab_calculate_range_tree_type(msp->ms_group->mg_vd, msp, 3597 &start, &shift); 3598 3599 condense_tree = range_tree_create(NULL, type, NULL, start, shift); 3600 3601 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 3602 range_tree_walk(msp->ms_defer[t], 3603 range_tree_add, condense_tree); 3604 } 3605 3606 for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { 3607 range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK], 3608 range_tree_add, condense_tree); 3609 } 3610 3611 ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, 3612 metaslab_unflushed_changes_memused(msp)); 3613 spa->spa_unflushed_stats.sus_memused -= 3614 metaslab_unflushed_changes_memused(msp); 3615 range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); 3616 range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); 3617 3618 /* 3619 * We're about to drop the metaslab's lock thus allowing other 3620 * consumers to change it's content. Set the metaslab's ms_condensing 3621 * flag to ensure that allocations on this metaslab do not occur 3622 * while we're in the middle of committing it to disk. This is only 3623 * critical for ms_allocatable as all other range trees use per TXG 3624 * views of their content. 3625 */ 3626 msp->ms_condensing = B_TRUE; 3627 3628 mutex_exit(&msp->ms_lock); 3629 uint64_t object = space_map_object(msp->ms_sm); 3630 space_map_truncate(sm, 3631 spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ? 3632 zfs_metaslab_sm_blksz_with_log : zfs_metaslab_sm_blksz_no_log, tx); 3633 3634 /* 3635 * space_map_truncate() may have reallocated the spacemap object. 3636 * If so, update the vdev_ms_array. 3637 */ 3638 if (space_map_object(msp->ms_sm) != object) { 3639 object = space_map_object(msp->ms_sm); 3640 dmu_write(spa->spa_meta_objset, 3641 msp->ms_group->mg_vd->vdev_ms_array, sizeof (uint64_t) * 3642 msp->ms_id, sizeof (uint64_t), &object, tx); 3643 } 3644 3645 /* 3646 * Note: 3647 * When the log space map feature is enabled, each space map will 3648 * always have ALLOCS followed by FREES for each sync pass. This is 3649 * typically true even when the log space map feature is disabled, 3650 * except from the case where a metaslab goes through metaslab_sync() 3651 * and gets condensed. In that case the metaslab's space map will have 3652 * ALLOCS followed by FREES (due to condensing) followed by ALLOCS 3653 * followed by FREES (due to space_map_write() in metaslab_sync()) for 3654 * sync pass 1. 3655 */ 3656 range_tree_t *tmp_tree = range_tree_create(NULL, type, NULL, start, 3657 shift); 3658 range_tree_add(tmp_tree, msp->ms_start, msp->ms_size); 3659 space_map_write(sm, tmp_tree, SM_ALLOC, SM_NO_VDEVID, tx); 3660 space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx); 3661 space_map_write(sm, condense_tree, SM_FREE, SM_NO_VDEVID, tx); 3662 3663 range_tree_vacate(condense_tree, NULL, NULL); 3664 range_tree_destroy(condense_tree); 3665 range_tree_vacate(tmp_tree, NULL, NULL); 3666 range_tree_destroy(tmp_tree); 3667 mutex_enter(&msp->ms_lock); 3668 3669 msp->ms_condensing = B_FALSE; 3670 metaslab_flush_update(msp, tx); 3671 } 3672 3673 /* 3674 * Called when the metaslab has been flushed (its own spacemap now reflects 3675 * all the contents of the pool-wide spacemap log). Updates the metaslab's 3676 * metadata and any pool-wide related log space map data (e.g. summary, 3677 * obsolete logs, etc..) to reflect that. 3678 */ 3679 static void 3680 metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx) 3681 { 3682 metaslab_group_t *mg = msp->ms_group; 3683 spa_t *spa = mg->mg_vd->vdev_spa; 3684 3685 ASSERT(MUTEX_HELD(&msp->ms_lock)); 3686 3687 ASSERT3U(spa_sync_pass(spa), ==, 1); 3688 ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); 3689 ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); 3690 3691 /* 3692 * Just because a metaslab got flushed, that doesn't mean that 3693 * it will pass through metaslab_sync_done(). Thus, make sure to 3694 * update ms_synced_length here in case it doesn't. 3695 */ 3696 msp->ms_synced_length = space_map_length(msp->ms_sm); 3697 3698 /* 3699 * We may end up here from metaslab_condense() without the 3700 * feature being active. In that case this is a no-op. 3701 */ 3702 if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) 3703 return; 3704 3705 ASSERT(spa_syncing_log_sm(spa) != NULL); 3706 ASSERT(msp->ms_sm != NULL); 3707 ASSERT(metaslab_unflushed_txg(msp) != 0); 3708 ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), ==, msp); 3709 3710 VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(spa)); 3711 3712 /* update metaslab's position in our flushing tree */ 3713 uint64_t ms_prev_flushed_txg = metaslab_unflushed_txg(msp); 3714 mutex_enter(&spa->spa_flushed_ms_lock); 3715 avl_remove(&spa->spa_metaslabs_by_flushed, msp); 3716 metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx); 3717 avl_add(&spa->spa_metaslabs_by_flushed, msp); 3718 mutex_exit(&spa->spa_flushed_ms_lock); 3719 3720 /* update metaslab counts of spa_log_sm_t nodes */ 3721 spa_log_sm_decrement_mscount(spa, ms_prev_flushed_txg); 3722 spa_log_sm_increment_current_mscount(spa); 3723 3724 /* cleanup obsolete logs if any */ 3725 uint64_t log_blocks_before = spa_log_sm_nblocks(spa); 3726 spa_cleanup_old_sm_logs(spa, tx); 3727 uint64_t log_blocks_after = spa_log_sm_nblocks(spa); 3728 VERIFY3U(log_blocks_after, <=, log_blocks_before); 3729 3730 /* update log space map summary */ 3731 uint64_t blocks_gone = log_blocks_before - log_blocks_after; 3732 spa_log_summary_add_flushed_metaslab(spa); 3733 spa_log_summary_decrement_mscount(spa, ms_prev_flushed_txg); 3734 spa_log_summary_decrement_blkcount(spa, blocks_gone); 3735 } 3736 3737 boolean_t 3738 metaslab_flush(metaslab_t *msp, dmu_tx_t *tx) 3739 { 3740 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 3741 3742 ASSERT(MUTEX_HELD(&msp->ms_lock)); 3743 ASSERT3U(spa_sync_pass(spa), ==, 1); 3744 ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); 3745 3746 ASSERT(msp->ms_sm != NULL); 3747 ASSERT(metaslab_unflushed_txg(msp) != 0); 3748 ASSERT(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL) != NULL); 3749 3750 /* 3751 * There is nothing wrong with flushing the same metaslab twice, as 3752 * this codepath should work on that case. However, the current 3753 * flushing scheme makes sure to avoid this situation as we would be 3754 * making all these calls without having anything meaningful to write 3755 * to disk. We assert this behavior here. 3756 */ 3757 ASSERT3U(metaslab_unflushed_txg(msp), <, dmu_tx_get_txg(tx)); 3758 3759 /* 3760 * We can not flush while loading, because then we would 3761 * not load the ms_unflushed_{allocs,frees}. 3762 */ 3763 if (msp->ms_loading) 3764 return (B_FALSE); 3765 3766 metaslab_verify_space(msp, dmu_tx_get_txg(tx)); 3767 metaslab_verify_weight_and_frag(msp); 3768 3769 /* 3770 * Metaslab condensing is effectively flushing. Therefore if the 3771 * metaslab can be condensed we can just condense it instead of 3772 * flushing it. 3773 * 3774 * Note that metaslab_condense() does call metaslab_flush_update() 3775 * so we can just return immediately after condensing. We also 3776 * don't need to care about setting ms_flushing or broadcasting 3777 * ms_flush_cv, even if we temporarily drop the ms_lock in 3778 * metaslab_condense(), as the metaslab is already loaded. 3779 */ 3780 if (msp->ms_loaded && metaslab_should_condense(msp)) { 3781 metaslab_group_t *mg = msp->ms_group; 3782 3783 /* 3784 * For all histogram operations below refer to the 3785 * comments of metaslab_sync() where we follow a 3786 * similar procedure. 3787 */ 3788 metaslab_group_histogram_verify(mg); 3789 metaslab_class_histogram_verify(mg->mg_class); 3790 metaslab_group_histogram_remove(mg, msp); 3791 3792 metaslab_condense(msp, tx); 3793 3794 space_map_histogram_clear(msp->ms_sm); 3795 space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx); 3796 ASSERT(range_tree_is_empty(msp->ms_freed)); 3797 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 3798 space_map_histogram_add(msp->ms_sm, 3799 msp->ms_defer[t], tx); 3800 } 3801 metaslab_aux_histograms_update(msp); 3802 3803 metaslab_group_histogram_add(mg, msp); 3804 metaslab_group_histogram_verify(mg); 3805 metaslab_class_histogram_verify(mg->mg_class); 3806 3807 metaslab_verify_space(msp, dmu_tx_get_txg(tx)); 3808 3809 /* 3810 * Since we recreated the histogram (and potentially 3811 * the ms_sm too while condensing) ensure that the 3812 * weight is updated too because we are not guaranteed 3813 * that this metaslab is dirty and will go through 3814 * metaslab_sync_done(). 3815 */ 3816 metaslab_recalculate_weight_and_sort(msp); 3817 return (B_TRUE); 3818 } 3819 3820 msp->ms_flushing = B_TRUE; 3821 uint64_t sm_len_before = space_map_length(msp->ms_sm); 3822 3823 mutex_exit(&msp->ms_lock); 3824 space_map_write(msp->ms_sm, msp->ms_unflushed_allocs, SM_ALLOC, 3825 SM_NO_VDEVID, tx); 3826 space_map_write(msp->ms_sm, msp->ms_unflushed_frees, SM_FREE, 3827 SM_NO_VDEVID, tx); 3828 mutex_enter(&msp->ms_lock); 3829 3830 uint64_t sm_len_after = space_map_length(msp->ms_sm); 3831 if (zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) { 3832 zfs_dbgmsg("flushing: txg %llu, spa %s, vdev_id %llu, " 3833 "ms_id %llu, unflushed_allocs %llu, unflushed_frees %llu, " 3834 "appended %llu bytes", dmu_tx_get_txg(tx), spa_name(spa), 3835 msp->ms_group->mg_vd->vdev_id, msp->ms_id, 3836 range_tree_space(msp->ms_unflushed_allocs), 3837 range_tree_space(msp->ms_unflushed_frees), 3838 (sm_len_after - sm_len_before)); 3839 } 3840 3841 ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, 3842 metaslab_unflushed_changes_memused(msp)); 3843 spa->spa_unflushed_stats.sus_memused -= 3844 metaslab_unflushed_changes_memused(msp); 3845 range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); 3846 range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); 3847 3848 metaslab_verify_space(msp, dmu_tx_get_txg(tx)); 3849 metaslab_verify_weight_and_frag(msp); 3850 3851 metaslab_flush_update(msp, tx); 3852 3853 metaslab_verify_space(msp, dmu_tx_get_txg(tx)); 3854 metaslab_verify_weight_and_frag(msp); 3855 3856 msp->ms_flushing = B_FALSE; 3857 cv_broadcast(&msp->ms_flush_cv); 3858 return (B_TRUE); 3859 } 3860 3861 /* 3862 * Write a metaslab to disk in the context of the specified transaction group. 3863 */ 3864 void 3865 metaslab_sync(metaslab_t *msp, uint64_t txg) 3866 { 3867 metaslab_group_t *mg = msp->ms_group; 3868 vdev_t *vd = mg->mg_vd; 3869 spa_t *spa = vd->vdev_spa; 3870 objset_t *mos = spa_meta_objset(spa); 3871 range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK]; 3872 dmu_tx_t *tx; 3873 3874 ASSERT(!vd->vdev_ishole); 3875 3876 /* 3877 * This metaslab has just been added so there's no work to do now. 3878 */ 3879 if (msp->ms_freeing == NULL) { 3880 ASSERT3P(alloctree, ==, NULL); 3881 return; 3882 } 3883 3884 ASSERT3P(alloctree, !=, NULL); 3885 ASSERT3P(msp->ms_freeing, !=, NULL); 3886 ASSERT3P(msp->ms_freed, !=, NULL); 3887 ASSERT3P(msp->ms_checkpointing, !=, NULL); 3888 ASSERT3P(msp->ms_trim, !=, NULL); 3889 3890 /* 3891 * Normally, we don't want to process a metaslab if there are no 3892 * allocations or frees to perform. However, if the metaslab is being 3893 * forced to condense, it's loaded and we're not beyond the final 3894 * dirty txg, we need to let it through. Not condensing beyond the 3895 * final dirty txg prevents an issue where metaslabs that need to be 3896 * condensed but were loaded for other reasons could cause a panic 3897 * here. By only checking the txg in that branch of the conditional, 3898 * we preserve the utility of the VERIFY statements in all other 3899 * cases. 3900 */ 3901 if (range_tree_is_empty(alloctree) && 3902 range_tree_is_empty(msp->ms_freeing) && 3903 range_tree_is_empty(msp->ms_checkpointing) && 3904 !(msp->ms_loaded && msp->ms_condense_wanted && 3905 txg <= spa_final_dirty_txg(spa))) 3906 return; 3907 3908 3909 VERIFY3U(txg, <=, spa_final_dirty_txg(spa)); 3910 3911 /* 3912 * The only state that can actually be changing concurrently 3913 * with metaslab_sync() is the metaslab's ms_allocatable. No 3914 * other thread can be modifying this txg's alloc, freeing, 3915 * freed, or space_map_phys_t. We drop ms_lock whenever we 3916 * could call into the DMU, because the DMU can call down to 3917 * us (e.g. via zio_free()) at any time. 3918 * 3919 * The spa_vdev_remove_thread() can be reading metaslab state 3920 * concurrently, and it is locked out by the ms_sync_lock. 3921 * Note that the ms_lock is insufficient for this, because it 3922 * is dropped by space_map_write(). 3923 */ 3924 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 3925 3926 /* 3927 * Generate a log space map if one doesn't exist already. 3928 */ 3929 spa_generate_syncing_log_sm(spa, tx); 3930 3931 if (msp->ms_sm == NULL) { 3932 uint64_t new_object = space_map_alloc(mos, 3933 spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ? 3934 zfs_metaslab_sm_blksz_with_log : 3935 zfs_metaslab_sm_blksz_no_log, tx); 3936 VERIFY3U(new_object, !=, 0); 3937 3938 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * 3939 msp->ms_id, sizeof (uint64_t), &new_object, tx); 3940 3941 VERIFY0(space_map_open(&msp->ms_sm, mos, new_object, 3942 msp->ms_start, msp->ms_size, vd->vdev_ashift)); 3943 ASSERT(msp->ms_sm != NULL); 3944 3945 ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); 3946 ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); 3947 ASSERT0(metaslab_allocated_space(msp)); 3948 } 3949 3950 if (metaslab_unflushed_txg(msp) == 0 && 3951 spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { 3952 ASSERT(spa_syncing_log_sm(spa) != NULL); 3953 3954 metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx); 3955 spa_log_sm_increment_current_mscount(spa); 3956 spa_log_summary_add_flushed_metaslab(spa); 3957 3958 ASSERT(msp->ms_sm != NULL); 3959 mutex_enter(&spa->spa_flushed_ms_lock); 3960 avl_add(&spa->spa_metaslabs_by_flushed, msp); 3961 mutex_exit(&spa->spa_flushed_ms_lock); 3962 3963 ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); 3964 ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); 3965 } 3966 3967 if (!range_tree_is_empty(msp->ms_checkpointing) && 3968 vd->vdev_checkpoint_sm == NULL) { 3969 ASSERT(spa_has_checkpoint(spa)); 3970 3971 uint64_t new_object = space_map_alloc(mos, 3972 zfs_vdev_standard_sm_blksz, tx); 3973 VERIFY3U(new_object, !=, 0); 3974 3975 VERIFY0(space_map_open(&vd->vdev_checkpoint_sm, 3976 mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift)); 3977 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); 3978 3979 /* 3980 * We save the space map object as an entry in vdev_top_zap 3981 * so it can be retrieved when the pool is reopened after an 3982 * export or through zdb. 3983 */ 3984 VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, 3985 vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, 3986 sizeof (new_object), 1, &new_object, tx)); 3987 } 3988 3989 mutex_enter(&msp->ms_sync_lock); 3990 mutex_enter(&msp->ms_lock); 3991 3992 /* 3993 * Note: metaslab_condense() clears the space map's histogram. 3994 * Therefore we must verify and remove this histogram before 3995 * condensing. 3996 */ 3997 metaslab_group_histogram_verify(mg); 3998 metaslab_class_histogram_verify(mg->mg_class); 3999 metaslab_group_histogram_remove(mg, msp); 4000 4001 if (spa->spa_sync_pass == 1 && msp->ms_loaded && 4002 metaslab_should_condense(msp)) 4003 metaslab_condense(msp, tx); 4004 4005 /* 4006 * We'll be going to disk to sync our space accounting, thus we 4007 * drop the ms_lock during that time so allocations coming from 4008 * open-context (ZIL) for future TXGs do not block. 4009 */ 4010 mutex_exit(&msp->ms_lock); 4011 space_map_t *log_sm = spa_syncing_log_sm(spa); 4012 if (log_sm != NULL) { 4013 ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP)); 4014 4015 space_map_write(log_sm, alloctree, SM_ALLOC, 4016 vd->vdev_id, tx); 4017 space_map_write(log_sm, msp->ms_freeing, SM_FREE, 4018 vd->vdev_id, tx); 4019 mutex_enter(&msp->ms_lock); 4020 4021 ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, 4022 metaslab_unflushed_changes_memused(msp)); 4023 spa->spa_unflushed_stats.sus_memused -= 4024 metaslab_unflushed_changes_memused(msp); 4025 range_tree_remove_xor_add(alloctree, 4026 msp->ms_unflushed_frees, msp->ms_unflushed_allocs); 4027 range_tree_remove_xor_add(msp->ms_freeing, 4028 msp->ms_unflushed_allocs, msp->ms_unflushed_frees); 4029 spa->spa_unflushed_stats.sus_memused += 4030 metaslab_unflushed_changes_memused(msp); 4031 } else { 4032 ASSERT(!spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP)); 4033 4034 space_map_write(msp->ms_sm, alloctree, SM_ALLOC, 4035 SM_NO_VDEVID, tx); 4036 space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE, 4037 SM_NO_VDEVID, tx); 4038 mutex_enter(&msp->ms_lock); 4039 } 4040 4041 msp->ms_allocated_space += range_tree_space(alloctree); 4042 ASSERT3U(msp->ms_allocated_space, >=, 4043 range_tree_space(msp->ms_freeing)); 4044 msp->ms_allocated_space -= range_tree_space(msp->ms_freeing); 4045 4046 if (!range_tree_is_empty(msp->ms_checkpointing)) { 4047 ASSERT(spa_has_checkpoint(spa)); 4048 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); 4049 4050 /* 4051 * Since we are doing writes to disk and the ms_checkpointing 4052 * tree won't be changing during that time, we drop the 4053 * ms_lock while writing to the checkpoint space map, for the 4054 * same reason mentioned above. 4055 */ 4056 mutex_exit(&msp->ms_lock); 4057 space_map_write(vd->vdev_checkpoint_sm, 4058 msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx); 4059 mutex_enter(&msp->ms_lock); 4060 4061 spa->spa_checkpoint_info.sci_dspace += 4062 range_tree_space(msp->ms_checkpointing); 4063 vd->vdev_stat.vs_checkpoint_space += 4064 range_tree_space(msp->ms_checkpointing); 4065 ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==, 4066 -space_map_allocated(vd->vdev_checkpoint_sm)); 4067 4068 range_tree_vacate(msp->ms_checkpointing, NULL, NULL); 4069 } 4070 4071 if (msp->ms_loaded) { 4072 /* 4073 * When the space map is loaded, we have an accurate 4074 * histogram in the range tree. This gives us an opportunity 4075 * to bring the space map's histogram up-to-date so we clear 4076 * it first before updating it. 4077 */ 4078 space_map_histogram_clear(msp->ms_sm); 4079 space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx); 4080 4081 /* 4082 * Since we've cleared the histogram we need to add back 4083 * any free space that has already been processed, plus 4084 * any deferred space. This allows the on-disk histogram 4085 * to accurately reflect all free space even if some space 4086 * is not yet available for allocation (i.e. deferred). 4087 */ 4088 space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx); 4089 4090 /* 4091 * Add back any deferred free space that has not been 4092 * added back into the in-core free tree yet. This will 4093 * ensure that we don't end up with a space map histogram 4094 * that is completely empty unless the metaslab is fully 4095 * allocated. 4096 */ 4097 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 4098 space_map_histogram_add(msp->ms_sm, 4099 msp->ms_defer[t], tx); 4100 } 4101 } 4102 4103 /* 4104 * Always add the free space from this sync pass to the space 4105 * map histogram. We want to make sure that the on-disk histogram 4106 * accounts for all free space. If the space map is not loaded, 4107 * then we will lose some accuracy but will correct it the next 4108 * time we load the space map. 4109 */ 4110 space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx); 4111 metaslab_aux_histograms_update(msp); 4112 4113 metaslab_group_histogram_add(mg, msp); 4114 metaslab_group_histogram_verify(mg); 4115 metaslab_class_histogram_verify(mg->mg_class); 4116 4117 /* 4118 * For sync pass 1, we avoid traversing this txg's free range tree 4119 * and instead will just swap the pointers for freeing and freed. 4120 * We can safely do this since the freed_tree is guaranteed to be 4121 * empty on the initial pass. 4122 * 4123 * Keep in mind that even if we are currently using a log spacemap 4124 * we want current frees to end up in the ms_allocatable (but not 4125 * get appended to the ms_sm) so their ranges can be reused as usual. 4126 */ 4127 if (spa_sync_pass(spa) == 1) { 4128 range_tree_swap(&msp->ms_freeing, &msp->ms_freed); 4129 ASSERT0(msp->ms_allocated_this_txg); 4130 } else { 4131 range_tree_vacate(msp->ms_freeing, 4132 range_tree_add, msp->ms_freed); 4133 } 4134 msp->ms_allocated_this_txg += range_tree_space(alloctree); 4135 range_tree_vacate(alloctree, NULL, NULL); 4136 4137 ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); 4138 ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg) 4139 & TXG_MASK])); 4140 ASSERT0(range_tree_space(msp->ms_freeing)); 4141 ASSERT0(range_tree_space(msp->ms_checkpointing)); 4142 4143 mutex_exit(&msp->ms_lock); 4144 4145 /* 4146 * Verify that the space map object ID has been recorded in the 4147 * vdev_ms_array. 4148 */ 4149 uint64_t object; 4150 VERIFY0(dmu_read(mos, vd->vdev_ms_array, 4151 msp->ms_id * sizeof (uint64_t), sizeof (uint64_t), &object, 0)); 4152 VERIFY3U(object, ==, space_map_object(msp->ms_sm)); 4153 4154 mutex_exit(&msp->ms_sync_lock); 4155 dmu_tx_commit(tx); 4156 } 4157 4158 static void 4159 metaslab_evict(metaslab_t *msp, uint64_t txg) 4160 { 4161 if (!msp->ms_loaded || msp->ms_disabled != 0) 4162 return; 4163 4164 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 4165 VERIFY0(range_tree_space( 4166 msp->ms_allocating[(txg + t) & TXG_MASK])); 4167 } 4168 if (msp->ms_allocator != -1) 4169 metaslab_passivate(msp, msp->ms_weight & ~METASLAB_ACTIVE_MASK); 4170 4171 if (!metaslab_debug_unload) 4172 metaslab_unload(msp); 4173 } 4174 4175 /* 4176 * Called after a transaction group has completely synced to mark 4177 * all of the metaslab's free space as usable. 4178 */ 4179 void 4180 metaslab_sync_done(metaslab_t *msp, uint64_t txg) 4181 { 4182 metaslab_group_t *mg = msp->ms_group; 4183 vdev_t *vd = mg->mg_vd; 4184 spa_t *spa = vd->vdev_spa; 4185 range_tree_t **defer_tree; 4186 int64_t alloc_delta, defer_delta; 4187 boolean_t defer_allowed = B_TRUE; 4188 4189 ASSERT(!vd->vdev_ishole); 4190 4191 mutex_enter(&msp->ms_lock); 4192 4193 /* 4194 * If this metaslab is just becoming available, initialize its 4195 * range trees and add its capacity to the vdev. 4196 */ 4197 if (msp->ms_freed == NULL) { 4198 range_seg_type_t type; 4199 uint64_t shift, start; 4200 type = metaslab_calculate_range_tree_type(vd, msp, &start, 4201 &shift); 4202 4203 for (int t = 0; t < TXG_SIZE; t++) { 4204 ASSERT(msp->ms_allocating[t] == NULL); 4205 4206 msp->ms_allocating[t] = range_tree_create(NULL, type, 4207 NULL, start, shift); 4208 } 4209 4210 ASSERT3P(msp->ms_freeing, ==, NULL); 4211 msp->ms_freeing = range_tree_create(NULL, type, NULL, start, 4212 shift); 4213 4214 ASSERT3P(msp->ms_freed, ==, NULL); 4215 msp->ms_freed = range_tree_create(NULL, type, NULL, start, 4216 shift); 4217 4218 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 4219 ASSERT3P(msp->ms_defer[t], ==, NULL); 4220 msp->ms_defer[t] = range_tree_create(NULL, type, NULL, 4221 start, shift); 4222 } 4223 4224 ASSERT3P(msp->ms_checkpointing, ==, NULL); 4225 msp->ms_checkpointing = range_tree_create(NULL, type, NULL, 4226 start, shift); 4227 4228 ASSERT3P(msp->ms_unflushed_allocs, ==, NULL); 4229 msp->ms_unflushed_allocs = range_tree_create(NULL, type, NULL, 4230 start, shift); 4231 4232 metaslab_rt_arg_t *mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP); 4233 mrap->mra_bt = &msp->ms_unflushed_frees_by_size; 4234 mrap->mra_floor_shift = metaslab_by_size_min_shift; 4235 ASSERT3P(msp->ms_unflushed_frees, ==, NULL); 4236 msp->ms_unflushed_frees = range_tree_create(&metaslab_rt_ops, 4237 type, mrap, start, shift); 4238 4239 metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size); 4240 } 4241 ASSERT0(range_tree_space(msp->ms_freeing)); 4242 ASSERT0(range_tree_space(msp->ms_checkpointing)); 4243 4244 defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE]; 4245 4246 uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) - 4247 metaslab_class_get_alloc(spa_normal_class(spa)); 4248 if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) { 4249 defer_allowed = B_FALSE; 4250 } 4251 4252 defer_delta = 0; 4253 alloc_delta = msp->ms_allocated_this_txg - 4254 range_tree_space(msp->ms_freed); 4255 4256 if (defer_allowed) { 4257 defer_delta = range_tree_space(msp->ms_freed) - 4258 range_tree_space(*defer_tree); 4259 } else { 4260 defer_delta -= range_tree_space(*defer_tree); 4261 } 4262 metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta, 4263 defer_delta, 0); 4264 4265 if (spa_syncing_log_sm(spa) == NULL) { 4266 /* 4267 * If there's a metaslab_load() in progress and we don't have 4268 * a log space map, it means that we probably wrote to the 4269 * metaslab's space map. If this is the case, we need to 4270 * make sure that we wait for the load to complete so that we 4271 * have a consistent view at the in-core side of the metaslab. 4272 */ 4273 metaslab_load_wait(msp); 4274 } else { 4275 ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); 4276 } 4277 4278 /* 4279 * When auto-trimming is enabled, free ranges which are added to 4280 * ms_allocatable are also be added to ms_trim. The ms_trim tree is 4281 * periodically consumed by the vdev_autotrim_thread() which issues 4282 * trims for all ranges and then vacates the tree. The ms_trim tree 4283 * can be discarded at any time with the sole consequence of recent 4284 * frees not being trimmed. 4285 */ 4286 if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON) { 4287 range_tree_walk(*defer_tree, range_tree_add, msp->ms_trim); 4288 if (!defer_allowed) { 4289 range_tree_walk(msp->ms_freed, range_tree_add, 4290 msp->ms_trim); 4291 } 4292 } else { 4293 range_tree_vacate(msp->ms_trim, NULL, NULL); 4294 } 4295 4296 /* 4297 * Move the frees from the defer_tree back to the free 4298 * range tree (if it's loaded). Swap the freed_tree and 4299 * the defer_tree -- this is safe to do because we've 4300 * just emptied out the defer_tree. 4301 */ 4302 range_tree_vacate(*defer_tree, 4303 msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable); 4304 if (defer_allowed) { 4305 range_tree_swap(&msp->ms_freed, defer_tree); 4306 } else { 4307 range_tree_vacate(msp->ms_freed, 4308 msp->ms_loaded ? range_tree_add : NULL, 4309 msp->ms_allocatable); 4310 } 4311 4312 msp->ms_synced_length = space_map_length(msp->ms_sm); 4313 4314 msp->ms_deferspace += defer_delta; 4315 ASSERT3S(msp->ms_deferspace, >=, 0); 4316 ASSERT3S(msp->ms_deferspace, <=, msp->ms_size); 4317 if (msp->ms_deferspace != 0) { 4318 /* 4319 * Keep syncing this metaslab until all deferred frees 4320 * are back in circulation. 4321 */ 4322 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 4323 } 4324 metaslab_aux_histograms_update_done(msp, defer_allowed); 4325 4326 if (msp->ms_new) { 4327 msp->ms_new = B_FALSE; 4328 mutex_enter(&mg->mg_lock); 4329 mg->mg_ms_ready++; 4330 mutex_exit(&mg->mg_lock); 4331 } 4332 4333 /* 4334 * Re-sort metaslab within its group now that we've adjusted 4335 * its allocatable space. 4336 */ 4337 metaslab_recalculate_weight_and_sort(msp); 4338 4339 ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); 4340 ASSERT0(range_tree_space(msp->ms_freeing)); 4341 ASSERT0(range_tree_space(msp->ms_freed)); 4342 ASSERT0(range_tree_space(msp->ms_checkpointing)); 4343 msp->ms_allocating_total -= msp->ms_allocated_this_txg; 4344 msp->ms_allocated_this_txg = 0; 4345 mutex_exit(&msp->ms_lock); 4346 } 4347 4348 void 4349 metaslab_sync_reassess(metaslab_group_t *mg) 4350 { 4351 spa_t *spa = mg->mg_class->mc_spa; 4352 4353 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 4354 metaslab_group_alloc_update(mg); 4355 mg->mg_fragmentation = metaslab_group_fragmentation(mg); 4356 4357 /* 4358 * Preload the next potential metaslabs but only on active 4359 * metaslab groups. We can get into a state where the metaslab 4360 * is no longer active since we dirty metaslabs as we remove a 4361 * a device, thus potentially making the metaslab group eligible 4362 * for preloading. 4363 */ 4364 if (mg->mg_activation_count > 0) { 4365 metaslab_group_preload(mg); 4366 } 4367 spa_config_exit(spa, SCL_ALLOC, FTAG); 4368 } 4369 4370 /* 4371 * When writing a ditto block (i.e. more than one DVA for a given BP) on 4372 * the same vdev as an existing DVA of this BP, then try to allocate it 4373 * on a different metaslab than existing DVAs (i.e. a unique metaslab). 4374 */ 4375 static boolean_t 4376 metaslab_is_unique(metaslab_t *msp, dva_t *dva) 4377 { 4378 uint64_t dva_ms_id; 4379 4380 if (DVA_GET_ASIZE(dva) == 0) 4381 return (B_TRUE); 4382 4383 if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) 4384 return (B_TRUE); 4385 4386 dva_ms_id = DVA_GET_OFFSET(dva) >> msp->ms_group->mg_vd->vdev_ms_shift; 4387 4388 return (msp->ms_id != dva_ms_id); 4389 } 4390 4391 /* 4392 * ========================================================================== 4393 * Metaslab allocation tracing facility 4394 * ========================================================================== 4395 */ 4396 #ifdef _METASLAB_TRACING 4397 4398 /* 4399 * Add an allocation trace element to the allocation tracing list. 4400 */ 4401 static void 4402 metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg, 4403 metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset, 4404 int allocator) 4405 { 4406 metaslab_alloc_trace_t *mat; 4407 4408 if (!metaslab_trace_enabled) 4409 return; 4410 4411 /* 4412 * When the tracing list reaches its maximum we remove 4413 * the second element in the list before adding a new one. 4414 * By removing the second element we preserve the original 4415 * entry as a clue to what allocations steps have already been 4416 * performed. 4417 */ 4418 if (zal->zal_size == metaslab_trace_max_entries) { 4419 metaslab_alloc_trace_t *mat_next; 4420 #ifdef ZFS_DEBUG 4421 panic("too many entries in allocation list"); 4422 #endif 4423 METASLABSTAT_BUMP(metaslabstat_trace_over_limit); 4424 zal->zal_size--; 4425 mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list)); 4426 list_remove(&zal->zal_list, mat_next); 4427 kmem_cache_free(metaslab_alloc_trace_cache, mat_next); 4428 } 4429 4430 mat = kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP); 4431 list_link_init(&mat->mat_list_node); 4432 mat->mat_mg = mg; 4433 mat->mat_msp = msp; 4434 mat->mat_size = psize; 4435 mat->mat_dva_id = dva_id; 4436 mat->mat_offset = offset; 4437 mat->mat_weight = 0; 4438 mat->mat_allocator = allocator; 4439 4440 if (msp != NULL) 4441 mat->mat_weight = msp->ms_weight; 4442 4443 /* 4444 * The list is part of the zio so locking is not required. Only 4445 * a single thread will perform allocations for a given zio. 4446 */ 4447 list_insert_tail(&zal->zal_list, mat); 4448 zal->zal_size++; 4449 4450 ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries); 4451 } 4452 4453 void 4454 metaslab_trace_init(zio_alloc_list_t *zal) 4455 { 4456 list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t), 4457 offsetof(metaslab_alloc_trace_t, mat_list_node)); 4458 zal->zal_size = 0; 4459 } 4460 4461 void 4462 metaslab_trace_fini(zio_alloc_list_t *zal) 4463 { 4464 metaslab_alloc_trace_t *mat; 4465 4466 while ((mat = list_remove_head(&zal->zal_list)) != NULL) 4467 kmem_cache_free(metaslab_alloc_trace_cache, mat); 4468 list_destroy(&zal->zal_list); 4469 zal->zal_size = 0; 4470 } 4471 #else 4472 4473 #define metaslab_trace_add(zal, mg, msp, psize, id, off, alloc) 4474 4475 void 4476 metaslab_trace_init(zio_alloc_list_t *zal) 4477 { 4478 } 4479 4480 void 4481 metaslab_trace_fini(zio_alloc_list_t *zal) 4482 { 4483 } 4484 4485 #endif /* _METASLAB_TRACING */ 4486 4487 /* 4488 * ========================================================================== 4489 * Metaslab block operations 4490 * ========================================================================== 4491 */ 4492 4493 static void 4494 metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags, 4495 int allocator) 4496 { 4497 if (!(flags & METASLAB_ASYNC_ALLOC) || 4498 (flags & METASLAB_DONT_THROTTLE)) 4499 return; 4500 4501 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 4502 if (!mg->mg_class->mc_alloc_throttle_enabled) 4503 return; 4504 4505 metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; 4506 (void) zfs_refcount_add(&mga->mga_alloc_queue_depth, tag); 4507 } 4508 4509 static void 4510 metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator) 4511 { 4512 metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; 4513 uint64_t max = mg->mg_max_alloc_queue_depth; 4514 uint64_t cur = mga->mga_cur_max_alloc_queue_depth; 4515 while (cur < max) { 4516 if (atomic_cas_64(&mga->mga_cur_max_alloc_queue_depth, 4517 cur, cur + 1) == cur) { 4518 atomic_inc_64( 4519 &mg->mg_class->mc_alloc_max_slots[allocator]); 4520 return; 4521 } 4522 cur = mga->mga_cur_max_alloc_queue_depth; 4523 } 4524 } 4525 4526 void 4527 metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags, 4528 int allocator, boolean_t io_complete) 4529 { 4530 if (!(flags & METASLAB_ASYNC_ALLOC) || 4531 (flags & METASLAB_DONT_THROTTLE)) 4532 return; 4533 4534 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 4535 if (!mg->mg_class->mc_alloc_throttle_enabled) 4536 return; 4537 4538 metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; 4539 (void) zfs_refcount_remove(&mga->mga_alloc_queue_depth, tag); 4540 if (io_complete) 4541 metaslab_group_increment_qdepth(mg, allocator); 4542 } 4543 4544 void 4545 metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag, 4546 int allocator) 4547 { 4548 #ifdef ZFS_DEBUG 4549 const dva_t *dva = bp->blk_dva; 4550 int ndvas = BP_GET_NDVAS(bp); 4551 4552 for (int d = 0; d < ndvas; d++) { 4553 uint64_t vdev = DVA_GET_VDEV(&dva[d]); 4554 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 4555 metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; 4556 VERIFY(zfs_refcount_not_held(&mga->mga_alloc_queue_depth, tag)); 4557 } 4558 #endif 4559 } 4560 4561 static uint64_t 4562 metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) 4563 { 4564 uint64_t start; 4565 range_tree_t *rt = msp->ms_allocatable; 4566 metaslab_class_t *mc = msp->ms_group->mg_class; 4567 4568 ASSERT(MUTEX_HELD(&msp->ms_lock)); 4569 VERIFY(!msp->ms_condensing); 4570 VERIFY0(msp->ms_disabled); 4571 4572 start = mc->mc_ops->msop_alloc(msp, size); 4573 if (start != -1ULL) { 4574 metaslab_group_t *mg = msp->ms_group; 4575 vdev_t *vd = mg->mg_vd; 4576 4577 VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift)); 4578 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 4579 VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size); 4580 range_tree_remove(rt, start, size); 4581 range_tree_clear(msp->ms_trim, start, size); 4582 4583 if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) 4584 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); 4585 4586 range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size); 4587 msp->ms_allocating_total += size; 4588 4589 /* Track the last successful allocation */ 4590 msp->ms_alloc_txg = txg; 4591 metaslab_verify_space(msp, txg); 4592 } 4593 4594 /* 4595 * Now that we've attempted the allocation we need to update the 4596 * metaslab's maximum block size since it may have changed. 4597 */ 4598 msp->ms_max_size = metaslab_largest_allocatable(msp); 4599 return (start); 4600 } 4601 4602 /* 4603 * Find the metaslab with the highest weight that is less than what we've 4604 * already tried. In the common case, this means that we will examine each 4605 * metaslab at most once. Note that concurrent callers could reorder metaslabs 4606 * by activation/passivation once we have dropped the mg_lock. If a metaslab is 4607 * activated by another thread, and we fail to allocate from the metaslab we 4608 * have selected, we may not try the newly-activated metaslab, and instead 4609 * activate another metaslab. This is not optimal, but generally does not cause 4610 * any problems (a possible exception being if every metaslab is completely full 4611 * except for the newly-activated metaslab which we fail to examine). 4612 */ 4613 static metaslab_t * 4614 find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, 4615 dva_t *dva, int d, boolean_t want_unique, uint64_t asize, int allocator, 4616 boolean_t try_hard, zio_alloc_list_t *zal, metaslab_t *search, 4617 boolean_t *was_active) 4618 { 4619 avl_index_t idx; 4620 avl_tree_t *t = &mg->mg_metaslab_tree; 4621 metaslab_t *msp = avl_find(t, search, &idx); 4622 if (msp == NULL) 4623 msp = avl_nearest(t, idx, AVL_AFTER); 4624 4625 for (; msp != NULL; msp = AVL_NEXT(t, msp)) { 4626 int i; 4627 if (!metaslab_should_allocate(msp, asize, try_hard)) { 4628 metaslab_trace_add(zal, mg, msp, asize, d, 4629 TRACE_TOO_SMALL, allocator); 4630 continue; 4631 } 4632 4633 /* 4634 * If the selected metaslab is condensing or disabled, 4635 * skip it. 4636 */ 4637 if (msp->ms_condensing || msp->ms_disabled > 0) 4638 continue; 4639 4640 *was_active = msp->ms_allocator != -1; 4641 /* 4642 * If we're activating as primary, this is our first allocation 4643 * from this disk, so we don't need to check how close we are. 4644 * If the metaslab under consideration was already active, 4645 * we're getting desperate enough to steal another allocator's 4646 * metaslab, so we still don't care about distances. 4647 */ 4648 if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active) 4649 break; 4650 4651 for (i = 0; i < d; i++) { 4652 if (want_unique && 4653 !metaslab_is_unique(msp, &dva[i])) 4654 break; /* try another metaslab */ 4655 } 4656 if (i == d) 4657 break; 4658 } 4659 4660 if (msp != NULL) { 4661 search->ms_weight = msp->ms_weight; 4662 search->ms_start = msp->ms_start + 1; 4663 search->ms_allocator = msp->ms_allocator; 4664 search->ms_primary = msp->ms_primary; 4665 } 4666 return (msp); 4667 } 4668 4669 static void 4670 metaslab_active_mask_verify(metaslab_t *msp) 4671 { 4672 ASSERT(MUTEX_HELD(&msp->ms_lock)); 4673 4674 if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) 4675 return; 4676 4677 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) 4678 return; 4679 4680 if (msp->ms_weight & METASLAB_WEIGHT_PRIMARY) { 4681 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); 4682 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM); 4683 VERIFY3S(msp->ms_allocator, !=, -1); 4684 VERIFY(msp->ms_primary); 4685 return; 4686 } 4687 4688 if (msp->ms_weight & METASLAB_WEIGHT_SECONDARY) { 4689 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); 4690 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM); 4691 VERIFY3S(msp->ms_allocator, !=, -1); 4692 VERIFY(!msp->ms_primary); 4693 return; 4694 } 4695 4696 if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) { 4697 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); 4698 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); 4699 VERIFY3S(msp->ms_allocator, ==, -1); 4700 return; 4701 } 4702 } 4703 4704 /* ARGSUSED */ 4705 static uint64_t 4706 metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, 4707 uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d, 4708 int allocator, boolean_t try_hard) 4709 { 4710 metaslab_t *msp = NULL; 4711 uint64_t offset = -1ULL; 4712 4713 uint64_t activation_weight = METASLAB_WEIGHT_PRIMARY; 4714 for (int i = 0; i < d; i++) { 4715 if (activation_weight == METASLAB_WEIGHT_PRIMARY && 4716 DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { 4717 activation_weight = METASLAB_WEIGHT_SECONDARY; 4718 } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && 4719 DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { 4720 activation_weight = METASLAB_WEIGHT_CLAIM; 4721 break; 4722 } 4723 } 4724 4725 /* 4726 * If we don't have enough metaslabs active to fill the entire array, we 4727 * just use the 0th slot. 4728 */ 4729 if (mg->mg_ms_ready < mg->mg_allocators * 3) 4730 allocator = 0; 4731 metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; 4732 4733 ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2); 4734 4735 metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP); 4736 search->ms_weight = UINT64_MAX; 4737 search->ms_start = 0; 4738 /* 4739 * At the end of the metaslab tree are the already-active metaslabs, 4740 * first the primaries, then the secondaries. When we resume searching 4741 * through the tree, we need to consider ms_allocator and ms_primary so 4742 * we start in the location right after where we left off, and don't 4743 * accidentally loop forever considering the same metaslabs. 4744 */ 4745 search->ms_allocator = -1; 4746 search->ms_primary = B_TRUE; 4747 for (;;) { 4748 boolean_t was_active = B_FALSE; 4749 4750 mutex_enter(&mg->mg_lock); 4751 4752 if (activation_weight == METASLAB_WEIGHT_PRIMARY && 4753 mga->mga_primary != NULL) { 4754 msp = mga->mga_primary; 4755 4756 /* 4757 * Even though we don't hold the ms_lock for the 4758 * primary metaslab, those fields should not 4759 * change while we hold the mg_lock. Thus it is 4760 * safe to make assertions on them. 4761 */ 4762 ASSERT(msp->ms_primary); 4763 ASSERT3S(msp->ms_allocator, ==, allocator); 4764 ASSERT(msp->ms_loaded); 4765 4766 was_active = B_TRUE; 4767 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); 4768 } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && 4769 mga->mga_secondary != NULL) { 4770 msp = mga->mga_secondary; 4771 4772 /* 4773 * See comment above about the similar assertions 4774 * for the primary metaslab. 4775 */ 4776 ASSERT(!msp->ms_primary); 4777 ASSERT3S(msp->ms_allocator, ==, allocator); 4778 ASSERT(msp->ms_loaded); 4779 4780 was_active = B_TRUE; 4781 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); 4782 } else { 4783 msp = find_valid_metaslab(mg, activation_weight, dva, d, 4784 want_unique, asize, allocator, try_hard, zal, 4785 search, &was_active); 4786 } 4787 4788 mutex_exit(&mg->mg_lock); 4789 if (msp == NULL) { 4790 kmem_free(search, sizeof (*search)); 4791 return (-1ULL); 4792 } 4793 mutex_enter(&msp->ms_lock); 4794 4795 metaslab_active_mask_verify(msp); 4796 4797 /* 4798 * This code is disabled out because of issues with 4799 * tracepoints in non-gpl kernel modules. 4800 */ 4801 #if 0 4802 DTRACE_PROBE3(ms__activation__attempt, 4803 metaslab_t *, msp, uint64_t, activation_weight, 4804 boolean_t, was_active); 4805 #endif 4806 4807 /* 4808 * Ensure that the metaslab we have selected is still 4809 * capable of handling our request. It's possible that 4810 * another thread may have changed the weight while we 4811 * were blocked on the metaslab lock. We check the 4812 * active status first to see if we need to set_selected_txg 4813 * a new metaslab. 4814 */ 4815 if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) { 4816 ASSERT3S(msp->ms_allocator, ==, -1); 4817 mutex_exit(&msp->ms_lock); 4818 continue; 4819 } 4820 4821 /* 4822 * If the metaslab was activated for another allocator 4823 * while we were waiting in the ms_lock above, or it's 4824 * a primary and we're seeking a secondary (or vice versa), 4825 * we go back and select a new metaslab. 4826 */ 4827 if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) && 4828 (msp->ms_allocator != -1) && 4829 (msp->ms_allocator != allocator || ((activation_weight == 4830 METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) { 4831 ASSERT(msp->ms_loaded); 4832 ASSERT((msp->ms_weight & METASLAB_WEIGHT_CLAIM) || 4833 msp->ms_allocator != -1); 4834 mutex_exit(&msp->ms_lock); 4835 continue; 4836 } 4837 4838 /* 4839 * This metaslab was used for claiming regions allocated 4840 * by the ZIL during pool import. Once these regions are 4841 * claimed we don't need to keep the CLAIM bit set 4842 * anymore. Passivate this metaslab to zero its activation 4843 * mask. 4844 */ 4845 if (msp->ms_weight & METASLAB_WEIGHT_CLAIM && 4846 activation_weight != METASLAB_WEIGHT_CLAIM) { 4847 ASSERT(msp->ms_loaded); 4848 ASSERT3S(msp->ms_allocator, ==, -1); 4849 metaslab_passivate(msp, msp->ms_weight & 4850 ~METASLAB_WEIGHT_CLAIM); 4851 mutex_exit(&msp->ms_lock); 4852 continue; 4853 } 4854 4855 metaslab_set_selected_txg(msp, txg); 4856 4857 int activation_error = 4858 metaslab_activate(msp, allocator, activation_weight); 4859 metaslab_active_mask_verify(msp); 4860 4861 /* 4862 * If the metaslab was activated by another thread for 4863 * another allocator or activation_weight (EBUSY), or it 4864 * failed because another metaslab was assigned as primary 4865 * for this allocator (EEXIST) we continue using this 4866 * metaslab for our allocation, rather than going on to a 4867 * worse metaslab (we waited for that metaslab to be loaded 4868 * after all). 4869 * 4870 * If the activation failed due to an I/O error or ENOSPC we 4871 * skip to the next metaslab. 4872 */ 4873 boolean_t activated; 4874 if (activation_error == 0) { 4875 activated = B_TRUE; 4876 } else if (activation_error == EBUSY || 4877 activation_error == EEXIST) { 4878 activated = B_FALSE; 4879 } else { 4880 mutex_exit(&msp->ms_lock); 4881 continue; 4882 } 4883 ASSERT(msp->ms_loaded); 4884 4885 /* 4886 * Now that we have the lock, recheck to see if we should 4887 * continue to use this metaslab for this allocation. The 4888 * the metaslab is now loaded so metaslab_should_allocate() 4889 * can accurately determine if the allocation attempt should 4890 * proceed. 4891 */ 4892 if (!metaslab_should_allocate(msp, asize, try_hard)) { 4893 /* Passivate this metaslab and select a new one. */ 4894 metaslab_trace_add(zal, mg, msp, asize, d, 4895 TRACE_TOO_SMALL, allocator); 4896 goto next; 4897 } 4898 4899 /* 4900 * If this metaslab is currently condensing then pick again 4901 * as we can't manipulate this metaslab until it's committed 4902 * to disk. If this metaslab is being initialized, we shouldn't 4903 * allocate from it since the allocated region might be 4904 * overwritten after allocation. 4905 */ 4906 if (msp->ms_condensing) { 4907 metaslab_trace_add(zal, mg, msp, asize, d, 4908 TRACE_CONDENSING, allocator); 4909 if (activated) { 4910 metaslab_passivate(msp, msp->ms_weight & 4911 ~METASLAB_ACTIVE_MASK); 4912 } 4913 mutex_exit(&msp->ms_lock); 4914 continue; 4915 } else if (msp->ms_disabled > 0) { 4916 metaslab_trace_add(zal, mg, msp, asize, d, 4917 TRACE_DISABLED, allocator); 4918 if (activated) { 4919 metaslab_passivate(msp, msp->ms_weight & 4920 ~METASLAB_ACTIVE_MASK); 4921 } 4922 mutex_exit(&msp->ms_lock); 4923 continue; 4924 } 4925 4926 offset = metaslab_block_alloc(msp, asize, txg); 4927 metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator); 4928 4929 if (offset != -1ULL) { 4930 /* Proactively passivate the metaslab, if needed */ 4931 if (activated) 4932 metaslab_segment_may_passivate(msp); 4933 break; 4934 } 4935 next: 4936 ASSERT(msp->ms_loaded); 4937 4938 /* 4939 * This code is disabled out because of issues with 4940 * tracepoints in non-gpl kernel modules. 4941 */ 4942 #if 0 4943 DTRACE_PROBE2(ms__alloc__failure, metaslab_t *, msp, 4944 uint64_t, asize); 4945 #endif 4946 4947 /* 4948 * We were unable to allocate from this metaslab so determine 4949 * a new weight for this metaslab. Now that we have loaded 4950 * the metaslab we can provide a better hint to the metaslab 4951 * selector. 4952 * 4953 * For space-based metaslabs, we use the maximum block size. 4954 * This information is only available when the metaslab 4955 * is loaded and is more accurate than the generic free 4956 * space weight that was calculated by metaslab_weight(). 4957 * This information allows us to quickly compare the maximum 4958 * available allocation in the metaslab to the allocation 4959 * size being requested. 4960 * 4961 * For segment-based metaslabs, determine the new weight 4962 * based on the highest bucket in the range tree. We 4963 * explicitly use the loaded segment weight (i.e. the range 4964 * tree histogram) since it contains the space that is 4965 * currently available for allocation and is accurate 4966 * even within a sync pass. 4967 */ 4968 uint64_t weight; 4969 if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) { 4970 weight = metaslab_largest_allocatable(msp); 4971 WEIGHT_SET_SPACEBASED(weight); 4972 } else { 4973 weight = metaslab_weight_from_range_tree(msp); 4974 } 4975 4976 if (activated) { 4977 metaslab_passivate(msp, weight); 4978 } else { 4979 /* 4980 * For the case where we use the metaslab that is 4981 * active for another allocator we want to make 4982 * sure that we retain the activation mask. 4983 * 4984 * Note that we could attempt to use something like 4985 * metaslab_recalculate_weight_and_sort() that 4986 * retains the activation mask here. That function 4987 * uses metaslab_weight() to set the weight though 4988 * which is not as accurate as the calculations 4989 * above. 4990 */ 4991 weight |= msp->ms_weight & METASLAB_ACTIVE_MASK; 4992 metaslab_group_sort(mg, msp, weight); 4993 } 4994 metaslab_active_mask_verify(msp); 4995 4996 /* 4997 * We have just failed an allocation attempt, check 4998 * that metaslab_should_allocate() agrees. Otherwise, 4999 * we may end up in an infinite loop retrying the same 5000 * metaslab. 5001 */ 5002 ASSERT(!metaslab_should_allocate(msp, asize, try_hard)); 5003 5004 mutex_exit(&msp->ms_lock); 5005 } 5006 mutex_exit(&msp->ms_lock); 5007 kmem_free(search, sizeof (*search)); 5008 return (offset); 5009 } 5010 5011 static uint64_t 5012 metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, 5013 uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d, 5014 int allocator, boolean_t try_hard) 5015 { 5016 uint64_t offset; 5017 ASSERT(mg->mg_initialized); 5018 5019 offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique, 5020 dva, d, allocator, try_hard); 5021 5022 mutex_enter(&mg->mg_lock); 5023 if (offset == -1ULL) { 5024 mg->mg_failed_allocations++; 5025 metaslab_trace_add(zal, mg, NULL, asize, d, 5026 TRACE_GROUP_FAILURE, allocator); 5027 if (asize == SPA_GANGBLOCKSIZE) { 5028 /* 5029 * This metaslab group was unable to allocate 5030 * the minimum gang block size so it must be out of 5031 * space. We must notify the allocation throttle 5032 * to start skipping allocation attempts to this 5033 * metaslab group until more space becomes available. 5034 * Note: this failure cannot be caused by the 5035 * allocation throttle since the allocation throttle 5036 * is only responsible for skipping devices and 5037 * not failing block allocations. 5038 */ 5039 mg->mg_no_free_space = B_TRUE; 5040 } 5041 } 5042 mg->mg_allocations++; 5043 mutex_exit(&mg->mg_lock); 5044 return (offset); 5045 } 5046 5047 /* 5048 * Allocate a block for the specified i/o. 5049 */ 5050 int 5051 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, 5052 dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags, 5053 zio_alloc_list_t *zal, int allocator) 5054 { 5055 metaslab_group_t *mg, *fast_mg, *rotor; 5056 vdev_t *vd; 5057 boolean_t try_hard = B_FALSE; 5058 5059 ASSERT(!DVA_IS_VALID(&dva[d])); 5060 5061 /* 5062 * For testing, make some blocks above a certain size be gang blocks. 5063 * This will result in more split blocks when using device removal, 5064 * and a large number of split blocks coupled with ztest-induced 5065 * damage can result in extremely long reconstruction times. This 5066 * will also test spilling from special to normal. 5067 */ 5068 if (psize >= metaslab_force_ganging && (spa_get_random(100) < 3)) { 5069 metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG, 5070 allocator); 5071 return (SET_ERROR(ENOSPC)); 5072 } 5073 5074 /* 5075 * Start at the rotor and loop through all mgs until we find something. 5076 * Note that there's no locking on mc_rotor or mc_aliquot because 5077 * nothing actually breaks if we miss a few updates -- we just won't 5078 * allocate quite as evenly. It all balances out over time. 5079 * 5080 * If we are doing ditto or log blocks, try to spread them across 5081 * consecutive vdevs. If we're forced to reuse a vdev before we've 5082 * allocated all of our ditto blocks, then try and spread them out on 5083 * that vdev as much as possible. If it turns out to not be possible, 5084 * gradually lower our standards until anything becomes acceptable. 5085 * Also, allocating on consecutive vdevs (as opposed to random vdevs) 5086 * gives us hope of containing our fault domains to something we're 5087 * able to reason about. Otherwise, any two top-level vdev failures 5088 * will guarantee the loss of data. With consecutive allocation, 5089 * only two adjacent top-level vdev failures will result in data loss. 5090 * 5091 * If we are doing gang blocks (hintdva is non-NULL), try to keep 5092 * ourselves on the same vdev as our gang block header. That 5093 * way, we can hope for locality in vdev_cache, plus it makes our 5094 * fault domains something tractable. 5095 */ 5096 if (hintdva) { 5097 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); 5098 5099 /* 5100 * It's possible the vdev we're using as the hint no 5101 * longer exists or its mg has been closed (e.g. by 5102 * device removal). Consult the rotor when 5103 * all else fails. 5104 */ 5105 if (vd != NULL && vd->vdev_mg != NULL) { 5106 mg = vd->vdev_mg; 5107 5108 if (flags & METASLAB_HINTBP_AVOID && 5109 mg->mg_next != NULL) 5110 mg = mg->mg_next; 5111 } else { 5112 mg = mc->mc_rotor; 5113 } 5114 } else if (d != 0) { 5115 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); 5116 mg = vd->vdev_mg->mg_next; 5117 } else if (flags & METASLAB_FASTWRITE) { 5118 mg = fast_mg = mc->mc_rotor; 5119 5120 do { 5121 if (fast_mg->mg_vd->vdev_pending_fastwrite < 5122 mg->mg_vd->vdev_pending_fastwrite) 5123 mg = fast_mg; 5124 } while ((fast_mg = fast_mg->mg_next) != mc->mc_rotor); 5125 5126 } else { 5127 ASSERT(mc->mc_rotor != NULL); 5128 mg = mc->mc_rotor; 5129 } 5130 5131 /* 5132 * If the hint put us into the wrong metaslab class, or into a 5133 * metaslab group that has been passivated, just follow the rotor. 5134 */ 5135 if (mg->mg_class != mc || mg->mg_activation_count <= 0) 5136 mg = mc->mc_rotor; 5137 5138 rotor = mg; 5139 top: 5140 do { 5141 boolean_t allocatable; 5142 5143 ASSERT(mg->mg_activation_count == 1); 5144 vd = mg->mg_vd; 5145 5146 /* 5147 * Don't allocate from faulted devices. 5148 */ 5149 if (try_hard) { 5150 spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); 5151 allocatable = vdev_allocatable(vd); 5152 spa_config_exit(spa, SCL_ZIO, FTAG); 5153 } else { 5154 allocatable = vdev_allocatable(vd); 5155 } 5156 5157 /* 5158 * Determine if the selected metaslab group is eligible 5159 * for allocations. If we're ganging then don't allow 5160 * this metaslab group to skip allocations since that would 5161 * inadvertently return ENOSPC and suspend the pool 5162 * even though space is still available. 5163 */ 5164 if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) { 5165 allocatable = metaslab_group_allocatable(mg, rotor, 5166 psize, allocator, d); 5167 } 5168 5169 if (!allocatable) { 5170 metaslab_trace_add(zal, mg, NULL, psize, d, 5171 TRACE_NOT_ALLOCATABLE, allocator); 5172 goto next; 5173 } 5174 5175 ASSERT(mg->mg_initialized); 5176 5177 /* 5178 * Avoid writing single-copy data to a failing, 5179 * non-redundant vdev, unless we've already tried all 5180 * other vdevs. 5181 */ 5182 if ((vd->vdev_stat.vs_write_errors > 0 || 5183 vd->vdev_state < VDEV_STATE_HEALTHY) && 5184 d == 0 && !try_hard && vd->vdev_children == 0) { 5185 metaslab_trace_add(zal, mg, NULL, psize, d, 5186 TRACE_VDEV_ERROR, allocator); 5187 goto next; 5188 } 5189 5190 ASSERT(mg->mg_class == mc); 5191 5192 uint64_t asize = vdev_psize_to_asize(vd, psize); 5193 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); 5194 5195 /* 5196 * If we don't need to try hard, then require that the 5197 * block be on a different metaslab from any other DVAs 5198 * in this BP (unique=true). If we are trying hard, then 5199 * allow any metaslab to be used (unique=false). 5200 */ 5201 uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg, 5202 !try_hard, dva, d, allocator, try_hard); 5203 5204 if (offset != -1ULL) { 5205 /* 5206 * If we've just selected this metaslab group, 5207 * figure out whether the corresponding vdev is 5208 * over- or under-used relative to the pool, 5209 * and set an allocation bias to even it out. 5210 * 5211 * Bias is also used to compensate for unequally 5212 * sized vdevs so that space is allocated fairly. 5213 */ 5214 if (mc->mc_aliquot == 0 && metaslab_bias_enabled) { 5215 vdev_stat_t *vs = &vd->vdev_stat; 5216 int64_t vs_free = vs->vs_space - vs->vs_alloc; 5217 int64_t mc_free = mc->mc_space - mc->mc_alloc; 5218 int64_t ratio; 5219 5220 /* 5221 * Calculate how much more or less we should 5222 * try to allocate from this device during 5223 * this iteration around the rotor. 5224 * 5225 * This basically introduces a zero-centered 5226 * bias towards the devices with the most 5227 * free space, while compensating for vdev 5228 * size differences. 5229 * 5230 * Examples: 5231 * vdev V1 = 16M/128M 5232 * vdev V2 = 16M/128M 5233 * ratio(V1) = 100% ratio(V2) = 100% 5234 * 5235 * vdev V1 = 16M/128M 5236 * vdev V2 = 64M/128M 5237 * ratio(V1) = 127% ratio(V2) = 72% 5238 * 5239 * vdev V1 = 16M/128M 5240 * vdev V2 = 64M/512M 5241 * ratio(V1) = 40% ratio(V2) = 160% 5242 */ 5243 ratio = (vs_free * mc->mc_alloc_groups * 100) / 5244 (mc_free + 1); 5245 mg->mg_bias = ((ratio - 100) * 5246 (int64_t)mg->mg_aliquot) / 100; 5247 } else if (!metaslab_bias_enabled) { 5248 mg->mg_bias = 0; 5249 } 5250 5251 if ((flags & METASLAB_FASTWRITE) || 5252 atomic_add_64_nv(&mc->mc_aliquot, asize) >= 5253 mg->mg_aliquot + mg->mg_bias) { 5254 mc->mc_rotor = mg->mg_next; 5255 mc->mc_aliquot = 0; 5256 } 5257 5258 DVA_SET_VDEV(&dva[d], vd->vdev_id); 5259 DVA_SET_OFFSET(&dva[d], offset); 5260 DVA_SET_GANG(&dva[d], 5261 ((flags & METASLAB_GANG_HEADER) ? 1 : 0)); 5262 DVA_SET_ASIZE(&dva[d], asize); 5263 5264 if (flags & METASLAB_FASTWRITE) { 5265 atomic_add_64(&vd->vdev_pending_fastwrite, 5266 psize); 5267 } 5268 5269 return (0); 5270 } 5271 next: 5272 mc->mc_rotor = mg->mg_next; 5273 mc->mc_aliquot = 0; 5274 } while ((mg = mg->mg_next) != rotor); 5275 5276 /* 5277 * If we haven't tried hard, do so now. 5278 */ 5279 if (!try_hard) { 5280 try_hard = B_TRUE; 5281 goto top; 5282 } 5283 5284 bzero(&dva[d], sizeof (dva_t)); 5285 5286 metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator); 5287 return (SET_ERROR(ENOSPC)); 5288 } 5289 5290 void 5291 metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize, 5292 boolean_t checkpoint) 5293 { 5294 metaslab_t *msp; 5295 spa_t *spa = vd->vdev_spa; 5296 5297 ASSERT(vdev_is_concrete(vd)); 5298 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 5299 ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); 5300 5301 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 5302 5303 VERIFY(!msp->ms_condensing); 5304 VERIFY3U(offset, >=, msp->ms_start); 5305 VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size); 5306 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 5307 VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift)); 5308 5309 metaslab_check_free_impl(vd, offset, asize); 5310 5311 mutex_enter(&msp->ms_lock); 5312 if (range_tree_is_empty(msp->ms_freeing) && 5313 range_tree_is_empty(msp->ms_checkpointing)) { 5314 vdev_dirty(vd, VDD_METASLAB, msp, spa_syncing_txg(spa)); 5315 } 5316 5317 if (checkpoint) { 5318 ASSERT(spa_has_checkpoint(spa)); 5319 range_tree_add(msp->ms_checkpointing, offset, asize); 5320 } else { 5321 range_tree_add(msp->ms_freeing, offset, asize); 5322 } 5323 mutex_exit(&msp->ms_lock); 5324 } 5325 5326 /* ARGSUSED */ 5327 void 5328 metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, 5329 uint64_t size, void *arg) 5330 { 5331 boolean_t *checkpoint = arg; 5332 5333 ASSERT3P(checkpoint, !=, NULL); 5334 5335 if (vd->vdev_ops->vdev_op_remap != NULL) 5336 vdev_indirect_mark_obsolete(vd, offset, size); 5337 else 5338 metaslab_free_impl(vd, offset, size, *checkpoint); 5339 } 5340 5341 static void 5342 metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size, 5343 boolean_t checkpoint) 5344 { 5345 spa_t *spa = vd->vdev_spa; 5346 5347 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 5348 5349 if (spa_syncing_txg(spa) > spa_freeze_txg(spa)) 5350 return; 5351 5352 if (spa->spa_vdev_removal != NULL && 5353 spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id && 5354 vdev_is_concrete(vd)) { 5355 /* 5356 * Note: we check if the vdev is concrete because when 5357 * we complete the removal, we first change the vdev to be 5358 * an indirect vdev (in open context), and then (in syncing 5359 * context) clear spa_vdev_removal. 5360 */ 5361 free_from_removing_vdev(vd, offset, size); 5362 } else if (vd->vdev_ops->vdev_op_remap != NULL) { 5363 vdev_indirect_mark_obsolete(vd, offset, size); 5364 vd->vdev_ops->vdev_op_remap(vd, offset, size, 5365 metaslab_free_impl_cb, &checkpoint); 5366 } else { 5367 metaslab_free_concrete(vd, offset, size, checkpoint); 5368 } 5369 } 5370 5371 typedef struct remap_blkptr_cb_arg { 5372 blkptr_t *rbca_bp; 5373 spa_remap_cb_t rbca_cb; 5374 vdev_t *rbca_remap_vd; 5375 uint64_t rbca_remap_offset; 5376 void *rbca_cb_arg; 5377 } remap_blkptr_cb_arg_t; 5378 5379 static void 5380 remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, 5381 uint64_t size, void *arg) 5382 { 5383 remap_blkptr_cb_arg_t *rbca = arg; 5384 blkptr_t *bp = rbca->rbca_bp; 5385 5386 /* We can not remap split blocks. */ 5387 if (size != DVA_GET_ASIZE(&bp->blk_dva[0])) 5388 return; 5389 ASSERT0(inner_offset); 5390 5391 if (rbca->rbca_cb != NULL) { 5392 /* 5393 * At this point we know that we are not handling split 5394 * blocks and we invoke the callback on the previous 5395 * vdev which must be indirect. 5396 */ 5397 ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops); 5398 5399 rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id, 5400 rbca->rbca_remap_offset, size, rbca->rbca_cb_arg); 5401 5402 /* set up remap_blkptr_cb_arg for the next call */ 5403 rbca->rbca_remap_vd = vd; 5404 rbca->rbca_remap_offset = offset; 5405 } 5406 5407 /* 5408 * The phys birth time is that of dva[0]. This ensures that we know 5409 * when each dva was written, so that resilver can determine which 5410 * blocks need to be scrubbed (i.e. those written during the time 5411 * the vdev was offline). It also ensures that the key used in 5412 * the ARC hash table is unique (i.e. dva[0] + phys_birth). If 5413 * we didn't change the phys_birth, a lookup in the ARC for a 5414 * remapped BP could find the data that was previously stored at 5415 * this vdev + offset. 5416 */ 5417 vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa, 5418 DVA_GET_VDEV(&bp->blk_dva[0])); 5419 vdev_indirect_births_t *vib = oldvd->vdev_indirect_births; 5420 bp->blk_phys_birth = vdev_indirect_births_physbirth(vib, 5421 DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0])); 5422 5423 DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id); 5424 DVA_SET_OFFSET(&bp->blk_dva[0], offset); 5425 } 5426 5427 /* 5428 * If the block pointer contains any indirect DVAs, modify them to refer to 5429 * concrete DVAs. Note that this will sometimes not be possible, leaving 5430 * the indirect DVA in place. This happens if the indirect DVA spans multiple 5431 * segments in the mapping (i.e. it is a "split block"). 5432 * 5433 * If the BP was remapped, calls the callback on the original dva (note the 5434 * callback can be called multiple times if the original indirect DVA refers 5435 * to another indirect DVA, etc). 5436 * 5437 * Returns TRUE if the BP was remapped. 5438 */ 5439 boolean_t 5440 spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg) 5441 { 5442 remap_blkptr_cb_arg_t rbca; 5443 5444 if (!zfs_remap_blkptr_enable) 5445 return (B_FALSE); 5446 5447 if (!spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) 5448 return (B_FALSE); 5449 5450 /* 5451 * Dedup BP's can not be remapped, because ddt_phys_select() depends 5452 * on DVA[0] being the same in the BP as in the DDT (dedup table). 5453 */ 5454 if (BP_GET_DEDUP(bp)) 5455 return (B_FALSE); 5456 5457 /* 5458 * Gang blocks can not be remapped, because 5459 * zio_checksum_gang_verifier() depends on the DVA[0] that's in 5460 * the BP used to read the gang block header (GBH) being the same 5461 * as the DVA[0] that we allocated for the GBH. 5462 */ 5463 if (BP_IS_GANG(bp)) 5464 return (B_FALSE); 5465 5466 /* 5467 * Embedded BP's have no DVA to remap. 5468 */ 5469 if (BP_GET_NDVAS(bp) < 1) 5470 return (B_FALSE); 5471 5472 /* 5473 * Note: we only remap dva[0]. If we remapped other dvas, we 5474 * would no longer know what their phys birth txg is. 5475 */ 5476 dva_t *dva = &bp->blk_dva[0]; 5477 5478 uint64_t offset = DVA_GET_OFFSET(dva); 5479 uint64_t size = DVA_GET_ASIZE(dva); 5480 vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); 5481 5482 if (vd->vdev_ops->vdev_op_remap == NULL) 5483 return (B_FALSE); 5484 5485 rbca.rbca_bp = bp; 5486 rbca.rbca_cb = callback; 5487 rbca.rbca_remap_vd = vd; 5488 rbca.rbca_remap_offset = offset; 5489 rbca.rbca_cb_arg = arg; 5490 5491 /* 5492 * remap_blkptr_cb() will be called in order for each level of 5493 * indirection, until a concrete vdev is reached or a split block is 5494 * encountered. old_vd and old_offset are updated within the callback 5495 * as we go from the one indirect vdev to the next one (either concrete 5496 * or indirect again) in that order. 5497 */ 5498 vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca); 5499 5500 /* Check if the DVA wasn't remapped because it is a split block */ 5501 if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id) 5502 return (B_FALSE); 5503 5504 return (B_TRUE); 5505 } 5506 5507 /* 5508 * Undo the allocation of a DVA which happened in the given transaction group. 5509 */ 5510 void 5511 metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg) 5512 { 5513 metaslab_t *msp; 5514 vdev_t *vd; 5515 uint64_t vdev = DVA_GET_VDEV(dva); 5516 uint64_t offset = DVA_GET_OFFSET(dva); 5517 uint64_t size = DVA_GET_ASIZE(dva); 5518 5519 ASSERT(DVA_IS_VALID(dva)); 5520 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 5521 5522 if (txg > spa_freeze_txg(spa)) 5523 return; 5524 5525 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || !DVA_IS_VALID(dva) || 5526 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { 5527 zfs_panic_recover("metaslab_free_dva(): bad DVA %llu:%llu:%llu", 5528 (u_longlong_t)vdev, (u_longlong_t)offset, 5529 (u_longlong_t)size); 5530 return; 5531 } 5532 5533 ASSERT(!vd->vdev_removing); 5534 ASSERT(vdev_is_concrete(vd)); 5535 ASSERT0(vd->vdev_indirect_config.vic_mapping_object); 5536 ASSERT3P(vd->vdev_indirect_mapping, ==, NULL); 5537 5538 if (DVA_GET_GANG(dva)) 5539 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 5540 5541 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 5542 5543 mutex_enter(&msp->ms_lock); 5544 range_tree_remove(msp->ms_allocating[txg & TXG_MASK], 5545 offset, size); 5546 msp->ms_allocating_total -= size; 5547 5548 VERIFY(!msp->ms_condensing); 5549 VERIFY3U(offset, >=, msp->ms_start); 5550 VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size); 5551 VERIFY3U(range_tree_space(msp->ms_allocatable) + size, <=, 5552 msp->ms_size); 5553 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 5554 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 5555 range_tree_add(msp->ms_allocatable, offset, size); 5556 mutex_exit(&msp->ms_lock); 5557 } 5558 5559 /* 5560 * Free the block represented by the given DVA. 5561 */ 5562 void 5563 metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint) 5564 { 5565 uint64_t vdev = DVA_GET_VDEV(dva); 5566 uint64_t offset = DVA_GET_OFFSET(dva); 5567 uint64_t size = DVA_GET_ASIZE(dva); 5568 vdev_t *vd = vdev_lookup_top(spa, vdev); 5569 5570 ASSERT(DVA_IS_VALID(dva)); 5571 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 5572 5573 if (DVA_GET_GANG(dva)) { 5574 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 5575 } 5576 5577 metaslab_free_impl(vd, offset, size, checkpoint); 5578 } 5579 5580 /* 5581 * Reserve some allocation slots. The reservation system must be called 5582 * before we call into the allocator. If there aren't any available slots 5583 * then the I/O will be throttled until an I/O completes and its slots are 5584 * freed up. The function returns true if it was successful in placing 5585 * the reservation. 5586 */ 5587 boolean_t 5588 metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator, 5589 zio_t *zio, int flags) 5590 { 5591 uint64_t available_slots = 0; 5592 boolean_t slot_reserved = B_FALSE; 5593 uint64_t max = mc->mc_alloc_max_slots[allocator]; 5594 5595 ASSERT(mc->mc_alloc_throttle_enabled); 5596 mutex_enter(&mc->mc_lock); 5597 5598 uint64_t reserved_slots = 5599 zfs_refcount_count(&mc->mc_alloc_slots[allocator]); 5600 if (reserved_slots < max) 5601 available_slots = max - reserved_slots; 5602 5603 if (slots <= available_slots || GANG_ALLOCATION(flags) || 5604 flags & METASLAB_MUST_RESERVE) { 5605 /* 5606 * We reserve the slots individually so that we can unreserve 5607 * them individually when an I/O completes. 5608 */ 5609 for (int d = 0; d < slots; d++) { 5610 reserved_slots = 5611 zfs_refcount_add(&mc->mc_alloc_slots[allocator], 5612 zio); 5613 } 5614 zio->io_flags |= ZIO_FLAG_IO_ALLOCATING; 5615 slot_reserved = B_TRUE; 5616 } 5617 5618 mutex_exit(&mc->mc_lock); 5619 return (slot_reserved); 5620 } 5621 5622 void 5623 metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, 5624 int allocator, zio_t *zio) 5625 { 5626 ASSERT(mc->mc_alloc_throttle_enabled); 5627 mutex_enter(&mc->mc_lock); 5628 for (int d = 0; d < slots; d++) { 5629 (void) zfs_refcount_remove(&mc->mc_alloc_slots[allocator], 5630 zio); 5631 } 5632 mutex_exit(&mc->mc_lock); 5633 } 5634 5635 static int 5636 metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size, 5637 uint64_t txg) 5638 { 5639 metaslab_t *msp; 5640 spa_t *spa = vd->vdev_spa; 5641 int error = 0; 5642 5643 if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count) 5644 return (SET_ERROR(ENXIO)); 5645 5646 ASSERT3P(vd->vdev_ms, !=, NULL); 5647 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 5648 5649 mutex_enter(&msp->ms_lock); 5650 5651 if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) { 5652 error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM); 5653 if (error == EBUSY) { 5654 ASSERT(msp->ms_loaded); 5655 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); 5656 error = 0; 5657 } 5658 } 5659 5660 if (error == 0 && 5661 !range_tree_contains(msp->ms_allocatable, offset, size)) 5662 error = SET_ERROR(ENOENT); 5663 5664 if (error || txg == 0) { /* txg == 0 indicates dry run */ 5665 mutex_exit(&msp->ms_lock); 5666 return (error); 5667 } 5668 5669 VERIFY(!msp->ms_condensing); 5670 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 5671 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 5672 VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=, 5673 msp->ms_size); 5674 range_tree_remove(msp->ms_allocatable, offset, size); 5675 range_tree_clear(msp->ms_trim, offset, size); 5676 5677 if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ 5678 metaslab_class_t *mc = msp->ms_group->mg_class; 5679 multilist_sublist_t *mls = 5680 multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp); 5681 if (!multilist_link_active(&msp->ms_class_txg_node)) { 5682 msp->ms_selected_txg = txg; 5683 multilist_sublist_insert_head(mls, msp); 5684 } 5685 multilist_sublist_unlock(mls); 5686 5687 if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) 5688 vdev_dirty(vd, VDD_METASLAB, msp, txg); 5689 range_tree_add(msp->ms_allocating[txg & TXG_MASK], 5690 offset, size); 5691 msp->ms_allocating_total += size; 5692 } 5693 5694 mutex_exit(&msp->ms_lock); 5695 5696 return (0); 5697 } 5698 5699 typedef struct metaslab_claim_cb_arg_t { 5700 uint64_t mcca_txg; 5701 int mcca_error; 5702 } metaslab_claim_cb_arg_t; 5703 5704 /* ARGSUSED */ 5705 static void 5706 metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, 5707 uint64_t size, void *arg) 5708 { 5709 metaslab_claim_cb_arg_t *mcca_arg = arg; 5710 5711 if (mcca_arg->mcca_error == 0) { 5712 mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset, 5713 size, mcca_arg->mcca_txg); 5714 } 5715 } 5716 5717 int 5718 metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg) 5719 { 5720 if (vd->vdev_ops->vdev_op_remap != NULL) { 5721 metaslab_claim_cb_arg_t arg; 5722 5723 /* 5724 * Only zdb(1M) can claim on indirect vdevs. This is used 5725 * to detect leaks of mapped space (that are not accounted 5726 * for in the obsolete counts, spacemap, or bpobj). 5727 */ 5728 ASSERT(!spa_writeable(vd->vdev_spa)); 5729 arg.mcca_error = 0; 5730 arg.mcca_txg = txg; 5731 5732 vd->vdev_ops->vdev_op_remap(vd, offset, size, 5733 metaslab_claim_impl_cb, &arg); 5734 5735 if (arg.mcca_error == 0) { 5736 arg.mcca_error = metaslab_claim_concrete(vd, 5737 offset, size, txg); 5738 } 5739 return (arg.mcca_error); 5740 } else { 5741 return (metaslab_claim_concrete(vd, offset, size, txg)); 5742 } 5743 } 5744 5745 /* 5746 * Intent log support: upon opening the pool after a crash, notify the SPA 5747 * of blocks that the intent log has allocated for immediate write, but 5748 * which are still considered free by the SPA because the last transaction 5749 * group didn't commit yet. 5750 */ 5751 static int 5752 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) 5753 { 5754 uint64_t vdev = DVA_GET_VDEV(dva); 5755 uint64_t offset = DVA_GET_OFFSET(dva); 5756 uint64_t size = DVA_GET_ASIZE(dva); 5757 vdev_t *vd; 5758 5759 if ((vd = vdev_lookup_top(spa, vdev)) == NULL) { 5760 return (SET_ERROR(ENXIO)); 5761 } 5762 5763 ASSERT(DVA_IS_VALID(dva)); 5764 5765 if (DVA_GET_GANG(dva)) 5766 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 5767 5768 return (metaslab_claim_impl(vd, offset, size, txg)); 5769 } 5770 5771 int 5772 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, 5773 int ndvas, uint64_t txg, blkptr_t *hintbp, int flags, 5774 zio_alloc_list_t *zal, zio_t *zio, int allocator) 5775 { 5776 dva_t *dva = bp->blk_dva; 5777 dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL; 5778 int error = 0; 5779 5780 ASSERT(bp->blk_birth == 0); 5781 ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); 5782 5783 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 5784 5785 if (mc->mc_rotor == NULL) { /* no vdevs in this class */ 5786 spa_config_exit(spa, SCL_ALLOC, FTAG); 5787 return (SET_ERROR(ENOSPC)); 5788 } 5789 5790 ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); 5791 ASSERT(BP_GET_NDVAS(bp) == 0); 5792 ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); 5793 ASSERT3P(zal, !=, NULL); 5794 5795 for (int d = 0; d < ndvas; d++) { 5796 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, 5797 txg, flags, zal, allocator); 5798 if (error != 0) { 5799 for (d--; d >= 0; d--) { 5800 metaslab_unalloc_dva(spa, &dva[d], txg); 5801 metaslab_group_alloc_decrement(spa, 5802 DVA_GET_VDEV(&dva[d]), zio, flags, 5803 allocator, B_FALSE); 5804 bzero(&dva[d], sizeof (dva_t)); 5805 } 5806 spa_config_exit(spa, SCL_ALLOC, FTAG); 5807 return (error); 5808 } else { 5809 /* 5810 * Update the metaslab group's queue depth 5811 * based on the newly allocated dva. 5812 */ 5813 metaslab_group_alloc_increment(spa, 5814 DVA_GET_VDEV(&dva[d]), zio, flags, allocator); 5815 } 5816 5817 } 5818 ASSERT(error == 0); 5819 ASSERT(BP_GET_NDVAS(bp) == ndvas); 5820 5821 spa_config_exit(spa, SCL_ALLOC, FTAG); 5822 5823 BP_SET_BIRTH(bp, txg, 0); 5824 5825 return (0); 5826 } 5827 5828 void 5829 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) 5830 { 5831 const dva_t *dva = bp->blk_dva; 5832 int ndvas = BP_GET_NDVAS(bp); 5833 5834 ASSERT(!BP_IS_HOLE(bp)); 5835 ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); 5836 5837 /* 5838 * If we have a checkpoint for the pool we need to make sure that 5839 * the blocks that we free that are part of the checkpoint won't be 5840 * reused until the checkpoint is discarded or we revert to it. 5841 * 5842 * The checkpoint flag is passed down the metaslab_free code path 5843 * and is set whenever we want to add a block to the checkpoint's 5844 * accounting. That is, we "checkpoint" blocks that existed at the 5845 * time the checkpoint was created and are therefore referenced by 5846 * the checkpointed uberblock. 5847 * 5848 * Note that, we don't checkpoint any blocks if the current 5849 * syncing txg <= spa_checkpoint_txg. We want these frees to sync 5850 * normally as they will be referenced by the checkpointed uberblock. 5851 */ 5852 boolean_t checkpoint = B_FALSE; 5853 if (bp->blk_birth <= spa->spa_checkpoint_txg && 5854 spa_syncing_txg(spa) > spa->spa_checkpoint_txg) { 5855 /* 5856 * At this point, if the block is part of the checkpoint 5857 * there is no way it was created in the current txg. 5858 */ 5859 ASSERT(!now); 5860 ASSERT3U(spa_syncing_txg(spa), ==, txg); 5861 checkpoint = B_TRUE; 5862 } 5863 5864 spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); 5865 5866 for (int d = 0; d < ndvas; d++) { 5867 if (now) { 5868 metaslab_unalloc_dva(spa, &dva[d], txg); 5869 } else { 5870 ASSERT3U(txg, ==, spa_syncing_txg(spa)); 5871 metaslab_free_dva(spa, &dva[d], checkpoint); 5872 } 5873 } 5874 5875 spa_config_exit(spa, SCL_FREE, FTAG); 5876 } 5877 5878 int 5879 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) 5880 { 5881 const dva_t *dva = bp->blk_dva; 5882 int ndvas = BP_GET_NDVAS(bp); 5883 int error = 0; 5884 5885 ASSERT(!BP_IS_HOLE(bp)); 5886 5887 if (txg != 0) { 5888 /* 5889 * First do a dry run to make sure all DVAs are claimable, 5890 * so we don't have to unwind from partial failures below. 5891 */ 5892 if ((error = metaslab_claim(spa, bp, 0)) != 0) 5893 return (error); 5894 } 5895 5896 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 5897 5898 for (int d = 0; d < ndvas; d++) { 5899 error = metaslab_claim_dva(spa, &dva[d], txg); 5900 if (error != 0) 5901 break; 5902 } 5903 5904 spa_config_exit(spa, SCL_ALLOC, FTAG); 5905 5906 ASSERT(error == 0 || txg == 0); 5907 5908 return (error); 5909 } 5910 5911 void 5912 metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp) 5913 { 5914 const dva_t *dva = bp->blk_dva; 5915 int ndvas = BP_GET_NDVAS(bp); 5916 uint64_t psize = BP_GET_PSIZE(bp); 5917 int d; 5918 vdev_t *vd; 5919 5920 ASSERT(!BP_IS_HOLE(bp)); 5921 ASSERT(!BP_IS_EMBEDDED(bp)); 5922 ASSERT(psize > 0); 5923 5924 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 5925 5926 for (d = 0; d < ndvas; d++) { 5927 if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL) 5928 continue; 5929 atomic_add_64(&vd->vdev_pending_fastwrite, psize); 5930 } 5931 5932 spa_config_exit(spa, SCL_VDEV, FTAG); 5933 } 5934 5935 void 5936 metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp) 5937 { 5938 const dva_t *dva = bp->blk_dva; 5939 int ndvas = BP_GET_NDVAS(bp); 5940 uint64_t psize = BP_GET_PSIZE(bp); 5941 int d; 5942 vdev_t *vd; 5943 5944 ASSERT(!BP_IS_HOLE(bp)); 5945 ASSERT(!BP_IS_EMBEDDED(bp)); 5946 ASSERT(psize > 0); 5947 5948 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 5949 5950 for (d = 0; d < ndvas; d++) { 5951 if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL) 5952 continue; 5953 ASSERT3U(vd->vdev_pending_fastwrite, >=, psize); 5954 atomic_sub_64(&vd->vdev_pending_fastwrite, psize); 5955 } 5956 5957 spa_config_exit(spa, SCL_VDEV, FTAG); 5958 } 5959 5960 /* ARGSUSED */ 5961 static void 5962 metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset, 5963 uint64_t size, void *arg) 5964 { 5965 if (vd->vdev_ops == &vdev_indirect_ops) 5966 return; 5967 5968 metaslab_check_free_impl(vd, offset, size); 5969 } 5970 5971 static void 5972 metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size) 5973 { 5974 metaslab_t *msp; 5975 spa_t *spa __maybe_unused = vd->vdev_spa; 5976 5977 if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) 5978 return; 5979 5980 if (vd->vdev_ops->vdev_op_remap != NULL) { 5981 vd->vdev_ops->vdev_op_remap(vd, offset, size, 5982 metaslab_check_free_impl_cb, NULL); 5983 return; 5984 } 5985 5986 ASSERT(vdev_is_concrete(vd)); 5987 ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); 5988 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 5989 5990 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 5991 5992 mutex_enter(&msp->ms_lock); 5993 if (msp->ms_loaded) { 5994 range_tree_verify_not_present(msp->ms_allocatable, 5995 offset, size); 5996 } 5997 5998 /* 5999 * Check all segments that currently exist in the freeing pipeline. 6000 * 6001 * It would intuitively make sense to also check the current allocating 6002 * tree since metaslab_unalloc_dva() exists for extents that are 6003 * allocated and freed in the same sync pass within the same txg. 6004 * Unfortunately there are places (e.g. the ZIL) where we allocate a 6005 * segment but then we free part of it within the same txg 6006 * [see zil_sync()]. Thus, we don't call range_tree_verify() in the 6007 * current allocating tree. 6008 */ 6009 range_tree_verify_not_present(msp->ms_freeing, offset, size); 6010 range_tree_verify_not_present(msp->ms_checkpointing, offset, size); 6011 range_tree_verify_not_present(msp->ms_freed, offset, size); 6012 for (int j = 0; j < TXG_DEFER_SIZE; j++) 6013 range_tree_verify_not_present(msp->ms_defer[j], offset, size); 6014 range_tree_verify_not_present(msp->ms_trim, offset, size); 6015 mutex_exit(&msp->ms_lock); 6016 } 6017 6018 void 6019 metaslab_check_free(spa_t *spa, const blkptr_t *bp) 6020 { 6021 if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) 6022 return; 6023 6024 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 6025 for (int i = 0; i < BP_GET_NDVAS(bp); i++) { 6026 uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]); 6027 vdev_t *vd = vdev_lookup_top(spa, vdev); 6028 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); 6029 uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]); 6030 6031 if (DVA_GET_GANG(&bp->blk_dva[i])) 6032 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 6033 6034 ASSERT3P(vd, !=, NULL); 6035 6036 metaslab_check_free_impl(vd, offset, size); 6037 } 6038 spa_config_exit(spa, SCL_VDEV, FTAG); 6039 } 6040 6041 static void 6042 metaslab_group_disable_wait(metaslab_group_t *mg) 6043 { 6044 ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock)); 6045 while (mg->mg_disabled_updating) { 6046 cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock); 6047 } 6048 } 6049 6050 static void 6051 metaslab_group_disabled_increment(metaslab_group_t *mg) 6052 { 6053 ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock)); 6054 ASSERT(mg->mg_disabled_updating); 6055 6056 while (mg->mg_ms_disabled >= max_disabled_ms) { 6057 cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock); 6058 } 6059 mg->mg_ms_disabled++; 6060 ASSERT3U(mg->mg_ms_disabled, <=, max_disabled_ms); 6061 } 6062 6063 /* 6064 * Mark the metaslab as disabled to prevent any allocations on this metaslab. 6065 * We must also track how many metaslabs are currently disabled within a 6066 * metaslab group and limit them to prevent allocation failures from 6067 * occurring because all metaslabs are disabled. 6068 */ 6069 void 6070 metaslab_disable(metaslab_t *msp) 6071 { 6072 ASSERT(!MUTEX_HELD(&msp->ms_lock)); 6073 metaslab_group_t *mg = msp->ms_group; 6074 6075 mutex_enter(&mg->mg_ms_disabled_lock); 6076 6077 /* 6078 * To keep an accurate count of how many threads have disabled 6079 * a specific metaslab group, we only allow one thread to mark 6080 * the metaslab group at a time. This ensures that the value of 6081 * ms_disabled will be accurate when we decide to mark a metaslab 6082 * group as disabled. To do this we force all other threads 6083 * to wait till the metaslab's mg_disabled_updating flag is no 6084 * longer set. 6085 */ 6086 metaslab_group_disable_wait(mg); 6087 mg->mg_disabled_updating = B_TRUE; 6088 if (msp->ms_disabled == 0) { 6089 metaslab_group_disabled_increment(mg); 6090 } 6091 mutex_enter(&msp->ms_lock); 6092 msp->ms_disabled++; 6093 mutex_exit(&msp->ms_lock); 6094 6095 mg->mg_disabled_updating = B_FALSE; 6096 cv_broadcast(&mg->mg_ms_disabled_cv); 6097 mutex_exit(&mg->mg_ms_disabled_lock); 6098 } 6099 6100 void 6101 metaslab_enable(metaslab_t *msp, boolean_t sync, boolean_t unload) 6102 { 6103 metaslab_group_t *mg = msp->ms_group; 6104 spa_t *spa = mg->mg_vd->vdev_spa; 6105 6106 /* 6107 * Wait for the outstanding IO to be synced to prevent newly 6108 * allocated blocks from being overwritten. This used by 6109 * initialize and TRIM which are modifying unallocated space. 6110 */ 6111 if (sync) 6112 txg_wait_synced(spa_get_dsl(spa), 0); 6113 6114 mutex_enter(&mg->mg_ms_disabled_lock); 6115 mutex_enter(&msp->ms_lock); 6116 if (--msp->ms_disabled == 0) { 6117 mg->mg_ms_disabled--; 6118 cv_broadcast(&mg->mg_ms_disabled_cv); 6119 if (unload) 6120 metaslab_unload(msp); 6121 } 6122 mutex_exit(&msp->ms_lock); 6123 mutex_exit(&mg->mg_ms_disabled_lock); 6124 } 6125 6126 static void 6127 metaslab_update_ondisk_flush_data(metaslab_t *ms, dmu_tx_t *tx) 6128 { 6129 vdev_t *vd = ms->ms_group->mg_vd; 6130 spa_t *spa = vd->vdev_spa; 6131 objset_t *mos = spa_meta_objset(spa); 6132 6133 ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); 6134 6135 metaslab_unflushed_phys_t entry = { 6136 .msp_unflushed_txg = metaslab_unflushed_txg(ms), 6137 }; 6138 uint64_t entry_size = sizeof (entry); 6139 uint64_t entry_offset = ms->ms_id * entry_size; 6140 6141 uint64_t object = 0; 6142 int err = zap_lookup(mos, vd->vdev_top_zap, 6143 VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, 6144 &object); 6145 if (err == ENOENT) { 6146 object = dmu_object_alloc(mos, DMU_OTN_UINT64_METADATA, 6147 SPA_OLD_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx); 6148 VERIFY0(zap_add(mos, vd->vdev_top_zap, 6149 VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, 6150 &object, tx)); 6151 } else { 6152 VERIFY0(err); 6153 } 6154 6155 dmu_write(spa_meta_objset(spa), object, entry_offset, entry_size, 6156 &entry, tx); 6157 } 6158 6159 void 6160 metaslab_set_unflushed_txg(metaslab_t *ms, uint64_t txg, dmu_tx_t *tx) 6161 { 6162 spa_t *spa = ms->ms_group->mg_vd->vdev_spa; 6163 6164 if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) 6165 return; 6166 6167 ms->ms_unflushed_txg = txg; 6168 metaslab_update_ondisk_flush_data(ms, tx); 6169 } 6170 6171 uint64_t 6172 metaslab_unflushed_txg(metaslab_t *ms) 6173 { 6174 return (ms->ms_unflushed_txg); 6175 } 6176 6177 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, aliquot, ULONG, ZMOD_RW, 6178 "Allocation granularity (a.k.a. stripe size)"); 6179 6180 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, debug_load, INT, ZMOD_RW, 6181 "Load all metaslabs when pool is first opened"); 6182 6183 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, debug_unload, INT, ZMOD_RW, 6184 "Prevent metaslabs from being unloaded"); 6185 6186 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_enabled, INT, ZMOD_RW, 6187 "Preload potential metaslabs during reassessment"); 6188 6189 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay, INT, ZMOD_RW, 6190 "Delay in txgs after metaslab was last used before unloading"); 6191 6192 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay_ms, INT, ZMOD_RW, 6193 "Delay in milliseconds after metaslab was last used before unloading"); 6194 6195 /* BEGIN CSTYLED */ 6196 ZFS_MODULE_PARAM(zfs_mg, zfs_mg_, noalloc_threshold, INT, ZMOD_RW, 6197 "Percentage of metaslab group size that should be free to make it " 6198 "eligible for allocation"); 6199 6200 ZFS_MODULE_PARAM(zfs_mg, zfs_mg_, fragmentation_threshold, INT, ZMOD_RW, 6201 "Percentage of metaslab group size that should be considered eligible " 6202 "for allocations unless all metaslab groups within the metaslab class " 6203 "have also crossed this threshold"); 6204 6205 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, fragmentation_threshold, INT, 6206 ZMOD_RW, "Fragmentation for metaslab to allow allocation"); 6207 6208 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, fragmentation_factor_enabled, INT, ZMOD_RW, 6209 "Use the fragmentation metric to prefer less fragmented metaslabs"); 6210 /* END CSTYLED */ 6211 6212 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, lba_weighting_enabled, INT, ZMOD_RW, 6213 "Prefer metaslabs with lower LBAs"); 6214 6215 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, bias_enabled, INT, ZMOD_RW, 6216 "Enable metaslab group biasing"); 6217 6218 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, segment_weight_enabled, INT, 6219 ZMOD_RW, "Enable segment-based metaslab selection"); 6220 6221 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, switch_threshold, INT, ZMOD_RW, 6222 "Segment-based metaslab selection maximum buckets before switching"); 6223 6224 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging, ULONG, ZMOD_RW, 6225 "Blocks larger than this size are forced to be gang blocks"); 6226 6227 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_max_search, INT, ZMOD_RW, 6228 "Max distance (bytes) to search forward before using size tree"); 6229 6230 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_use_largest_segment, INT, ZMOD_RW, 6231 "When looking in size tree, use largest segment instead of exact fit"); 6232 6233 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, max_size_cache_sec, ULONG, 6234 ZMOD_RW, "How long to trust the cached max chunk size of a metaslab"); 6235 6236 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, mem_limit, INT, ZMOD_RW, 6237 "Percentage of memory that can be used to store metaslab range trees"); 6238