1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2019 by Delphix. All rights reserved. 24 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 25 * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved. 26 * Copyright (c) 2017, Intel Corporation. 27 */ 28 29 #include <sys/zfs_context.h> 30 #include <sys/dmu.h> 31 #include <sys/dmu_tx.h> 32 #include <sys/space_map.h> 33 #include <sys/metaslab_impl.h> 34 #include <sys/vdev_impl.h> 35 #include <sys/vdev_draid.h> 36 #include <sys/zio.h> 37 #include <sys/spa_impl.h> 38 #include <sys/zfeature.h> 39 #include <sys/vdev_indirect_mapping.h> 40 #include <sys/zap.h> 41 #include <sys/btree.h> 42 43 #define GANG_ALLOCATION(flags) \ 44 ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER)) 45 46 /* 47 * Metaslab granularity, in bytes. This is roughly similar to what would be 48 * referred to as the "stripe size" in traditional RAID arrays. In normal 49 * operation, we will try to write this amount of data to each disk before 50 * moving on to the next top-level vdev. 51 */ 52 static uint64_t metaslab_aliquot = 1024 * 1024; 53 54 /* 55 * For testing, make some blocks above a certain size be gang blocks. 56 */ 57 uint64_t metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1; 58 59 /* 60 * Of blocks of size >= metaslab_force_ganging, actually gang them this often. 61 */ 62 uint_t metaslab_force_ganging_pct = 3; 63 64 /* 65 * In pools where the log space map feature is not enabled we touch 66 * multiple metaslabs (and their respective space maps) with each 67 * transaction group. Thus, we benefit from having a small space map 68 * block size since it allows us to issue more I/O operations scattered 69 * around the disk. So a sane default for the space map block size 70 * is 8~16K. 71 */ 72 int zfs_metaslab_sm_blksz_no_log = (1 << 14); 73 74 /* 75 * When the log space map feature is enabled, we accumulate a lot of 76 * changes per metaslab that are flushed once in a while so we benefit 77 * from a bigger block size like 128K for the metaslab space maps. 78 */ 79 int zfs_metaslab_sm_blksz_with_log = (1 << 17); 80 81 /* 82 * The in-core space map representation is more compact than its on-disk form. 83 * The zfs_condense_pct determines how much more compact the in-core 84 * space map representation must be before we compact it on-disk. 85 * Values should be greater than or equal to 100. 86 */ 87 uint_t zfs_condense_pct = 200; 88 89 /* 90 * Condensing a metaslab is not guaranteed to actually reduce the amount of 91 * space used on disk. In particular, a space map uses data in increments of 92 * MAX(1 << ashift, space_map_blksz), so a metaslab might use the 93 * same number of blocks after condensing. Since the goal of condensing is to 94 * reduce the number of IOPs required to read the space map, we only want to 95 * condense when we can be sure we will reduce the number of blocks used by the 96 * space map. Unfortunately, we cannot precisely compute whether or not this is 97 * the case in metaslab_should_condense since we are holding ms_lock. Instead, 98 * we apply the following heuristic: do not condense a spacemap unless the 99 * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold 100 * blocks. 101 */ 102 static const int zfs_metaslab_condense_block_threshold = 4; 103 104 /* 105 * The zfs_mg_noalloc_threshold defines which metaslab groups should 106 * be eligible for allocation. The value is defined as a percentage of 107 * free space. Metaslab groups that have more free space than 108 * zfs_mg_noalloc_threshold are always eligible for allocations. Once 109 * a metaslab group's free space is less than or equal to the 110 * zfs_mg_noalloc_threshold the allocator will avoid allocating to that 111 * group unless all groups in the pool have reached zfs_mg_noalloc_threshold. 112 * Once all groups in the pool reach zfs_mg_noalloc_threshold then all 113 * groups are allowed to accept allocations. Gang blocks are always 114 * eligible to allocate on any metaslab group. The default value of 0 means 115 * no metaslab group will be excluded based on this criterion. 116 */ 117 static uint_t zfs_mg_noalloc_threshold = 0; 118 119 /* 120 * Metaslab groups are considered eligible for allocations if their 121 * fragmentation metric (measured as a percentage) is less than or 122 * equal to zfs_mg_fragmentation_threshold. If a metaslab group 123 * exceeds this threshold then it will be skipped unless all metaslab 124 * groups within the metaslab class have also crossed this threshold. 125 * 126 * This tunable was introduced to avoid edge cases where we continue 127 * allocating from very fragmented disks in our pool while other, less 128 * fragmented disks, exists. On the other hand, if all disks in the 129 * pool are uniformly approaching the threshold, the threshold can 130 * be a speed bump in performance, where we keep switching the disks 131 * that we allocate from (e.g. we allocate some segments from disk A 132 * making it bypassing the threshold while freeing segments from disk 133 * B getting its fragmentation below the threshold). 134 * 135 * Empirically, we've seen that our vdev selection for allocations is 136 * good enough that fragmentation increases uniformly across all vdevs 137 * the majority of the time. Thus we set the threshold percentage high 138 * enough to avoid hitting the speed bump on pools that are being pushed 139 * to the edge. 140 */ 141 static uint_t zfs_mg_fragmentation_threshold = 95; 142 143 /* 144 * Allow metaslabs to keep their active state as long as their fragmentation 145 * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An 146 * active metaslab that exceeds this threshold will no longer keep its active 147 * status allowing better metaslabs to be selected. 148 */ 149 static uint_t zfs_metaslab_fragmentation_threshold = 70; 150 151 /* 152 * When set will load all metaslabs when pool is first opened. 153 */ 154 int metaslab_debug_load = B_FALSE; 155 156 /* 157 * When set will prevent metaslabs from being unloaded. 158 */ 159 static int metaslab_debug_unload = B_FALSE; 160 161 /* 162 * Minimum size which forces the dynamic allocator to change 163 * it's allocation strategy. Once the space map cannot satisfy 164 * an allocation of this size then it switches to using more 165 * aggressive strategy (i.e search by size rather than offset). 166 */ 167 uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE; 168 169 /* 170 * The minimum free space, in percent, which must be available 171 * in a space map to continue allocations in a first-fit fashion. 172 * Once the space map's free space drops below this level we dynamically 173 * switch to using best-fit allocations. 174 */ 175 uint_t metaslab_df_free_pct = 4; 176 177 /* 178 * Maximum distance to search forward from the last offset. Without this 179 * limit, fragmented pools can see >100,000 iterations and 180 * metaslab_block_picker() becomes the performance limiting factor on 181 * high-performance storage. 182 * 183 * With the default setting of 16MB, we typically see less than 500 184 * iterations, even with very fragmented, ashift=9 pools. The maximum number 185 * of iterations possible is: 186 * metaslab_df_max_search / (2 * (1<<ashift)) 187 * With the default setting of 16MB this is 16*1024 (with ashift=9) or 188 * 2048 (with ashift=12). 189 */ 190 static uint_t metaslab_df_max_search = 16 * 1024 * 1024; 191 192 /* 193 * Forces the metaslab_block_picker function to search for at least this many 194 * segments forwards until giving up on finding a segment that the allocation 195 * will fit into. 196 */ 197 static const uint32_t metaslab_min_search_count = 100; 198 199 /* 200 * If we are not searching forward (due to metaslab_df_max_search, 201 * metaslab_df_free_pct, or metaslab_df_alloc_threshold), this tunable 202 * controls what segment is used. If it is set, we will use the largest free 203 * segment. If it is not set, we will use a segment of exactly the requested 204 * size (or larger). 205 */ 206 static int metaslab_df_use_largest_segment = B_FALSE; 207 208 /* 209 * These tunables control how long a metaslab will remain loaded after the 210 * last allocation from it. A metaslab can't be unloaded until at least 211 * metaslab_unload_delay TXG's and metaslab_unload_delay_ms milliseconds 212 * have elapsed. However, zfs_metaslab_mem_limit may cause it to be 213 * unloaded sooner. These settings are intended to be generous -- to keep 214 * metaslabs loaded for a long time, reducing the rate of metaslab loading. 215 */ 216 static uint_t metaslab_unload_delay = 32; 217 static uint_t metaslab_unload_delay_ms = 10 * 60 * 1000; /* ten minutes */ 218 219 /* 220 * Max number of metaslabs per group to preload. 221 */ 222 uint_t metaslab_preload_limit = 10; 223 224 /* 225 * Enable/disable preloading of metaslab. 226 */ 227 static int metaslab_preload_enabled = B_TRUE; 228 229 /* 230 * Enable/disable fragmentation weighting on metaslabs. 231 */ 232 static int metaslab_fragmentation_factor_enabled = B_TRUE; 233 234 /* 235 * Enable/disable lba weighting (i.e. outer tracks are given preference). 236 */ 237 static int metaslab_lba_weighting_enabled = B_TRUE; 238 239 /* 240 * Enable/disable metaslab group biasing. 241 */ 242 static int metaslab_bias_enabled = B_TRUE; 243 244 /* 245 * Enable/disable remapping of indirect DVAs to their concrete vdevs. 246 */ 247 static const boolean_t zfs_remap_blkptr_enable = B_TRUE; 248 249 /* 250 * Enable/disable segment-based metaslab selection. 251 */ 252 static int zfs_metaslab_segment_weight_enabled = B_TRUE; 253 254 /* 255 * When using segment-based metaslab selection, we will continue 256 * allocating from the active metaslab until we have exhausted 257 * zfs_metaslab_switch_threshold of its buckets. 258 */ 259 static int zfs_metaslab_switch_threshold = 2; 260 261 /* 262 * Internal switch to enable/disable the metaslab allocation tracing 263 * facility. 264 */ 265 static const boolean_t metaslab_trace_enabled = B_FALSE; 266 267 /* 268 * Maximum entries that the metaslab allocation tracing facility will keep 269 * in a given list when running in non-debug mode. We limit the number 270 * of entries in non-debug mode to prevent us from using up too much memory. 271 * The limit should be sufficiently large that we don't expect any allocation 272 * to every exceed this value. In debug mode, the system will panic if this 273 * limit is ever reached allowing for further investigation. 274 */ 275 static const uint64_t metaslab_trace_max_entries = 5000; 276 277 /* 278 * Maximum number of metaslabs per group that can be disabled 279 * simultaneously. 280 */ 281 static const int max_disabled_ms = 3; 282 283 /* 284 * Time (in seconds) to respect ms_max_size when the metaslab is not loaded. 285 * To avoid 64-bit overflow, don't set above UINT32_MAX. 286 */ 287 static uint64_t zfs_metaslab_max_size_cache_sec = 1 * 60 * 60; /* 1 hour */ 288 289 /* 290 * Maximum percentage of memory to use on storing loaded metaslabs. If loading 291 * a metaslab would take it over this percentage, the oldest selected metaslab 292 * is automatically unloaded. 293 */ 294 static uint_t zfs_metaslab_mem_limit = 25; 295 296 /* 297 * Force the per-metaslab range trees to use 64-bit integers to store 298 * segments. Used for debugging purposes. 299 */ 300 static const boolean_t zfs_metaslab_force_large_segs = B_FALSE; 301 302 /* 303 * By default we only store segments over a certain size in the size-sorted 304 * metaslab trees (ms_allocatable_by_size and 305 * ms_unflushed_frees_by_size). This dramatically reduces memory usage and 306 * improves load and unload times at the cost of causing us to use slightly 307 * larger segments than we would otherwise in some cases. 308 */ 309 static const uint32_t metaslab_by_size_min_shift = 14; 310 311 /* 312 * If not set, we will first try normal allocation. If that fails then 313 * we will do a gang allocation. If that fails then we will do a "try hard" 314 * gang allocation. If that fails then we will have a multi-layer gang 315 * block. 316 * 317 * If set, we will first try normal allocation. If that fails then 318 * we will do a "try hard" allocation. If that fails we will do a gang 319 * allocation. If that fails we will do a "try hard" gang allocation. If 320 * that fails then we will have a multi-layer gang block. 321 */ 322 static int zfs_metaslab_try_hard_before_gang = B_FALSE; 323 324 /* 325 * When not trying hard, we only consider the best zfs_metaslab_find_max_tries 326 * metaslabs. This improves performance, especially when there are many 327 * metaslabs per vdev and the allocation can't actually be satisfied (so we 328 * would otherwise iterate all the metaslabs). If there is a metaslab with a 329 * worse weight but it can actually satisfy the allocation, we won't find it 330 * until trying hard. This may happen if the worse metaslab is not loaded 331 * (and the true weight is better than we have calculated), or due to weight 332 * bucketization. E.g. we are looking for a 60K segment, and the best 333 * metaslabs all have free segments in the 32-63K bucket, but the best 334 * zfs_metaslab_find_max_tries metaslabs have ms_max_size <60KB, and a 335 * subsequent metaslab has ms_max_size >60KB (but fewer segments in this 336 * bucket, and therefore a lower weight). 337 */ 338 static uint_t zfs_metaslab_find_max_tries = 100; 339 340 static uint64_t metaslab_weight(metaslab_t *, boolean_t); 341 static void metaslab_set_fragmentation(metaslab_t *, boolean_t); 342 static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t); 343 static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t); 344 345 static void metaslab_passivate(metaslab_t *msp, uint64_t weight); 346 static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp); 347 static void metaslab_flush_update(metaslab_t *, dmu_tx_t *); 348 static unsigned int metaslab_idx_func(multilist_t *, void *); 349 static void metaslab_evict(metaslab_t *, uint64_t); 350 static void metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg); 351 kmem_cache_t *metaslab_alloc_trace_cache; 352 353 typedef struct metaslab_stats { 354 kstat_named_t metaslabstat_trace_over_limit; 355 kstat_named_t metaslabstat_reload_tree; 356 kstat_named_t metaslabstat_too_many_tries; 357 kstat_named_t metaslabstat_try_hard; 358 } metaslab_stats_t; 359 360 static metaslab_stats_t metaslab_stats = { 361 { "trace_over_limit", KSTAT_DATA_UINT64 }, 362 { "reload_tree", KSTAT_DATA_UINT64 }, 363 { "too_many_tries", KSTAT_DATA_UINT64 }, 364 { "try_hard", KSTAT_DATA_UINT64 }, 365 }; 366 367 #define METASLABSTAT_BUMP(stat) \ 368 atomic_inc_64(&metaslab_stats.stat.value.ui64); 369 370 371 static kstat_t *metaslab_ksp; 372 373 void 374 metaslab_stat_init(void) 375 { 376 ASSERT(metaslab_alloc_trace_cache == NULL); 377 metaslab_alloc_trace_cache = kmem_cache_create( 378 "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t), 379 0, NULL, NULL, NULL, NULL, NULL, 0); 380 metaslab_ksp = kstat_create("zfs", 0, "metaslab_stats", 381 "misc", KSTAT_TYPE_NAMED, sizeof (metaslab_stats) / 382 sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 383 if (metaslab_ksp != NULL) { 384 metaslab_ksp->ks_data = &metaslab_stats; 385 kstat_install(metaslab_ksp); 386 } 387 } 388 389 void 390 metaslab_stat_fini(void) 391 { 392 if (metaslab_ksp != NULL) { 393 kstat_delete(metaslab_ksp); 394 metaslab_ksp = NULL; 395 } 396 397 kmem_cache_destroy(metaslab_alloc_trace_cache); 398 metaslab_alloc_trace_cache = NULL; 399 } 400 401 /* 402 * ========================================================================== 403 * Metaslab classes 404 * ========================================================================== 405 */ 406 metaslab_class_t * 407 metaslab_class_create(spa_t *spa, const metaslab_ops_t *ops) 408 { 409 metaslab_class_t *mc; 410 411 mc = kmem_zalloc(offsetof(metaslab_class_t, 412 mc_allocator[spa->spa_alloc_count]), KM_SLEEP); 413 414 mc->mc_spa = spa; 415 mc->mc_ops = ops; 416 mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL); 417 multilist_create(&mc->mc_metaslab_txg_list, sizeof (metaslab_t), 418 offsetof(metaslab_t, ms_class_txg_node), metaslab_idx_func); 419 for (int i = 0; i < spa->spa_alloc_count; i++) { 420 metaslab_class_allocator_t *mca = &mc->mc_allocator[i]; 421 mca->mca_rotor = NULL; 422 zfs_refcount_create_tracked(&mca->mca_alloc_slots); 423 } 424 425 return (mc); 426 } 427 428 void 429 metaslab_class_destroy(metaslab_class_t *mc) 430 { 431 spa_t *spa = mc->mc_spa; 432 433 ASSERT(mc->mc_alloc == 0); 434 ASSERT(mc->mc_deferred == 0); 435 ASSERT(mc->mc_space == 0); 436 ASSERT(mc->mc_dspace == 0); 437 438 for (int i = 0; i < spa->spa_alloc_count; i++) { 439 metaslab_class_allocator_t *mca = &mc->mc_allocator[i]; 440 ASSERT(mca->mca_rotor == NULL); 441 zfs_refcount_destroy(&mca->mca_alloc_slots); 442 } 443 mutex_destroy(&mc->mc_lock); 444 multilist_destroy(&mc->mc_metaslab_txg_list); 445 kmem_free(mc, offsetof(metaslab_class_t, 446 mc_allocator[spa->spa_alloc_count])); 447 } 448 449 int 450 metaslab_class_validate(metaslab_class_t *mc) 451 { 452 metaslab_group_t *mg; 453 vdev_t *vd; 454 455 /* 456 * Must hold one of the spa_config locks. 457 */ 458 ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || 459 spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); 460 461 if ((mg = mc->mc_allocator[0].mca_rotor) == NULL) 462 return (0); 463 464 do { 465 vd = mg->mg_vd; 466 ASSERT(vd->vdev_mg != NULL); 467 ASSERT3P(vd->vdev_top, ==, vd); 468 ASSERT3P(mg->mg_class, ==, mc); 469 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); 470 } while ((mg = mg->mg_next) != mc->mc_allocator[0].mca_rotor); 471 472 return (0); 473 } 474 475 static void 476 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, 477 int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) 478 { 479 atomic_add_64(&mc->mc_alloc, alloc_delta); 480 atomic_add_64(&mc->mc_deferred, defer_delta); 481 atomic_add_64(&mc->mc_space, space_delta); 482 atomic_add_64(&mc->mc_dspace, dspace_delta); 483 } 484 485 uint64_t 486 metaslab_class_get_alloc(metaslab_class_t *mc) 487 { 488 return (mc->mc_alloc); 489 } 490 491 uint64_t 492 metaslab_class_get_deferred(metaslab_class_t *mc) 493 { 494 return (mc->mc_deferred); 495 } 496 497 uint64_t 498 metaslab_class_get_space(metaslab_class_t *mc) 499 { 500 return (mc->mc_space); 501 } 502 503 uint64_t 504 metaslab_class_get_dspace(metaslab_class_t *mc) 505 { 506 return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); 507 } 508 509 void 510 metaslab_class_histogram_verify(metaslab_class_t *mc) 511 { 512 spa_t *spa = mc->mc_spa; 513 vdev_t *rvd = spa->spa_root_vdev; 514 uint64_t *mc_hist; 515 int i; 516 517 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 518 return; 519 520 mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 521 KM_SLEEP); 522 523 mutex_enter(&mc->mc_lock); 524 for (int c = 0; c < rvd->vdev_children; c++) { 525 vdev_t *tvd = rvd->vdev_child[c]; 526 metaslab_group_t *mg = vdev_get_mg(tvd, mc); 527 528 /* 529 * Skip any holes, uninitialized top-levels, or 530 * vdevs that are not in this metalab class. 531 */ 532 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || 533 mg->mg_class != mc) { 534 continue; 535 } 536 537 IMPLY(mg == mg->mg_vd->vdev_log_mg, 538 mc == spa_embedded_log_class(mg->mg_vd->vdev_spa)); 539 540 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 541 mc_hist[i] += mg->mg_histogram[i]; 542 } 543 544 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { 545 VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]); 546 } 547 548 mutex_exit(&mc->mc_lock); 549 kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 550 } 551 552 /* 553 * Calculate the metaslab class's fragmentation metric. The metric 554 * is weighted based on the space contribution of each metaslab group. 555 * The return value will be a number between 0 and 100 (inclusive), or 556 * ZFS_FRAG_INVALID if the metric has not been set. See comment above the 557 * zfs_frag_table for more information about the metric. 558 */ 559 uint64_t 560 metaslab_class_fragmentation(metaslab_class_t *mc) 561 { 562 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 563 uint64_t fragmentation = 0; 564 565 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 566 567 for (int c = 0; c < rvd->vdev_children; c++) { 568 vdev_t *tvd = rvd->vdev_child[c]; 569 metaslab_group_t *mg = tvd->vdev_mg; 570 571 /* 572 * Skip any holes, uninitialized top-levels, 573 * or vdevs that are not in this metalab class. 574 */ 575 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || 576 mg->mg_class != mc) { 577 continue; 578 } 579 580 /* 581 * If a metaslab group does not contain a fragmentation 582 * metric then just bail out. 583 */ 584 if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { 585 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 586 return (ZFS_FRAG_INVALID); 587 } 588 589 /* 590 * Determine how much this metaslab_group is contributing 591 * to the overall pool fragmentation metric. 592 */ 593 fragmentation += mg->mg_fragmentation * 594 metaslab_group_get_space(mg); 595 } 596 fragmentation /= metaslab_class_get_space(mc); 597 598 ASSERT3U(fragmentation, <=, 100); 599 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 600 return (fragmentation); 601 } 602 603 /* 604 * Calculate the amount of expandable space that is available in 605 * this metaslab class. If a device is expanded then its expandable 606 * space will be the amount of allocatable space that is currently not 607 * part of this metaslab class. 608 */ 609 uint64_t 610 metaslab_class_expandable_space(metaslab_class_t *mc) 611 { 612 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 613 uint64_t space = 0; 614 615 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 616 for (int c = 0; c < rvd->vdev_children; c++) { 617 vdev_t *tvd = rvd->vdev_child[c]; 618 metaslab_group_t *mg = tvd->vdev_mg; 619 620 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || 621 mg->mg_class != mc) { 622 continue; 623 } 624 625 /* 626 * Calculate if we have enough space to add additional 627 * metaslabs. We report the expandable space in terms 628 * of the metaslab size since that's the unit of expansion. 629 */ 630 space += P2ALIGN_TYPED(tvd->vdev_max_asize - tvd->vdev_asize, 631 1ULL << tvd->vdev_ms_shift, uint64_t); 632 } 633 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 634 return (space); 635 } 636 637 void 638 metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg) 639 { 640 multilist_t *ml = &mc->mc_metaslab_txg_list; 641 hrtime_t now = gethrtime(); 642 for (int i = 0; i < multilist_get_num_sublists(ml); i++) { 643 multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i); 644 metaslab_t *msp = multilist_sublist_head(mls); 645 multilist_sublist_unlock(mls); 646 while (msp != NULL) { 647 mutex_enter(&msp->ms_lock); 648 649 /* 650 * If the metaslab has been removed from the list 651 * (which could happen if we were at the memory limit 652 * and it was evicted during this loop), then we can't 653 * proceed and we should restart the sublist. 654 */ 655 if (!multilist_link_active(&msp->ms_class_txg_node)) { 656 mutex_exit(&msp->ms_lock); 657 i--; 658 break; 659 } 660 mls = multilist_sublist_lock_idx(ml, i); 661 metaslab_t *next_msp = multilist_sublist_next(mls, msp); 662 multilist_sublist_unlock(mls); 663 if (txg > 664 msp->ms_selected_txg + metaslab_unload_delay && 665 now > msp->ms_selected_time + 666 MSEC2NSEC(metaslab_unload_delay_ms) && 667 (msp->ms_allocator == -1 || 668 !metaslab_preload_enabled)) { 669 metaslab_evict(msp, txg); 670 } else { 671 /* 672 * Once we've hit a metaslab selected too 673 * recently to evict, we're done evicting for 674 * now. 675 */ 676 mutex_exit(&msp->ms_lock); 677 break; 678 } 679 mutex_exit(&msp->ms_lock); 680 msp = next_msp; 681 } 682 } 683 } 684 685 static int 686 metaslab_compare(const void *x1, const void *x2) 687 { 688 const metaslab_t *m1 = (const metaslab_t *)x1; 689 const metaslab_t *m2 = (const metaslab_t *)x2; 690 691 int sort1 = 0; 692 int sort2 = 0; 693 if (m1->ms_allocator != -1 && m1->ms_primary) 694 sort1 = 1; 695 else if (m1->ms_allocator != -1 && !m1->ms_primary) 696 sort1 = 2; 697 if (m2->ms_allocator != -1 && m2->ms_primary) 698 sort2 = 1; 699 else if (m2->ms_allocator != -1 && !m2->ms_primary) 700 sort2 = 2; 701 702 /* 703 * Sort inactive metaslabs first, then primaries, then secondaries. When 704 * selecting a metaslab to allocate from, an allocator first tries its 705 * primary, then secondary active metaslab. If it doesn't have active 706 * metaslabs, or can't allocate from them, it searches for an inactive 707 * metaslab to activate. If it can't find a suitable one, it will steal 708 * a primary or secondary metaslab from another allocator. 709 */ 710 if (sort1 < sort2) 711 return (-1); 712 if (sort1 > sort2) 713 return (1); 714 715 int cmp = TREE_CMP(m2->ms_weight, m1->ms_weight); 716 if (likely(cmp)) 717 return (cmp); 718 719 IMPLY(TREE_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2); 720 721 return (TREE_CMP(m1->ms_start, m2->ms_start)); 722 } 723 724 /* 725 * ========================================================================== 726 * Metaslab groups 727 * ========================================================================== 728 */ 729 /* 730 * Update the allocatable flag and the metaslab group's capacity. 731 * The allocatable flag is set to true if the capacity is below 732 * the zfs_mg_noalloc_threshold or has a fragmentation value that is 733 * greater than zfs_mg_fragmentation_threshold. If a metaslab group 734 * transitions from allocatable to non-allocatable or vice versa then the 735 * metaslab group's class is updated to reflect the transition. 736 */ 737 static void 738 metaslab_group_alloc_update(metaslab_group_t *mg) 739 { 740 vdev_t *vd = mg->mg_vd; 741 metaslab_class_t *mc = mg->mg_class; 742 vdev_stat_t *vs = &vd->vdev_stat; 743 boolean_t was_allocatable; 744 boolean_t was_initialized; 745 746 ASSERT(vd == vd->vdev_top); 747 ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==, 748 SCL_ALLOC); 749 750 mutex_enter(&mg->mg_lock); 751 was_allocatable = mg->mg_allocatable; 752 was_initialized = mg->mg_initialized; 753 754 mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / 755 (vs->vs_space + 1); 756 757 mutex_enter(&mc->mc_lock); 758 759 /* 760 * If the metaslab group was just added then it won't 761 * have any space until we finish syncing out this txg. 762 * At that point we will consider it initialized and available 763 * for allocations. We also don't consider non-activated 764 * metaslab groups (e.g. vdevs that are in the middle of being removed) 765 * to be initialized, because they can't be used for allocation. 766 */ 767 mg->mg_initialized = metaslab_group_initialized(mg); 768 if (!was_initialized && mg->mg_initialized) { 769 mc->mc_groups++; 770 } else if (was_initialized && !mg->mg_initialized) { 771 ASSERT3U(mc->mc_groups, >, 0); 772 mc->mc_groups--; 773 } 774 if (mg->mg_initialized) 775 mg->mg_no_free_space = B_FALSE; 776 777 /* 778 * A metaslab group is considered allocatable if it has plenty 779 * of free space or is not heavily fragmented. We only take 780 * fragmentation into account if the metaslab group has a valid 781 * fragmentation metric (i.e. a value between 0 and 100). 782 */ 783 mg->mg_allocatable = (mg->mg_activation_count > 0 && 784 mg->mg_free_capacity > zfs_mg_noalloc_threshold && 785 (mg->mg_fragmentation == ZFS_FRAG_INVALID || 786 mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)); 787 788 /* 789 * The mc_alloc_groups maintains a count of the number of 790 * groups in this metaslab class that are still above the 791 * zfs_mg_noalloc_threshold. This is used by the allocating 792 * threads to determine if they should avoid allocations to 793 * a given group. The allocator will avoid allocations to a group 794 * if that group has reached or is below the zfs_mg_noalloc_threshold 795 * and there are still other groups that are above the threshold. 796 * When a group transitions from allocatable to non-allocatable or 797 * vice versa we update the metaslab class to reflect that change. 798 * When the mc_alloc_groups value drops to 0 that means that all 799 * groups have reached the zfs_mg_noalloc_threshold making all groups 800 * eligible for allocations. This effectively means that all devices 801 * are balanced again. 802 */ 803 if (was_allocatable && !mg->mg_allocatable) 804 mc->mc_alloc_groups--; 805 else if (!was_allocatable && mg->mg_allocatable) 806 mc->mc_alloc_groups++; 807 mutex_exit(&mc->mc_lock); 808 809 mutex_exit(&mg->mg_lock); 810 } 811 812 int 813 metaslab_sort_by_flushed(const void *va, const void *vb) 814 { 815 const metaslab_t *a = va; 816 const metaslab_t *b = vb; 817 818 int cmp = TREE_CMP(a->ms_unflushed_txg, b->ms_unflushed_txg); 819 if (likely(cmp)) 820 return (cmp); 821 822 uint64_t a_vdev_id = a->ms_group->mg_vd->vdev_id; 823 uint64_t b_vdev_id = b->ms_group->mg_vd->vdev_id; 824 cmp = TREE_CMP(a_vdev_id, b_vdev_id); 825 if (cmp) 826 return (cmp); 827 828 return (TREE_CMP(a->ms_id, b->ms_id)); 829 } 830 831 metaslab_group_t * 832 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators) 833 { 834 metaslab_group_t *mg; 835 836 mg = kmem_zalloc(offsetof(metaslab_group_t, 837 mg_allocator[allocators]), KM_SLEEP); 838 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); 839 mutex_init(&mg->mg_ms_disabled_lock, NULL, MUTEX_DEFAULT, NULL); 840 cv_init(&mg->mg_ms_disabled_cv, NULL, CV_DEFAULT, NULL); 841 avl_create(&mg->mg_metaslab_tree, metaslab_compare, 842 sizeof (metaslab_t), offsetof(metaslab_t, ms_group_node)); 843 mg->mg_vd = vd; 844 mg->mg_class = mc; 845 mg->mg_activation_count = 0; 846 mg->mg_initialized = B_FALSE; 847 mg->mg_no_free_space = B_TRUE; 848 mg->mg_allocators = allocators; 849 850 for (int i = 0; i < allocators; i++) { 851 metaslab_group_allocator_t *mga = &mg->mg_allocator[i]; 852 zfs_refcount_create_tracked(&mga->mga_alloc_queue_depth); 853 } 854 855 return (mg); 856 } 857 858 void 859 metaslab_group_destroy(metaslab_group_t *mg) 860 { 861 ASSERT(mg->mg_prev == NULL); 862 ASSERT(mg->mg_next == NULL); 863 /* 864 * We may have gone below zero with the activation count 865 * either because we never activated in the first place or 866 * because we're done, and possibly removing the vdev. 867 */ 868 ASSERT(mg->mg_activation_count <= 0); 869 870 avl_destroy(&mg->mg_metaslab_tree); 871 mutex_destroy(&mg->mg_lock); 872 mutex_destroy(&mg->mg_ms_disabled_lock); 873 cv_destroy(&mg->mg_ms_disabled_cv); 874 875 for (int i = 0; i < mg->mg_allocators; i++) { 876 metaslab_group_allocator_t *mga = &mg->mg_allocator[i]; 877 zfs_refcount_destroy(&mga->mga_alloc_queue_depth); 878 } 879 kmem_free(mg, offsetof(metaslab_group_t, 880 mg_allocator[mg->mg_allocators])); 881 } 882 883 void 884 metaslab_group_activate(metaslab_group_t *mg) 885 { 886 metaslab_class_t *mc = mg->mg_class; 887 spa_t *spa = mc->mc_spa; 888 metaslab_group_t *mgprev, *mgnext; 889 890 ASSERT3U(spa_config_held(spa, SCL_ALLOC, RW_WRITER), !=, 0); 891 892 ASSERT(mg->mg_prev == NULL); 893 ASSERT(mg->mg_next == NULL); 894 ASSERT(mg->mg_activation_count <= 0); 895 896 if (++mg->mg_activation_count <= 0) 897 return; 898 899 mg->mg_aliquot = metaslab_aliquot * MAX(1, 900 vdev_get_ndisks(mg->mg_vd) - vdev_get_nparity(mg->mg_vd)); 901 metaslab_group_alloc_update(mg); 902 903 if ((mgprev = mc->mc_allocator[0].mca_rotor) == NULL) { 904 mg->mg_prev = mg; 905 mg->mg_next = mg; 906 } else { 907 mgnext = mgprev->mg_next; 908 mg->mg_prev = mgprev; 909 mg->mg_next = mgnext; 910 mgprev->mg_next = mg; 911 mgnext->mg_prev = mg; 912 } 913 for (int i = 0; i < spa->spa_alloc_count; i++) { 914 mc->mc_allocator[i].mca_rotor = mg; 915 mg = mg->mg_next; 916 } 917 } 918 919 /* 920 * Passivate a metaslab group and remove it from the allocation rotor. 921 * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating 922 * a metaslab group. This function will momentarily drop spa_config_locks 923 * that are lower than the SCL_ALLOC lock (see comment below). 924 */ 925 void 926 metaslab_group_passivate(metaslab_group_t *mg) 927 { 928 metaslab_class_t *mc = mg->mg_class; 929 spa_t *spa = mc->mc_spa; 930 metaslab_group_t *mgprev, *mgnext; 931 int locks = spa_config_held(spa, SCL_ALL, RW_WRITER); 932 933 ASSERT3U(spa_config_held(spa, SCL_ALLOC | SCL_ZIO, RW_WRITER), ==, 934 (SCL_ALLOC | SCL_ZIO)); 935 936 if (--mg->mg_activation_count != 0) { 937 for (int i = 0; i < spa->spa_alloc_count; i++) 938 ASSERT(mc->mc_allocator[i].mca_rotor != mg); 939 ASSERT(mg->mg_prev == NULL); 940 ASSERT(mg->mg_next == NULL); 941 ASSERT(mg->mg_activation_count < 0); 942 return; 943 } 944 945 /* 946 * The spa_config_lock is an array of rwlocks, ordered as 947 * follows (from highest to lowest): 948 * SCL_CONFIG > SCL_STATE > SCL_L2ARC > SCL_ALLOC > 949 * SCL_ZIO > SCL_FREE > SCL_VDEV 950 * (For more information about the spa_config_lock see spa_misc.c) 951 * The higher the lock, the broader its coverage. When we passivate 952 * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO 953 * config locks. However, the metaslab group's taskq might be trying 954 * to preload metaslabs so we must drop the SCL_ZIO lock and any 955 * lower locks to allow the I/O to complete. At a minimum, 956 * we continue to hold the SCL_ALLOC lock, which prevents any future 957 * allocations from taking place and any changes to the vdev tree. 958 */ 959 spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa); 960 taskq_wait_outstanding(spa->spa_metaslab_taskq, 0); 961 spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER); 962 metaslab_group_alloc_update(mg); 963 for (int i = 0; i < mg->mg_allocators; i++) { 964 metaslab_group_allocator_t *mga = &mg->mg_allocator[i]; 965 metaslab_t *msp = mga->mga_primary; 966 if (msp != NULL) { 967 mutex_enter(&msp->ms_lock); 968 metaslab_passivate(msp, 969 metaslab_weight_from_range_tree(msp)); 970 mutex_exit(&msp->ms_lock); 971 } 972 msp = mga->mga_secondary; 973 if (msp != NULL) { 974 mutex_enter(&msp->ms_lock); 975 metaslab_passivate(msp, 976 metaslab_weight_from_range_tree(msp)); 977 mutex_exit(&msp->ms_lock); 978 } 979 } 980 981 mgprev = mg->mg_prev; 982 mgnext = mg->mg_next; 983 984 if (mg == mgnext) { 985 mgnext = NULL; 986 } else { 987 mgprev->mg_next = mgnext; 988 mgnext->mg_prev = mgprev; 989 } 990 for (int i = 0; i < spa->spa_alloc_count; i++) { 991 if (mc->mc_allocator[i].mca_rotor == mg) 992 mc->mc_allocator[i].mca_rotor = mgnext; 993 } 994 995 mg->mg_prev = NULL; 996 mg->mg_next = NULL; 997 } 998 999 boolean_t 1000 metaslab_group_initialized(metaslab_group_t *mg) 1001 { 1002 vdev_t *vd = mg->mg_vd; 1003 vdev_stat_t *vs = &vd->vdev_stat; 1004 1005 return (vs->vs_space != 0 && mg->mg_activation_count > 0); 1006 } 1007 1008 uint64_t 1009 metaslab_group_get_space(metaslab_group_t *mg) 1010 { 1011 /* 1012 * Note that the number of nodes in mg_metaslab_tree may be one less 1013 * than vdev_ms_count, due to the embedded log metaslab. 1014 */ 1015 mutex_enter(&mg->mg_lock); 1016 uint64_t ms_count = avl_numnodes(&mg->mg_metaslab_tree); 1017 mutex_exit(&mg->mg_lock); 1018 return ((1ULL << mg->mg_vd->vdev_ms_shift) * ms_count); 1019 } 1020 1021 void 1022 metaslab_group_histogram_verify(metaslab_group_t *mg) 1023 { 1024 uint64_t *mg_hist; 1025 avl_tree_t *t = &mg->mg_metaslab_tree; 1026 uint64_t ashift = mg->mg_vd->vdev_ashift; 1027 1028 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 1029 return; 1030 1031 mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 1032 KM_SLEEP); 1033 1034 ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=, 1035 SPACE_MAP_HISTOGRAM_SIZE + ashift); 1036 1037 mutex_enter(&mg->mg_lock); 1038 for (metaslab_t *msp = avl_first(t); 1039 msp != NULL; msp = AVL_NEXT(t, msp)) { 1040 VERIFY3P(msp->ms_group, ==, mg); 1041 /* skip if not active */ 1042 if (msp->ms_sm == NULL) 1043 continue; 1044 1045 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 1046 mg_hist[i + ashift] += 1047 msp->ms_sm->sm_phys->smp_histogram[i]; 1048 } 1049 } 1050 1051 for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++) 1052 VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]); 1053 1054 mutex_exit(&mg->mg_lock); 1055 1056 kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 1057 } 1058 1059 static void 1060 metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp) 1061 { 1062 metaslab_class_t *mc = mg->mg_class; 1063 uint64_t ashift = mg->mg_vd->vdev_ashift; 1064 1065 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1066 if (msp->ms_sm == NULL) 1067 return; 1068 1069 mutex_enter(&mg->mg_lock); 1070 mutex_enter(&mc->mc_lock); 1071 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 1072 IMPLY(mg == mg->mg_vd->vdev_log_mg, 1073 mc == spa_embedded_log_class(mg->mg_vd->vdev_spa)); 1074 mg->mg_histogram[i + ashift] += 1075 msp->ms_sm->sm_phys->smp_histogram[i]; 1076 mc->mc_histogram[i + ashift] += 1077 msp->ms_sm->sm_phys->smp_histogram[i]; 1078 } 1079 mutex_exit(&mc->mc_lock); 1080 mutex_exit(&mg->mg_lock); 1081 } 1082 1083 void 1084 metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp) 1085 { 1086 metaslab_class_t *mc = mg->mg_class; 1087 uint64_t ashift = mg->mg_vd->vdev_ashift; 1088 1089 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1090 if (msp->ms_sm == NULL) 1091 return; 1092 1093 mutex_enter(&mg->mg_lock); 1094 mutex_enter(&mc->mc_lock); 1095 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 1096 ASSERT3U(mg->mg_histogram[i + ashift], >=, 1097 msp->ms_sm->sm_phys->smp_histogram[i]); 1098 ASSERT3U(mc->mc_histogram[i + ashift], >=, 1099 msp->ms_sm->sm_phys->smp_histogram[i]); 1100 IMPLY(mg == mg->mg_vd->vdev_log_mg, 1101 mc == spa_embedded_log_class(mg->mg_vd->vdev_spa)); 1102 1103 mg->mg_histogram[i + ashift] -= 1104 msp->ms_sm->sm_phys->smp_histogram[i]; 1105 mc->mc_histogram[i + ashift] -= 1106 msp->ms_sm->sm_phys->smp_histogram[i]; 1107 } 1108 mutex_exit(&mc->mc_lock); 1109 mutex_exit(&mg->mg_lock); 1110 } 1111 1112 static void 1113 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) 1114 { 1115 ASSERT(msp->ms_group == NULL); 1116 mutex_enter(&mg->mg_lock); 1117 msp->ms_group = mg; 1118 msp->ms_weight = 0; 1119 avl_add(&mg->mg_metaslab_tree, msp); 1120 mutex_exit(&mg->mg_lock); 1121 1122 mutex_enter(&msp->ms_lock); 1123 metaslab_group_histogram_add(mg, msp); 1124 mutex_exit(&msp->ms_lock); 1125 } 1126 1127 static void 1128 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) 1129 { 1130 mutex_enter(&msp->ms_lock); 1131 metaslab_group_histogram_remove(mg, msp); 1132 mutex_exit(&msp->ms_lock); 1133 1134 mutex_enter(&mg->mg_lock); 1135 ASSERT(msp->ms_group == mg); 1136 avl_remove(&mg->mg_metaslab_tree, msp); 1137 1138 metaslab_class_t *mc = msp->ms_group->mg_class; 1139 multilist_sublist_t *mls = 1140 multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp); 1141 if (multilist_link_active(&msp->ms_class_txg_node)) 1142 multilist_sublist_remove(mls, msp); 1143 multilist_sublist_unlock(mls); 1144 1145 msp->ms_group = NULL; 1146 mutex_exit(&mg->mg_lock); 1147 } 1148 1149 static void 1150 metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 1151 { 1152 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1153 ASSERT(MUTEX_HELD(&mg->mg_lock)); 1154 ASSERT(msp->ms_group == mg); 1155 1156 avl_remove(&mg->mg_metaslab_tree, msp); 1157 msp->ms_weight = weight; 1158 avl_add(&mg->mg_metaslab_tree, msp); 1159 1160 } 1161 1162 static void 1163 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 1164 { 1165 /* 1166 * Although in principle the weight can be any value, in 1167 * practice we do not use values in the range [1, 511]. 1168 */ 1169 ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0); 1170 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1171 1172 mutex_enter(&mg->mg_lock); 1173 metaslab_group_sort_impl(mg, msp, weight); 1174 mutex_exit(&mg->mg_lock); 1175 } 1176 1177 /* 1178 * Calculate the fragmentation for a given metaslab group. We can use 1179 * a simple average here since all metaslabs within the group must have 1180 * the same size. The return value will be a value between 0 and 100 1181 * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this 1182 * group have a fragmentation metric. 1183 */ 1184 uint64_t 1185 metaslab_group_fragmentation(metaslab_group_t *mg) 1186 { 1187 vdev_t *vd = mg->mg_vd; 1188 uint64_t fragmentation = 0; 1189 uint64_t valid_ms = 0; 1190 1191 for (int m = 0; m < vd->vdev_ms_count; m++) { 1192 metaslab_t *msp = vd->vdev_ms[m]; 1193 1194 if (msp->ms_fragmentation == ZFS_FRAG_INVALID) 1195 continue; 1196 if (msp->ms_group != mg) 1197 continue; 1198 1199 valid_ms++; 1200 fragmentation += msp->ms_fragmentation; 1201 } 1202 1203 if (valid_ms <= mg->mg_vd->vdev_ms_count / 2) 1204 return (ZFS_FRAG_INVALID); 1205 1206 fragmentation /= valid_ms; 1207 ASSERT3U(fragmentation, <=, 100); 1208 return (fragmentation); 1209 } 1210 1211 /* 1212 * Determine if a given metaslab group should skip allocations. A metaslab 1213 * group should avoid allocations if its free capacity is less than the 1214 * zfs_mg_noalloc_threshold or its fragmentation metric is greater than 1215 * zfs_mg_fragmentation_threshold and there is at least one metaslab group 1216 * that can still handle allocations. If the allocation throttle is enabled 1217 * then we skip allocations to devices that have reached their maximum 1218 * allocation queue depth unless the selected metaslab group is the only 1219 * eligible group remaining. 1220 */ 1221 static boolean_t 1222 metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, 1223 int flags, uint64_t psize, int allocator, int d) 1224 { 1225 spa_t *spa = mg->mg_vd->vdev_spa; 1226 metaslab_class_t *mc = mg->mg_class; 1227 1228 /* 1229 * We can only consider skipping this metaslab group if it's 1230 * in the normal metaslab class and there are other metaslab 1231 * groups to select from. Otherwise, we always consider it eligible 1232 * for allocations. 1233 */ 1234 if ((mc != spa_normal_class(spa) && 1235 mc != spa_special_class(spa) && 1236 mc != spa_dedup_class(spa)) || 1237 mc->mc_groups <= 1) 1238 return (B_TRUE); 1239 1240 /* 1241 * If the metaslab group's mg_allocatable flag is set (see comments 1242 * in metaslab_group_alloc_update() for more information) and 1243 * the allocation throttle is disabled then allow allocations to this 1244 * device. However, if the allocation throttle is enabled then 1245 * check if we have reached our allocation limit (mga_alloc_queue_depth) 1246 * to determine if we should allow allocations to this metaslab group. 1247 * If all metaslab groups are no longer considered allocatable 1248 * (mc_alloc_groups == 0) or we're trying to allocate the smallest 1249 * gang block size then we allow allocations on this metaslab group 1250 * regardless of the mg_allocatable or throttle settings. 1251 */ 1252 if (mg->mg_allocatable) { 1253 metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; 1254 int64_t qdepth; 1255 uint64_t qmax = mga->mga_cur_max_alloc_queue_depth; 1256 1257 if (!mc->mc_alloc_throttle_enabled) 1258 return (B_TRUE); 1259 1260 /* 1261 * If this metaslab group does not have any free space, then 1262 * there is no point in looking further. 1263 */ 1264 if (mg->mg_no_free_space) 1265 return (B_FALSE); 1266 1267 /* 1268 * Some allocations (e.g., those coming from device removal 1269 * where the * allocations are not even counted in the 1270 * metaslab * allocation queues) are allowed to bypass 1271 * the throttle. 1272 */ 1273 if (flags & METASLAB_DONT_THROTTLE) 1274 return (B_TRUE); 1275 1276 /* 1277 * Relax allocation throttling for ditto blocks. Due to 1278 * random imbalances in allocation it tends to push copies 1279 * to one vdev, that looks a bit better at the moment. 1280 */ 1281 qmax = qmax * (4 + d) / 4; 1282 1283 qdepth = zfs_refcount_count(&mga->mga_alloc_queue_depth); 1284 1285 /* 1286 * If this metaslab group is below its qmax or it's 1287 * the only allocatable metaslab group, then attempt 1288 * to allocate from it. 1289 */ 1290 if (qdepth < qmax || mc->mc_alloc_groups == 1) 1291 return (B_TRUE); 1292 ASSERT3U(mc->mc_alloc_groups, >, 1); 1293 1294 /* 1295 * Since this metaslab group is at or over its qmax, we 1296 * need to determine if there are metaslab groups after this 1297 * one that might be able to handle this allocation. This is 1298 * racy since we can't hold the locks for all metaslab 1299 * groups at the same time when we make this check. 1300 */ 1301 for (metaslab_group_t *mgp = mg->mg_next; 1302 mgp != rotor; mgp = mgp->mg_next) { 1303 metaslab_group_allocator_t *mgap = 1304 &mgp->mg_allocator[allocator]; 1305 qmax = mgap->mga_cur_max_alloc_queue_depth; 1306 qmax = qmax * (4 + d) / 4; 1307 qdepth = 1308 zfs_refcount_count(&mgap->mga_alloc_queue_depth); 1309 1310 /* 1311 * If there is another metaslab group that 1312 * might be able to handle the allocation, then 1313 * we return false so that we skip this group. 1314 */ 1315 if (qdepth < qmax && !mgp->mg_no_free_space) 1316 return (B_FALSE); 1317 } 1318 1319 /* 1320 * We didn't find another group to handle the allocation 1321 * so we can't skip this metaslab group even though 1322 * we are at or over our qmax. 1323 */ 1324 return (B_TRUE); 1325 1326 } else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) { 1327 return (B_TRUE); 1328 } 1329 return (B_FALSE); 1330 } 1331 1332 /* 1333 * ========================================================================== 1334 * Range tree callbacks 1335 * ========================================================================== 1336 */ 1337 1338 /* 1339 * Comparison function for the private size-ordered tree using 32-bit 1340 * ranges. Tree is sorted by size, larger sizes at the end of the tree. 1341 */ 1342 __attribute__((always_inline)) inline 1343 static int 1344 metaslab_rangesize32_compare(const void *x1, const void *x2) 1345 { 1346 const range_seg32_t *r1 = x1; 1347 const range_seg32_t *r2 = x2; 1348 1349 uint64_t rs_size1 = r1->rs_end - r1->rs_start; 1350 uint64_t rs_size2 = r2->rs_end - r2->rs_start; 1351 1352 int cmp = TREE_CMP(rs_size1, rs_size2); 1353 1354 return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start)); 1355 } 1356 1357 /* 1358 * Comparison function for the private size-ordered tree using 64-bit 1359 * ranges. Tree is sorted by size, larger sizes at the end of the tree. 1360 */ 1361 __attribute__((always_inline)) inline 1362 static int 1363 metaslab_rangesize64_compare(const void *x1, const void *x2) 1364 { 1365 const range_seg64_t *r1 = x1; 1366 const range_seg64_t *r2 = x2; 1367 1368 uint64_t rs_size1 = r1->rs_end - r1->rs_start; 1369 uint64_t rs_size2 = r2->rs_end - r2->rs_start; 1370 1371 int cmp = TREE_CMP(rs_size1, rs_size2); 1372 1373 return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start)); 1374 } 1375 1376 typedef struct metaslab_rt_arg { 1377 zfs_btree_t *mra_bt; 1378 uint32_t mra_floor_shift; 1379 } metaslab_rt_arg_t; 1380 1381 struct mssa_arg { 1382 range_tree_t *rt; 1383 metaslab_rt_arg_t *mra; 1384 }; 1385 1386 static void 1387 metaslab_size_sorted_add(void *arg, uint64_t start, uint64_t size) 1388 { 1389 struct mssa_arg *mssap = arg; 1390 range_tree_t *rt = mssap->rt; 1391 metaslab_rt_arg_t *mrap = mssap->mra; 1392 range_seg_max_t seg = {0}; 1393 rs_set_start(&seg, rt, start); 1394 rs_set_end(&seg, rt, start + size); 1395 metaslab_rt_add(rt, &seg, mrap); 1396 } 1397 1398 static void 1399 metaslab_size_tree_full_load(range_tree_t *rt) 1400 { 1401 metaslab_rt_arg_t *mrap = rt->rt_arg; 1402 METASLABSTAT_BUMP(metaslabstat_reload_tree); 1403 ASSERT0(zfs_btree_numnodes(mrap->mra_bt)); 1404 mrap->mra_floor_shift = 0; 1405 struct mssa_arg arg = {0}; 1406 arg.rt = rt; 1407 arg.mra = mrap; 1408 range_tree_walk(rt, metaslab_size_sorted_add, &arg); 1409 } 1410 1411 1412 ZFS_BTREE_FIND_IN_BUF_FUNC(metaslab_rt_find_rangesize32_in_buf, 1413 range_seg32_t, metaslab_rangesize32_compare) 1414 1415 ZFS_BTREE_FIND_IN_BUF_FUNC(metaslab_rt_find_rangesize64_in_buf, 1416 range_seg64_t, metaslab_rangesize64_compare) 1417 1418 /* 1419 * Create any block allocator specific components. The current allocators 1420 * rely on using both a size-ordered range_tree_t and an array of uint64_t's. 1421 */ 1422 static void 1423 metaslab_rt_create(range_tree_t *rt, void *arg) 1424 { 1425 metaslab_rt_arg_t *mrap = arg; 1426 zfs_btree_t *size_tree = mrap->mra_bt; 1427 1428 size_t size; 1429 int (*compare) (const void *, const void *); 1430 bt_find_in_buf_f bt_find; 1431 switch (rt->rt_type) { 1432 case RANGE_SEG32: 1433 size = sizeof (range_seg32_t); 1434 compare = metaslab_rangesize32_compare; 1435 bt_find = metaslab_rt_find_rangesize32_in_buf; 1436 break; 1437 case RANGE_SEG64: 1438 size = sizeof (range_seg64_t); 1439 compare = metaslab_rangesize64_compare; 1440 bt_find = metaslab_rt_find_rangesize64_in_buf; 1441 break; 1442 default: 1443 panic("Invalid range seg type %d", rt->rt_type); 1444 } 1445 zfs_btree_create(size_tree, compare, bt_find, size); 1446 mrap->mra_floor_shift = metaslab_by_size_min_shift; 1447 } 1448 1449 static void 1450 metaslab_rt_destroy(range_tree_t *rt, void *arg) 1451 { 1452 (void) rt; 1453 metaslab_rt_arg_t *mrap = arg; 1454 zfs_btree_t *size_tree = mrap->mra_bt; 1455 1456 zfs_btree_destroy(size_tree); 1457 kmem_free(mrap, sizeof (*mrap)); 1458 } 1459 1460 static void 1461 metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg) 1462 { 1463 metaslab_rt_arg_t *mrap = arg; 1464 zfs_btree_t *size_tree = mrap->mra_bt; 1465 1466 if (rs_get_end(rs, rt) - rs_get_start(rs, rt) < 1467 (1ULL << mrap->mra_floor_shift)) 1468 return; 1469 1470 zfs_btree_add(size_tree, rs); 1471 } 1472 1473 static void 1474 metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg) 1475 { 1476 metaslab_rt_arg_t *mrap = arg; 1477 zfs_btree_t *size_tree = mrap->mra_bt; 1478 1479 if (rs_get_end(rs, rt) - rs_get_start(rs, rt) < (1ULL << 1480 mrap->mra_floor_shift)) 1481 return; 1482 1483 zfs_btree_remove(size_tree, rs); 1484 } 1485 1486 static void 1487 metaslab_rt_vacate(range_tree_t *rt, void *arg) 1488 { 1489 metaslab_rt_arg_t *mrap = arg; 1490 zfs_btree_t *size_tree = mrap->mra_bt; 1491 zfs_btree_clear(size_tree); 1492 zfs_btree_destroy(size_tree); 1493 1494 metaslab_rt_create(rt, arg); 1495 } 1496 1497 static const range_tree_ops_t metaslab_rt_ops = { 1498 .rtop_create = metaslab_rt_create, 1499 .rtop_destroy = metaslab_rt_destroy, 1500 .rtop_add = metaslab_rt_add, 1501 .rtop_remove = metaslab_rt_remove, 1502 .rtop_vacate = metaslab_rt_vacate 1503 }; 1504 1505 /* 1506 * ========================================================================== 1507 * Common allocator routines 1508 * ========================================================================== 1509 */ 1510 1511 /* 1512 * Return the maximum contiguous segment within the metaslab. 1513 */ 1514 uint64_t 1515 metaslab_largest_allocatable(metaslab_t *msp) 1516 { 1517 zfs_btree_t *t = &msp->ms_allocatable_by_size; 1518 range_seg_t *rs; 1519 1520 if (t == NULL) 1521 return (0); 1522 if (zfs_btree_numnodes(t) == 0) 1523 metaslab_size_tree_full_load(msp->ms_allocatable); 1524 1525 rs = zfs_btree_last(t, NULL); 1526 if (rs == NULL) 1527 return (0); 1528 1529 return (rs_get_end(rs, msp->ms_allocatable) - rs_get_start(rs, 1530 msp->ms_allocatable)); 1531 } 1532 1533 /* 1534 * Return the maximum contiguous segment within the unflushed frees of this 1535 * metaslab. 1536 */ 1537 static uint64_t 1538 metaslab_largest_unflushed_free(metaslab_t *msp) 1539 { 1540 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1541 1542 if (msp->ms_unflushed_frees == NULL) 1543 return (0); 1544 1545 if (zfs_btree_numnodes(&msp->ms_unflushed_frees_by_size) == 0) 1546 metaslab_size_tree_full_load(msp->ms_unflushed_frees); 1547 range_seg_t *rs = zfs_btree_last(&msp->ms_unflushed_frees_by_size, 1548 NULL); 1549 if (rs == NULL) 1550 return (0); 1551 1552 /* 1553 * When a range is freed from the metaslab, that range is added to 1554 * both the unflushed frees and the deferred frees. While the block 1555 * will eventually be usable, if the metaslab were loaded the range 1556 * would not be added to the ms_allocatable tree until TXG_DEFER_SIZE 1557 * txgs had passed. As a result, when attempting to estimate an upper 1558 * bound for the largest currently-usable free segment in the 1559 * metaslab, we need to not consider any ranges currently in the defer 1560 * trees. This algorithm approximates the largest available chunk in 1561 * the largest range in the unflushed_frees tree by taking the first 1562 * chunk. While this may be a poor estimate, it should only remain so 1563 * briefly and should eventually self-correct as frees are no longer 1564 * deferred. Similar logic applies to the ms_freed tree. See 1565 * metaslab_load() for more details. 1566 * 1567 * There are two primary sources of inaccuracy in this estimate. Both 1568 * are tolerated for performance reasons. The first source is that we 1569 * only check the largest segment for overlaps. Smaller segments may 1570 * have more favorable overlaps with the other trees, resulting in 1571 * larger usable chunks. Second, we only look at the first chunk in 1572 * the largest segment; there may be other usable chunks in the 1573 * largest segment, but we ignore them. 1574 */ 1575 uint64_t rstart = rs_get_start(rs, msp->ms_unflushed_frees); 1576 uint64_t rsize = rs_get_end(rs, msp->ms_unflushed_frees) - rstart; 1577 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1578 uint64_t start = 0; 1579 uint64_t size = 0; 1580 boolean_t found = range_tree_find_in(msp->ms_defer[t], rstart, 1581 rsize, &start, &size); 1582 if (found) { 1583 if (rstart == start) 1584 return (0); 1585 rsize = start - rstart; 1586 } 1587 } 1588 1589 uint64_t start = 0; 1590 uint64_t size = 0; 1591 boolean_t found = range_tree_find_in(msp->ms_freed, rstart, 1592 rsize, &start, &size); 1593 if (found) 1594 rsize = start - rstart; 1595 1596 return (rsize); 1597 } 1598 1599 static range_seg_t * 1600 metaslab_block_find(zfs_btree_t *t, range_tree_t *rt, uint64_t start, 1601 uint64_t size, zfs_btree_index_t *where) 1602 { 1603 range_seg_t *rs; 1604 range_seg_max_t rsearch; 1605 1606 rs_set_start(&rsearch, rt, start); 1607 rs_set_end(&rsearch, rt, start + size); 1608 1609 rs = zfs_btree_find(t, &rsearch, where); 1610 if (rs == NULL) { 1611 rs = zfs_btree_next(t, where, where); 1612 } 1613 1614 return (rs); 1615 } 1616 1617 /* 1618 * This is a helper function that can be used by the allocator to find a 1619 * suitable block to allocate. This will search the specified B-tree looking 1620 * for a block that matches the specified criteria. 1621 */ 1622 static uint64_t 1623 metaslab_block_picker(range_tree_t *rt, uint64_t *cursor, uint64_t size, 1624 uint64_t max_search) 1625 { 1626 if (*cursor == 0) 1627 *cursor = rt->rt_start; 1628 zfs_btree_t *bt = &rt->rt_root; 1629 zfs_btree_index_t where; 1630 range_seg_t *rs = metaslab_block_find(bt, rt, *cursor, size, &where); 1631 uint64_t first_found; 1632 int count_searched = 0; 1633 1634 if (rs != NULL) 1635 first_found = rs_get_start(rs, rt); 1636 1637 while (rs != NULL && (rs_get_start(rs, rt) - first_found <= 1638 max_search || count_searched < metaslab_min_search_count)) { 1639 uint64_t offset = rs_get_start(rs, rt); 1640 if (offset + size <= rs_get_end(rs, rt)) { 1641 *cursor = offset + size; 1642 return (offset); 1643 } 1644 rs = zfs_btree_next(bt, &where, &where); 1645 count_searched++; 1646 } 1647 1648 *cursor = 0; 1649 return (-1ULL); 1650 } 1651 1652 static uint64_t metaslab_df_alloc(metaslab_t *msp, uint64_t size); 1653 static uint64_t metaslab_cf_alloc(metaslab_t *msp, uint64_t size); 1654 static uint64_t metaslab_ndf_alloc(metaslab_t *msp, uint64_t size); 1655 metaslab_ops_t *metaslab_allocator(spa_t *spa); 1656 1657 static metaslab_ops_t metaslab_allocators[] = { 1658 { "dynamic", metaslab_df_alloc }, 1659 { "cursor", metaslab_cf_alloc }, 1660 { "new-dynamic", metaslab_ndf_alloc }, 1661 }; 1662 1663 static int 1664 spa_find_allocator_byname(const char *val) 1665 { 1666 int a = ARRAY_SIZE(metaslab_allocators) - 1; 1667 if (strcmp("new-dynamic", val) == 0) 1668 return (-1); /* remove when ndf is working */ 1669 for (; a >= 0; a--) { 1670 if (strcmp(val, metaslab_allocators[a].msop_name) == 0) 1671 return (a); 1672 } 1673 return (-1); 1674 } 1675 1676 void 1677 spa_set_allocator(spa_t *spa, const char *allocator) 1678 { 1679 int a = spa_find_allocator_byname(allocator); 1680 if (a < 0) a = 0; 1681 spa->spa_active_allocator = a; 1682 zfs_dbgmsg("spa allocator: %s\n", metaslab_allocators[a].msop_name); 1683 } 1684 1685 int 1686 spa_get_allocator(spa_t *spa) 1687 { 1688 return (spa->spa_active_allocator); 1689 } 1690 1691 #if defined(_KERNEL) 1692 int 1693 param_set_active_allocator_common(const char *val) 1694 { 1695 char *p; 1696 1697 if (val == NULL) 1698 return (SET_ERROR(EINVAL)); 1699 1700 if ((p = strchr(val, '\n')) != NULL) 1701 *p = '\0'; 1702 1703 int a = spa_find_allocator_byname(val); 1704 if (a < 0) 1705 return (SET_ERROR(EINVAL)); 1706 1707 zfs_active_allocator = metaslab_allocators[a].msop_name; 1708 return (0); 1709 } 1710 #endif 1711 1712 metaslab_ops_t * 1713 metaslab_allocator(spa_t *spa) 1714 { 1715 int allocator = spa_get_allocator(spa); 1716 return (&metaslab_allocators[allocator]); 1717 } 1718 1719 /* 1720 * ========================================================================== 1721 * Dynamic Fit (df) block allocator 1722 * 1723 * Search for a free chunk of at least this size, starting from the last 1724 * offset (for this alignment of block) looking for up to 1725 * metaslab_df_max_search bytes (16MB). If a large enough free chunk is not 1726 * found within 16MB, then return a free chunk of exactly the requested size (or 1727 * larger). 1728 * 1729 * If it seems like searching from the last offset will be unproductive, skip 1730 * that and just return a free chunk of exactly the requested size (or larger). 1731 * This is based on metaslab_df_alloc_threshold and metaslab_df_free_pct. This 1732 * mechanism is probably not very useful and may be removed in the future. 1733 * 1734 * The behavior when not searching can be changed to return the largest free 1735 * chunk, instead of a free chunk of exactly the requested size, by setting 1736 * metaslab_df_use_largest_segment. 1737 * ========================================================================== 1738 */ 1739 static uint64_t 1740 metaslab_df_alloc(metaslab_t *msp, uint64_t size) 1741 { 1742 /* 1743 * Find the largest power of 2 block size that evenly divides the 1744 * requested size. This is used to try to allocate blocks with similar 1745 * alignment from the same area of the metaslab (i.e. same cursor 1746 * bucket) but it does not guarantee that other allocations sizes 1747 * may exist in the same region. 1748 */ 1749 uint64_t align = size & -size; 1750 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; 1751 range_tree_t *rt = msp->ms_allocatable; 1752 uint_t free_pct = range_tree_space(rt) * 100 / msp->ms_size; 1753 uint64_t offset; 1754 1755 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1756 1757 /* 1758 * If we're running low on space, find a segment based on size, 1759 * rather than iterating based on offset. 1760 */ 1761 if (metaslab_largest_allocatable(msp) < metaslab_df_alloc_threshold || 1762 free_pct < metaslab_df_free_pct) { 1763 offset = -1; 1764 } else { 1765 offset = metaslab_block_picker(rt, 1766 cursor, size, metaslab_df_max_search); 1767 } 1768 1769 if (offset == -1) { 1770 range_seg_t *rs; 1771 if (zfs_btree_numnodes(&msp->ms_allocatable_by_size) == 0) 1772 metaslab_size_tree_full_load(msp->ms_allocatable); 1773 1774 if (metaslab_df_use_largest_segment) { 1775 /* use largest free segment */ 1776 rs = zfs_btree_last(&msp->ms_allocatable_by_size, NULL); 1777 } else { 1778 zfs_btree_index_t where; 1779 /* use segment of this size, or next largest */ 1780 rs = metaslab_block_find(&msp->ms_allocatable_by_size, 1781 rt, msp->ms_start, size, &where); 1782 } 1783 if (rs != NULL && rs_get_start(rs, rt) + size <= rs_get_end(rs, 1784 rt)) { 1785 offset = rs_get_start(rs, rt); 1786 *cursor = offset + size; 1787 } 1788 } 1789 1790 return (offset); 1791 } 1792 1793 /* 1794 * ========================================================================== 1795 * Cursor fit block allocator - 1796 * Select the largest region in the metaslab, set the cursor to the beginning 1797 * of the range and the cursor_end to the end of the range. As allocations 1798 * are made advance the cursor. Continue allocating from the cursor until 1799 * the range is exhausted and then find a new range. 1800 * ========================================================================== 1801 */ 1802 static uint64_t 1803 metaslab_cf_alloc(metaslab_t *msp, uint64_t size) 1804 { 1805 range_tree_t *rt = msp->ms_allocatable; 1806 zfs_btree_t *t = &msp->ms_allocatable_by_size; 1807 uint64_t *cursor = &msp->ms_lbas[0]; 1808 uint64_t *cursor_end = &msp->ms_lbas[1]; 1809 uint64_t offset = 0; 1810 1811 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1812 1813 ASSERT3U(*cursor_end, >=, *cursor); 1814 1815 if ((*cursor + size) > *cursor_end) { 1816 range_seg_t *rs; 1817 1818 if (zfs_btree_numnodes(t) == 0) 1819 metaslab_size_tree_full_load(msp->ms_allocatable); 1820 rs = zfs_btree_last(t, NULL); 1821 if (rs == NULL || (rs_get_end(rs, rt) - rs_get_start(rs, rt)) < 1822 size) 1823 return (-1ULL); 1824 1825 *cursor = rs_get_start(rs, rt); 1826 *cursor_end = rs_get_end(rs, rt); 1827 } 1828 1829 offset = *cursor; 1830 *cursor += size; 1831 1832 return (offset); 1833 } 1834 1835 /* 1836 * ========================================================================== 1837 * New dynamic fit allocator - 1838 * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift 1839 * contiguous blocks. If no region is found then just use the largest segment 1840 * that remains. 1841 * ========================================================================== 1842 */ 1843 1844 /* 1845 * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift) 1846 * to request from the allocator. 1847 */ 1848 uint64_t metaslab_ndf_clump_shift = 4; 1849 1850 static uint64_t 1851 metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) 1852 { 1853 zfs_btree_t *t = &msp->ms_allocatable->rt_root; 1854 range_tree_t *rt = msp->ms_allocatable; 1855 zfs_btree_index_t where; 1856 range_seg_t *rs; 1857 range_seg_max_t rsearch; 1858 uint64_t hbit = highbit64(size); 1859 uint64_t *cursor = &msp->ms_lbas[hbit - 1]; 1860 uint64_t max_size = metaslab_largest_allocatable(msp); 1861 1862 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1863 1864 if (max_size < size) 1865 return (-1ULL); 1866 1867 rs_set_start(&rsearch, rt, *cursor); 1868 rs_set_end(&rsearch, rt, *cursor + size); 1869 1870 rs = zfs_btree_find(t, &rsearch, &where); 1871 if (rs == NULL || (rs_get_end(rs, rt) - rs_get_start(rs, rt)) < size) { 1872 t = &msp->ms_allocatable_by_size; 1873 1874 rs_set_start(&rsearch, rt, 0); 1875 rs_set_end(&rsearch, rt, MIN(max_size, 1ULL << (hbit + 1876 metaslab_ndf_clump_shift))); 1877 1878 rs = zfs_btree_find(t, &rsearch, &where); 1879 if (rs == NULL) 1880 rs = zfs_btree_next(t, &where, &where); 1881 ASSERT(rs != NULL); 1882 } 1883 1884 if ((rs_get_end(rs, rt) - rs_get_start(rs, rt)) >= size) { 1885 *cursor = rs_get_start(rs, rt) + size; 1886 return (rs_get_start(rs, rt)); 1887 } 1888 return (-1ULL); 1889 } 1890 1891 /* 1892 * ========================================================================== 1893 * Metaslabs 1894 * ========================================================================== 1895 */ 1896 1897 /* 1898 * Wait for any in-progress metaslab loads to complete. 1899 */ 1900 static void 1901 metaslab_load_wait(metaslab_t *msp) 1902 { 1903 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1904 1905 while (msp->ms_loading) { 1906 ASSERT(!msp->ms_loaded); 1907 cv_wait(&msp->ms_load_cv, &msp->ms_lock); 1908 } 1909 } 1910 1911 /* 1912 * Wait for any in-progress flushing to complete. 1913 */ 1914 static void 1915 metaslab_flush_wait(metaslab_t *msp) 1916 { 1917 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1918 1919 while (msp->ms_flushing) 1920 cv_wait(&msp->ms_flush_cv, &msp->ms_lock); 1921 } 1922 1923 static unsigned int 1924 metaslab_idx_func(multilist_t *ml, void *arg) 1925 { 1926 metaslab_t *msp = arg; 1927 1928 /* 1929 * ms_id values are allocated sequentially, so full 64bit 1930 * division would be a waste of time, so limit it to 32 bits. 1931 */ 1932 return ((unsigned int)msp->ms_id % multilist_get_num_sublists(ml)); 1933 } 1934 1935 uint64_t 1936 metaslab_allocated_space(metaslab_t *msp) 1937 { 1938 return (msp->ms_allocated_space); 1939 } 1940 1941 /* 1942 * Verify that the space accounting on disk matches the in-core range_trees. 1943 */ 1944 static void 1945 metaslab_verify_space(metaslab_t *msp, uint64_t txg) 1946 { 1947 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1948 uint64_t allocating = 0; 1949 uint64_t sm_free_space, msp_free_space; 1950 1951 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1952 ASSERT(!msp->ms_condensing); 1953 1954 if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) 1955 return; 1956 1957 /* 1958 * We can only verify the metaslab space when we're called 1959 * from syncing context with a loaded metaslab that has an 1960 * allocated space map. Calling this in non-syncing context 1961 * does not provide a consistent view of the metaslab since 1962 * we're performing allocations in the future. 1963 */ 1964 if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL || 1965 !msp->ms_loaded) 1966 return; 1967 1968 /* 1969 * Even though the smp_alloc field can get negative, 1970 * when it comes to a metaslab's space map, that should 1971 * never be the case. 1972 */ 1973 ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0); 1974 1975 ASSERT3U(space_map_allocated(msp->ms_sm), >=, 1976 range_tree_space(msp->ms_unflushed_frees)); 1977 1978 ASSERT3U(metaslab_allocated_space(msp), ==, 1979 space_map_allocated(msp->ms_sm) + 1980 range_tree_space(msp->ms_unflushed_allocs) - 1981 range_tree_space(msp->ms_unflushed_frees)); 1982 1983 sm_free_space = msp->ms_size - metaslab_allocated_space(msp); 1984 1985 /* 1986 * Account for future allocations since we would have 1987 * already deducted that space from the ms_allocatable. 1988 */ 1989 for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { 1990 allocating += 1991 range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]); 1992 } 1993 ASSERT3U(allocating + msp->ms_allocated_this_txg, ==, 1994 msp->ms_allocating_total); 1995 1996 ASSERT3U(msp->ms_deferspace, ==, 1997 range_tree_space(msp->ms_defer[0]) + 1998 range_tree_space(msp->ms_defer[1])); 1999 2000 msp_free_space = range_tree_space(msp->ms_allocatable) + allocating + 2001 msp->ms_deferspace + range_tree_space(msp->ms_freed); 2002 2003 VERIFY3U(sm_free_space, ==, msp_free_space); 2004 } 2005 2006 static void 2007 metaslab_aux_histograms_clear(metaslab_t *msp) 2008 { 2009 /* 2010 * Auxiliary histograms are only cleared when resetting them, 2011 * which can only happen while the metaslab is loaded. 2012 */ 2013 ASSERT(msp->ms_loaded); 2014 2015 memset(msp->ms_synchist, 0, sizeof (msp->ms_synchist)); 2016 for (int t = 0; t < TXG_DEFER_SIZE; t++) 2017 memset(msp->ms_deferhist[t], 0, sizeof (msp->ms_deferhist[t])); 2018 } 2019 2020 static void 2021 metaslab_aux_histogram_add(uint64_t *histogram, uint64_t shift, 2022 range_tree_t *rt) 2023 { 2024 /* 2025 * This is modeled after space_map_histogram_add(), so refer to that 2026 * function for implementation details. We want this to work like 2027 * the space map histogram, and not the range tree histogram, as we 2028 * are essentially constructing a delta that will be later subtracted 2029 * from the space map histogram. 2030 */ 2031 int idx = 0; 2032 for (int i = shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { 2033 ASSERT3U(i, >=, idx + shift); 2034 histogram[idx] += rt->rt_histogram[i] << (i - idx - shift); 2035 2036 if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) { 2037 ASSERT3U(idx + shift, ==, i); 2038 idx++; 2039 ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE); 2040 } 2041 } 2042 } 2043 2044 /* 2045 * Called at every sync pass that the metaslab gets synced. 2046 * 2047 * The reason is that we want our auxiliary histograms to be updated 2048 * wherever the metaslab's space map histogram is updated. This way 2049 * we stay consistent on which parts of the metaslab space map's 2050 * histogram are currently not available for allocations (e.g because 2051 * they are in the defer, freed, and freeing trees). 2052 */ 2053 static void 2054 metaslab_aux_histograms_update(metaslab_t *msp) 2055 { 2056 space_map_t *sm = msp->ms_sm; 2057 ASSERT(sm != NULL); 2058 2059 /* 2060 * This is similar to the metaslab's space map histogram updates 2061 * that take place in metaslab_sync(). The only difference is that 2062 * we only care about segments that haven't made it into the 2063 * ms_allocatable tree yet. 2064 */ 2065 if (msp->ms_loaded) { 2066 metaslab_aux_histograms_clear(msp); 2067 2068 metaslab_aux_histogram_add(msp->ms_synchist, 2069 sm->sm_shift, msp->ms_freed); 2070 2071 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2072 metaslab_aux_histogram_add(msp->ms_deferhist[t], 2073 sm->sm_shift, msp->ms_defer[t]); 2074 } 2075 } 2076 2077 metaslab_aux_histogram_add(msp->ms_synchist, 2078 sm->sm_shift, msp->ms_freeing); 2079 } 2080 2081 /* 2082 * Called every time we are done syncing (writing to) the metaslab, 2083 * i.e. at the end of each sync pass. 2084 * [see the comment in metaslab_impl.h for ms_synchist, ms_deferhist] 2085 */ 2086 static void 2087 metaslab_aux_histograms_update_done(metaslab_t *msp, boolean_t defer_allowed) 2088 { 2089 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 2090 space_map_t *sm = msp->ms_sm; 2091 2092 if (sm == NULL) { 2093 /* 2094 * We came here from metaslab_init() when creating/opening a 2095 * pool, looking at a metaslab that hasn't had any allocations 2096 * yet. 2097 */ 2098 return; 2099 } 2100 2101 /* 2102 * This is similar to the actions that we take for the ms_freed 2103 * and ms_defer trees in metaslab_sync_done(). 2104 */ 2105 uint64_t hist_index = spa_syncing_txg(spa) % TXG_DEFER_SIZE; 2106 if (defer_allowed) { 2107 memcpy(msp->ms_deferhist[hist_index], msp->ms_synchist, 2108 sizeof (msp->ms_synchist)); 2109 } else { 2110 memset(msp->ms_deferhist[hist_index], 0, 2111 sizeof (msp->ms_deferhist[hist_index])); 2112 } 2113 memset(msp->ms_synchist, 0, sizeof (msp->ms_synchist)); 2114 } 2115 2116 /* 2117 * Ensure that the metaslab's weight and fragmentation are consistent 2118 * with the contents of the histogram (either the range tree's histogram 2119 * or the space map's depending whether the metaslab is loaded). 2120 */ 2121 static void 2122 metaslab_verify_weight_and_frag(metaslab_t *msp) 2123 { 2124 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2125 2126 if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) 2127 return; 2128 2129 /* 2130 * We can end up here from vdev_remove_complete(), in which case we 2131 * cannot do these assertions because we hold spa config locks and 2132 * thus we are not allowed to read from the DMU. 2133 * 2134 * We check if the metaslab group has been removed and if that's 2135 * the case we return immediately as that would mean that we are 2136 * here from the aforementioned code path. 2137 */ 2138 if (msp->ms_group == NULL) 2139 return; 2140 2141 /* 2142 * Devices being removed always return a weight of 0 and leave 2143 * fragmentation and ms_max_size as is - there is nothing for 2144 * us to verify here. 2145 */ 2146 vdev_t *vd = msp->ms_group->mg_vd; 2147 if (vd->vdev_removing) 2148 return; 2149 2150 /* 2151 * If the metaslab is dirty it probably means that we've done 2152 * some allocations or frees that have changed our histograms 2153 * and thus the weight. 2154 */ 2155 for (int t = 0; t < TXG_SIZE; t++) { 2156 if (txg_list_member(&vd->vdev_ms_list, msp, t)) 2157 return; 2158 } 2159 2160 /* 2161 * This verification checks that our in-memory state is consistent 2162 * with what's on disk. If the pool is read-only then there aren't 2163 * any changes and we just have the initially-loaded state. 2164 */ 2165 if (!spa_writeable(msp->ms_group->mg_vd->vdev_spa)) 2166 return; 2167 2168 /* some extra verification for in-core tree if you can */ 2169 if (msp->ms_loaded) { 2170 range_tree_stat_verify(msp->ms_allocatable); 2171 VERIFY(space_map_histogram_verify(msp->ms_sm, 2172 msp->ms_allocatable)); 2173 } 2174 2175 uint64_t weight = msp->ms_weight; 2176 uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; 2177 boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight); 2178 uint64_t frag = msp->ms_fragmentation; 2179 uint64_t max_segsize = msp->ms_max_size; 2180 2181 msp->ms_weight = 0; 2182 msp->ms_fragmentation = 0; 2183 2184 /* 2185 * This function is used for verification purposes and thus should 2186 * not introduce any side-effects/mutations on the system's state. 2187 * 2188 * Regardless of whether metaslab_weight() thinks this metaslab 2189 * should be active or not, we want to ensure that the actual weight 2190 * (and therefore the value of ms_weight) would be the same if it 2191 * was to be recalculated at this point. 2192 * 2193 * In addition we set the nodirty flag so metaslab_weight() does 2194 * not dirty the metaslab for future TXGs (e.g. when trying to 2195 * force condensing to upgrade the metaslab spacemaps). 2196 */ 2197 msp->ms_weight = metaslab_weight(msp, B_TRUE) | was_active; 2198 2199 VERIFY3U(max_segsize, ==, msp->ms_max_size); 2200 2201 /* 2202 * If the weight type changed then there is no point in doing 2203 * verification. Revert fields to their original values. 2204 */ 2205 if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) || 2206 (!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) { 2207 msp->ms_fragmentation = frag; 2208 msp->ms_weight = weight; 2209 return; 2210 } 2211 2212 VERIFY3U(msp->ms_fragmentation, ==, frag); 2213 VERIFY3U(msp->ms_weight, ==, weight); 2214 } 2215 2216 /* 2217 * If we're over the zfs_metaslab_mem_limit, select the loaded metaslab from 2218 * this class that was used longest ago, and attempt to unload it. We don't 2219 * want to spend too much time in this loop to prevent performance 2220 * degradation, and we expect that most of the time this operation will 2221 * succeed. Between that and the normal unloading processing during txg sync, 2222 * we expect this to keep the metaslab memory usage under control. 2223 */ 2224 static void 2225 metaslab_potentially_evict(metaslab_class_t *mc) 2226 { 2227 #ifdef _KERNEL 2228 uint64_t allmem = arc_all_memory(); 2229 uint64_t inuse = spl_kmem_cache_inuse(zfs_btree_leaf_cache); 2230 uint64_t size = spl_kmem_cache_entry_size(zfs_btree_leaf_cache); 2231 uint_t tries = 0; 2232 for (; allmem * zfs_metaslab_mem_limit / 100 < inuse * size && 2233 tries < multilist_get_num_sublists(&mc->mc_metaslab_txg_list) * 2; 2234 tries++) { 2235 unsigned int idx = multilist_get_random_index( 2236 &mc->mc_metaslab_txg_list); 2237 multilist_sublist_t *mls = 2238 multilist_sublist_lock_idx(&mc->mc_metaslab_txg_list, idx); 2239 metaslab_t *msp = multilist_sublist_head(mls); 2240 multilist_sublist_unlock(mls); 2241 while (msp != NULL && allmem * zfs_metaslab_mem_limit / 100 < 2242 inuse * size) { 2243 VERIFY3P(mls, ==, multilist_sublist_lock_idx( 2244 &mc->mc_metaslab_txg_list, idx)); 2245 ASSERT3U(idx, ==, 2246 metaslab_idx_func(&mc->mc_metaslab_txg_list, msp)); 2247 2248 if (!multilist_link_active(&msp->ms_class_txg_node)) { 2249 multilist_sublist_unlock(mls); 2250 break; 2251 } 2252 metaslab_t *next_msp = multilist_sublist_next(mls, msp); 2253 multilist_sublist_unlock(mls); 2254 /* 2255 * If the metaslab is currently loading there are two 2256 * cases. If it's the metaslab we're evicting, we 2257 * can't continue on or we'll panic when we attempt to 2258 * recursively lock the mutex. If it's another 2259 * metaslab that's loading, it can be safely skipped, 2260 * since we know it's very new and therefore not a 2261 * good eviction candidate. We check later once the 2262 * lock is held that the metaslab is fully loaded 2263 * before actually unloading it. 2264 */ 2265 if (msp->ms_loading) { 2266 msp = next_msp; 2267 inuse = 2268 spl_kmem_cache_inuse(zfs_btree_leaf_cache); 2269 continue; 2270 } 2271 /* 2272 * We can't unload metaslabs with no spacemap because 2273 * they're not ready to be unloaded yet. We can't 2274 * unload metaslabs with outstanding allocations 2275 * because doing so could cause the metaslab's weight 2276 * to decrease while it's unloaded, which violates an 2277 * invariant that we use to prevent unnecessary 2278 * loading. We also don't unload metaslabs that are 2279 * currently active because they are high-weight 2280 * metaslabs that are likely to be used in the near 2281 * future. 2282 */ 2283 mutex_enter(&msp->ms_lock); 2284 if (msp->ms_allocator == -1 && msp->ms_sm != NULL && 2285 msp->ms_allocating_total == 0) { 2286 metaslab_unload(msp); 2287 } 2288 mutex_exit(&msp->ms_lock); 2289 msp = next_msp; 2290 inuse = spl_kmem_cache_inuse(zfs_btree_leaf_cache); 2291 } 2292 } 2293 #else 2294 (void) mc, (void) zfs_metaslab_mem_limit; 2295 #endif 2296 } 2297 2298 static int 2299 metaslab_load_impl(metaslab_t *msp) 2300 { 2301 int error = 0; 2302 2303 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2304 ASSERT(msp->ms_loading); 2305 ASSERT(!msp->ms_condensing); 2306 2307 /* 2308 * We temporarily drop the lock to unblock other operations while we 2309 * are reading the space map. Therefore, metaslab_sync() and 2310 * metaslab_sync_done() can run at the same time as we do. 2311 * 2312 * If we are using the log space maps, metaslab_sync() can't write to 2313 * the metaslab's space map while we are loading as we only write to 2314 * it when we are flushing the metaslab, and that can't happen while 2315 * we are loading it. 2316 * 2317 * If we are not using log space maps though, metaslab_sync() can 2318 * append to the space map while we are loading. Therefore we load 2319 * only entries that existed when we started the load. Additionally, 2320 * metaslab_sync_done() has to wait for the load to complete because 2321 * there are potential races like metaslab_load() loading parts of the 2322 * space map that are currently being appended by metaslab_sync(). If 2323 * we didn't, the ms_allocatable would have entries that 2324 * metaslab_sync_done() would try to re-add later. 2325 * 2326 * That's why before dropping the lock we remember the synced length 2327 * of the metaslab and read up to that point of the space map, 2328 * ignoring entries appended by metaslab_sync() that happen after we 2329 * drop the lock. 2330 */ 2331 uint64_t length = msp->ms_synced_length; 2332 mutex_exit(&msp->ms_lock); 2333 2334 hrtime_t load_start = gethrtime(); 2335 metaslab_rt_arg_t *mrap; 2336 if (msp->ms_allocatable->rt_arg == NULL) { 2337 mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP); 2338 } else { 2339 mrap = msp->ms_allocatable->rt_arg; 2340 msp->ms_allocatable->rt_ops = NULL; 2341 msp->ms_allocatable->rt_arg = NULL; 2342 } 2343 mrap->mra_bt = &msp->ms_allocatable_by_size; 2344 mrap->mra_floor_shift = metaslab_by_size_min_shift; 2345 2346 if (msp->ms_sm != NULL) { 2347 error = space_map_load_length(msp->ms_sm, msp->ms_allocatable, 2348 SM_FREE, length); 2349 2350 /* Now, populate the size-sorted tree. */ 2351 metaslab_rt_create(msp->ms_allocatable, mrap); 2352 msp->ms_allocatable->rt_ops = &metaslab_rt_ops; 2353 msp->ms_allocatable->rt_arg = mrap; 2354 2355 struct mssa_arg arg = {0}; 2356 arg.rt = msp->ms_allocatable; 2357 arg.mra = mrap; 2358 range_tree_walk(msp->ms_allocatable, metaslab_size_sorted_add, 2359 &arg); 2360 } else { 2361 /* 2362 * Add the size-sorted tree first, since we don't need to load 2363 * the metaslab from the spacemap. 2364 */ 2365 metaslab_rt_create(msp->ms_allocatable, mrap); 2366 msp->ms_allocatable->rt_ops = &metaslab_rt_ops; 2367 msp->ms_allocatable->rt_arg = mrap; 2368 /* 2369 * The space map has not been allocated yet, so treat 2370 * all the space in the metaslab as free and add it to the 2371 * ms_allocatable tree. 2372 */ 2373 range_tree_add(msp->ms_allocatable, 2374 msp->ms_start, msp->ms_size); 2375 2376 if (msp->ms_new) { 2377 /* 2378 * If the ms_sm doesn't exist, this means that this 2379 * metaslab hasn't gone through metaslab_sync() and 2380 * thus has never been dirtied. So we shouldn't 2381 * expect any unflushed allocs or frees from previous 2382 * TXGs. 2383 */ 2384 ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); 2385 ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); 2386 } 2387 } 2388 2389 /* 2390 * We need to grab the ms_sync_lock to prevent metaslab_sync() from 2391 * changing the ms_sm (or log_sm) and the metaslab's range trees 2392 * while we are about to use them and populate the ms_allocatable. 2393 * The ms_lock is insufficient for this because metaslab_sync() doesn't 2394 * hold the ms_lock while writing the ms_checkpointing tree to disk. 2395 */ 2396 mutex_enter(&msp->ms_sync_lock); 2397 mutex_enter(&msp->ms_lock); 2398 2399 ASSERT(!msp->ms_condensing); 2400 ASSERT(!msp->ms_flushing); 2401 2402 if (error != 0) { 2403 mutex_exit(&msp->ms_sync_lock); 2404 return (error); 2405 } 2406 2407 ASSERT3P(msp->ms_group, !=, NULL); 2408 msp->ms_loaded = B_TRUE; 2409 2410 /* 2411 * Apply all the unflushed changes to ms_allocatable right 2412 * away so any manipulations we do below have a clear view 2413 * of what is allocated and what is free. 2414 */ 2415 range_tree_walk(msp->ms_unflushed_allocs, 2416 range_tree_remove, msp->ms_allocatable); 2417 range_tree_walk(msp->ms_unflushed_frees, 2418 range_tree_add, msp->ms_allocatable); 2419 2420 ASSERT3P(msp->ms_group, !=, NULL); 2421 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 2422 if (spa_syncing_log_sm(spa) != NULL) { 2423 ASSERT(spa_feature_is_enabled(spa, 2424 SPA_FEATURE_LOG_SPACEMAP)); 2425 2426 /* 2427 * If we use a log space map we add all the segments 2428 * that are in ms_unflushed_frees so they are available 2429 * for allocation. 2430 * 2431 * ms_allocatable needs to contain all free segments 2432 * that are ready for allocations (thus not segments 2433 * from ms_freeing, ms_freed, and the ms_defer trees). 2434 * But if we grab the lock in this code path at a sync 2435 * pass later that 1, then it also contains the 2436 * segments of ms_freed (they were added to it earlier 2437 * in this path through ms_unflushed_frees). So we 2438 * need to remove all the segments that exist in 2439 * ms_freed from ms_allocatable as they will be added 2440 * later in metaslab_sync_done(). 2441 * 2442 * When there's no log space map, the ms_allocatable 2443 * correctly doesn't contain any segments that exist 2444 * in ms_freed [see ms_synced_length]. 2445 */ 2446 range_tree_walk(msp->ms_freed, 2447 range_tree_remove, msp->ms_allocatable); 2448 } 2449 2450 /* 2451 * If we are not using the log space map, ms_allocatable 2452 * contains the segments that exist in the ms_defer trees 2453 * [see ms_synced_length]. Thus we need to remove them 2454 * from ms_allocatable as they will be added again in 2455 * metaslab_sync_done(). 2456 * 2457 * If we are using the log space map, ms_allocatable still 2458 * contains the segments that exist in the ms_defer trees. 2459 * Not because it read them through the ms_sm though. But 2460 * because these segments are part of ms_unflushed_frees 2461 * whose segments we add to ms_allocatable earlier in this 2462 * code path. 2463 */ 2464 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2465 range_tree_walk(msp->ms_defer[t], 2466 range_tree_remove, msp->ms_allocatable); 2467 } 2468 2469 /* 2470 * Call metaslab_recalculate_weight_and_sort() now that the 2471 * metaslab is loaded so we get the metaslab's real weight. 2472 * 2473 * Unless this metaslab was created with older software and 2474 * has not yet been converted to use segment-based weight, we 2475 * expect the new weight to be better or equal to the weight 2476 * that the metaslab had while it was not loaded. This is 2477 * because the old weight does not take into account the 2478 * consolidation of adjacent segments between TXGs. [see 2479 * comment for ms_synchist and ms_deferhist[] for more info] 2480 */ 2481 uint64_t weight = msp->ms_weight; 2482 uint64_t max_size = msp->ms_max_size; 2483 metaslab_recalculate_weight_and_sort(msp); 2484 if (!WEIGHT_IS_SPACEBASED(weight)) 2485 ASSERT3U(weight, <=, msp->ms_weight); 2486 msp->ms_max_size = metaslab_largest_allocatable(msp); 2487 ASSERT3U(max_size, <=, msp->ms_max_size); 2488 hrtime_t load_end = gethrtime(); 2489 msp->ms_load_time = load_end; 2490 zfs_dbgmsg("metaslab_load: txg %llu, spa %s, vdev_id %llu, " 2491 "ms_id %llu, smp_length %llu, " 2492 "unflushed_allocs %llu, unflushed_frees %llu, " 2493 "freed %llu, defer %llu + %llu, unloaded time %llu ms, " 2494 "loading_time %lld ms, ms_max_size %llu, " 2495 "max size error %lld, " 2496 "old_weight %llx, new_weight %llx", 2497 (u_longlong_t)spa_syncing_txg(spa), spa_name(spa), 2498 (u_longlong_t)msp->ms_group->mg_vd->vdev_id, 2499 (u_longlong_t)msp->ms_id, 2500 (u_longlong_t)space_map_length(msp->ms_sm), 2501 (u_longlong_t)range_tree_space(msp->ms_unflushed_allocs), 2502 (u_longlong_t)range_tree_space(msp->ms_unflushed_frees), 2503 (u_longlong_t)range_tree_space(msp->ms_freed), 2504 (u_longlong_t)range_tree_space(msp->ms_defer[0]), 2505 (u_longlong_t)range_tree_space(msp->ms_defer[1]), 2506 (longlong_t)((load_start - msp->ms_unload_time) / 1000000), 2507 (longlong_t)((load_end - load_start) / 1000000), 2508 (u_longlong_t)msp->ms_max_size, 2509 (u_longlong_t)msp->ms_max_size - max_size, 2510 (u_longlong_t)weight, (u_longlong_t)msp->ms_weight); 2511 2512 metaslab_verify_space(msp, spa_syncing_txg(spa)); 2513 mutex_exit(&msp->ms_sync_lock); 2514 return (0); 2515 } 2516 2517 int 2518 metaslab_load(metaslab_t *msp) 2519 { 2520 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2521 2522 /* 2523 * There may be another thread loading the same metaslab, if that's 2524 * the case just wait until the other thread is done and return. 2525 */ 2526 metaslab_load_wait(msp); 2527 if (msp->ms_loaded) 2528 return (0); 2529 VERIFY(!msp->ms_loading); 2530 ASSERT(!msp->ms_condensing); 2531 2532 /* 2533 * We set the loading flag BEFORE potentially dropping the lock to 2534 * wait for an ongoing flush (see ms_flushing below). This way other 2535 * threads know that there is already a thread that is loading this 2536 * metaslab. 2537 */ 2538 msp->ms_loading = B_TRUE; 2539 2540 /* 2541 * Wait for any in-progress flushing to finish as we drop the ms_lock 2542 * both here (during space_map_load()) and in metaslab_flush() (when 2543 * we flush our changes to the ms_sm). 2544 */ 2545 if (msp->ms_flushing) 2546 metaslab_flush_wait(msp); 2547 2548 /* 2549 * In the possibility that we were waiting for the metaslab to be 2550 * flushed (where we temporarily dropped the ms_lock), ensure that 2551 * no one else loaded the metaslab somehow. 2552 */ 2553 ASSERT(!msp->ms_loaded); 2554 2555 /* 2556 * If we're loading a metaslab in the normal class, consider evicting 2557 * another one to keep our memory usage under the limit defined by the 2558 * zfs_metaslab_mem_limit tunable. 2559 */ 2560 if (spa_normal_class(msp->ms_group->mg_class->mc_spa) == 2561 msp->ms_group->mg_class) { 2562 metaslab_potentially_evict(msp->ms_group->mg_class); 2563 } 2564 2565 int error = metaslab_load_impl(msp); 2566 2567 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2568 msp->ms_loading = B_FALSE; 2569 cv_broadcast(&msp->ms_load_cv); 2570 2571 return (error); 2572 } 2573 2574 void 2575 metaslab_unload(metaslab_t *msp) 2576 { 2577 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2578 2579 /* 2580 * This can happen if a metaslab is selected for eviction (in 2581 * metaslab_potentially_evict) and then unloaded during spa_sync (via 2582 * metaslab_class_evict_old). 2583 */ 2584 if (!msp->ms_loaded) 2585 return; 2586 2587 range_tree_vacate(msp->ms_allocatable, NULL, NULL); 2588 msp->ms_loaded = B_FALSE; 2589 msp->ms_unload_time = gethrtime(); 2590 2591 msp->ms_activation_weight = 0; 2592 msp->ms_weight &= ~METASLAB_ACTIVE_MASK; 2593 2594 if (msp->ms_group != NULL) { 2595 metaslab_class_t *mc = msp->ms_group->mg_class; 2596 multilist_sublist_t *mls = 2597 multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp); 2598 if (multilist_link_active(&msp->ms_class_txg_node)) 2599 multilist_sublist_remove(mls, msp); 2600 multilist_sublist_unlock(mls); 2601 2602 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 2603 zfs_dbgmsg("metaslab_unload: txg %llu, spa %s, vdev_id %llu, " 2604 "ms_id %llu, weight %llx, " 2605 "selected txg %llu (%llu ms ago), alloc_txg %llu, " 2606 "loaded %llu ms ago, max_size %llu", 2607 (u_longlong_t)spa_syncing_txg(spa), spa_name(spa), 2608 (u_longlong_t)msp->ms_group->mg_vd->vdev_id, 2609 (u_longlong_t)msp->ms_id, 2610 (u_longlong_t)msp->ms_weight, 2611 (u_longlong_t)msp->ms_selected_txg, 2612 (u_longlong_t)(msp->ms_unload_time - 2613 msp->ms_selected_time) / 1000 / 1000, 2614 (u_longlong_t)msp->ms_alloc_txg, 2615 (u_longlong_t)(msp->ms_unload_time - 2616 msp->ms_load_time) / 1000 / 1000, 2617 (u_longlong_t)msp->ms_max_size); 2618 } 2619 2620 /* 2621 * We explicitly recalculate the metaslab's weight based on its space 2622 * map (as it is now not loaded). We want unload metaslabs to always 2623 * have their weights calculated from the space map histograms, while 2624 * loaded ones have it calculated from their in-core range tree 2625 * [see metaslab_load()]. This way, the weight reflects the information 2626 * available in-core, whether it is loaded or not. 2627 * 2628 * If ms_group == NULL means that we came here from metaslab_fini(), 2629 * at which point it doesn't make sense for us to do the recalculation 2630 * and the sorting. 2631 */ 2632 if (msp->ms_group != NULL) 2633 metaslab_recalculate_weight_and_sort(msp); 2634 } 2635 2636 /* 2637 * We want to optimize the memory use of the per-metaslab range 2638 * trees. To do this, we store the segments in the range trees in 2639 * units of sectors, zero-indexing from the start of the metaslab. If 2640 * the vdev_ms_shift - the vdev_ashift is less than 32, we can store 2641 * the ranges using two uint32_ts, rather than two uint64_ts. 2642 */ 2643 range_seg_type_t 2644 metaslab_calculate_range_tree_type(vdev_t *vdev, metaslab_t *msp, 2645 uint64_t *start, uint64_t *shift) 2646 { 2647 if (vdev->vdev_ms_shift - vdev->vdev_ashift < 32 && 2648 !zfs_metaslab_force_large_segs) { 2649 *shift = vdev->vdev_ashift; 2650 *start = msp->ms_start; 2651 return (RANGE_SEG32); 2652 } else { 2653 *shift = 0; 2654 *start = 0; 2655 return (RANGE_SEG64); 2656 } 2657 } 2658 2659 void 2660 metaslab_set_selected_txg(metaslab_t *msp, uint64_t txg) 2661 { 2662 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2663 metaslab_class_t *mc = msp->ms_group->mg_class; 2664 multilist_sublist_t *mls = 2665 multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp); 2666 if (multilist_link_active(&msp->ms_class_txg_node)) 2667 multilist_sublist_remove(mls, msp); 2668 msp->ms_selected_txg = txg; 2669 msp->ms_selected_time = gethrtime(); 2670 multilist_sublist_insert_tail(mls, msp); 2671 multilist_sublist_unlock(mls); 2672 } 2673 2674 void 2675 metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta, 2676 int64_t defer_delta, int64_t space_delta) 2677 { 2678 vdev_space_update(vd, alloc_delta, defer_delta, space_delta); 2679 2680 ASSERT3P(vd->vdev_spa->spa_root_vdev, ==, vd->vdev_parent); 2681 ASSERT(vd->vdev_ms_count != 0); 2682 2683 metaslab_class_space_update(mc, alloc_delta, defer_delta, space_delta, 2684 vdev_deflated_space(vd, space_delta)); 2685 } 2686 2687 int 2688 metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, 2689 uint64_t txg, metaslab_t **msp) 2690 { 2691 vdev_t *vd = mg->mg_vd; 2692 spa_t *spa = vd->vdev_spa; 2693 objset_t *mos = spa->spa_meta_objset; 2694 metaslab_t *ms; 2695 int error; 2696 2697 ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); 2698 mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL); 2699 mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL); 2700 cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); 2701 cv_init(&ms->ms_flush_cv, NULL, CV_DEFAULT, NULL); 2702 multilist_link_init(&ms->ms_class_txg_node); 2703 2704 ms->ms_id = id; 2705 ms->ms_start = id << vd->vdev_ms_shift; 2706 ms->ms_size = 1ULL << vd->vdev_ms_shift; 2707 ms->ms_allocator = -1; 2708 ms->ms_new = B_TRUE; 2709 2710 vdev_ops_t *ops = vd->vdev_ops; 2711 if (ops->vdev_op_metaslab_init != NULL) 2712 ops->vdev_op_metaslab_init(vd, &ms->ms_start, &ms->ms_size); 2713 2714 /* 2715 * We only open space map objects that already exist. All others 2716 * will be opened when we finally allocate an object for it. For 2717 * readonly pools there is no need to open the space map object. 2718 * 2719 * Note: 2720 * When called from vdev_expand(), we can't call into the DMU as 2721 * we are holding the spa_config_lock as a writer and we would 2722 * deadlock [see relevant comment in vdev_metaslab_init()]. in 2723 * that case, the object parameter is zero though, so we won't 2724 * call into the DMU. 2725 */ 2726 if (object != 0 && !(spa->spa_mode == SPA_MODE_READ && 2727 !spa->spa_read_spacemaps)) { 2728 error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start, 2729 ms->ms_size, vd->vdev_ashift); 2730 2731 if (error != 0) { 2732 kmem_free(ms, sizeof (metaslab_t)); 2733 return (error); 2734 } 2735 2736 ASSERT(ms->ms_sm != NULL); 2737 ms->ms_allocated_space = space_map_allocated(ms->ms_sm); 2738 } 2739 2740 uint64_t shift, start; 2741 range_seg_type_t type = 2742 metaslab_calculate_range_tree_type(vd, ms, &start, &shift); 2743 2744 ms->ms_allocatable = range_tree_create(NULL, type, NULL, start, shift); 2745 for (int t = 0; t < TXG_SIZE; t++) { 2746 ms->ms_allocating[t] = range_tree_create(NULL, type, 2747 NULL, start, shift); 2748 } 2749 ms->ms_freeing = range_tree_create(NULL, type, NULL, start, shift); 2750 ms->ms_freed = range_tree_create(NULL, type, NULL, start, shift); 2751 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2752 ms->ms_defer[t] = range_tree_create(NULL, type, NULL, 2753 start, shift); 2754 } 2755 ms->ms_checkpointing = 2756 range_tree_create(NULL, type, NULL, start, shift); 2757 ms->ms_unflushed_allocs = 2758 range_tree_create(NULL, type, NULL, start, shift); 2759 2760 metaslab_rt_arg_t *mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP); 2761 mrap->mra_bt = &ms->ms_unflushed_frees_by_size; 2762 mrap->mra_floor_shift = metaslab_by_size_min_shift; 2763 ms->ms_unflushed_frees = range_tree_create(&metaslab_rt_ops, 2764 type, mrap, start, shift); 2765 2766 ms->ms_trim = range_tree_create(NULL, type, NULL, start, shift); 2767 2768 metaslab_group_add(mg, ms); 2769 metaslab_set_fragmentation(ms, B_FALSE); 2770 2771 /* 2772 * If we're opening an existing pool (txg == 0) or creating 2773 * a new one (txg == TXG_INITIAL), all space is available now. 2774 * If we're adding space to an existing pool, the new space 2775 * does not become available until after this txg has synced. 2776 * The metaslab's weight will also be initialized when we sync 2777 * out this txg. This ensures that we don't attempt to allocate 2778 * from it before we have initialized it completely. 2779 */ 2780 if (txg <= TXG_INITIAL) { 2781 metaslab_sync_done(ms, 0); 2782 metaslab_space_update(vd, mg->mg_class, 2783 metaslab_allocated_space(ms), 0, 0); 2784 } 2785 2786 if (txg != 0) { 2787 vdev_dirty(vd, 0, NULL, txg); 2788 vdev_dirty(vd, VDD_METASLAB, ms, txg); 2789 } 2790 2791 *msp = ms; 2792 2793 return (0); 2794 } 2795 2796 static void 2797 metaslab_fini_flush_data(metaslab_t *msp) 2798 { 2799 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 2800 2801 if (metaslab_unflushed_txg(msp) == 0) { 2802 ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), 2803 ==, NULL); 2804 return; 2805 } 2806 ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); 2807 2808 mutex_enter(&spa->spa_flushed_ms_lock); 2809 avl_remove(&spa->spa_metaslabs_by_flushed, msp); 2810 mutex_exit(&spa->spa_flushed_ms_lock); 2811 2812 spa_log_sm_decrement_mscount(spa, metaslab_unflushed_txg(msp)); 2813 spa_log_summary_decrement_mscount(spa, metaslab_unflushed_txg(msp), 2814 metaslab_unflushed_dirty(msp)); 2815 } 2816 2817 uint64_t 2818 metaslab_unflushed_changes_memused(metaslab_t *ms) 2819 { 2820 return ((range_tree_numsegs(ms->ms_unflushed_allocs) + 2821 range_tree_numsegs(ms->ms_unflushed_frees)) * 2822 ms->ms_unflushed_allocs->rt_root.bt_elem_size); 2823 } 2824 2825 void 2826 metaslab_fini(metaslab_t *msp) 2827 { 2828 metaslab_group_t *mg = msp->ms_group; 2829 vdev_t *vd = mg->mg_vd; 2830 spa_t *spa = vd->vdev_spa; 2831 2832 metaslab_fini_flush_data(msp); 2833 2834 metaslab_group_remove(mg, msp); 2835 2836 mutex_enter(&msp->ms_lock); 2837 VERIFY(msp->ms_group == NULL); 2838 2839 /* 2840 * If this metaslab hasn't been through metaslab_sync_done() yet its 2841 * space hasn't been accounted for in its vdev and doesn't need to be 2842 * subtracted. 2843 */ 2844 if (!msp->ms_new) { 2845 metaslab_space_update(vd, mg->mg_class, 2846 -metaslab_allocated_space(msp), 0, -msp->ms_size); 2847 2848 } 2849 space_map_close(msp->ms_sm); 2850 msp->ms_sm = NULL; 2851 2852 metaslab_unload(msp); 2853 2854 range_tree_destroy(msp->ms_allocatable); 2855 range_tree_destroy(msp->ms_freeing); 2856 range_tree_destroy(msp->ms_freed); 2857 2858 ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, 2859 metaslab_unflushed_changes_memused(msp)); 2860 spa->spa_unflushed_stats.sus_memused -= 2861 metaslab_unflushed_changes_memused(msp); 2862 range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); 2863 range_tree_destroy(msp->ms_unflushed_allocs); 2864 range_tree_destroy(msp->ms_checkpointing); 2865 range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); 2866 range_tree_destroy(msp->ms_unflushed_frees); 2867 2868 for (int t = 0; t < TXG_SIZE; t++) { 2869 range_tree_destroy(msp->ms_allocating[t]); 2870 } 2871 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2872 range_tree_destroy(msp->ms_defer[t]); 2873 } 2874 ASSERT0(msp->ms_deferspace); 2875 2876 for (int t = 0; t < TXG_SIZE; t++) 2877 ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t)); 2878 2879 range_tree_vacate(msp->ms_trim, NULL, NULL); 2880 range_tree_destroy(msp->ms_trim); 2881 2882 mutex_exit(&msp->ms_lock); 2883 cv_destroy(&msp->ms_load_cv); 2884 cv_destroy(&msp->ms_flush_cv); 2885 mutex_destroy(&msp->ms_lock); 2886 mutex_destroy(&msp->ms_sync_lock); 2887 ASSERT3U(msp->ms_allocator, ==, -1); 2888 2889 kmem_free(msp, sizeof (metaslab_t)); 2890 } 2891 2892 #define FRAGMENTATION_TABLE_SIZE 17 2893 2894 /* 2895 * This table defines a segment size based fragmentation metric that will 2896 * allow each metaslab to derive its own fragmentation value. This is done 2897 * by calculating the space in each bucket of the spacemap histogram and 2898 * multiplying that by the fragmentation metric in this table. Doing 2899 * this for all buckets and dividing it by the total amount of free 2900 * space in this metaslab (i.e. the total free space in all buckets) gives 2901 * us the fragmentation metric. This means that a high fragmentation metric 2902 * equates to most of the free space being comprised of small segments. 2903 * Conversely, if the metric is low, then most of the free space is in 2904 * large segments. A 10% change in fragmentation equates to approximately 2905 * double the number of segments. 2906 * 2907 * This table defines 0% fragmented space using 16MB segments. Testing has 2908 * shown that segments that are greater than or equal to 16MB do not suffer 2909 * from drastic performance problems. Using this value, we derive the rest 2910 * of the table. Since the fragmentation value is never stored on disk, it 2911 * is possible to change these calculations in the future. 2912 */ 2913 static const int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = { 2914 100, /* 512B */ 2915 100, /* 1K */ 2916 98, /* 2K */ 2917 95, /* 4K */ 2918 90, /* 8K */ 2919 80, /* 16K */ 2920 70, /* 32K */ 2921 60, /* 64K */ 2922 50, /* 128K */ 2923 40, /* 256K */ 2924 30, /* 512K */ 2925 20, /* 1M */ 2926 15, /* 2M */ 2927 10, /* 4M */ 2928 5, /* 8M */ 2929 0 /* 16M */ 2930 }; 2931 2932 /* 2933 * Calculate the metaslab's fragmentation metric and set ms_fragmentation. 2934 * Setting this value to ZFS_FRAG_INVALID means that the metaslab has not 2935 * been upgraded and does not support this metric. Otherwise, the return 2936 * value should be in the range [0, 100]. 2937 */ 2938 static void 2939 metaslab_set_fragmentation(metaslab_t *msp, boolean_t nodirty) 2940 { 2941 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 2942 uint64_t fragmentation = 0; 2943 uint64_t total = 0; 2944 boolean_t feature_enabled = spa_feature_is_enabled(spa, 2945 SPA_FEATURE_SPACEMAP_HISTOGRAM); 2946 2947 if (!feature_enabled) { 2948 msp->ms_fragmentation = ZFS_FRAG_INVALID; 2949 return; 2950 } 2951 2952 /* 2953 * A null space map means that the entire metaslab is free 2954 * and thus is not fragmented. 2955 */ 2956 if (msp->ms_sm == NULL) { 2957 msp->ms_fragmentation = 0; 2958 return; 2959 } 2960 2961 /* 2962 * If this metaslab's space map has not been upgraded, flag it 2963 * so that we upgrade next time we encounter it. 2964 */ 2965 if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) { 2966 uint64_t txg = spa_syncing_txg(spa); 2967 vdev_t *vd = msp->ms_group->mg_vd; 2968 2969 /* 2970 * If we've reached the final dirty txg, then we must 2971 * be shutting down the pool. We don't want to dirty 2972 * any data past this point so skip setting the condense 2973 * flag. We can retry this action the next time the pool 2974 * is imported. We also skip marking this metaslab for 2975 * condensing if the caller has explicitly set nodirty. 2976 */ 2977 if (!nodirty && 2978 spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) { 2979 msp->ms_condense_wanted = B_TRUE; 2980 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 2981 zfs_dbgmsg("txg %llu, requesting force condense: " 2982 "ms_id %llu, vdev_id %llu", (u_longlong_t)txg, 2983 (u_longlong_t)msp->ms_id, 2984 (u_longlong_t)vd->vdev_id); 2985 } 2986 msp->ms_fragmentation = ZFS_FRAG_INVALID; 2987 return; 2988 } 2989 2990 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 2991 uint64_t space = 0; 2992 uint8_t shift = msp->ms_sm->sm_shift; 2993 2994 int idx = MIN(shift - SPA_MINBLOCKSHIFT + i, 2995 FRAGMENTATION_TABLE_SIZE - 1); 2996 2997 if (msp->ms_sm->sm_phys->smp_histogram[i] == 0) 2998 continue; 2999 3000 space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift); 3001 total += space; 3002 3003 ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE); 3004 fragmentation += space * zfs_frag_table[idx]; 3005 } 3006 3007 if (total > 0) 3008 fragmentation /= total; 3009 ASSERT3U(fragmentation, <=, 100); 3010 3011 msp->ms_fragmentation = fragmentation; 3012 } 3013 3014 /* 3015 * Compute a weight -- a selection preference value -- for the given metaslab. 3016 * This is based on the amount of free space, the level of fragmentation, 3017 * the LBA range, and whether the metaslab is loaded. 3018 */ 3019 static uint64_t 3020 metaslab_space_weight(metaslab_t *msp) 3021 { 3022 metaslab_group_t *mg = msp->ms_group; 3023 vdev_t *vd = mg->mg_vd; 3024 uint64_t weight, space; 3025 3026 ASSERT(MUTEX_HELD(&msp->ms_lock)); 3027 3028 /* 3029 * The baseline weight is the metaslab's free space. 3030 */ 3031 space = msp->ms_size - metaslab_allocated_space(msp); 3032 3033 if (metaslab_fragmentation_factor_enabled && 3034 msp->ms_fragmentation != ZFS_FRAG_INVALID) { 3035 /* 3036 * Use the fragmentation information to inversely scale 3037 * down the baseline weight. We need to ensure that we 3038 * don't exclude this metaslab completely when it's 100% 3039 * fragmented. To avoid this we reduce the fragmented value 3040 * by 1. 3041 */ 3042 space = (space * (100 - (msp->ms_fragmentation - 1))) / 100; 3043 3044 /* 3045 * If space < SPA_MINBLOCKSIZE, then we will not allocate from 3046 * this metaslab again. The fragmentation metric may have 3047 * decreased the space to something smaller than 3048 * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE 3049 * so that we can consume any remaining space. 3050 */ 3051 if (space > 0 && space < SPA_MINBLOCKSIZE) 3052 space = SPA_MINBLOCKSIZE; 3053 } 3054 weight = space; 3055 3056 /* 3057 * Modern disks have uniform bit density and constant angular velocity. 3058 * Therefore, the outer recording zones are faster (higher bandwidth) 3059 * than the inner zones by the ratio of outer to inner track diameter, 3060 * which is typically around 2:1. We account for this by assigning 3061 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). 3062 * In effect, this means that we'll select the metaslab with the most 3063 * free bandwidth rather than simply the one with the most free space. 3064 */ 3065 if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) { 3066 weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count; 3067 ASSERT(weight >= space && weight <= 2 * space); 3068 } 3069 3070 /* 3071 * If this metaslab is one we're actively using, adjust its 3072 * weight to make it preferable to any inactive metaslab so 3073 * we'll polish it off. If the fragmentation on this metaslab 3074 * has exceed our threshold, then don't mark it active. 3075 */ 3076 if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID && 3077 msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) { 3078 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); 3079 } 3080 3081 WEIGHT_SET_SPACEBASED(weight); 3082 return (weight); 3083 } 3084 3085 /* 3086 * Return the weight of the specified metaslab, according to the segment-based 3087 * weighting algorithm. The metaslab must be loaded. This function can 3088 * be called within a sync pass since it relies only on the metaslab's 3089 * range tree which is always accurate when the metaslab is loaded. 3090 */ 3091 static uint64_t 3092 metaslab_weight_from_range_tree(metaslab_t *msp) 3093 { 3094 uint64_t weight = 0; 3095 uint32_t segments = 0; 3096 3097 ASSERT(msp->ms_loaded); 3098 3099 for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT; 3100 i--) { 3101 uint8_t shift = msp->ms_group->mg_vd->vdev_ashift; 3102 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; 3103 3104 segments <<= 1; 3105 segments += msp->ms_allocatable->rt_histogram[i]; 3106 3107 /* 3108 * The range tree provides more precision than the space map 3109 * and must be downgraded so that all values fit within the 3110 * space map's histogram. This allows us to compare loaded 3111 * vs. unloaded metaslabs to determine which metaslab is 3112 * considered "best". 3113 */ 3114 if (i > max_idx) 3115 continue; 3116 3117 if (segments != 0) { 3118 WEIGHT_SET_COUNT(weight, segments); 3119 WEIGHT_SET_INDEX(weight, i); 3120 WEIGHT_SET_ACTIVE(weight, 0); 3121 break; 3122 } 3123 } 3124 return (weight); 3125 } 3126 3127 /* 3128 * Calculate the weight based on the on-disk histogram. Should be applied 3129 * only to unloaded metaslabs (i.e no incoming allocations) in-order to 3130 * give results consistent with the on-disk state 3131 */ 3132 static uint64_t 3133 metaslab_weight_from_spacemap(metaslab_t *msp) 3134 { 3135 space_map_t *sm = msp->ms_sm; 3136 ASSERT(!msp->ms_loaded); 3137 ASSERT(sm != NULL); 3138 ASSERT3U(space_map_object(sm), !=, 0); 3139 ASSERT3U(sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); 3140 3141 /* 3142 * Create a joint histogram from all the segments that have made 3143 * it to the metaslab's space map histogram, that are not yet 3144 * available for allocation because they are still in the freeing 3145 * pipeline (e.g. freeing, freed, and defer trees). Then subtract 3146 * these segments from the space map's histogram to get a more 3147 * accurate weight. 3148 */ 3149 uint64_t deferspace_histogram[SPACE_MAP_HISTOGRAM_SIZE] = {0}; 3150 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) 3151 deferspace_histogram[i] += msp->ms_synchist[i]; 3152 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 3153 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 3154 deferspace_histogram[i] += msp->ms_deferhist[t][i]; 3155 } 3156 } 3157 3158 uint64_t weight = 0; 3159 for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) { 3160 ASSERT3U(sm->sm_phys->smp_histogram[i], >=, 3161 deferspace_histogram[i]); 3162 uint64_t count = 3163 sm->sm_phys->smp_histogram[i] - deferspace_histogram[i]; 3164 if (count != 0) { 3165 WEIGHT_SET_COUNT(weight, count); 3166 WEIGHT_SET_INDEX(weight, i + sm->sm_shift); 3167 WEIGHT_SET_ACTIVE(weight, 0); 3168 break; 3169 } 3170 } 3171 return (weight); 3172 } 3173 3174 /* 3175 * Compute a segment-based weight for the specified metaslab. The weight 3176 * is determined by highest bucket in the histogram. The information 3177 * for the highest bucket is encoded into the weight value. 3178 */ 3179 static uint64_t 3180 metaslab_segment_weight(metaslab_t *msp) 3181 { 3182 metaslab_group_t *mg = msp->ms_group; 3183 uint64_t weight = 0; 3184 uint8_t shift = mg->mg_vd->vdev_ashift; 3185 3186 ASSERT(MUTEX_HELD(&msp->ms_lock)); 3187 3188 /* 3189 * The metaslab is completely free. 3190 */ 3191 if (metaslab_allocated_space(msp) == 0) { 3192 int idx = highbit64(msp->ms_size) - 1; 3193 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; 3194 3195 if (idx < max_idx) { 3196 WEIGHT_SET_COUNT(weight, 1ULL); 3197 WEIGHT_SET_INDEX(weight, idx); 3198 } else { 3199 WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx)); 3200 WEIGHT_SET_INDEX(weight, max_idx); 3201 } 3202 WEIGHT_SET_ACTIVE(weight, 0); 3203 ASSERT(!WEIGHT_IS_SPACEBASED(weight)); 3204 return (weight); 3205 } 3206 3207 ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); 3208 3209 /* 3210 * If the metaslab is fully allocated then just make the weight 0. 3211 */ 3212 if (metaslab_allocated_space(msp) == msp->ms_size) 3213 return (0); 3214 /* 3215 * If the metaslab is already loaded, then use the range tree to 3216 * determine the weight. Otherwise, we rely on the space map information 3217 * to generate the weight. 3218 */ 3219 if (msp->ms_loaded) { 3220 weight = metaslab_weight_from_range_tree(msp); 3221 } else { 3222 weight = metaslab_weight_from_spacemap(msp); 3223 } 3224 3225 /* 3226 * If the metaslab was active the last time we calculated its weight 3227 * then keep it active. We want to consume the entire region that 3228 * is associated with this weight. 3229 */ 3230 if (msp->ms_activation_weight != 0 && weight != 0) 3231 WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight)); 3232 return (weight); 3233 } 3234 3235 /* 3236 * Determine if we should attempt to allocate from this metaslab. If the 3237 * metaslab is loaded, then we can determine if the desired allocation 3238 * can be satisfied by looking at the size of the maximum free segment 3239 * on that metaslab. Otherwise, we make our decision based on the metaslab's 3240 * weight. For segment-based weighting we can determine the maximum 3241 * allocation based on the index encoded in its value. For space-based 3242 * weights we rely on the entire weight (excluding the weight-type bit). 3243 */ 3244 static boolean_t 3245 metaslab_should_allocate(metaslab_t *msp, uint64_t asize, boolean_t try_hard) 3246 { 3247 /* 3248 * This case will usually but not always get caught by the checks below; 3249 * metaslabs can be loaded by various means, including the trim and 3250 * initialize code. Once that happens, without this check they are 3251 * allocatable even before they finish their first txg sync. 3252 */ 3253 if (unlikely(msp->ms_new)) 3254 return (B_FALSE); 3255 3256 /* 3257 * If the metaslab is loaded, ms_max_size is definitive and we can use 3258 * the fast check. If it's not, the ms_max_size is a lower bound (once 3259 * set), and we should use the fast check as long as we're not in 3260 * try_hard and it's been less than zfs_metaslab_max_size_cache_sec 3261 * seconds since the metaslab was unloaded. 3262 */ 3263 if (msp->ms_loaded || 3264 (msp->ms_max_size != 0 && !try_hard && gethrtime() < 3265 msp->ms_unload_time + SEC2NSEC(zfs_metaslab_max_size_cache_sec))) 3266 return (msp->ms_max_size >= asize); 3267 3268 boolean_t should_allocate; 3269 if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) { 3270 /* 3271 * The metaslab segment weight indicates segments in the 3272 * range [2^i, 2^(i+1)), where i is the index in the weight. 3273 * Since the asize might be in the middle of the range, we 3274 * should attempt the allocation if asize < 2^(i+1). 3275 */ 3276 should_allocate = (asize < 3277 1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1)); 3278 } else { 3279 should_allocate = (asize <= 3280 (msp->ms_weight & ~METASLAB_WEIGHT_TYPE)); 3281 } 3282 3283 return (should_allocate); 3284 } 3285 3286 static uint64_t 3287 metaslab_weight(metaslab_t *msp, boolean_t nodirty) 3288 { 3289 vdev_t *vd = msp->ms_group->mg_vd; 3290 spa_t *spa = vd->vdev_spa; 3291 uint64_t weight; 3292 3293 ASSERT(MUTEX_HELD(&msp->ms_lock)); 3294 3295 metaslab_set_fragmentation(msp, nodirty); 3296 3297 /* 3298 * Update the maximum size. If the metaslab is loaded, this will 3299 * ensure that we get an accurate maximum size if newly freed space 3300 * has been added back into the free tree. If the metaslab is 3301 * unloaded, we check if there's a larger free segment in the 3302 * unflushed frees. This is a lower bound on the largest allocatable 3303 * segment size. Coalescing of adjacent entries may reveal larger 3304 * allocatable segments, but we aren't aware of those until loading 3305 * the space map into a range tree. 3306 */ 3307 if (msp->ms_loaded) { 3308 msp->ms_max_size = metaslab_largest_allocatable(msp); 3309 } else { 3310 msp->ms_max_size = MAX(msp->ms_max_size, 3311 metaslab_largest_unflushed_free(msp)); 3312 } 3313 3314 /* 3315 * Segment-based weighting requires space map histogram support. 3316 */ 3317 if (zfs_metaslab_segment_weight_enabled && 3318 spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) && 3319 (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size == 3320 sizeof (space_map_phys_t))) { 3321 weight = metaslab_segment_weight(msp); 3322 } else { 3323 weight = metaslab_space_weight(msp); 3324 } 3325 return (weight); 3326 } 3327 3328 void 3329 metaslab_recalculate_weight_and_sort(metaslab_t *msp) 3330 { 3331 ASSERT(MUTEX_HELD(&msp->ms_lock)); 3332 3333 /* note: we preserve the mask (e.g. indication of primary, etc..) */ 3334 uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; 3335 metaslab_group_sort(msp->ms_group, msp, 3336 metaslab_weight(msp, B_FALSE) | was_active); 3337 } 3338 3339 static int 3340 metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp, 3341 int allocator, uint64_t activation_weight) 3342 { 3343 metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; 3344 ASSERT(MUTEX_HELD(&msp->ms_lock)); 3345 3346 /* 3347 * If we're activating for the claim code, we don't want to actually 3348 * set the metaslab up for a specific allocator. 3349 */ 3350 if (activation_weight == METASLAB_WEIGHT_CLAIM) { 3351 ASSERT0(msp->ms_activation_weight); 3352 msp->ms_activation_weight = msp->ms_weight; 3353 metaslab_group_sort(mg, msp, msp->ms_weight | 3354 activation_weight); 3355 return (0); 3356 } 3357 3358 metaslab_t **mspp = (activation_weight == METASLAB_WEIGHT_PRIMARY ? 3359 &mga->mga_primary : &mga->mga_secondary); 3360 3361 mutex_enter(&mg->mg_lock); 3362 if (*mspp != NULL) { 3363 mutex_exit(&mg->mg_lock); 3364 return (EEXIST); 3365 } 3366 3367 *mspp = msp; 3368 ASSERT3S(msp->ms_allocator, ==, -1); 3369 msp->ms_allocator = allocator; 3370 msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY); 3371 3372 ASSERT0(msp->ms_activation_weight); 3373 msp->ms_activation_weight = msp->ms_weight; 3374 metaslab_group_sort_impl(mg, msp, 3375 msp->ms_weight | activation_weight); 3376 mutex_exit(&mg->mg_lock); 3377 3378 return (0); 3379 } 3380 3381 static int 3382 metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight) 3383 { 3384 ASSERT(MUTEX_HELD(&msp->ms_lock)); 3385 3386 /* 3387 * The current metaslab is already activated for us so there 3388 * is nothing to do. Already activated though, doesn't mean 3389 * that this metaslab is activated for our allocator nor our 3390 * requested activation weight. The metaslab could have started 3391 * as an active one for our allocator but changed allocators 3392 * while we were waiting to grab its ms_lock or we stole it 3393 * [see find_valid_metaslab()]. This means that there is a 3394 * possibility of passivating a metaslab of another allocator 3395 * or from a different activation mask, from this thread. 3396 */ 3397 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) { 3398 ASSERT(msp->ms_loaded); 3399 return (0); 3400 } 3401 3402 int error = metaslab_load(msp); 3403 if (error != 0) { 3404 metaslab_group_sort(msp->ms_group, msp, 0); 3405 return (error); 3406 } 3407 3408 /* 3409 * When entering metaslab_load() we may have dropped the 3410 * ms_lock because we were loading this metaslab, or we 3411 * were waiting for another thread to load it for us. In 3412 * that scenario, we recheck the weight of the metaslab 3413 * to see if it was activated by another thread. 3414 * 3415 * If the metaslab was activated for another allocator or 3416 * it was activated with a different activation weight (e.g. 3417 * we wanted to make it a primary but it was activated as 3418 * secondary) we return error (EBUSY). 3419 * 3420 * If the metaslab was activated for the same allocator 3421 * and requested activation mask, skip activating it. 3422 */ 3423 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) { 3424 if (msp->ms_allocator != allocator) 3425 return (EBUSY); 3426 3427 if ((msp->ms_weight & activation_weight) == 0) 3428 return (SET_ERROR(EBUSY)); 3429 3430 EQUIV((activation_weight == METASLAB_WEIGHT_PRIMARY), 3431 msp->ms_primary); 3432 return (0); 3433 } 3434 3435 /* 3436 * If the metaslab has literally 0 space, it will have weight 0. In 3437 * that case, don't bother activating it. This can happen if the 3438 * metaslab had space during find_valid_metaslab, but another thread 3439 * loaded it and used all that space while we were waiting to grab the 3440 * lock. 3441 */ 3442 if (msp->ms_weight == 0) { 3443 ASSERT0(range_tree_space(msp->ms_allocatable)); 3444 return (SET_ERROR(ENOSPC)); 3445 } 3446 3447 if ((error = metaslab_activate_allocator(msp->ms_group, msp, 3448 allocator, activation_weight)) != 0) { 3449 return (error); 3450 } 3451 3452 ASSERT(msp->ms_loaded); 3453 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); 3454 3455 return (0); 3456 } 3457 3458 static void 3459 metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp, 3460 uint64_t weight) 3461 { 3462 ASSERT(MUTEX_HELD(&msp->ms_lock)); 3463 ASSERT(msp->ms_loaded); 3464 3465 if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) { 3466 metaslab_group_sort(mg, msp, weight); 3467 return; 3468 } 3469 3470 mutex_enter(&mg->mg_lock); 3471 ASSERT3P(msp->ms_group, ==, mg); 3472 ASSERT3S(0, <=, msp->ms_allocator); 3473 ASSERT3U(msp->ms_allocator, <, mg->mg_allocators); 3474 3475 metaslab_group_allocator_t *mga = &mg->mg_allocator[msp->ms_allocator]; 3476 if (msp->ms_primary) { 3477 ASSERT3P(mga->mga_primary, ==, msp); 3478 ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); 3479 mga->mga_primary = NULL; 3480 } else { 3481 ASSERT3P(mga->mga_secondary, ==, msp); 3482 ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); 3483 mga->mga_secondary = NULL; 3484 } 3485 msp->ms_allocator = -1; 3486 metaslab_group_sort_impl(mg, msp, weight); 3487 mutex_exit(&mg->mg_lock); 3488 } 3489 3490 static void 3491 metaslab_passivate(metaslab_t *msp, uint64_t weight) 3492 { 3493 uint64_t size __maybe_unused = weight & ~METASLAB_WEIGHT_TYPE; 3494 3495 /* 3496 * If size < SPA_MINBLOCKSIZE, then we will not allocate from 3497 * this metaslab again. In that case, it had better be empty, 3498 * or we would be leaving space on the table. 3499 */ 3500 ASSERT(!WEIGHT_IS_SPACEBASED(msp->ms_weight) || 3501 size >= SPA_MINBLOCKSIZE || 3502 range_tree_space(msp->ms_allocatable) == 0); 3503 ASSERT0(weight & METASLAB_ACTIVE_MASK); 3504 3505 ASSERT(msp->ms_activation_weight != 0); 3506 msp->ms_activation_weight = 0; 3507 metaslab_passivate_allocator(msp->ms_group, msp, weight); 3508 ASSERT0(msp->ms_weight & METASLAB_ACTIVE_MASK); 3509 } 3510 3511 /* 3512 * Segment-based metaslabs are activated once and remain active until 3513 * we either fail an allocation attempt (similar to space-based metaslabs) 3514 * or have exhausted the free space in zfs_metaslab_switch_threshold 3515 * buckets since the metaslab was activated. This function checks to see 3516 * if we've exhausted the zfs_metaslab_switch_threshold buckets in the 3517 * metaslab and passivates it proactively. This will allow us to select a 3518 * metaslab with a larger contiguous region, if any, remaining within this 3519 * metaslab group. If we're in sync pass > 1, then we continue using this 3520 * metaslab so that we don't dirty more block and cause more sync passes. 3521 */ 3522 static void 3523 metaslab_segment_may_passivate(metaslab_t *msp) 3524 { 3525 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 3526 3527 if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1) 3528 return; 3529 3530 /* 3531 * Since we are in the middle of a sync pass, the most accurate 3532 * information that is accessible to us is the in-core range tree 3533 * histogram; calculate the new weight based on that information. 3534 */ 3535 uint64_t weight = metaslab_weight_from_range_tree(msp); 3536 int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight); 3537 int current_idx = WEIGHT_GET_INDEX(weight); 3538 3539 if (current_idx <= activation_idx - zfs_metaslab_switch_threshold) 3540 metaslab_passivate(msp, weight); 3541 } 3542 3543 static void 3544 metaslab_preload(void *arg) 3545 { 3546 metaslab_t *msp = arg; 3547 metaslab_class_t *mc = msp->ms_group->mg_class; 3548 spa_t *spa = mc->mc_spa; 3549 fstrans_cookie_t cookie = spl_fstrans_mark(); 3550 3551 ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock)); 3552 3553 mutex_enter(&msp->ms_lock); 3554 (void) metaslab_load(msp); 3555 metaslab_set_selected_txg(msp, spa_syncing_txg(spa)); 3556 mutex_exit(&msp->ms_lock); 3557 spl_fstrans_unmark(cookie); 3558 } 3559 3560 static void 3561 metaslab_group_preload(metaslab_group_t *mg) 3562 { 3563 spa_t *spa = mg->mg_vd->vdev_spa; 3564 metaslab_t *msp; 3565 avl_tree_t *t = &mg->mg_metaslab_tree; 3566 int m = 0; 3567 3568 if (spa_shutting_down(spa) || !metaslab_preload_enabled) 3569 return; 3570 3571 mutex_enter(&mg->mg_lock); 3572 3573 /* 3574 * Load the next potential metaslabs 3575 */ 3576 for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) { 3577 ASSERT3P(msp->ms_group, ==, mg); 3578 3579 /* 3580 * We preload only the maximum number of metaslabs specified 3581 * by metaslab_preload_limit. If a metaslab is being forced 3582 * to condense then we preload it too. This will ensure 3583 * that force condensing happens in the next txg. 3584 */ 3585 if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) { 3586 continue; 3587 } 3588 3589 VERIFY(taskq_dispatch(spa->spa_metaslab_taskq, metaslab_preload, 3590 msp, TQ_SLEEP | (m <= mg->mg_allocators ? TQ_FRONT : 0)) 3591 != TASKQID_INVALID); 3592 } 3593 mutex_exit(&mg->mg_lock); 3594 } 3595 3596 /* 3597 * Determine if the space map's on-disk footprint is past our tolerance for 3598 * inefficiency. We would like to use the following criteria to make our 3599 * decision: 3600 * 3601 * 1. Do not condense if the size of the space map object would dramatically 3602 * increase as a result of writing out the free space range tree. 3603 * 3604 * 2. Condense if the on on-disk space map representation is at least 3605 * zfs_condense_pct/100 times the size of the optimal representation 3606 * (i.e. zfs_condense_pct = 110 and in-core = 1MB, optimal = 1.1MB). 3607 * 3608 * 3. Do not condense if the on-disk size of the space map does not actually 3609 * decrease. 3610 * 3611 * Unfortunately, we cannot compute the on-disk size of the space map in this 3612 * context because we cannot accurately compute the effects of compression, etc. 3613 * Instead, we apply the heuristic described in the block comment for 3614 * zfs_metaslab_condense_block_threshold - we only condense if the space used 3615 * is greater than a threshold number of blocks. 3616 */ 3617 static boolean_t 3618 metaslab_should_condense(metaslab_t *msp) 3619 { 3620 space_map_t *sm = msp->ms_sm; 3621 vdev_t *vd = msp->ms_group->mg_vd; 3622 uint64_t vdev_blocksize = 1ULL << vd->vdev_ashift; 3623 3624 ASSERT(MUTEX_HELD(&msp->ms_lock)); 3625 ASSERT(msp->ms_loaded); 3626 ASSERT(sm != NULL); 3627 ASSERT3U(spa_sync_pass(vd->vdev_spa), ==, 1); 3628 3629 /* 3630 * We always condense metaslabs that are empty and metaslabs for 3631 * which a condense request has been made. 3632 */ 3633 if (range_tree_numsegs(msp->ms_allocatable) == 0 || 3634 msp->ms_condense_wanted) 3635 return (B_TRUE); 3636 3637 uint64_t record_size = MAX(sm->sm_blksz, vdev_blocksize); 3638 uint64_t object_size = space_map_length(sm); 3639 uint64_t optimal_size = space_map_estimate_optimal_size(sm, 3640 msp->ms_allocatable, SM_NO_VDEVID); 3641 3642 return (object_size >= (optimal_size * zfs_condense_pct / 100) && 3643 object_size > zfs_metaslab_condense_block_threshold * record_size); 3644 } 3645 3646 /* 3647 * Condense the on-disk space map representation to its minimized form. 3648 * The minimized form consists of a small number of allocations followed 3649 * by the entries of the free range tree (ms_allocatable). The condensed 3650 * spacemap contains all the entries of previous TXGs (including those in 3651 * the pool-wide log spacemaps; thus this is effectively a superset of 3652 * metaslab_flush()), but this TXG's entries still need to be written. 3653 */ 3654 static void 3655 metaslab_condense(metaslab_t *msp, dmu_tx_t *tx) 3656 { 3657 range_tree_t *condense_tree; 3658 space_map_t *sm = msp->ms_sm; 3659 uint64_t txg = dmu_tx_get_txg(tx); 3660 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 3661 3662 ASSERT(MUTEX_HELD(&msp->ms_lock)); 3663 ASSERT(msp->ms_loaded); 3664 ASSERT(msp->ms_sm != NULL); 3665 3666 /* 3667 * In order to condense the space map, we need to change it so it 3668 * only describes which segments are currently allocated and free. 3669 * 3670 * All the current free space resides in the ms_allocatable, all 3671 * the ms_defer trees, and all the ms_allocating trees. We ignore 3672 * ms_freed because it is empty because we're in sync pass 1. We 3673 * ignore ms_freeing because these changes are not yet reflected 3674 * in the spacemap (they will be written later this txg). 3675 * 3676 * So to truncate the space map to represent all the entries of 3677 * previous TXGs we do the following: 3678 * 3679 * 1] We create a range tree (condense tree) that is 100% empty. 3680 * 2] We add to it all segments found in the ms_defer trees 3681 * as those segments are marked as free in the original space 3682 * map. We do the same with the ms_allocating trees for the same 3683 * reason. Adding these segments should be a relatively 3684 * inexpensive operation since we expect these trees to have a 3685 * small number of nodes. 3686 * 3] We vacate any unflushed allocs, since they are not frees we 3687 * need to add to the condense tree. Then we vacate any 3688 * unflushed frees as they should already be part of ms_allocatable. 3689 * 4] At this point, we would ideally like to add all segments 3690 * in the ms_allocatable tree from the condense tree. This way 3691 * we would write all the entries of the condense tree as the 3692 * condensed space map, which would only contain freed 3693 * segments with everything else assumed to be allocated. 3694 * 3695 * Doing so can be prohibitively expensive as ms_allocatable can 3696 * be large, and therefore computationally expensive to add to 3697 * the condense_tree. Instead we first sync out an entry marking 3698 * everything as allocated, then the condense_tree and then the 3699 * ms_allocatable, in the condensed space map. While this is not 3700 * optimal, it is typically close to optimal and more importantly 3701 * much cheaper to compute. 3702 * 3703 * 5] Finally, as both of the unflushed trees were written to our 3704 * new and condensed metaslab space map, we basically flushed 3705 * all the unflushed changes to disk, thus we call 3706 * metaslab_flush_update(). 3707 */ 3708 ASSERT3U(spa_sync_pass(spa), ==, 1); 3709 ASSERT(range_tree_is_empty(msp->ms_freed)); /* since it is pass 1 */ 3710 3711 zfs_dbgmsg("condensing: txg %llu, msp[%llu] %px, vdev id %llu, " 3712 "spa %s, smp size %llu, segments %llu, forcing condense=%s", 3713 (u_longlong_t)txg, (u_longlong_t)msp->ms_id, msp, 3714 (u_longlong_t)msp->ms_group->mg_vd->vdev_id, 3715 spa->spa_name, (u_longlong_t)space_map_length(msp->ms_sm), 3716 (u_longlong_t)range_tree_numsegs(msp->ms_allocatable), 3717 msp->ms_condense_wanted ? "TRUE" : "FALSE"); 3718 3719 msp->ms_condense_wanted = B_FALSE; 3720 3721 range_seg_type_t type; 3722 uint64_t shift, start; 3723 type = metaslab_calculate_range_tree_type(msp->ms_group->mg_vd, msp, 3724 &start, &shift); 3725 3726 condense_tree = range_tree_create(NULL, type, NULL, start, shift); 3727 3728 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 3729 range_tree_walk(msp->ms_defer[t], 3730 range_tree_add, condense_tree); 3731 } 3732 3733 for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { 3734 range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK], 3735 range_tree_add, condense_tree); 3736 } 3737 3738 ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, 3739 metaslab_unflushed_changes_memused(msp)); 3740 spa->spa_unflushed_stats.sus_memused -= 3741 metaslab_unflushed_changes_memused(msp); 3742 range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); 3743 range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); 3744 3745 /* 3746 * We're about to drop the metaslab's lock thus allowing other 3747 * consumers to change it's content. Set the metaslab's ms_condensing 3748 * flag to ensure that allocations on this metaslab do not occur 3749 * while we're in the middle of committing it to disk. This is only 3750 * critical for ms_allocatable as all other range trees use per TXG 3751 * views of their content. 3752 */ 3753 msp->ms_condensing = B_TRUE; 3754 3755 mutex_exit(&msp->ms_lock); 3756 uint64_t object = space_map_object(msp->ms_sm); 3757 space_map_truncate(sm, 3758 spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ? 3759 zfs_metaslab_sm_blksz_with_log : zfs_metaslab_sm_blksz_no_log, tx); 3760 3761 /* 3762 * space_map_truncate() may have reallocated the spacemap object. 3763 * If so, update the vdev_ms_array. 3764 */ 3765 if (space_map_object(msp->ms_sm) != object) { 3766 object = space_map_object(msp->ms_sm); 3767 dmu_write(spa->spa_meta_objset, 3768 msp->ms_group->mg_vd->vdev_ms_array, sizeof (uint64_t) * 3769 msp->ms_id, sizeof (uint64_t), &object, tx); 3770 } 3771 3772 /* 3773 * Note: 3774 * When the log space map feature is enabled, each space map will 3775 * always have ALLOCS followed by FREES for each sync pass. This is 3776 * typically true even when the log space map feature is disabled, 3777 * except from the case where a metaslab goes through metaslab_sync() 3778 * and gets condensed. In that case the metaslab's space map will have 3779 * ALLOCS followed by FREES (due to condensing) followed by ALLOCS 3780 * followed by FREES (due to space_map_write() in metaslab_sync()) for 3781 * sync pass 1. 3782 */ 3783 range_tree_t *tmp_tree = range_tree_create(NULL, type, NULL, start, 3784 shift); 3785 range_tree_add(tmp_tree, msp->ms_start, msp->ms_size); 3786 space_map_write(sm, tmp_tree, SM_ALLOC, SM_NO_VDEVID, tx); 3787 space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx); 3788 space_map_write(sm, condense_tree, SM_FREE, SM_NO_VDEVID, tx); 3789 3790 range_tree_vacate(condense_tree, NULL, NULL); 3791 range_tree_destroy(condense_tree); 3792 range_tree_vacate(tmp_tree, NULL, NULL); 3793 range_tree_destroy(tmp_tree); 3794 mutex_enter(&msp->ms_lock); 3795 3796 msp->ms_condensing = B_FALSE; 3797 metaslab_flush_update(msp, tx); 3798 } 3799 3800 static void 3801 metaslab_unflushed_add(metaslab_t *msp, dmu_tx_t *tx) 3802 { 3803 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 3804 ASSERT(spa_syncing_log_sm(spa) != NULL); 3805 ASSERT(msp->ms_sm != NULL); 3806 ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); 3807 ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); 3808 3809 mutex_enter(&spa->spa_flushed_ms_lock); 3810 metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx); 3811 metaslab_set_unflushed_dirty(msp, B_TRUE); 3812 avl_add(&spa->spa_metaslabs_by_flushed, msp); 3813 mutex_exit(&spa->spa_flushed_ms_lock); 3814 3815 spa_log_sm_increment_current_mscount(spa); 3816 spa_log_summary_add_flushed_metaslab(spa, B_TRUE); 3817 } 3818 3819 void 3820 metaslab_unflushed_bump(metaslab_t *msp, dmu_tx_t *tx, boolean_t dirty) 3821 { 3822 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 3823 ASSERT(spa_syncing_log_sm(spa) != NULL); 3824 ASSERT(msp->ms_sm != NULL); 3825 ASSERT(metaslab_unflushed_txg(msp) != 0); 3826 ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), ==, msp); 3827 ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); 3828 ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); 3829 3830 VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(spa)); 3831 3832 /* update metaslab's position in our flushing tree */ 3833 uint64_t ms_prev_flushed_txg = metaslab_unflushed_txg(msp); 3834 boolean_t ms_prev_flushed_dirty = metaslab_unflushed_dirty(msp); 3835 mutex_enter(&spa->spa_flushed_ms_lock); 3836 avl_remove(&spa->spa_metaslabs_by_flushed, msp); 3837 metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx); 3838 metaslab_set_unflushed_dirty(msp, dirty); 3839 avl_add(&spa->spa_metaslabs_by_flushed, msp); 3840 mutex_exit(&spa->spa_flushed_ms_lock); 3841 3842 /* update metaslab counts of spa_log_sm_t nodes */ 3843 spa_log_sm_decrement_mscount(spa, ms_prev_flushed_txg); 3844 spa_log_sm_increment_current_mscount(spa); 3845 3846 /* update log space map summary */ 3847 spa_log_summary_decrement_mscount(spa, ms_prev_flushed_txg, 3848 ms_prev_flushed_dirty); 3849 spa_log_summary_add_flushed_metaslab(spa, dirty); 3850 3851 /* cleanup obsolete logs if any */ 3852 spa_cleanup_old_sm_logs(spa, tx); 3853 } 3854 3855 /* 3856 * Called when the metaslab has been flushed (its own spacemap now reflects 3857 * all the contents of the pool-wide spacemap log). Updates the metaslab's 3858 * metadata and any pool-wide related log space map data (e.g. summary, 3859 * obsolete logs, etc..) to reflect that. 3860 */ 3861 static void 3862 metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx) 3863 { 3864 metaslab_group_t *mg = msp->ms_group; 3865 spa_t *spa = mg->mg_vd->vdev_spa; 3866 3867 ASSERT(MUTEX_HELD(&msp->ms_lock)); 3868 3869 ASSERT3U(spa_sync_pass(spa), ==, 1); 3870 3871 /* 3872 * Just because a metaslab got flushed, that doesn't mean that 3873 * it will pass through metaslab_sync_done(). Thus, make sure to 3874 * update ms_synced_length here in case it doesn't. 3875 */ 3876 msp->ms_synced_length = space_map_length(msp->ms_sm); 3877 3878 /* 3879 * We may end up here from metaslab_condense() without the 3880 * feature being active. In that case this is a no-op. 3881 */ 3882 if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP) || 3883 metaslab_unflushed_txg(msp) == 0) 3884 return; 3885 3886 metaslab_unflushed_bump(msp, tx, B_FALSE); 3887 } 3888 3889 boolean_t 3890 metaslab_flush(metaslab_t *msp, dmu_tx_t *tx) 3891 { 3892 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 3893 3894 ASSERT(MUTEX_HELD(&msp->ms_lock)); 3895 ASSERT3U(spa_sync_pass(spa), ==, 1); 3896 ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); 3897 3898 ASSERT(msp->ms_sm != NULL); 3899 ASSERT(metaslab_unflushed_txg(msp) != 0); 3900 ASSERT(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL) != NULL); 3901 3902 /* 3903 * There is nothing wrong with flushing the same metaslab twice, as 3904 * this codepath should work on that case. However, the current 3905 * flushing scheme makes sure to avoid this situation as we would be 3906 * making all these calls without having anything meaningful to write 3907 * to disk. We assert this behavior here. 3908 */ 3909 ASSERT3U(metaslab_unflushed_txg(msp), <, dmu_tx_get_txg(tx)); 3910 3911 /* 3912 * We can not flush while loading, because then we would 3913 * not load the ms_unflushed_{allocs,frees}. 3914 */ 3915 if (msp->ms_loading) 3916 return (B_FALSE); 3917 3918 metaslab_verify_space(msp, dmu_tx_get_txg(tx)); 3919 metaslab_verify_weight_and_frag(msp); 3920 3921 /* 3922 * Metaslab condensing is effectively flushing. Therefore if the 3923 * metaslab can be condensed we can just condense it instead of 3924 * flushing it. 3925 * 3926 * Note that metaslab_condense() does call metaslab_flush_update() 3927 * so we can just return immediately after condensing. We also 3928 * don't need to care about setting ms_flushing or broadcasting 3929 * ms_flush_cv, even if we temporarily drop the ms_lock in 3930 * metaslab_condense(), as the metaslab is already loaded. 3931 */ 3932 if (msp->ms_loaded && metaslab_should_condense(msp)) { 3933 metaslab_group_t *mg = msp->ms_group; 3934 3935 /* 3936 * For all histogram operations below refer to the 3937 * comments of metaslab_sync() where we follow a 3938 * similar procedure. 3939 */ 3940 metaslab_group_histogram_verify(mg); 3941 metaslab_class_histogram_verify(mg->mg_class); 3942 metaslab_group_histogram_remove(mg, msp); 3943 3944 metaslab_condense(msp, tx); 3945 3946 space_map_histogram_clear(msp->ms_sm); 3947 space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx); 3948 ASSERT(range_tree_is_empty(msp->ms_freed)); 3949 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 3950 space_map_histogram_add(msp->ms_sm, 3951 msp->ms_defer[t], tx); 3952 } 3953 metaslab_aux_histograms_update(msp); 3954 3955 metaslab_group_histogram_add(mg, msp); 3956 metaslab_group_histogram_verify(mg); 3957 metaslab_class_histogram_verify(mg->mg_class); 3958 3959 metaslab_verify_space(msp, dmu_tx_get_txg(tx)); 3960 3961 /* 3962 * Since we recreated the histogram (and potentially 3963 * the ms_sm too while condensing) ensure that the 3964 * weight is updated too because we are not guaranteed 3965 * that this metaslab is dirty and will go through 3966 * metaslab_sync_done(). 3967 */ 3968 metaslab_recalculate_weight_and_sort(msp); 3969 return (B_TRUE); 3970 } 3971 3972 msp->ms_flushing = B_TRUE; 3973 uint64_t sm_len_before = space_map_length(msp->ms_sm); 3974 3975 mutex_exit(&msp->ms_lock); 3976 space_map_write(msp->ms_sm, msp->ms_unflushed_allocs, SM_ALLOC, 3977 SM_NO_VDEVID, tx); 3978 space_map_write(msp->ms_sm, msp->ms_unflushed_frees, SM_FREE, 3979 SM_NO_VDEVID, tx); 3980 mutex_enter(&msp->ms_lock); 3981 3982 uint64_t sm_len_after = space_map_length(msp->ms_sm); 3983 if (zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) { 3984 zfs_dbgmsg("flushing: txg %llu, spa %s, vdev_id %llu, " 3985 "ms_id %llu, unflushed_allocs %llu, unflushed_frees %llu, " 3986 "appended %llu bytes", (u_longlong_t)dmu_tx_get_txg(tx), 3987 spa_name(spa), 3988 (u_longlong_t)msp->ms_group->mg_vd->vdev_id, 3989 (u_longlong_t)msp->ms_id, 3990 (u_longlong_t)range_tree_space(msp->ms_unflushed_allocs), 3991 (u_longlong_t)range_tree_space(msp->ms_unflushed_frees), 3992 (u_longlong_t)(sm_len_after - sm_len_before)); 3993 } 3994 3995 ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, 3996 metaslab_unflushed_changes_memused(msp)); 3997 spa->spa_unflushed_stats.sus_memused -= 3998 metaslab_unflushed_changes_memused(msp); 3999 range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); 4000 range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); 4001 4002 metaslab_verify_space(msp, dmu_tx_get_txg(tx)); 4003 metaslab_verify_weight_and_frag(msp); 4004 4005 metaslab_flush_update(msp, tx); 4006 4007 metaslab_verify_space(msp, dmu_tx_get_txg(tx)); 4008 metaslab_verify_weight_and_frag(msp); 4009 4010 msp->ms_flushing = B_FALSE; 4011 cv_broadcast(&msp->ms_flush_cv); 4012 return (B_TRUE); 4013 } 4014 4015 /* 4016 * Write a metaslab to disk in the context of the specified transaction group. 4017 */ 4018 void 4019 metaslab_sync(metaslab_t *msp, uint64_t txg) 4020 { 4021 metaslab_group_t *mg = msp->ms_group; 4022 vdev_t *vd = mg->mg_vd; 4023 spa_t *spa = vd->vdev_spa; 4024 objset_t *mos = spa_meta_objset(spa); 4025 range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK]; 4026 dmu_tx_t *tx; 4027 4028 ASSERT(!vd->vdev_ishole); 4029 4030 /* 4031 * This metaslab has just been added so there's no work to do now. 4032 */ 4033 if (msp->ms_new) { 4034 ASSERT0(range_tree_space(alloctree)); 4035 ASSERT0(range_tree_space(msp->ms_freeing)); 4036 ASSERT0(range_tree_space(msp->ms_freed)); 4037 ASSERT0(range_tree_space(msp->ms_checkpointing)); 4038 ASSERT0(range_tree_space(msp->ms_trim)); 4039 return; 4040 } 4041 4042 /* 4043 * Normally, we don't want to process a metaslab if there are no 4044 * allocations or frees to perform. However, if the metaslab is being 4045 * forced to condense, it's loaded and we're not beyond the final 4046 * dirty txg, we need to let it through. Not condensing beyond the 4047 * final dirty txg prevents an issue where metaslabs that need to be 4048 * condensed but were loaded for other reasons could cause a panic 4049 * here. By only checking the txg in that branch of the conditional, 4050 * we preserve the utility of the VERIFY statements in all other 4051 * cases. 4052 */ 4053 if (range_tree_is_empty(alloctree) && 4054 range_tree_is_empty(msp->ms_freeing) && 4055 range_tree_is_empty(msp->ms_checkpointing) && 4056 !(msp->ms_loaded && msp->ms_condense_wanted && 4057 txg <= spa_final_dirty_txg(spa))) 4058 return; 4059 4060 4061 VERIFY3U(txg, <=, spa_final_dirty_txg(spa)); 4062 4063 /* 4064 * The only state that can actually be changing concurrently 4065 * with metaslab_sync() is the metaslab's ms_allocatable. No 4066 * other thread can be modifying this txg's alloc, freeing, 4067 * freed, or space_map_phys_t. We drop ms_lock whenever we 4068 * could call into the DMU, because the DMU can call down to 4069 * us (e.g. via zio_free()) at any time. 4070 * 4071 * The spa_vdev_remove_thread() can be reading metaslab state 4072 * concurrently, and it is locked out by the ms_sync_lock. 4073 * Note that the ms_lock is insufficient for this, because it 4074 * is dropped by space_map_write(). 4075 */ 4076 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 4077 4078 /* 4079 * Generate a log space map if one doesn't exist already. 4080 */ 4081 spa_generate_syncing_log_sm(spa, tx); 4082 4083 if (msp->ms_sm == NULL) { 4084 uint64_t new_object = space_map_alloc(mos, 4085 spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ? 4086 zfs_metaslab_sm_blksz_with_log : 4087 zfs_metaslab_sm_blksz_no_log, tx); 4088 VERIFY3U(new_object, !=, 0); 4089 4090 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * 4091 msp->ms_id, sizeof (uint64_t), &new_object, tx); 4092 4093 VERIFY0(space_map_open(&msp->ms_sm, mos, new_object, 4094 msp->ms_start, msp->ms_size, vd->vdev_ashift)); 4095 ASSERT(msp->ms_sm != NULL); 4096 4097 ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); 4098 ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); 4099 ASSERT0(metaslab_allocated_space(msp)); 4100 } 4101 4102 if (!range_tree_is_empty(msp->ms_checkpointing) && 4103 vd->vdev_checkpoint_sm == NULL) { 4104 ASSERT(spa_has_checkpoint(spa)); 4105 4106 uint64_t new_object = space_map_alloc(mos, 4107 zfs_vdev_standard_sm_blksz, tx); 4108 VERIFY3U(new_object, !=, 0); 4109 4110 VERIFY0(space_map_open(&vd->vdev_checkpoint_sm, 4111 mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift)); 4112 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); 4113 4114 /* 4115 * We save the space map object as an entry in vdev_top_zap 4116 * so it can be retrieved when the pool is reopened after an 4117 * export or through zdb. 4118 */ 4119 VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, 4120 vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, 4121 sizeof (new_object), 1, &new_object, tx)); 4122 } 4123 4124 mutex_enter(&msp->ms_sync_lock); 4125 mutex_enter(&msp->ms_lock); 4126 4127 /* 4128 * Note: metaslab_condense() clears the space map's histogram. 4129 * Therefore we must verify and remove this histogram before 4130 * condensing. 4131 */ 4132 metaslab_group_histogram_verify(mg); 4133 metaslab_class_histogram_verify(mg->mg_class); 4134 metaslab_group_histogram_remove(mg, msp); 4135 4136 if (spa->spa_sync_pass == 1 && msp->ms_loaded && 4137 metaslab_should_condense(msp)) 4138 metaslab_condense(msp, tx); 4139 4140 /* 4141 * We'll be going to disk to sync our space accounting, thus we 4142 * drop the ms_lock during that time so allocations coming from 4143 * open-context (ZIL) for future TXGs do not block. 4144 */ 4145 mutex_exit(&msp->ms_lock); 4146 space_map_t *log_sm = spa_syncing_log_sm(spa); 4147 if (log_sm != NULL) { 4148 ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP)); 4149 if (metaslab_unflushed_txg(msp) == 0) 4150 metaslab_unflushed_add(msp, tx); 4151 else if (!metaslab_unflushed_dirty(msp)) 4152 metaslab_unflushed_bump(msp, tx, B_TRUE); 4153 4154 space_map_write(log_sm, alloctree, SM_ALLOC, 4155 vd->vdev_id, tx); 4156 space_map_write(log_sm, msp->ms_freeing, SM_FREE, 4157 vd->vdev_id, tx); 4158 mutex_enter(&msp->ms_lock); 4159 4160 ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, 4161 metaslab_unflushed_changes_memused(msp)); 4162 spa->spa_unflushed_stats.sus_memused -= 4163 metaslab_unflushed_changes_memused(msp); 4164 range_tree_remove_xor_add(alloctree, 4165 msp->ms_unflushed_frees, msp->ms_unflushed_allocs); 4166 range_tree_remove_xor_add(msp->ms_freeing, 4167 msp->ms_unflushed_allocs, msp->ms_unflushed_frees); 4168 spa->spa_unflushed_stats.sus_memused += 4169 metaslab_unflushed_changes_memused(msp); 4170 } else { 4171 ASSERT(!spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP)); 4172 4173 space_map_write(msp->ms_sm, alloctree, SM_ALLOC, 4174 SM_NO_VDEVID, tx); 4175 space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE, 4176 SM_NO_VDEVID, tx); 4177 mutex_enter(&msp->ms_lock); 4178 } 4179 4180 msp->ms_allocated_space += range_tree_space(alloctree); 4181 ASSERT3U(msp->ms_allocated_space, >=, 4182 range_tree_space(msp->ms_freeing)); 4183 msp->ms_allocated_space -= range_tree_space(msp->ms_freeing); 4184 4185 if (!range_tree_is_empty(msp->ms_checkpointing)) { 4186 ASSERT(spa_has_checkpoint(spa)); 4187 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); 4188 4189 /* 4190 * Since we are doing writes to disk and the ms_checkpointing 4191 * tree won't be changing during that time, we drop the 4192 * ms_lock while writing to the checkpoint space map, for the 4193 * same reason mentioned above. 4194 */ 4195 mutex_exit(&msp->ms_lock); 4196 space_map_write(vd->vdev_checkpoint_sm, 4197 msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx); 4198 mutex_enter(&msp->ms_lock); 4199 4200 spa->spa_checkpoint_info.sci_dspace += 4201 range_tree_space(msp->ms_checkpointing); 4202 vd->vdev_stat.vs_checkpoint_space += 4203 range_tree_space(msp->ms_checkpointing); 4204 ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==, 4205 -space_map_allocated(vd->vdev_checkpoint_sm)); 4206 4207 range_tree_vacate(msp->ms_checkpointing, NULL, NULL); 4208 } 4209 4210 if (msp->ms_loaded) { 4211 /* 4212 * When the space map is loaded, we have an accurate 4213 * histogram in the range tree. This gives us an opportunity 4214 * to bring the space map's histogram up-to-date so we clear 4215 * it first before updating it. 4216 */ 4217 space_map_histogram_clear(msp->ms_sm); 4218 space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx); 4219 4220 /* 4221 * Since we've cleared the histogram we need to add back 4222 * any free space that has already been processed, plus 4223 * any deferred space. This allows the on-disk histogram 4224 * to accurately reflect all free space even if some space 4225 * is not yet available for allocation (i.e. deferred). 4226 */ 4227 space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx); 4228 4229 /* 4230 * Add back any deferred free space that has not been 4231 * added back into the in-core free tree yet. This will 4232 * ensure that we don't end up with a space map histogram 4233 * that is completely empty unless the metaslab is fully 4234 * allocated. 4235 */ 4236 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 4237 space_map_histogram_add(msp->ms_sm, 4238 msp->ms_defer[t], tx); 4239 } 4240 } 4241 4242 /* 4243 * Always add the free space from this sync pass to the space 4244 * map histogram. We want to make sure that the on-disk histogram 4245 * accounts for all free space. If the space map is not loaded, 4246 * then we will lose some accuracy but will correct it the next 4247 * time we load the space map. 4248 */ 4249 space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx); 4250 metaslab_aux_histograms_update(msp); 4251 4252 metaslab_group_histogram_add(mg, msp); 4253 metaslab_group_histogram_verify(mg); 4254 metaslab_class_histogram_verify(mg->mg_class); 4255 4256 /* 4257 * For sync pass 1, we avoid traversing this txg's free range tree 4258 * and instead will just swap the pointers for freeing and freed. 4259 * We can safely do this since the freed_tree is guaranteed to be 4260 * empty on the initial pass. 4261 * 4262 * Keep in mind that even if we are currently using a log spacemap 4263 * we want current frees to end up in the ms_allocatable (but not 4264 * get appended to the ms_sm) so their ranges can be reused as usual. 4265 */ 4266 if (spa_sync_pass(spa) == 1) { 4267 range_tree_swap(&msp->ms_freeing, &msp->ms_freed); 4268 ASSERT0(msp->ms_allocated_this_txg); 4269 } else { 4270 range_tree_vacate(msp->ms_freeing, 4271 range_tree_add, msp->ms_freed); 4272 } 4273 msp->ms_allocated_this_txg += range_tree_space(alloctree); 4274 range_tree_vacate(alloctree, NULL, NULL); 4275 4276 ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); 4277 ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg) 4278 & TXG_MASK])); 4279 ASSERT0(range_tree_space(msp->ms_freeing)); 4280 ASSERT0(range_tree_space(msp->ms_checkpointing)); 4281 4282 mutex_exit(&msp->ms_lock); 4283 4284 /* 4285 * Verify that the space map object ID has been recorded in the 4286 * vdev_ms_array. 4287 */ 4288 uint64_t object; 4289 VERIFY0(dmu_read(mos, vd->vdev_ms_array, 4290 msp->ms_id * sizeof (uint64_t), sizeof (uint64_t), &object, 0)); 4291 VERIFY3U(object, ==, space_map_object(msp->ms_sm)); 4292 4293 mutex_exit(&msp->ms_sync_lock); 4294 dmu_tx_commit(tx); 4295 } 4296 4297 static void 4298 metaslab_evict(metaslab_t *msp, uint64_t txg) 4299 { 4300 if (!msp->ms_loaded || msp->ms_disabled != 0) 4301 return; 4302 4303 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 4304 VERIFY0(range_tree_space( 4305 msp->ms_allocating[(txg + t) & TXG_MASK])); 4306 } 4307 if (msp->ms_allocator != -1) 4308 metaslab_passivate(msp, msp->ms_weight & ~METASLAB_ACTIVE_MASK); 4309 4310 if (!metaslab_debug_unload) 4311 metaslab_unload(msp); 4312 } 4313 4314 /* 4315 * Called after a transaction group has completely synced to mark 4316 * all of the metaslab's free space as usable. 4317 */ 4318 void 4319 metaslab_sync_done(metaslab_t *msp, uint64_t txg) 4320 { 4321 metaslab_group_t *mg = msp->ms_group; 4322 vdev_t *vd = mg->mg_vd; 4323 spa_t *spa = vd->vdev_spa; 4324 range_tree_t **defer_tree; 4325 int64_t alloc_delta, defer_delta; 4326 boolean_t defer_allowed = B_TRUE; 4327 4328 ASSERT(!vd->vdev_ishole); 4329 4330 mutex_enter(&msp->ms_lock); 4331 4332 if (msp->ms_new) { 4333 /* this is a new metaslab, add its capacity to the vdev */ 4334 metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size); 4335 4336 /* there should be no allocations nor frees at this point */ 4337 VERIFY0(msp->ms_allocated_this_txg); 4338 VERIFY0(range_tree_space(msp->ms_freed)); 4339 } 4340 4341 ASSERT0(range_tree_space(msp->ms_freeing)); 4342 ASSERT0(range_tree_space(msp->ms_checkpointing)); 4343 4344 defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE]; 4345 4346 uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) - 4347 metaslab_class_get_alloc(spa_normal_class(spa)); 4348 if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing || 4349 vd->vdev_rz_expanding) { 4350 defer_allowed = B_FALSE; 4351 } 4352 4353 defer_delta = 0; 4354 alloc_delta = msp->ms_allocated_this_txg - 4355 range_tree_space(msp->ms_freed); 4356 4357 if (defer_allowed) { 4358 defer_delta = range_tree_space(msp->ms_freed) - 4359 range_tree_space(*defer_tree); 4360 } else { 4361 defer_delta -= range_tree_space(*defer_tree); 4362 } 4363 metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta, 4364 defer_delta, 0); 4365 4366 if (spa_syncing_log_sm(spa) == NULL) { 4367 /* 4368 * If there's a metaslab_load() in progress and we don't have 4369 * a log space map, it means that we probably wrote to the 4370 * metaslab's space map. If this is the case, we need to 4371 * make sure that we wait for the load to complete so that we 4372 * have a consistent view at the in-core side of the metaslab. 4373 */ 4374 metaslab_load_wait(msp); 4375 } else { 4376 ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); 4377 } 4378 4379 /* 4380 * When auto-trimming is enabled, free ranges which are added to 4381 * ms_allocatable are also be added to ms_trim. The ms_trim tree is 4382 * periodically consumed by the vdev_autotrim_thread() which issues 4383 * trims for all ranges and then vacates the tree. The ms_trim tree 4384 * can be discarded at any time with the sole consequence of recent 4385 * frees not being trimmed. 4386 */ 4387 if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON) { 4388 range_tree_walk(*defer_tree, range_tree_add, msp->ms_trim); 4389 if (!defer_allowed) { 4390 range_tree_walk(msp->ms_freed, range_tree_add, 4391 msp->ms_trim); 4392 } 4393 } else { 4394 range_tree_vacate(msp->ms_trim, NULL, NULL); 4395 } 4396 4397 /* 4398 * Move the frees from the defer_tree back to the free 4399 * range tree (if it's loaded). Swap the freed_tree and 4400 * the defer_tree -- this is safe to do because we've 4401 * just emptied out the defer_tree. 4402 */ 4403 range_tree_vacate(*defer_tree, 4404 msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable); 4405 if (defer_allowed) { 4406 range_tree_swap(&msp->ms_freed, defer_tree); 4407 } else { 4408 range_tree_vacate(msp->ms_freed, 4409 msp->ms_loaded ? range_tree_add : NULL, 4410 msp->ms_allocatable); 4411 } 4412 4413 msp->ms_synced_length = space_map_length(msp->ms_sm); 4414 4415 msp->ms_deferspace += defer_delta; 4416 ASSERT3S(msp->ms_deferspace, >=, 0); 4417 ASSERT3S(msp->ms_deferspace, <=, msp->ms_size); 4418 if (msp->ms_deferspace != 0) { 4419 /* 4420 * Keep syncing this metaslab until all deferred frees 4421 * are back in circulation. 4422 */ 4423 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 4424 } 4425 metaslab_aux_histograms_update_done(msp, defer_allowed); 4426 4427 if (msp->ms_new) { 4428 msp->ms_new = B_FALSE; 4429 mutex_enter(&mg->mg_lock); 4430 mg->mg_ms_ready++; 4431 mutex_exit(&mg->mg_lock); 4432 } 4433 4434 /* 4435 * Re-sort metaslab within its group now that we've adjusted 4436 * its allocatable space. 4437 */ 4438 metaslab_recalculate_weight_and_sort(msp); 4439 4440 ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); 4441 ASSERT0(range_tree_space(msp->ms_freeing)); 4442 ASSERT0(range_tree_space(msp->ms_freed)); 4443 ASSERT0(range_tree_space(msp->ms_checkpointing)); 4444 msp->ms_allocating_total -= msp->ms_allocated_this_txg; 4445 msp->ms_allocated_this_txg = 0; 4446 mutex_exit(&msp->ms_lock); 4447 } 4448 4449 void 4450 metaslab_sync_reassess(metaslab_group_t *mg) 4451 { 4452 spa_t *spa = mg->mg_class->mc_spa; 4453 4454 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 4455 metaslab_group_alloc_update(mg); 4456 mg->mg_fragmentation = metaslab_group_fragmentation(mg); 4457 4458 /* 4459 * Preload the next potential metaslabs but only on active 4460 * metaslab groups. We can get into a state where the metaslab 4461 * is no longer active since we dirty metaslabs as we remove a 4462 * a device, thus potentially making the metaslab group eligible 4463 * for preloading. 4464 */ 4465 if (mg->mg_activation_count > 0) { 4466 metaslab_group_preload(mg); 4467 } 4468 spa_config_exit(spa, SCL_ALLOC, FTAG); 4469 } 4470 4471 /* 4472 * When writing a ditto block (i.e. more than one DVA for a given BP) on 4473 * the same vdev as an existing DVA of this BP, then try to allocate it 4474 * on a different metaslab than existing DVAs (i.e. a unique metaslab). 4475 */ 4476 static boolean_t 4477 metaslab_is_unique(metaslab_t *msp, dva_t *dva) 4478 { 4479 uint64_t dva_ms_id; 4480 4481 if (DVA_GET_ASIZE(dva) == 0) 4482 return (B_TRUE); 4483 4484 if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) 4485 return (B_TRUE); 4486 4487 dva_ms_id = DVA_GET_OFFSET(dva) >> msp->ms_group->mg_vd->vdev_ms_shift; 4488 4489 return (msp->ms_id != dva_ms_id); 4490 } 4491 4492 /* 4493 * ========================================================================== 4494 * Metaslab allocation tracing facility 4495 * ========================================================================== 4496 */ 4497 4498 /* 4499 * Add an allocation trace element to the allocation tracing list. 4500 */ 4501 static void 4502 metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg, 4503 metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset, 4504 int allocator) 4505 { 4506 metaslab_alloc_trace_t *mat; 4507 4508 if (!metaslab_trace_enabled) 4509 return; 4510 4511 /* 4512 * When the tracing list reaches its maximum we remove 4513 * the second element in the list before adding a new one. 4514 * By removing the second element we preserve the original 4515 * entry as a clue to what allocations steps have already been 4516 * performed. 4517 */ 4518 if (zal->zal_size == metaslab_trace_max_entries) { 4519 metaslab_alloc_trace_t *mat_next; 4520 #ifdef ZFS_DEBUG 4521 panic("too many entries in allocation list"); 4522 #endif 4523 METASLABSTAT_BUMP(metaslabstat_trace_over_limit); 4524 zal->zal_size--; 4525 mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list)); 4526 list_remove(&zal->zal_list, mat_next); 4527 kmem_cache_free(metaslab_alloc_trace_cache, mat_next); 4528 } 4529 4530 mat = kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP); 4531 list_link_init(&mat->mat_list_node); 4532 mat->mat_mg = mg; 4533 mat->mat_msp = msp; 4534 mat->mat_size = psize; 4535 mat->mat_dva_id = dva_id; 4536 mat->mat_offset = offset; 4537 mat->mat_weight = 0; 4538 mat->mat_allocator = allocator; 4539 4540 if (msp != NULL) 4541 mat->mat_weight = msp->ms_weight; 4542 4543 /* 4544 * The list is part of the zio so locking is not required. Only 4545 * a single thread will perform allocations for a given zio. 4546 */ 4547 list_insert_tail(&zal->zal_list, mat); 4548 zal->zal_size++; 4549 4550 ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries); 4551 } 4552 4553 void 4554 metaslab_trace_init(zio_alloc_list_t *zal) 4555 { 4556 list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t), 4557 offsetof(metaslab_alloc_trace_t, mat_list_node)); 4558 zal->zal_size = 0; 4559 } 4560 4561 void 4562 metaslab_trace_fini(zio_alloc_list_t *zal) 4563 { 4564 metaslab_alloc_trace_t *mat; 4565 4566 while ((mat = list_remove_head(&zal->zal_list)) != NULL) 4567 kmem_cache_free(metaslab_alloc_trace_cache, mat); 4568 list_destroy(&zal->zal_list); 4569 zal->zal_size = 0; 4570 } 4571 4572 /* 4573 * ========================================================================== 4574 * Metaslab block operations 4575 * ========================================================================== 4576 */ 4577 4578 static void 4579 metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, const void *tag, 4580 int flags, int allocator) 4581 { 4582 if (!(flags & METASLAB_ASYNC_ALLOC) || 4583 (flags & METASLAB_DONT_THROTTLE)) 4584 return; 4585 4586 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 4587 if (!mg->mg_class->mc_alloc_throttle_enabled) 4588 return; 4589 4590 metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; 4591 (void) zfs_refcount_add(&mga->mga_alloc_queue_depth, tag); 4592 } 4593 4594 static void 4595 metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator) 4596 { 4597 metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; 4598 metaslab_class_allocator_t *mca = 4599 &mg->mg_class->mc_allocator[allocator]; 4600 uint64_t max = mg->mg_max_alloc_queue_depth; 4601 uint64_t cur = mga->mga_cur_max_alloc_queue_depth; 4602 while (cur < max) { 4603 if (atomic_cas_64(&mga->mga_cur_max_alloc_queue_depth, 4604 cur, cur + 1) == cur) { 4605 atomic_inc_64(&mca->mca_alloc_max_slots); 4606 return; 4607 } 4608 cur = mga->mga_cur_max_alloc_queue_depth; 4609 } 4610 } 4611 4612 void 4613 metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, const void *tag, 4614 int flags, int allocator, boolean_t io_complete) 4615 { 4616 if (!(flags & METASLAB_ASYNC_ALLOC) || 4617 (flags & METASLAB_DONT_THROTTLE)) 4618 return; 4619 4620 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 4621 if (!mg->mg_class->mc_alloc_throttle_enabled) 4622 return; 4623 4624 metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; 4625 (void) zfs_refcount_remove(&mga->mga_alloc_queue_depth, tag); 4626 if (io_complete) 4627 metaslab_group_increment_qdepth(mg, allocator); 4628 } 4629 4630 void 4631 metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, const void *tag, 4632 int allocator) 4633 { 4634 #ifdef ZFS_DEBUG 4635 const dva_t *dva = bp->blk_dva; 4636 int ndvas = BP_GET_NDVAS(bp); 4637 4638 for (int d = 0; d < ndvas; d++) { 4639 uint64_t vdev = DVA_GET_VDEV(&dva[d]); 4640 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 4641 metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; 4642 VERIFY(zfs_refcount_not_held(&mga->mga_alloc_queue_depth, tag)); 4643 } 4644 #endif 4645 } 4646 4647 static uint64_t 4648 metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) 4649 { 4650 uint64_t start; 4651 range_tree_t *rt = msp->ms_allocatable; 4652 metaslab_class_t *mc = msp->ms_group->mg_class; 4653 4654 ASSERT(MUTEX_HELD(&msp->ms_lock)); 4655 VERIFY(!msp->ms_condensing); 4656 VERIFY0(msp->ms_disabled); 4657 VERIFY0(msp->ms_new); 4658 4659 start = mc->mc_ops->msop_alloc(msp, size); 4660 if (start != -1ULL) { 4661 metaslab_group_t *mg = msp->ms_group; 4662 vdev_t *vd = mg->mg_vd; 4663 4664 VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift)); 4665 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 4666 VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size); 4667 range_tree_remove(rt, start, size); 4668 range_tree_clear(msp->ms_trim, start, size); 4669 4670 if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) 4671 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); 4672 4673 range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size); 4674 msp->ms_allocating_total += size; 4675 4676 /* Track the last successful allocation */ 4677 msp->ms_alloc_txg = txg; 4678 metaslab_verify_space(msp, txg); 4679 } 4680 4681 /* 4682 * Now that we've attempted the allocation we need to update the 4683 * metaslab's maximum block size since it may have changed. 4684 */ 4685 msp->ms_max_size = metaslab_largest_allocatable(msp); 4686 return (start); 4687 } 4688 4689 /* 4690 * Find the metaslab with the highest weight that is less than what we've 4691 * already tried. In the common case, this means that we will examine each 4692 * metaslab at most once. Note that concurrent callers could reorder metaslabs 4693 * by activation/passivation once we have dropped the mg_lock. If a metaslab is 4694 * activated by another thread, and we fail to allocate from the metaslab we 4695 * have selected, we may not try the newly-activated metaslab, and instead 4696 * activate another metaslab. This is not optimal, but generally does not cause 4697 * any problems (a possible exception being if every metaslab is completely full 4698 * except for the newly-activated metaslab which we fail to examine). 4699 */ 4700 static metaslab_t * 4701 find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, 4702 dva_t *dva, int d, boolean_t want_unique, uint64_t asize, int allocator, 4703 boolean_t try_hard, zio_alloc_list_t *zal, metaslab_t *search, 4704 boolean_t *was_active) 4705 { 4706 avl_index_t idx; 4707 avl_tree_t *t = &mg->mg_metaslab_tree; 4708 metaslab_t *msp = avl_find(t, search, &idx); 4709 if (msp == NULL) 4710 msp = avl_nearest(t, idx, AVL_AFTER); 4711 4712 uint_t tries = 0; 4713 for (; msp != NULL; msp = AVL_NEXT(t, msp)) { 4714 int i; 4715 4716 if (!try_hard && tries > zfs_metaslab_find_max_tries) { 4717 METASLABSTAT_BUMP(metaslabstat_too_many_tries); 4718 return (NULL); 4719 } 4720 tries++; 4721 4722 if (!metaslab_should_allocate(msp, asize, try_hard)) { 4723 metaslab_trace_add(zal, mg, msp, asize, d, 4724 TRACE_TOO_SMALL, allocator); 4725 continue; 4726 } 4727 4728 /* 4729 * If the selected metaslab is condensing or disabled, or 4730 * hasn't gone through a metaslab_sync_done(), then skip it. 4731 */ 4732 if (msp->ms_condensing || msp->ms_disabled > 0 || msp->ms_new) 4733 continue; 4734 4735 *was_active = msp->ms_allocator != -1; 4736 /* 4737 * If we're activating as primary, this is our first allocation 4738 * from this disk, so we don't need to check how close we are. 4739 * If the metaslab under consideration was already active, 4740 * we're getting desperate enough to steal another allocator's 4741 * metaslab, so we still don't care about distances. 4742 */ 4743 if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active) 4744 break; 4745 4746 for (i = 0; i < d; i++) { 4747 if (want_unique && 4748 !metaslab_is_unique(msp, &dva[i])) 4749 break; /* try another metaslab */ 4750 } 4751 if (i == d) 4752 break; 4753 } 4754 4755 if (msp != NULL) { 4756 search->ms_weight = msp->ms_weight; 4757 search->ms_start = msp->ms_start + 1; 4758 search->ms_allocator = msp->ms_allocator; 4759 search->ms_primary = msp->ms_primary; 4760 } 4761 return (msp); 4762 } 4763 4764 static void 4765 metaslab_active_mask_verify(metaslab_t *msp) 4766 { 4767 ASSERT(MUTEX_HELD(&msp->ms_lock)); 4768 4769 if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) 4770 return; 4771 4772 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) 4773 return; 4774 4775 if (msp->ms_weight & METASLAB_WEIGHT_PRIMARY) { 4776 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); 4777 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM); 4778 VERIFY3S(msp->ms_allocator, !=, -1); 4779 VERIFY(msp->ms_primary); 4780 return; 4781 } 4782 4783 if (msp->ms_weight & METASLAB_WEIGHT_SECONDARY) { 4784 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); 4785 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM); 4786 VERIFY3S(msp->ms_allocator, !=, -1); 4787 VERIFY(!msp->ms_primary); 4788 return; 4789 } 4790 4791 if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) { 4792 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); 4793 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); 4794 VERIFY3S(msp->ms_allocator, ==, -1); 4795 return; 4796 } 4797 } 4798 4799 static uint64_t 4800 metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, 4801 uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d, 4802 int allocator, boolean_t try_hard) 4803 { 4804 metaslab_t *msp = NULL; 4805 uint64_t offset = -1ULL; 4806 4807 uint64_t activation_weight = METASLAB_WEIGHT_PRIMARY; 4808 for (int i = 0; i < d; i++) { 4809 if (activation_weight == METASLAB_WEIGHT_PRIMARY && 4810 DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { 4811 activation_weight = METASLAB_WEIGHT_SECONDARY; 4812 } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && 4813 DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { 4814 activation_weight = METASLAB_WEIGHT_CLAIM; 4815 break; 4816 } 4817 } 4818 4819 /* 4820 * If we don't have enough metaslabs active to fill the entire array, we 4821 * just use the 0th slot. 4822 */ 4823 if (mg->mg_ms_ready < mg->mg_allocators * 3) 4824 allocator = 0; 4825 metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; 4826 4827 ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2); 4828 4829 metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP); 4830 search->ms_weight = UINT64_MAX; 4831 search->ms_start = 0; 4832 /* 4833 * At the end of the metaslab tree are the already-active metaslabs, 4834 * first the primaries, then the secondaries. When we resume searching 4835 * through the tree, we need to consider ms_allocator and ms_primary so 4836 * we start in the location right after where we left off, and don't 4837 * accidentally loop forever considering the same metaslabs. 4838 */ 4839 search->ms_allocator = -1; 4840 search->ms_primary = B_TRUE; 4841 for (;;) { 4842 boolean_t was_active = B_FALSE; 4843 4844 mutex_enter(&mg->mg_lock); 4845 4846 if (activation_weight == METASLAB_WEIGHT_PRIMARY && 4847 mga->mga_primary != NULL) { 4848 msp = mga->mga_primary; 4849 4850 /* 4851 * Even though we don't hold the ms_lock for the 4852 * primary metaslab, those fields should not 4853 * change while we hold the mg_lock. Thus it is 4854 * safe to make assertions on them. 4855 */ 4856 ASSERT(msp->ms_primary); 4857 ASSERT3S(msp->ms_allocator, ==, allocator); 4858 ASSERT(msp->ms_loaded); 4859 4860 was_active = B_TRUE; 4861 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); 4862 } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && 4863 mga->mga_secondary != NULL) { 4864 msp = mga->mga_secondary; 4865 4866 /* 4867 * See comment above about the similar assertions 4868 * for the primary metaslab. 4869 */ 4870 ASSERT(!msp->ms_primary); 4871 ASSERT3S(msp->ms_allocator, ==, allocator); 4872 ASSERT(msp->ms_loaded); 4873 4874 was_active = B_TRUE; 4875 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); 4876 } else { 4877 msp = find_valid_metaslab(mg, activation_weight, dva, d, 4878 want_unique, asize, allocator, try_hard, zal, 4879 search, &was_active); 4880 } 4881 4882 mutex_exit(&mg->mg_lock); 4883 if (msp == NULL) { 4884 kmem_free(search, sizeof (*search)); 4885 return (-1ULL); 4886 } 4887 mutex_enter(&msp->ms_lock); 4888 4889 metaslab_active_mask_verify(msp); 4890 4891 /* 4892 * This code is disabled out because of issues with 4893 * tracepoints in non-gpl kernel modules. 4894 */ 4895 #if 0 4896 DTRACE_PROBE3(ms__activation__attempt, 4897 metaslab_t *, msp, uint64_t, activation_weight, 4898 boolean_t, was_active); 4899 #endif 4900 4901 /* 4902 * Ensure that the metaslab we have selected is still 4903 * capable of handling our request. It's possible that 4904 * another thread may have changed the weight while we 4905 * were blocked on the metaslab lock. We check the 4906 * active status first to see if we need to set_selected_txg 4907 * a new metaslab. 4908 */ 4909 if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) { 4910 ASSERT3S(msp->ms_allocator, ==, -1); 4911 mutex_exit(&msp->ms_lock); 4912 continue; 4913 } 4914 4915 /* 4916 * If the metaslab was activated for another allocator 4917 * while we were waiting in the ms_lock above, or it's 4918 * a primary and we're seeking a secondary (or vice versa), 4919 * we go back and select a new metaslab. 4920 */ 4921 if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) && 4922 (msp->ms_allocator != -1) && 4923 (msp->ms_allocator != allocator || ((activation_weight == 4924 METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) { 4925 ASSERT(msp->ms_loaded); 4926 ASSERT((msp->ms_weight & METASLAB_WEIGHT_CLAIM) || 4927 msp->ms_allocator != -1); 4928 mutex_exit(&msp->ms_lock); 4929 continue; 4930 } 4931 4932 /* 4933 * This metaslab was used for claiming regions allocated 4934 * by the ZIL during pool import. Once these regions are 4935 * claimed we don't need to keep the CLAIM bit set 4936 * anymore. Passivate this metaslab to zero its activation 4937 * mask. 4938 */ 4939 if (msp->ms_weight & METASLAB_WEIGHT_CLAIM && 4940 activation_weight != METASLAB_WEIGHT_CLAIM) { 4941 ASSERT(msp->ms_loaded); 4942 ASSERT3S(msp->ms_allocator, ==, -1); 4943 metaslab_passivate(msp, msp->ms_weight & 4944 ~METASLAB_WEIGHT_CLAIM); 4945 mutex_exit(&msp->ms_lock); 4946 continue; 4947 } 4948 4949 metaslab_set_selected_txg(msp, txg); 4950 4951 int activation_error = 4952 metaslab_activate(msp, allocator, activation_weight); 4953 metaslab_active_mask_verify(msp); 4954 4955 /* 4956 * If the metaslab was activated by another thread for 4957 * another allocator or activation_weight (EBUSY), or it 4958 * failed because another metaslab was assigned as primary 4959 * for this allocator (EEXIST) we continue using this 4960 * metaslab for our allocation, rather than going on to a 4961 * worse metaslab (we waited for that metaslab to be loaded 4962 * after all). 4963 * 4964 * If the activation failed due to an I/O error or ENOSPC we 4965 * skip to the next metaslab. 4966 */ 4967 boolean_t activated; 4968 if (activation_error == 0) { 4969 activated = B_TRUE; 4970 } else if (activation_error == EBUSY || 4971 activation_error == EEXIST) { 4972 activated = B_FALSE; 4973 } else { 4974 mutex_exit(&msp->ms_lock); 4975 continue; 4976 } 4977 ASSERT(msp->ms_loaded); 4978 4979 /* 4980 * Now that we have the lock, recheck to see if we should 4981 * continue to use this metaslab for this allocation. The 4982 * the metaslab is now loaded so metaslab_should_allocate() 4983 * can accurately determine if the allocation attempt should 4984 * proceed. 4985 */ 4986 if (!metaslab_should_allocate(msp, asize, try_hard)) { 4987 /* Passivate this metaslab and select a new one. */ 4988 metaslab_trace_add(zal, mg, msp, asize, d, 4989 TRACE_TOO_SMALL, allocator); 4990 goto next; 4991 } 4992 4993 /* 4994 * If this metaslab is currently condensing then pick again 4995 * as we can't manipulate this metaslab until it's committed 4996 * to disk. If this metaslab is being initialized, we shouldn't 4997 * allocate from it since the allocated region might be 4998 * overwritten after allocation. 4999 */ 5000 if (msp->ms_condensing) { 5001 metaslab_trace_add(zal, mg, msp, asize, d, 5002 TRACE_CONDENSING, allocator); 5003 if (activated) { 5004 metaslab_passivate(msp, msp->ms_weight & 5005 ~METASLAB_ACTIVE_MASK); 5006 } 5007 mutex_exit(&msp->ms_lock); 5008 continue; 5009 } else if (msp->ms_disabled > 0) { 5010 metaslab_trace_add(zal, mg, msp, asize, d, 5011 TRACE_DISABLED, allocator); 5012 if (activated) { 5013 metaslab_passivate(msp, msp->ms_weight & 5014 ~METASLAB_ACTIVE_MASK); 5015 } 5016 mutex_exit(&msp->ms_lock); 5017 continue; 5018 } 5019 5020 offset = metaslab_block_alloc(msp, asize, txg); 5021 metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator); 5022 5023 if (offset != -1ULL) { 5024 /* Proactively passivate the metaslab, if needed */ 5025 if (activated) 5026 metaslab_segment_may_passivate(msp); 5027 break; 5028 } 5029 next: 5030 ASSERT(msp->ms_loaded); 5031 5032 /* 5033 * This code is disabled out because of issues with 5034 * tracepoints in non-gpl kernel modules. 5035 */ 5036 #if 0 5037 DTRACE_PROBE2(ms__alloc__failure, metaslab_t *, msp, 5038 uint64_t, asize); 5039 #endif 5040 5041 /* 5042 * We were unable to allocate from this metaslab so determine 5043 * a new weight for this metaslab. Now that we have loaded 5044 * the metaslab we can provide a better hint to the metaslab 5045 * selector. 5046 * 5047 * For space-based metaslabs, we use the maximum block size. 5048 * This information is only available when the metaslab 5049 * is loaded and is more accurate than the generic free 5050 * space weight that was calculated by metaslab_weight(). 5051 * This information allows us to quickly compare the maximum 5052 * available allocation in the metaslab to the allocation 5053 * size being requested. 5054 * 5055 * For segment-based metaslabs, determine the new weight 5056 * based on the highest bucket in the range tree. We 5057 * explicitly use the loaded segment weight (i.e. the range 5058 * tree histogram) since it contains the space that is 5059 * currently available for allocation and is accurate 5060 * even within a sync pass. 5061 */ 5062 uint64_t weight; 5063 if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) { 5064 weight = metaslab_largest_allocatable(msp); 5065 WEIGHT_SET_SPACEBASED(weight); 5066 } else { 5067 weight = metaslab_weight_from_range_tree(msp); 5068 } 5069 5070 if (activated) { 5071 metaslab_passivate(msp, weight); 5072 } else { 5073 /* 5074 * For the case where we use the metaslab that is 5075 * active for another allocator we want to make 5076 * sure that we retain the activation mask. 5077 * 5078 * Note that we could attempt to use something like 5079 * metaslab_recalculate_weight_and_sort() that 5080 * retains the activation mask here. That function 5081 * uses metaslab_weight() to set the weight though 5082 * which is not as accurate as the calculations 5083 * above. 5084 */ 5085 weight |= msp->ms_weight & METASLAB_ACTIVE_MASK; 5086 metaslab_group_sort(mg, msp, weight); 5087 } 5088 metaslab_active_mask_verify(msp); 5089 5090 /* 5091 * We have just failed an allocation attempt, check 5092 * that metaslab_should_allocate() agrees. Otherwise, 5093 * we may end up in an infinite loop retrying the same 5094 * metaslab. 5095 */ 5096 ASSERT(!metaslab_should_allocate(msp, asize, try_hard)); 5097 5098 mutex_exit(&msp->ms_lock); 5099 } 5100 mutex_exit(&msp->ms_lock); 5101 kmem_free(search, sizeof (*search)); 5102 return (offset); 5103 } 5104 5105 static uint64_t 5106 metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, 5107 uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d, 5108 int allocator, boolean_t try_hard) 5109 { 5110 uint64_t offset; 5111 5112 offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique, 5113 dva, d, allocator, try_hard); 5114 5115 mutex_enter(&mg->mg_lock); 5116 if (offset == -1ULL) { 5117 mg->mg_failed_allocations++; 5118 metaslab_trace_add(zal, mg, NULL, asize, d, 5119 TRACE_GROUP_FAILURE, allocator); 5120 if (asize == SPA_GANGBLOCKSIZE) { 5121 /* 5122 * This metaslab group was unable to allocate 5123 * the minimum gang block size so it must be out of 5124 * space. We must notify the allocation throttle 5125 * to start skipping allocation attempts to this 5126 * metaslab group until more space becomes available. 5127 * Note: this failure cannot be caused by the 5128 * allocation throttle since the allocation throttle 5129 * is only responsible for skipping devices and 5130 * not failing block allocations. 5131 */ 5132 mg->mg_no_free_space = B_TRUE; 5133 } 5134 } 5135 mg->mg_allocations++; 5136 mutex_exit(&mg->mg_lock); 5137 return (offset); 5138 } 5139 5140 /* 5141 * Allocate a block for the specified i/o. 5142 */ 5143 int 5144 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, 5145 dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags, 5146 zio_alloc_list_t *zal, int allocator) 5147 { 5148 metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; 5149 metaslab_group_t *mg, *rotor; 5150 vdev_t *vd; 5151 boolean_t try_hard = B_FALSE; 5152 5153 ASSERT(!DVA_IS_VALID(&dva[d])); 5154 5155 /* 5156 * For testing, make some blocks above a certain size be gang blocks. 5157 * This will result in more split blocks when using device removal, 5158 * and a large number of split blocks coupled with ztest-induced 5159 * damage can result in extremely long reconstruction times. This 5160 * will also test spilling from special to normal. 5161 */ 5162 if (psize >= metaslab_force_ganging && 5163 metaslab_force_ganging_pct > 0 && 5164 (random_in_range(100) < MIN(metaslab_force_ganging_pct, 100))) { 5165 metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG, 5166 allocator); 5167 return (SET_ERROR(ENOSPC)); 5168 } 5169 5170 /* 5171 * Start at the rotor and loop through all mgs until we find something. 5172 * Note that there's no locking on mca_rotor or mca_aliquot because 5173 * nothing actually breaks if we miss a few updates -- we just won't 5174 * allocate quite as evenly. It all balances out over time. 5175 * 5176 * If we are doing ditto or log blocks, try to spread them across 5177 * consecutive vdevs. If we're forced to reuse a vdev before we've 5178 * allocated all of our ditto blocks, then try and spread them out on 5179 * that vdev as much as possible. If it turns out to not be possible, 5180 * gradually lower our standards until anything becomes acceptable. 5181 * Also, allocating on consecutive vdevs (as opposed to random vdevs) 5182 * gives us hope of containing our fault domains to something we're 5183 * able to reason about. Otherwise, any two top-level vdev failures 5184 * will guarantee the loss of data. With consecutive allocation, 5185 * only two adjacent top-level vdev failures will result in data loss. 5186 * 5187 * If we are doing gang blocks (hintdva is non-NULL), try to keep 5188 * ourselves on the same vdev as our gang block header. That 5189 * way, we can hope for locality in vdev_cache, plus it makes our 5190 * fault domains something tractable. 5191 */ 5192 if (hintdva) { 5193 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); 5194 5195 /* 5196 * It's possible the vdev we're using as the hint no 5197 * longer exists or its mg has been closed (e.g. by 5198 * device removal). Consult the rotor when 5199 * all else fails. 5200 */ 5201 if (vd != NULL && vd->vdev_mg != NULL) { 5202 mg = vdev_get_mg(vd, mc); 5203 5204 if (flags & METASLAB_HINTBP_AVOID) 5205 mg = mg->mg_next; 5206 } else { 5207 mg = mca->mca_rotor; 5208 } 5209 } else if (d != 0) { 5210 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); 5211 mg = vd->vdev_mg->mg_next; 5212 } else { 5213 ASSERT(mca->mca_rotor != NULL); 5214 mg = mca->mca_rotor; 5215 } 5216 5217 /* 5218 * If the hint put us into the wrong metaslab class, or into a 5219 * metaslab group that has been passivated, just follow the rotor. 5220 */ 5221 if (mg->mg_class != mc || mg->mg_activation_count <= 0) 5222 mg = mca->mca_rotor; 5223 5224 rotor = mg; 5225 top: 5226 do { 5227 boolean_t allocatable; 5228 5229 ASSERT(mg->mg_activation_count == 1); 5230 vd = mg->mg_vd; 5231 5232 /* 5233 * Don't allocate from faulted devices. 5234 */ 5235 if (try_hard) { 5236 spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); 5237 allocatable = vdev_allocatable(vd); 5238 spa_config_exit(spa, SCL_ZIO, FTAG); 5239 } else { 5240 allocatable = vdev_allocatable(vd); 5241 } 5242 5243 /* 5244 * Determine if the selected metaslab group is eligible 5245 * for allocations. If we're ganging then don't allow 5246 * this metaslab group to skip allocations since that would 5247 * inadvertently return ENOSPC and suspend the pool 5248 * even though space is still available. 5249 */ 5250 if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) { 5251 allocatable = metaslab_group_allocatable(mg, rotor, 5252 flags, psize, allocator, d); 5253 } 5254 5255 if (!allocatable) { 5256 metaslab_trace_add(zal, mg, NULL, psize, d, 5257 TRACE_NOT_ALLOCATABLE, allocator); 5258 goto next; 5259 } 5260 5261 /* 5262 * Avoid writing single-copy data to an unhealthy, 5263 * non-redundant vdev, unless we've already tried all 5264 * other vdevs. 5265 */ 5266 if (vd->vdev_state < VDEV_STATE_HEALTHY && 5267 d == 0 && !try_hard && vd->vdev_children == 0) { 5268 metaslab_trace_add(zal, mg, NULL, psize, d, 5269 TRACE_VDEV_ERROR, allocator); 5270 goto next; 5271 } 5272 5273 ASSERT(mg->mg_class == mc); 5274 5275 uint64_t asize = vdev_psize_to_asize_txg(vd, psize, txg); 5276 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); 5277 5278 /* 5279 * If we don't need to try hard, then require that the 5280 * block be on a different metaslab from any other DVAs 5281 * in this BP (unique=true). If we are trying hard, then 5282 * allow any metaslab to be used (unique=false). 5283 */ 5284 uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg, 5285 !try_hard, dva, d, allocator, try_hard); 5286 5287 if (offset != -1ULL) { 5288 /* 5289 * If we've just selected this metaslab group, 5290 * figure out whether the corresponding vdev is 5291 * over- or under-used relative to the pool, 5292 * and set an allocation bias to even it out. 5293 * 5294 * Bias is also used to compensate for unequally 5295 * sized vdevs so that space is allocated fairly. 5296 */ 5297 if (mca->mca_aliquot == 0 && metaslab_bias_enabled) { 5298 vdev_stat_t *vs = &vd->vdev_stat; 5299 int64_t vs_free = vs->vs_space - vs->vs_alloc; 5300 int64_t mc_free = mc->mc_space - mc->mc_alloc; 5301 int64_t ratio; 5302 5303 /* 5304 * Calculate how much more or less we should 5305 * try to allocate from this device during 5306 * this iteration around the rotor. 5307 * 5308 * This basically introduces a zero-centered 5309 * bias towards the devices with the most 5310 * free space, while compensating for vdev 5311 * size differences. 5312 * 5313 * Examples: 5314 * vdev V1 = 16M/128M 5315 * vdev V2 = 16M/128M 5316 * ratio(V1) = 100% ratio(V2) = 100% 5317 * 5318 * vdev V1 = 16M/128M 5319 * vdev V2 = 64M/128M 5320 * ratio(V1) = 127% ratio(V2) = 72% 5321 * 5322 * vdev V1 = 16M/128M 5323 * vdev V2 = 64M/512M 5324 * ratio(V1) = 40% ratio(V2) = 160% 5325 */ 5326 ratio = (vs_free * mc->mc_alloc_groups * 100) / 5327 (mc_free + 1); 5328 mg->mg_bias = ((ratio - 100) * 5329 (int64_t)mg->mg_aliquot) / 100; 5330 } else if (!metaslab_bias_enabled) { 5331 mg->mg_bias = 0; 5332 } 5333 5334 if ((flags & METASLAB_ZIL) || 5335 atomic_add_64_nv(&mca->mca_aliquot, asize) >= 5336 mg->mg_aliquot + mg->mg_bias) { 5337 mca->mca_rotor = mg->mg_next; 5338 mca->mca_aliquot = 0; 5339 } 5340 5341 DVA_SET_VDEV(&dva[d], vd->vdev_id); 5342 DVA_SET_OFFSET(&dva[d], offset); 5343 DVA_SET_GANG(&dva[d], 5344 ((flags & METASLAB_GANG_HEADER) ? 1 : 0)); 5345 DVA_SET_ASIZE(&dva[d], asize); 5346 5347 return (0); 5348 } 5349 next: 5350 mca->mca_rotor = mg->mg_next; 5351 mca->mca_aliquot = 0; 5352 } while ((mg = mg->mg_next) != rotor); 5353 5354 /* 5355 * If we haven't tried hard, perhaps do so now. 5356 */ 5357 if (!try_hard && (zfs_metaslab_try_hard_before_gang || 5358 GANG_ALLOCATION(flags) || (flags & METASLAB_ZIL) != 0 || 5359 psize <= 1 << spa->spa_min_ashift)) { 5360 METASLABSTAT_BUMP(metaslabstat_try_hard); 5361 try_hard = B_TRUE; 5362 goto top; 5363 } 5364 5365 memset(&dva[d], 0, sizeof (dva_t)); 5366 5367 metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator); 5368 return (SET_ERROR(ENOSPC)); 5369 } 5370 5371 void 5372 metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize, 5373 boolean_t checkpoint) 5374 { 5375 metaslab_t *msp; 5376 spa_t *spa = vd->vdev_spa; 5377 5378 ASSERT(vdev_is_concrete(vd)); 5379 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 5380 ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); 5381 5382 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 5383 5384 VERIFY(!msp->ms_condensing); 5385 VERIFY3U(offset, >=, msp->ms_start); 5386 VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size); 5387 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 5388 VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift)); 5389 5390 metaslab_check_free_impl(vd, offset, asize); 5391 5392 mutex_enter(&msp->ms_lock); 5393 if (range_tree_is_empty(msp->ms_freeing) && 5394 range_tree_is_empty(msp->ms_checkpointing)) { 5395 vdev_dirty(vd, VDD_METASLAB, msp, spa_syncing_txg(spa)); 5396 } 5397 5398 if (checkpoint) { 5399 ASSERT(spa_has_checkpoint(spa)); 5400 range_tree_add(msp->ms_checkpointing, offset, asize); 5401 } else { 5402 range_tree_add(msp->ms_freeing, offset, asize); 5403 } 5404 mutex_exit(&msp->ms_lock); 5405 } 5406 5407 void 5408 metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, 5409 uint64_t size, void *arg) 5410 { 5411 (void) inner_offset; 5412 boolean_t *checkpoint = arg; 5413 5414 ASSERT3P(checkpoint, !=, NULL); 5415 5416 if (vd->vdev_ops->vdev_op_remap != NULL) 5417 vdev_indirect_mark_obsolete(vd, offset, size); 5418 else 5419 metaslab_free_impl(vd, offset, size, *checkpoint); 5420 } 5421 5422 static void 5423 metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size, 5424 boolean_t checkpoint) 5425 { 5426 spa_t *spa = vd->vdev_spa; 5427 5428 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 5429 5430 if (spa_syncing_txg(spa) > spa_freeze_txg(spa)) 5431 return; 5432 5433 if (spa->spa_vdev_removal != NULL && 5434 spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id && 5435 vdev_is_concrete(vd)) { 5436 /* 5437 * Note: we check if the vdev is concrete because when 5438 * we complete the removal, we first change the vdev to be 5439 * an indirect vdev (in open context), and then (in syncing 5440 * context) clear spa_vdev_removal. 5441 */ 5442 free_from_removing_vdev(vd, offset, size); 5443 } else if (vd->vdev_ops->vdev_op_remap != NULL) { 5444 vdev_indirect_mark_obsolete(vd, offset, size); 5445 vd->vdev_ops->vdev_op_remap(vd, offset, size, 5446 metaslab_free_impl_cb, &checkpoint); 5447 } else { 5448 metaslab_free_concrete(vd, offset, size, checkpoint); 5449 } 5450 } 5451 5452 typedef struct remap_blkptr_cb_arg { 5453 blkptr_t *rbca_bp; 5454 spa_remap_cb_t rbca_cb; 5455 vdev_t *rbca_remap_vd; 5456 uint64_t rbca_remap_offset; 5457 void *rbca_cb_arg; 5458 } remap_blkptr_cb_arg_t; 5459 5460 static void 5461 remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, 5462 uint64_t size, void *arg) 5463 { 5464 remap_blkptr_cb_arg_t *rbca = arg; 5465 blkptr_t *bp = rbca->rbca_bp; 5466 5467 /* We can not remap split blocks. */ 5468 if (size != DVA_GET_ASIZE(&bp->blk_dva[0])) 5469 return; 5470 ASSERT0(inner_offset); 5471 5472 if (rbca->rbca_cb != NULL) { 5473 /* 5474 * At this point we know that we are not handling split 5475 * blocks and we invoke the callback on the previous 5476 * vdev which must be indirect. 5477 */ 5478 ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops); 5479 5480 rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id, 5481 rbca->rbca_remap_offset, size, rbca->rbca_cb_arg); 5482 5483 /* set up remap_blkptr_cb_arg for the next call */ 5484 rbca->rbca_remap_vd = vd; 5485 rbca->rbca_remap_offset = offset; 5486 } 5487 5488 /* 5489 * The phys birth time is that of dva[0]. This ensures that we know 5490 * when each dva was written, so that resilver can determine which 5491 * blocks need to be scrubbed (i.e. those written during the time 5492 * the vdev was offline). It also ensures that the key used in 5493 * the ARC hash table is unique (i.e. dva[0] + phys_birth). If 5494 * we didn't change the phys_birth, a lookup in the ARC for a 5495 * remapped BP could find the data that was previously stored at 5496 * this vdev + offset. 5497 */ 5498 vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa, 5499 DVA_GET_VDEV(&bp->blk_dva[0])); 5500 vdev_indirect_births_t *vib = oldvd->vdev_indirect_births; 5501 uint64_t physical_birth = vdev_indirect_births_physbirth(vib, 5502 DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0])); 5503 BP_SET_PHYSICAL_BIRTH(bp, physical_birth); 5504 5505 DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id); 5506 DVA_SET_OFFSET(&bp->blk_dva[0], offset); 5507 } 5508 5509 /* 5510 * If the block pointer contains any indirect DVAs, modify them to refer to 5511 * concrete DVAs. Note that this will sometimes not be possible, leaving 5512 * the indirect DVA in place. This happens if the indirect DVA spans multiple 5513 * segments in the mapping (i.e. it is a "split block"). 5514 * 5515 * If the BP was remapped, calls the callback on the original dva (note the 5516 * callback can be called multiple times if the original indirect DVA refers 5517 * to another indirect DVA, etc). 5518 * 5519 * Returns TRUE if the BP was remapped. 5520 */ 5521 boolean_t 5522 spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg) 5523 { 5524 remap_blkptr_cb_arg_t rbca; 5525 5526 if (!zfs_remap_blkptr_enable) 5527 return (B_FALSE); 5528 5529 if (!spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) 5530 return (B_FALSE); 5531 5532 /* 5533 * Dedup BP's can not be remapped, because ddt_phys_select() depends 5534 * on DVA[0] being the same in the BP as in the DDT (dedup table). 5535 */ 5536 if (BP_GET_DEDUP(bp)) 5537 return (B_FALSE); 5538 5539 /* 5540 * Gang blocks can not be remapped, because 5541 * zio_checksum_gang_verifier() depends on the DVA[0] that's in 5542 * the BP used to read the gang block header (GBH) being the same 5543 * as the DVA[0] that we allocated for the GBH. 5544 */ 5545 if (BP_IS_GANG(bp)) 5546 return (B_FALSE); 5547 5548 /* 5549 * Embedded BP's have no DVA to remap. 5550 */ 5551 if (BP_GET_NDVAS(bp) < 1) 5552 return (B_FALSE); 5553 5554 /* 5555 * Note: we only remap dva[0]. If we remapped other dvas, we 5556 * would no longer know what their phys birth txg is. 5557 */ 5558 dva_t *dva = &bp->blk_dva[0]; 5559 5560 uint64_t offset = DVA_GET_OFFSET(dva); 5561 uint64_t size = DVA_GET_ASIZE(dva); 5562 vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); 5563 5564 if (vd->vdev_ops->vdev_op_remap == NULL) 5565 return (B_FALSE); 5566 5567 rbca.rbca_bp = bp; 5568 rbca.rbca_cb = callback; 5569 rbca.rbca_remap_vd = vd; 5570 rbca.rbca_remap_offset = offset; 5571 rbca.rbca_cb_arg = arg; 5572 5573 /* 5574 * remap_blkptr_cb() will be called in order for each level of 5575 * indirection, until a concrete vdev is reached or a split block is 5576 * encountered. old_vd and old_offset are updated within the callback 5577 * as we go from the one indirect vdev to the next one (either concrete 5578 * or indirect again) in that order. 5579 */ 5580 vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca); 5581 5582 /* Check if the DVA wasn't remapped because it is a split block */ 5583 if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id) 5584 return (B_FALSE); 5585 5586 return (B_TRUE); 5587 } 5588 5589 /* 5590 * Undo the allocation of a DVA which happened in the given transaction group. 5591 */ 5592 void 5593 metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg) 5594 { 5595 metaslab_t *msp; 5596 vdev_t *vd; 5597 uint64_t vdev = DVA_GET_VDEV(dva); 5598 uint64_t offset = DVA_GET_OFFSET(dva); 5599 uint64_t size = DVA_GET_ASIZE(dva); 5600 5601 ASSERT(DVA_IS_VALID(dva)); 5602 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 5603 5604 if (txg > spa_freeze_txg(spa)) 5605 return; 5606 5607 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || !DVA_IS_VALID(dva) || 5608 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { 5609 zfs_panic_recover("metaslab_free_dva(): bad DVA %llu:%llu:%llu", 5610 (u_longlong_t)vdev, (u_longlong_t)offset, 5611 (u_longlong_t)size); 5612 return; 5613 } 5614 5615 ASSERT(!vd->vdev_removing); 5616 ASSERT(vdev_is_concrete(vd)); 5617 ASSERT0(vd->vdev_indirect_config.vic_mapping_object); 5618 ASSERT3P(vd->vdev_indirect_mapping, ==, NULL); 5619 5620 if (DVA_GET_GANG(dva)) 5621 size = vdev_gang_header_asize(vd); 5622 5623 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 5624 5625 mutex_enter(&msp->ms_lock); 5626 range_tree_remove(msp->ms_allocating[txg & TXG_MASK], 5627 offset, size); 5628 msp->ms_allocating_total -= size; 5629 5630 VERIFY(!msp->ms_condensing); 5631 VERIFY3U(offset, >=, msp->ms_start); 5632 VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size); 5633 VERIFY3U(range_tree_space(msp->ms_allocatable) + size, <=, 5634 msp->ms_size); 5635 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 5636 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 5637 range_tree_add(msp->ms_allocatable, offset, size); 5638 mutex_exit(&msp->ms_lock); 5639 } 5640 5641 /* 5642 * Free the block represented by the given DVA. 5643 */ 5644 void 5645 metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint) 5646 { 5647 uint64_t vdev = DVA_GET_VDEV(dva); 5648 uint64_t offset = DVA_GET_OFFSET(dva); 5649 uint64_t size = DVA_GET_ASIZE(dva); 5650 vdev_t *vd = vdev_lookup_top(spa, vdev); 5651 5652 ASSERT(DVA_IS_VALID(dva)); 5653 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 5654 5655 if (DVA_GET_GANG(dva)) { 5656 size = vdev_gang_header_asize(vd); 5657 } 5658 5659 metaslab_free_impl(vd, offset, size, checkpoint); 5660 } 5661 5662 /* 5663 * Reserve some allocation slots. The reservation system must be called 5664 * before we call into the allocator. If there aren't any available slots 5665 * then the I/O will be throttled until an I/O completes and its slots are 5666 * freed up. The function returns true if it was successful in placing 5667 * the reservation. 5668 */ 5669 boolean_t 5670 metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator, 5671 zio_t *zio, int flags) 5672 { 5673 metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; 5674 uint64_t max = mca->mca_alloc_max_slots; 5675 5676 ASSERT(mc->mc_alloc_throttle_enabled); 5677 if (GANG_ALLOCATION(flags) || (flags & METASLAB_MUST_RESERVE) || 5678 zfs_refcount_count(&mca->mca_alloc_slots) + slots <= max) { 5679 /* 5680 * The potential race between _count() and _add() is covered 5681 * by the allocator lock in most cases, or irrelevant due to 5682 * GANG_ALLOCATION() or METASLAB_MUST_RESERVE set in others. 5683 * But even if we assume some other non-existing scenario, the 5684 * worst that can happen is few more I/Os get to allocation 5685 * earlier, that is not a problem. 5686 * 5687 * We reserve the slots individually so that we can unreserve 5688 * them individually when an I/O completes. 5689 */ 5690 zfs_refcount_add_few(&mca->mca_alloc_slots, slots, zio); 5691 zio->io_flags |= ZIO_FLAG_IO_ALLOCATING; 5692 return (B_TRUE); 5693 } 5694 return (B_FALSE); 5695 } 5696 5697 void 5698 metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, 5699 int allocator, zio_t *zio) 5700 { 5701 metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; 5702 5703 ASSERT(mc->mc_alloc_throttle_enabled); 5704 zfs_refcount_remove_few(&mca->mca_alloc_slots, slots, zio); 5705 } 5706 5707 static int 5708 metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size, 5709 uint64_t txg) 5710 { 5711 metaslab_t *msp; 5712 spa_t *spa = vd->vdev_spa; 5713 int error = 0; 5714 5715 if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count) 5716 return (SET_ERROR(ENXIO)); 5717 5718 ASSERT3P(vd->vdev_ms, !=, NULL); 5719 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 5720 5721 mutex_enter(&msp->ms_lock); 5722 5723 if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) { 5724 error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM); 5725 if (error == EBUSY) { 5726 ASSERT(msp->ms_loaded); 5727 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); 5728 error = 0; 5729 } 5730 } 5731 5732 if (error == 0 && 5733 !range_tree_contains(msp->ms_allocatable, offset, size)) 5734 error = SET_ERROR(ENOENT); 5735 5736 if (error || txg == 0) { /* txg == 0 indicates dry run */ 5737 mutex_exit(&msp->ms_lock); 5738 return (error); 5739 } 5740 5741 VERIFY(!msp->ms_condensing); 5742 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 5743 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 5744 VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=, 5745 msp->ms_size); 5746 range_tree_remove(msp->ms_allocatable, offset, size); 5747 range_tree_clear(msp->ms_trim, offset, size); 5748 5749 if (spa_writeable(spa)) { /* don't dirty if we're zdb(8) */ 5750 metaslab_class_t *mc = msp->ms_group->mg_class; 5751 multilist_sublist_t *mls = 5752 multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp); 5753 if (!multilist_link_active(&msp->ms_class_txg_node)) { 5754 msp->ms_selected_txg = txg; 5755 multilist_sublist_insert_head(mls, msp); 5756 } 5757 multilist_sublist_unlock(mls); 5758 5759 if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) 5760 vdev_dirty(vd, VDD_METASLAB, msp, txg); 5761 range_tree_add(msp->ms_allocating[txg & TXG_MASK], 5762 offset, size); 5763 msp->ms_allocating_total += size; 5764 } 5765 5766 mutex_exit(&msp->ms_lock); 5767 5768 return (0); 5769 } 5770 5771 typedef struct metaslab_claim_cb_arg_t { 5772 uint64_t mcca_txg; 5773 int mcca_error; 5774 } metaslab_claim_cb_arg_t; 5775 5776 static void 5777 metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, 5778 uint64_t size, void *arg) 5779 { 5780 (void) inner_offset; 5781 metaslab_claim_cb_arg_t *mcca_arg = arg; 5782 5783 if (mcca_arg->mcca_error == 0) { 5784 mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset, 5785 size, mcca_arg->mcca_txg); 5786 } 5787 } 5788 5789 int 5790 metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg) 5791 { 5792 if (vd->vdev_ops->vdev_op_remap != NULL) { 5793 metaslab_claim_cb_arg_t arg; 5794 5795 /* 5796 * Only zdb(8) can claim on indirect vdevs. This is used 5797 * to detect leaks of mapped space (that are not accounted 5798 * for in the obsolete counts, spacemap, or bpobj). 5799 */ 5800 ASSERT(!spa_writeable(vd->vdev_spa)); 5801 arg.mcca_error = 0; 5802 arg.mcca_txg = txg; 5803 5804 vd->vdev_ops->vdev_op_remap(vd, offset, size, 5805 metaslab_claim_impl_cb, &arg); 5806 5807 if (arg.mcca_error == 0) { 5808 arg.mcca_error = metaslab_claim_concrete(vd, 5809 offset, size, txg); 5810 } 5811 return (arg.mcca_error); 5812 } else { 5813 return (metaslab_claim_concrete(vd, offset, size, txg)); 5814 } 5815 } 5816 5817 /* 5818 * Intent log support: upon opening the pool after a crash, notify the SPA 5819 * of blocks that the intent log has allocated for immediate write, but 5820 * which are still considered free by the SPA because the last transaction 5821 * group didn't commit yet. 5822 */ 5823 static int 5824 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) 5825 { 5826 uint64_t vdev = DVA_GET_VDEV(dva); 5827 uint64_t offset = DVA_GET_OFFSET(dva); 5828 uint64_t size = DVA_GET_ASIZE(dva); 5829 vdev_t *vd; 5830 5831 if ((vd = vdev_lookup_top(spa, vdev)) == NULL) { 5832 return (SET_ERROR(ENXIO)); 5833 } 5834 5835 ASSERT(DVA_IS_VALID(dva)); 5836 5837 if (DVA_GET_GANG(dva)) 5838 size = vdev_gang_header_asize(vd); 5839 5840 return (metaslab_claim_impl(vd, offset, size, txg)); 5841 } 5842 5843 int 5844 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, 5845 int ndvas, uint64_t txg, blkptr_t *hintbp, int flags, 5846 zio_alloc_list_t *zal, zio_t *zio, int allocator) 5847 { 5848 dva_t *dva = bp->blk_dva; 5849 dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL; 5850 int error = 0; 5851 5852 ASSERT0(BP_GET_LOGICAL_BIRTH(bp)); 5853 ASSERT0(BP_GET_PHYSICAL_BIRTH(bp)); 5854 5855 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 5856 5857 if (mc->mc_allocator[allocator].mca_rotor == NULL) { 5858 /* no vdevs in this class */ 5859 spa_config_exit(spa, SCL_ALLOC, FTAG); 5860 return (SET_ERROR(ENOSPC)); 5861 } 5862 5863 ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); 5864 ASSERT(BP_GET_NDVAS(bp) == 0); 5865 ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); 5866 ASSERT3P(zal, !=, NULL); 5867 5868 for (int d = 0; d < ndvas; d++) { 5869 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, 5870 txg, flags, zal, allocator); 5871 if (error != 0) { 5872 for (d--; d >= 0; d--) { 5873 metaslab_unalloc_dva(spa, &dva[d], txg); 5874 metaslab_group_alloc_decrement(spa, 5875 DVA_GET_VDEV(&dva[d]), zio, flags, 5876 allocator, B_FALSE); 5877 memset(&dva[d], 0, sizeof (dva_t)); 5878 } 5879 spa_config_exit(spa, SCL_ALLOC, FTAG); 5880 return (error); 5881 } else { 5882 /* 5883 * Update the metaslab group's queue depth 5884 * based on the newly allocated dva. 5885 */ 5886 metaslab_group_alloc_increment(spa, 5887 DVA_GET_VDEV(&dva[d]), zio, flags, allocator); 5888 } 5889 } 5890 ASSERT(error == 0); 5891 ASSERT(BP_GET_NDVAS(bp) == ndvas); 5892 5893 spa_config_exit(spa, SCL_ALLOC, FTAG); 5894 5895 BP_SET_BIRTH(bp, txg, 0); 5896 5897 return (0); 5898 } 5899 5900 void 5901 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) 5902 { 5903 const dva_t *dva = bp->blk_dva; 5904 int ndvas = BP_GET_NDVAS(bp); 5905 5906 ASSERT(!BP_IS_HOLE(bp)); 5907 ASSERT(!now || BP_GET_LOGICAL_BIRTH(bp) >= spa_syncing_txg(spa)); 5908 5909 /* 5910 * If we have a checkpoint for the pool we need to make sure that 5911 * the blocks that we free that are part of the checkpoint won't be 5912 * reused until the checkpoint is discarded or we revert to it. 5913 * 5914 * The checkpoint flag is passed down the metaslab_free code path 5915 * and is set whenever we want to add a block to the checkpoint's 5916 * accounting. That is, we "checkpoint" blocks that existed at the 5917 * time the checkpoint was created and are therefore referenced by 5918 * the checkpointed uberblock. 5919 * 5920 * Note that, we don't checkpoint any blocks if the current 5921 * syncing txg <= spa_checkpoint_txg. We want these frees to sync 5922 * normally as they will be referenced by the checkpointed uberblock. 5923 */ 5924 boolean_t checkpoint = B_FALSE; 5925 if (BP_GET_LOGICAL_BIRTH(bp) <= spa->spa_checkpoint_txg && 5926 spa_syncing_txg(spa) > spa->spa_checkpoint_txg) { 5927 /* 5928 * At this point, if the block is part of the checkpoint 5929 * there is no way it was created in the current txg. 5930 */ 5931 ASSERT(!now); 5932 ASSERT3U(spa_syncing_txg(spa), ==, txg); 5933 checkpoint = B_TRUE; 5934 } 5935 5936 spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); 5937 5938 for (int d = 0; d < ndvas; d++) { 5939 if (now) { 5940 metaslab_unalloc_dva(spa, &dva[d], txg); 5941 } else { 5942 ASSERT3U(txg, ==, spa_syncing_txg(spa)); 5943 metaslab_free_dva(spa, &dva[d], checkpoint); 5944 } 5945 } 5946 5947 spa_config_exit(spa, SCL_FREE, FTAG); 5948 } 5949 5950 int 5951 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) 5952 { 5953 const dva_t *dva = bp->blk_dva; 5954 int ndvas = BP_GET_NDVAS(bp); 5955 int error = 0; 5956 5957 ASSERT(!BP_IS_HOLE(bp)); 5958 5959 if (txg != 0) { 5960 /* 5961 * First do a dry run to make sure all DVAs are claimable, 5962 * so we don't have to unwind from partial failures below. 5963 */ 5964 if ((error = metaslab_claim(spa, bp, 0)) != 0) 5965 return (error); 5966 } 5967 5968 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 5969 5970 for (int d = 0; d < ndvas; d++) { 5971 error = metaslab_claim_dva(spa, &dva[d], txg); 5972 if (error != 0) 5973 break; 5974 } 5975 5976 spa_config_exit(spa, SCL_ALLOC, FTAG); 5977 5978 ASSERT(error == 0 || txg == 0); 5979 5980 return (error); 5981 } 5982 5983 static void 5984 metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset, 5985 uint64_t size, void *arg) 5986 { 5987 (void) inner, (void) arg; 5988 5989 if (vd->vdev_ops == &vdev_indirect_ops) 5990 return; 5991 5992 metaslab_check_free_impl(vd, offset, size); 5993 } 5994 5995 static void 5996 metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size) 5997 { 5998 metaslab_t *msp; 5999 spa_t *spa __maybe_unused = vd->vdev_spa; 6000 6001 if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) 6002 return; 6003 6004 if (vd->vdev_ops->vdev_op_remap != NULL) { 6005 vd->vdev_ops->vdev_op_remap(vd, offset, size, 6006 metaslab_check_free_impl_cb, NULL); 6007 return; 6008 } 6009 6010 ASSERT(vdev_is_concrete(vd)); 6011 ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); 6012 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 6013 6014 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 6015 6016 mutex_enter(&msp->ms_lock); 6017 if (msp->ms_loaded) { 6018 range_tree_verify_not_present(msp->ms_allocatable, 6019 offset, size); 6020 } 6021 6022 /* 6023 * Check all segments that currently exist in the freeing pipeline. 6024 * 6025 * It would intuitively make sense to also check the current allocating 6026 * tree since metaslab_unalloc_dva() exists for extents that are 6027 * allocated and freed in the same sync pass within the same txg. 6028 * Unfortunately there are places (e.g. the ZIL) where we allocate a 6029 * segment but then we free part of it within the same txg 6030 * [see zil_sync()]. Thus, we don't call range_tree_verify() in the 6031 * current allocating tree. 6032 */ 6033 range_tree_verify_not_present(msp->ms_freeing, offset, size); 6034 range_tree_verify_not_present(msp->ms_checkpointing, offset, size); 6035 range_tree_verify_not_present(msp->ms_freed, offset, size); 6036 for (int j = 0; j < TXG_DEFER_SIZE; j++) 6037 range_tree_verify_not_present(msp->ms_defer[j], offset, size); 6038 range_tree_verify_not_present(msp->ms_trim, offset, size); 6039 mutex_exit(&msp->ms_lock); 6040 } 6041 6042 void 6043 metaslab_check_free(spa_t *spa, const blkptr_t *bp) 6044 { 6045 if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) 6046 return; 6047 6048 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 6049 for (int i = 0; i < BP_GET_NDVAS(bp); i++) { 6050 uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]); 6051 vdev_t *vd = vdev_lookup_top(spa, vdev); 6052 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); 6053 uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]); 6054 6055 if (DVA_GET_GANG(&bp->blk_dva[i])) 6056 size = vdev_gang_header_asize(vd); 6057 6058 ASSERT3P(vd, !=, NULL); 6059 6060 metaslab_check_free_impl(vd, offset, size); 6061 } 6062 spa_config_exit(spa, SCL_VDEV, FTAG); 6063 } 6064 6065 static void 6066 metaslab_group_disable_wait(metaslab_group_t *mg) 6067 { 6068 ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock)); 6069 while (mg->mg_disabled_updating) { 6070 cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock); 6071 } 6072 } 6073 6074 static void 6075 metaslab_group_disabled_increment(metaslab_group_t *mg) 6076 { 6077 ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock)); 6078 ASSERT(mg->mg_disabled_updating); 6079 6080 while (mg->mg_ms_disabled >= max_disabled_ms) { 6081 cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock); 6082 } 6083 mg->mg_ms_disabled++; 6084 ASSERT3U(mg->mg_ms_disabled, <=, max_disabled_ms); 6085 } 6086 6087 /* 6088 * Mark the metaslab as disabled to prevent any allocations on this metaslab. 6089 * We must also track how many metaslabs are currently disabled within a 6090 * metaslab group and limit them to prevent allocation failures from 6091 * occurring because all metaslabs are disabled. 6092 */ 6093 void 6094 metaslab_disable(metaslab_t *msp) 6095 { 6096 ASSERT(!MUTEX_HELD(&msp->ms_lock)); 6097 metaslab_group_t *mg = msp->ms_group; 6098 6099 mutex_enter(&mg->mg_ms_disabled_lock); 6100 6101 /* 6102 * To keep an accurate count of how many threads have disabled 6103 * a specific metaslab group, we only allow one thread to mark 6104 * the metaslab group at a time. This ensures that the value of 6105 * ms_disabled will be accurate when we decide to mark a metaslab 6106 * group as disabled. To do this we force all other threads 6107 * to wait till the metaslab's mg_disabled_updating flag is no 6108 * longer set. 6109 */ 6110 metaslab_group_disable_wait(mg); 6111 mg->mg_disabled_updating = B_TRUE; 6112 if (msp->ms_disabled == 0) { 6113 metaslab_group_disabled_increment(mg); 6114 } 6115 mutex_enter(&msp->ms_lock); 6116 msp->ms_disabled++; 6117 mutex_exit(&msp->ms_lock); 6118 6119 mg->mg_disabled_updating = B_FALSE; 6120 cv_broadcast(&mg->mg_ms_disabled_cv); 6121 mutex_exit(&mg->mg_ms_disabled_lock); 6122 } 6123 6124 void 6125 metaslab_enable(metaslab_t *msp, boolean_t sync, boolean_t unload) 6126 { 6127 metaslab_group_t *mg = msp->ms_group; 6128 spa_t *spa = mg->mg_vd->vdev_spa; 6129 6130 /* 6131 * Wait for the outstanding IO to be synced to prevent newly 6132 * allocated blocks from being overwritten. This used by 6133 * initialize and TRIM which are modifying unallocated space. 6134 */ 6135 if (sync) 6136 txg_wait_synced(spa_get_dsl(spa), 0); 6137 6138 mutex_enter(&mg->mg_ms_disabled_lock); 6139 mutex_enter(&msp->ms_lock); 6140 if (--msp->ms_disabled == 0) { 6141 mg->mg_ms_disabled--; 6142 cv_broadcast(&mg->mg_ms_disabled_cv); 6143 if (unload) 6144 metaslab_unload(msp); 6145 } 6146 mutex_exit(&msp->ms_lock); 6147 mutex_exit(&mg->mg_ms_disabled_lock); 6148 } 6149 6150 void 6151 metaslab_set_unflushed_dirty(metaslab_t *ms, boolean_t dirty) 6152 { 6153 ms->ms_unflushed_dirty = dirty; 6154 } 6155 6156 static void 6157 metaslab_update_ondisk_flush_data(metaslab_t *ms, dmu_tx_t *tx) 6158 { 6159 vdev_t *vd = ms->ms_group->mg_vd; 6160 spa_t *spa = vd->vdev_spa; 6161 objset_t *mos = spa_meta_objset(spa); 6162 6163 ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); 6164 6165 metaslab_unflushed_phys_t entry = { 6166 .msp_unflushed_txg = metaslab_unflushed_txg(ms), 6167 }; 6168 uint64_t entry_size = sizeof (entry); 6169 uint64_t entry_offset = ms->ms_id * entry_size; 6170 6171 uint64_t object = 0; 6172 int err = zap_lookup(mos, vd->vdev_top_zap, 6173 VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, 6174 &object); 6175 if (err == ENOENT) { 6176 object = dmu_object_alloc(mos, DMU_OTN_UINT64_METADATA, 6177 SPA_OLD_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx); 6178 VERIFY0(zap_add(mos, vd->vdev_top_zap, 6179 VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, 6180 &object, tx)); 6181 } else { 6182 VERIFY0(err); 6183 } 6184 6185 dmu_write(spa_meta_objset(spa), object, entry_offset, entry_size, 6186 &entry, tx); 6187 } 6188 6189 void 6190 metaslab_set_unflushed_txg(metaslab_t *ms, uint64_t txg, dmu_tx_t *tx) 6191 { 6192 ms->ms_unflushed_txg = txg; 6193 metaslab_update_ondisk_flush_data(ms, tx); 6194 } 6195 6196 boolean_t 6197 metaslab_unflushed_dirty(metaslab_t *ms) 6198 { 6199 return (ms->ms_unflushed_dirty); 6200 } 6201 6202 uint64_t 6203 metaslab_unflushed_txg(metaslab_t *ms) 6204 { 6205 return (ms->ms_unflushed_txg); 6206 } 6207 6208 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, aliquot, U64, ZMOD_RW, 6209 "Allocation granularity (a.k.a. stripe size)"); 6210 6211 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, debug_load, INT, ZMOD_RW, 6212 "Load all metaslabs when pool is first opened"); 6213 6214 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, debug_unload, INT, ZMOD_RW, 6215 "Prevent metaslabs from being unloaded"); 6216 6217 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_enabled, INT, ZMOD_RW, 6218 "Preload potential metaslabs during reassessment"); 6219 6220 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_limit, UINT, ZMOD_RW, 6221 "Max number of metaslabs per group to preload"); 6222 6223 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay, UINT, ZMOD_RW, 6224 "Delay in txgs after metaslab was last used before unloading"); 6225 6226 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay_ms, UINT, ZMOD_RW, 6227 "Delay in milliseconds after metaslab was last used before unloading"); 6228 6229 /* BEGIN CSTYLED */ 6230 ZFS_MODULE_PARAM(zfs_mg, zfs_mg_, noalloc_threshold, UINT, ZMOD_RW, 6231 "Percentage of metaslab group size that should be free to make it " 6232 "eligible for allocation"); 6233 6234 ZFS_MODULE_PARAM(zfs_mg, zfs_mg_, fragmentation_threshold, UINT, ZMOD_RW, 6235 "Percentage of metaslab group size that should be considered eligible " 6236 "for allocations unless all metaslab groups within the metaslab class " 6237 "have also crossed this threshold"); 6238 6239 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, fragmentation_factor_enabled, INT, 6240 ZMOD_RW, 6241 "Use the fragmentation metric to prefer less fragmented metaslabs"); 6242 /* END CSTYLED */ 6243 6244 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, fragmentation_threshold, UINT, 6245 ZMOD_RW, "Fragmentation for metaslab to allow allocation"); 6246 6247 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, lba_weighting_enabled, INT, ZMOD_RW, 6248 "Prefer metaslabs with lower LBAs"); 6249 6250 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, bias_enabled, INT, ZMOD_RW, 6251 "Enable metaslab group biasing"); 6252 6253 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, segment_weight_enabled, INT, 6254 ZMOD_RW, "Enable segment-based metaslab selection"); 6255 6256 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, switch_threshold, INT, ZMOD_RW, 6257 "Segment-based metaslab selection maximum buckets before switching"); 6258 6259 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging, U64, ZMOD_RW, 6260 "Blocks larger than this size are sometimes forced to be gang blocks"); 6261 6262 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging_pct, UINT, ZMOD_RW, 6263 "Percentage of large blocks that will be forced to be gang blocks"); 6264 6265 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_max_search, UINT, ZMOD_RW, 6266 "Max distance (bytes) to search forward before using size tree"); 6267 6268 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_use_largest_segment, INT, ZMOD_RW, 6269 "When looking in size tree, use largest segment instead of exact fit"); 6270 6271 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, max_size_cache_sec, U64, 6272 ZMOD_RW, "How long to trust the cached max chunk size of a metaslab"); 6273 6274 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, mem_limit, UINT, ZMOD_RW, 6275 "Percentage of memory that can be used to store metaslab range trees"); 6276 6277 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, try_hard_before_gang, INT, 6278 ZMOD_RW, "Try hard to allocate before ganging"); 6279 6280 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, find_max_tries, UINT, ZMOD_RW, 6281 "Normally only consider this many of the best metaslabs in each vdev"); 6282 6283 /* BEGIN CSTYLED */ 6284 ZFS_MODULE_PARAM_CALL(zfs, zfs_, active_allocator, 6285 param_set_active_allocator, param_get_charp, ZMOD_RW, 6286 "SPA active allocator"); 6287 /* END CSTYLED */ 6288