1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 24 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 25 * Copyright (c) 2014 Integros [integros.com] 26 * Copyright (c) 2017, Intel Corporation. 27 */ 28 29 #include <sys/zfs_context.h> 30 #include <sys/dmu.h> 31 #include <sys/dmu_tx.h> 32 #include <sys/space_map.h> 33 #include <sys/metaslab_impl.h> 34 #include <sys/vdev_impl.h> 35 #include <sys/zio.h> 36 #include <sys/spa_impl.h> 37 #include <sys/zfeature.h> 38 #include <sys/vdev_indirect_mapping.h> 39 #include <sys/zap.h> 40 #include <sys/btree.h> 41 42 #define GANG_ALLOCATION(flags) \ 43 ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER)) 44 45 uint64_t metaslab_aliquot = 512ULL << 10; 46 uint64_t metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ 47 48 /* 49 * In pools where the log space map feature is not enabled we touch 50 * multiple metaslabs (and their respective space maps) with each 51 * transaction group. Thus, we benefit from having a small space map 52 * block size since it allows us to issue more I/O operations scattered 53 * around the disk. So a sane default for the space map block size 54 * is 8~16K. 55 */ 56 int zfs_metaslab_sm_blksz_no_log = (1 << 14); 57 58 /* 59 * When the log space map feature is enabled, we accumulate a lot of 60 * changes per metaslab that are flushed once in a while so we benefit 61 * from a bigger block size like 128K for the metaslab space maps. 62 */ 63 int zfs_metaslab_sm_blksz_with_log = (1 << 17); 64 65 /* 66 * The in-core space map representation is more compact than its on-disk form. 67 * The zfs_condense_pct determines how much more compact the in-core 68 * space map representation must be before we compact it on-disk. 69 * Values should be greater than or equal to 100. 70 */ 71 int zfs_condense_pct = 200; 72 73 /* 74 * Condensing a metaslab is not guaranteed to actually reduce the amount of 75 * space used on disk. In particular, a space map uses data in increments of 76 * MAX(1 << ashift, space_map_blksize), so a metaslab might use the 77 * same number of blocks after condensing. Since the goal of condensing is to 78 * reduce the number of IOPs required to read the space map, we only want to 79 * condense when we can be sure we will reduce the number of blocks used by the 80 * space map. Unfortunately, we cannot precisely compute whether or not this is 81 * the case in metaslab_should_condense since we are holding ms_lock. Instead, 82 * we apply the following heuristic: do not condense a spacemap unless the 83 * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold 84 * blocks. 85 */ 86 int zfs_metaslab_condense_block_threshold = 4; 87 88 /* 89 * The zfs_mg_noalloc_threshold defines which metaslab groups should 90 * be eligible for allocation. The value is defined as a percentage of 91 * free space. Metaslab groups that have more free space than 92 * zfs_mg_noalloc_threshold are always eligible for allocations. Once 93 * a metaslab group's free space is less than or equal to the 94 * zfs_mg_noalloc_threshold the allocator will avoid allocating to that 95 * group unless all groups in the pool have reached zfs_mg_noalloc_threshold. 96 * Once all groups in the pool reach zfs_mg_noalloc_threshold then all 97 * groups are allowed to accept allocations. Gang blocks are always 98 * eligible to allocate on any metaslab group. The default value of 0 means 99 * no metaslab group will be excluded based on this criterion. 100 */ 101 int zfs_mg_noalloc_threshold = 0; 102 103 /* 104 * Metaslab groups are considered eligible for allocations if their 105 * fragmenation metric (measured as a percentage) is less than or 106 * equal to zfs_mg_fragmentation_threshold. If a metaslab group 107 * exceeds this threshold then it will be skipped unless all metaslab 108 * groups within the metaslab class have also crossed this threshold. 109 * 110 * This tunable was introduced to avoid edge cases where we continue 111 * allocating from very fragmented disks in our pool while other, less 112 * fragmented disks, exists. On the other hand, if all disks in the 113 * pool are uniformly approaching the threshold, the threshold can 114 * be a speed bump in performance, where we keep switching the disks 115 * that we allocate from (e.g. we allocate some segments from disk A 116 * making it bypassing the threshold while freeing segments from disk 117 * B getting its fragmentation below the threshold). 118 * 119 * Empirically, we've seen that our vdev selection for allocations is 120 * good enough that fragmentation increases uniformly across all vdevs 121 * the majority of the time. Thus we set the threshold percentage high 122 * enough to avoid hitting the speed bump on pools that are being pushed 123 * to the edge. 124 */ 125 int zfs_mg_fragmentation_threshold = 95; 126 127 /* 128 * Allow metaslabs to keep their active state as long as their fragmentation 129 * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An 130 * active metaslab that exceeds this threshold will no longer keep its active 131 * status allowing better metaslabs to be selected. 132 */ 133 int zfs_metaslab_fragmentation_threshold = 70; 134 135 /* 136 * When set will load all metaslabs when pool is first opened. 137 */ 138 int metaslab_debug_load = 0; 139 140 /* 141 * When set will prevent metaslabs from being unloaded. 142 */ 143 int metaslab_debug_unload = 0; 144 145 /* 146 * Minimum size which forces the dynamic allocator to change 147 * it's allocation strategy. Once the space map cannot satisfy 148 * an allocation of this size then it switches to using more 149 * aggressive strategy (i.e search by size rather than offset). 150 */ 151 uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE; 152 153 /* 154 * The minimum free space, in percent, which must be available 155 * in a space map to continue allocations in a first-fit fashion. 156 * Once the space map's free space drops below this level we dynamically 157 * switch to using best-fit allocations. 158 */ 159 int metaslab_df_free_pct = 4; 160 161 /* 162 * Maximum distance to search forward from the last offset. Without this 163 * limit, fragmented pools can see >100,000 iterations and 164 * metaslab_block_picker() becomes the performance limiting factor on 165 * high-performance storage. 166 * 167 * With the default setting of 16MB, we typically see less than 500 168 * iterations, even with very fragmented, ashift=9 pools. The maximum number 169 * of iterations possible is: 170 * metaslab_df_max_search / (2 * (1<<ashift)) 171 * With the default setting of 16MB this is 16*1024 (with ashift=9) or 172 * 2048 (with ashift=12). 173 */ 174 int metaslab_df_max_search = 16 * 1024 * 1024; 175 176 /* 177 * Forces the metaslab_block_picker function to search for at least this many 178 * segments forwards until giving up on finding a segment that the allocation 179 * will fit into. 180 */ 181 uint32_t metaslab_min_search_count = 100; 182 183 /* 184 * If we are not searching forward (due to metaslab_df_max_search, 185 * metaslab_df_free_pct, or metaslab_df_alloc_threshold), this tunable 186 * controls what segment is used. If it is set, we will use the largest free 187 * segment. If it is not set, we will use a segment of exactly the requested 188 * size (or larger). 189 */ 190 int metaslab_df_use_largest_segment = B_FALSE; 191 192 /* 193 * A metaslab is considered "free" if it contains a contiguous 194 * segment which is greater than metaslab_min_alloc_size. 195 */ 196 uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS; 197 198 /* 199 * Percentage of all cpus that can be used by the metaslab taskq. 200 */ 201 int metaslab_load_pct = 50; 202 203 /* 204 * These tunables control how long a metaslab will remain loaded after the 205 * last allocation from it. A metaslab can't be unloaded until at least 206 * metaslab_unload_delay TXG's and metaslab_unload_delay_ms milliseconds 207 * have elapsed. However, zfs_metaslab_mem_limit may cause it to be 208 * unloaded sooner. These settings are intended to be generous -- to keep 209 * metaslabs loaded for a long time, reducing the rate of metaslab loading. 210 */ 211 int metaslab_unload_delay = 32; 212 int metaslab_unload_delay_ms = 10 * 60 * 1000; /* ten minutes */ 213 214 /* 215 * Max number of metaslabs per group to preload. 216 */ 217 int metaslab_preload_limit = 10; 218 219 /* 220 * Enable/disable preloading of metaslab. 221 */ 222 boolean_t metaslab_preload_enabled = B_TRUE; 223 224 /* 225 * Enable/disable fragmentation weighting on metaslabs. 226 */ 227 boolean_t metaslab_fragmentation_factor_enabled = B_TRUE; 228 229 /* 230 * Enable/disable lba weighting (i.e. outer tracks are given preference). 231 */ 232 boolean_t metaslab_lba_weighting_enabled = B_TRUE; 233 234 /* 235 * Enable/disable metaslab group biasing. 236 */ 237 boolean_t metaslab_bias_enabled = B_TRUE; 238 239 /* 240 * Enable/disable remapping of indirect DVAs to their concrete vdevs. 241 */ 242 boolean_t zfs_remap_blkptr_enable = B_TRUE; 243 244 /* 245 * Enable/disable segment-based metaslab selection. 246 */ 247 boolean_t zfs_metaslab_segment_weight_enabled = B_TRUE; 248 249 /* 250 * When using segment-based metaslab selection, we will continue 251 * allocating from the active metaslab until we have exhausted 252 * zfs_metaslab_switch_threshold of its buckets. 253 */ 254 int zfs_metaslab_switch_threshold = 2; 255 256 /* 257 * Internal switch to enable/disable the metaslab allocation tracing 258 * facility. 259 */ 260 boolean_t metaslab_trace_enabled = B_TRUE; 261 262 /* 263 * Maximum entries that the metaslab allocation tracing facility will keep 264 * in a given list when running in non-debug mode. We limit the number 265 * of entries in non-debug mode to prevent us from using up too much memory. 266 * The limit should be sufficiently large that we don't expect any allocation 267 * to every exceed this value. In debug mode, the system will panic if this 268 * limit is ever reached allowing for further investigation. 269 */ 270 uint64_t metaslab_trace_max_entries = 5000; 271 272 /* 273 * Maximum number of metaslabs per group that can be disabled 274 * simultaneously. 275 */ 276 int max_disabled_ms = 3; 277 278 /* 279 * Time (in seconds) to respect ms_max_size when the metaslab is not loaded. 280 * To avoid 64-bit overflow, don't set above UINT32_MAX. 281 */ 282 unsigned long zfs_metaslab_max_size_cache_sec = 3600; /* 1 hour */ 283 284 /* 285 * Maximum percentage of memory to use on storing loaded metaslabs. If loading 286 * a metaslab would take it over this percentage, the oldest selected metaslab 287 * is automatically unloaded. 288 */ 289 int zfs_metaslab_mem_limit = 75; 290 291 /* 292 * Force the per-metaslab range trees to use 64-bit integers to store 293 * segments. Used for debugging purposes. 294 */ 295 boolean_t zfs_metaslab_force_large_segs = B_FALSE; 296 297 /* 298 * By default we only store segments over a certain size in the size-sorted 299 * metaslab trees (ms_allocatable_by_size and 300 * ms_unflushed_frees_by_size). This dramatically reduces memory usage and 301 * improves load and unload times at the cost of causing us to use slightly 302 * larger segments than we would otherwise in some cases. 303 */ 304 uint32_t metaslab_by_size_min_shift = 14; 305 306 static uint64_t metaslab_weight(metaslab_t *); 307 static void metaslab_set_fragmentation(metaslab_t *); 308 static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t); 309 static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t); 310 static void metaslab_passivate(metaslab_t *msp, uint64_t weight); 311 static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp); 312 static void metaslab_flush_update(metaslab_t *, dmu_tx_t *); 313 static unsigned int metaslab_idx_func(multilist_t *, void *); 314 static void metaslab_evict(metaslab_t *, uint64_t); 315 static void metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg); 316 317 kmem_cache_t *metaslab_alloc_trace_cache; 318 319 typedef struct metaslab_stats { 320 kstat_named_t metaslabstat_trace_over_limit; 321 kstat_named_t metaslabstat_df_find_under_floor; 322 kstat_named_t metaslabstat_reload_tree; 323 } metaslab_stats_t; 324 325 static metaslab_stats_t metaslab_stats = { 326 { "trace_over_limit", KSTAT_DATA_UINT64 }, 327 { "df_find_under_floor", KSTAT_DATA_UINT64 }, 328 { "reload_tree", KSTAT_DATA_UINT64 }, 329 }; 330 331 #define METASLABSTAT_BUMP(stat) \ 332 atomic_inc_64(&metaslab_stats.stat.value.ui64); 333 334 335 kstat_t *metaslab_ksp; 336 337 void 338 metaslab_stat_init(void) 339 { 340 ASSERT(metaslab_alloc_trace_cache == NULL); 341 metaslab_alloc_trace_cache = kmem_cache_create( 342 "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t), 343 0, NULL, NULL, NULL, NULL, NULL, 0); 344 metaslab_ksp = kstat_create("zfs", 0, "metaslab_stats", 345 "misc", KSTAT_TYPE_NAMED, sizeof (metaslab_stats) / 346 sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 347 if (metaslab_ksp != NULL) { 348 metaslab_ksp->ks_data = &metaslab_stats; 349 kstat_install(metaslab_ksp); 350 } 351 } 352 353 void 354 metaslab_stat_fini(void) 355 { 356 if (metaslab_ksp != NULL) { 357 kstat_delete(metaslab_ksp); 358 metaslab_ksp = NULL; 359 } 360 361 kmem_cache_destroy(metaslab_alloc_trace_cache); 362 metaslab_alloc_trace_cache = NULL; 363 } 364 365 /* 366 * ========================================================================== 367 * Metaslab classes 368 * ========================================================================== 369 */ 370 metaslab_class_t * 371 metaslab_class_create(spa_t *spa, metaslab_ops_t *ops) 372 { 373 metaslab_class_t *mc; 374 375 mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); 376 377 mc->mc_spa = spa; 378 mc->mc_rotor = NULL; 379 mc->mc_ops = ops; 380 mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL); 381 mc->mc_metaslab_txg_list = multilist_create(sizeof (metaslab_t), 382 offsetof(metaslab_t, ms_class_txg_node), metaslab_idx_func); 383 mc->mc_alloc_slots = kmem_zalloc(spa->spa_alloc_count * 384 sizeof (zfs_refcount_t), KM_SLEEP); 385 mc->mc_alloc_max_slots = kmem_zalloc(spa->spa_alloc_count * 386 sizeof (uint64_t), KM_SLEEP); 387 for (int i = 0; i < spa->spa_alloc_count; i++) 388 zfs_refcount_create_tracked(&mc->mc_alloc_slots[i]); 389 390 return (mc); 391 } 392 393 void 394 metaslab_class_destroy(metaslab_class_t *mc) 395 { 396 ASSERT(mc->mc_rotor == NULL); 397 ASSERT(mc->mc_alloc == 0); 398 ASSERT(mc->mc_deferred == 0); 399 ASSERT(mc->mc_space == 0); 400 ASSERT(mc->mc_dspace == 0); 401 402 for (int i = 0; i < mc->mc_spa->spa_alloc_count; i++) 403 zfs_refcount_destroy(&mc->mc_alloc_slots[i]); 404 kmem_free(mc->mc_alloc_slots, mc->mc_spa->spa_alloc_count * 405 sizeof (zfs_refcount_t)); 406 kmem_free(mc->mc_alloc_max_slots, mc->mc_spa->spa_alloc_count * 407 sizeof (uint64_t)); 408 mutex_destroy(&mc->mc_lock); 409 multilist_destroy(mc->mc_metaslab_txg_list); 410 kmem_free(mc, sizeof (metaslab_class_t)); 411 } 412 413 int 414 metaslab_class_validate(metaslab_class_t *mc) 415 { 416 metaslab_group_t *mg; 417 vdev_t *vd; 418 419 /* 420 * Must hold one of the spa_config locks. 421 */ 422 ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || 423 spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); 424 425 if ((mg = mc->mc_rotor) == NULL) 426 return (0); 427 428 do { 429 vd = mg->mg_vd; 430 ASSERT(vd->vdev_mg != NULL); 431 ASSERT3P(vd->vdev_top, ==, vd); 432 ASSERT3P(mg->mg_class, ==, mc); 433 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); 434 } while ((mg = mg->mg_next) != mc->mc_rotor); 435 436 return (0); 437 } 438 439 static void 440 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, 441 int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) 442 { 443 atomic_add_64(&mc->mc_alloc, alloc_delta); 444 atomic_add_64(&mc->mc_deferred, defer_delta); 445 atomic_add_64(&mc->mc_space, space_delta); 446 atomic_add_64(&mc->mc_dspace, dspace_delta); 447 } 448 449 uint64_t 450 metaslab_class_get_alloc(metaslab_class_t *mc) 451 { 452 return (mc->mc_alloc); 453 } 454 455 uint64_t 456 metaslab_class_get_deferred(metaslab_class_t *mc) 457 { 458 return (mc->mc_deferred); 459 } 460 461 uint64_t 462 metaslab_class_get_space(metaslab_class_t *mc) 463 { 464 return (mc->mc_space); 465 } 466 467 uint64_t 468 metaslab_class_get_dspace(metaslab_class_t *mc) 469 { 470 return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); 471 } 472 473 void 474 metaslab_class_histogram_verify(metaslab_class_t *mc) 475 { 476 spa_t *spa = mc->mc_spa; 477 vdev_t *rvd = spa->spa_root_vdev; 478 uint64_t *mc_hist; 479 int i; 480 481 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 482 return; 483 484 mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 485 KM_SLEEP); 486 487 for (int c = 0; c < rvd->vdev_children; c++) { 488 vdev_t *tvd = rvd->vdev_child[c]; 489 metaslab_group_t *mg = tvd->vdev_mg; 490 491 /* 492 * Skip any holes, uninitialized top-levels, or 493 * vdevs that are not in this metalab class. 494 */ 495 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || 496 mg->mg_class != mc) { 497 continue; 498 } 499 500 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 501 mc_hist[i] += mg->mg_histogram[i]; 502 } 503 504 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 505 VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]); 506 507 kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 508 } 509 510 /* 511 * Calculate the metaslab class's fragmentation metric. The metric 512 * is weighted based on the space contribution of each metaslab group. 513 * The return value will be a number between 0 and 100 (inclusive), or 514 * ZFS_FRAG_INVALID if the metric has not been set. See comment above the 515 * zfs_frag_table for more information about the metric. 516 */ 517 uint64_t 518 metaslab_class_fragmentation(metaslab_class_t *mc) 519 { 520 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 521 uint64_t fragmentation = 0; 522 523 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 524 525 for (int c = 0; c < rvd->vdev_children; c++) { 526 vdev_t *tvd = rvd->vdev_child[c]; 527 metaslab_group_t *mg = tvd->vdev_mg; 528 529 /* 530 * Skip any holes, uninitialized top-levels, 531 * or vdevs that are not in this metalab class. 532 */ 533 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || 534 mg->mg_class != mc) { 535 continue; 536 } 537 538 /* 539 * If a metaslab group does not contain a fragmentation 540 * metric then just bail out. 541 */ 542 if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { 543 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 544 return (ZFS_FRAG_INVALID); 545 } 546 547 /* 548 * Determine how much this metaslab_group is contributing 549 * to the overall pool fragmentation metric. 550 */ 551 fragmentation += mg->mg_fragmentation * 552 metaslab_group_get_space(mg); 553 } 554 fragmentation /= metaslab_class_get_space(mc); 555 556 ASSERT3U(fragmentation, <=, 100); 557 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 558 return (fragmentation); 559 } 560 561 /* 562 * Calculate the amount of expandable space that is available in 563 * this metaslab class. If a device is expanded then its expandable 564 * space will be the amount of allocatable space that is currently not 565 * part of this metaslab class. 566 */ 567 uint64_t 568 metaslab_class_expandable_space(metaslab_class_t *mc) 569 { 570 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 571 uint64_t space = 0; 572 573 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 574 for (int c = 0; c < rvd->vdev_children; c++) { 575 uint64_t tspace; 576 vdev_t *tvd = rvd->vdev_child[c]; 577 metaslab_group_t *mg = tvd->vdev_mg; 578 579 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || 580 mg->mg_class != mc) { 581 continue; 582 } 583 584 /* 585 * Calculate if we have enough space to add additional 586 * metaslabs. We report the expandable space in terms 587 * of the metaslab size since that's the unit of expansion. 588 * Adjust by efi system partition size. 589 */ 590 tspace = tvd->vdev_max_asize - tvd->vdev_asize; 591 if (tspace > mc->mc_spa->spa_bootsize) { 592 tspace -= mc->mc_spa->spa_bootsize; 593 } 594 space += P2ALIGN(tspace, 1ULL << tvd->vdev_ms_shift); 595 } 596 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 597 return (space); 598 } 599 600 void 601 metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg) 602 { 603 multilist_t *ml = mc->mc_metaslab_txg_list; 604 for (int i = 0; i < multilist_get_num_sublists(ml); i++) { 605 multilist_sublist_t *mls = multilist_sublist_lock(ml, i); 606 metaslab_t *msp = multilist_sublist_head(mls); 607 multilist_sublist_unlock(mls); 608 while (msp != NULL) { 609 mutex_enter(&msp->ms_lock); 610 611 /* 612 * If the metaslab has been removed from the list 613 * (which could happen if we were at the memory limit 614 * and it was evicted during this loop), then we can't 615 * proceed and we should restart the sublist. 616 */ 617 if (!multilist_link_active(&msp->ms_class_txg_node)) { 618 mutex_exit(&msp->ms_lock); 619 i--; 620 break; 621 } 622 mls = multilist_sublist_lock(ml, i); 623 metaslab_t *next_msp = multilist_sublist_next(mls, msp); 624 multilist_sublist_unlock(mls); 625 if (txg > 626 msp->ms_selected_txg + metaslab_unload_delay && 627 gethrtime() > msp->ms_selected_time + 628 (uint64_t)MSEC2NSEC(metaslab_unload_delay_ms)) { 629 metaslab_evict(msp, txg); 630 } else { 631 /* 632 * Once we've hit a metaslab selected too 633 * recently to evict, we're done evicting for 634 * now. 635 */ 636 mutex_exit(&msp->ms_lock); 637 break; 638 } 639 mutex_exit(&msp->ms_lock); 640 msp = next_msp; 641 } 642 } 643 } 644 645 static int 646 metaslab_compare(const void *x1, const void *x2) 647 { 648 const metaslab_t *m1 = (const metaslab_t *)x1; 649 const metaslab_t *m2 = (const metaslab_t *)x2; 650 651 int sort1 = 0; 652 int sort2 = 0; 653 if (m1->ms_allocator != -1 && m1->ms_primary) 654 sort1 = 1; 655 else if (m1->ms_allocator != -1 && !m1->ms_primary) 656 sort1 = 2; 657 if (m2->ms_allocator != -1 && m2->ms_primary) 658 sort2 = 1; 659 else if (m2->ms_allocator != -1 && !m2->ms_primary) 660 sort2 = 2; 661 662 /* 663 * Sort inactive metaslabs first, then primaries, then secondaries. When 664 * selecting a metaslab to allocate from, an allocator first tries its 665 * primary, then secondary active metaslab. If it doesn't have active 666 * metaslabs, or can't allocate from them, it searches for an inactive 667 * metaslab to activate. If it can't find a suitable one, it will steal 668 * a primary or secondary metaslab from another allocator. 669 */ 670 if (sort1 < sort2) 671 return (-1); 672 if (sort1 > sort2) 673 return (1); 674 675 int cmp = TREE_CMP(m2->ms_weight, m1->ms_weight); 676 if (likely(cmp)) 677 return (cmp); 678 679 IMPLY(TREE_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2); 680 681 return (TREE_CMP(m1->ms_start, m2->ms_start)); 682 } 683 684 /* 685 * ========================================================================== 686 * Metaslab groups 687 * ========================================================================== 688 */ 689 /* 690 * Update the allocatable flag and the metaslab group's capacity. 691 * The allocatable flag is set to true if the capacity is below 692 * the zfs_mg_noalloc_threshold or has a fragmentation value that is 693 * greater than zfs_mg_fragmentation_threshold. If a metaslab group 694 * transitions from allocatable to non-allocatable or vice versa then the 695 * metaslab group's class is updated to reflect the transition. 696 */ 697 static void 698 metaslab_group_alloc_update(metaslab_group_t *mg) 699 { 700 vdev_t *vd = mg->mg_vd; 701 metaslab_class_t *mc = mg->mg_class; 702 vdev_stat_t *vs = &vd->vdev_stat; 703 boolean_t was_allocatable; 704 boolean_t was_initialized; 705 706 ASSERT(vd == vd->vdev_top); 707 ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==, 708 SCL_ALLOC); 709 710 mutex_enter(&mg->mg_lock); 711 was_allocatable = mg->mg_allocatable; 712 was_initialized = mg->mg_initialized; 713 714 mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / 715 (vs->vs_space + 1); 716 717 mutex_enter(&mc->mc_lock); 718 719 /* 720 * If the metaslab group was just added then it won't 721 * have any space until we finish syncing out this txg. 722 * At that point we will consider it initialized and available 723 * for allocations. We also don't consider non-activated 724 * metaslab groups (e.g. vdevs that are in the middle of being removed) 725 * to be initialized, because they can't be used for allocation. 726 */ 727 mg->mg_initialized = metaslab_group_initialized(mg); 728 if (!was_initialized && mg->mg_initialized) { 729 mc->mc_groups++; 730 } else if (was_initialized && !mg->mg_initialized) { 731 ASSERT3U(mc->mc_groups, >, 0); 732 mc->mc_groups--; 733 } 734 if (mg->mg_initialized) 735 mg->mg_no_free_space = B_FALSE; 736 737 /* 738 * A metaslab group is considered allocatable if it has plenty 739 * of free space or is not heavily fragmented. We only take 740 * fragmentation into account if the metaslab group has a valid 741 * fragmentation metric (i.e. a value between 0 and 100). 742 */ 743 mg->mg_allocatable = (mg->mg_activation_count > 0 && 744 mg->mg_free_capacity > zfs_mg_noalloc_threshold && 745 (mg->mg_fragmentation == ZFS_FRAG_INVALID || 746 mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)); 747 748 /* 749 * The mc_alloc_groups maintains a count of the number of 750 * groups in this metaslab class that are still above the 751 * zfs_mg_noalloc_threshold. This is used by the allocating 752 * threads to determine if they should avoid allocations to 753 * a given group. The allocator will avoid allocations to a group 754 * if that group has reached or is below the zfs_mg_noalloc_threshold 755 * and there are still other groups that are above the threshold. 756 * When a group transitions from allocatable to non-allocatable or 757 * vice versa we update the metaslab class to reflect that change. 758 * When the mc_alloc_groups value drops to 0 that means that all 759 * groups have reached the zfs_mg_noalloc_threshold making all groups 760 * eligible for allocations. This effectively means that all devices 761 * are balanced again. 762 */ 763 if (was_allocatable && !mg->mg_allocatable) 764 mc->mc_alloc_groups--; 765 else if (!was_allocatable && mg->mg_allocatable) 766 mc->mc_alloc_groups++; 767 mutex_exit(&mc->mc_lock); 768 769 mutex_exit(&mg->mg_lock); 770 } 771 772 int 773 metaslab_sort_by_flushed(const void *va, const void *vb) 774 { 775 const metaslab_t *a = va; 776 const metaslab_t *b = vb; 777 778 int cmp = TREE_CMP(a->ms_unflushed_txg, b->ms_unflushed_txg); 779 if (likely(cmp)) 780 return (cmp); 781 782 uint64_t a_vdev_id = a->ms_group->mg_vd->vdev_id; 783 uint64_t b_vdev_id = b->ms_group->mg_vd->vdev_id; 784 cmp = TREE_CMP(a_vdev_id, b_vdev_id); 785 if (cmp) 786 return (cmp); 787 788 return (TREE_CMP(a->ms_id, b->ms_id)); 789 } 790 791 metaslab_group_t * 792 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators) 793 { 794 metaslab_group_t *mg; 795 796 mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); 797 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); 798 mutex_init(&mg->mg_ms_disabled_lock, NULL, MUTEX_DEFAULT, NULL); 799 cv_init(&mg->mg_ms_disabled_cv, NULL, CV_DEFAULT, NULL); 800 mg->mg_primaries = kmem_zalloc(allocators * sizeof (metaslab_t *), 801 KM_SLEEP); 802 mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *), 803 KM_SLEEP); 804 avl_create(&mg->mg_metaslab_tree, metaslab_compare, 805 sizeof (metaslab_t), offsetof(metaslab_t, ms_group_node)); 806 mg->mg_vd = vd; 807 mg->mg_class = mc; 808 mg->mg_activation_count = 0; 809 mg->mg_initialized = B_FALSE; 810 mg->mg_no_free_space = B_TRUE; 811 mg->mg_allocators = allocators; 812 813 mg->mg_alloc_queue_depth = kmem_zalloc(allocators * 814 sizeof (zfs_refcount_t), KM_SLEEP); 815 mg->mg_cur_max_alloc_queue_depth = kmem_zalloc(allocators * 816 sizeof (uint64_t), KM_SLEEP); 817 for (int i = 0; i < allocators; i++) { 818 zfs_refcount_create_tracked(&mg->mg_alloc_queue_depth[i]); 819 mg->mg_cur_max_alloc_queue_depth[i] = 0; 820 } 821 822 mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct, 823 minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT); 824 825 return (mg); 826 } 827 828 void 829 metaslab_group_destroy(metaslab_group_t *mg) 830 { 831 ASSERT(mg->mg_prev == NULL); 832 ASSERT(mg->mg_next == NULL); 833 /* 834 * We may have gone below zero with the activation count 835 * either because we never activated in the first place or 836 * because we're done, and possibly removing the vdev. 837 */ 838 ASSERT(mg->mg_activation_count <= 0); 839 840 taskq_destroy(mg->mg_taskq); 841 avl_destroy(&mg->mg_metaslab_tree); 842 kmem_free(mg->mg_primaries, mg->mg_allocators * sizeof (metaslab_t *)); 843 kmem_free(mg->mg_secondaries, mg->mg_allocators * 844 sizeof (metaslab_t *)); 845 mutex_destroy(&mg->mg_lock); 846 mutex_destroy(&mg->mg_ms_disabled_lock); 847 cv_destroy(&mg->mg_ms_disabled_cv); 848 849 for (int i = 0; i < mg->mg_allocators; i++) { 850 zfs_refcount_destroy(&mg->mg_alloc_queue_depth[i]); 851 mg->mg_cur_max_alloc_queue_depth[i] = 0; 852 } 853 kmem_free(mg->mg_alloc_queue_depth, mg->mg_allocators * 854 sizeof (zfs_refcount_t)); 855 kmem_free(mg->mg_cur_max_alloc_queue_depth, mg->mg_allocators * 856 sizeof (uint64_t)); 857 858 kmem_free(mg, sizeof (metaslab_group_t)); 859 } 860 861 void 862 metaslab_group_activate(metaslab_group_t *mg) 863 { 864 metaslab_class_t *mc = mg->mg_class; 865 metaslab_group_t *mgprev, *mgnext; 866 867 ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER), !=, 0); 868 869 ASSERT(mc->mc_rotor != mg); 870 ASSERT(mg->mg_prev == NULL); 871 ASSERT(mg->mg_next == NULL); 872 ASSERT(mg->mg_activation_count <= 0); 873 874 if (++mg->mg_activation_count <= 0) 875 return; 876 877 mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); 878 metaslab_group_alloc_update(mg); 879 880 if ((mgprev = mc->mc_rotor) == NULL) { 881 mg->mg_prev = mg; 882 mg->mg_next = mg; 883 } else { 884 mgnext = mgprev->mg_next; 885 mg->mg_prev = mgprev; 886 mg->mg_next = mgnext; 887 mgprev->mg_next = mg; 888 mgnext->mg_prev = mg; 889 } 890 mc->mc_rotor = mg; 891 } 892 893 /* 894 * Passivate a metaslab group and remove it from the allocation rotor. 895 * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating 896 * a metaslab group. This function will momentarily drop spa_config_locks 897 * that are lower than the SCL_ALLOC lock (see comment below). 898 */ 899 void 900 metaslab_group_passivate(metaslab_group_t *mg) 901 { 902 metaslab_class_t *mc = mg->mg_class; 903 spa_t *spa = mc->mc_spa; 904 metaslab_group_t *mgprev, *mgnext; 905 int locks = spa_config_held(spa, SCL_ALL, RW_WRITER); 906 907 ASSERT3U(spa_config_held(spa, SCL_ALLOC | SCL_ZIO, RW_WRITER), ==, 908 (SCL_ALLOC | SCL_ZIO)); 909 910 if (--mg->mg_activation_count != 0) { 911 ASSERT(mc->mc_rotor != mg); 912 ASSERT(mg->mg_prev == NULL); 913 ASSERT(mg->mg_next == NULL); 914 ASSERT(mg->mg_activation_count < 0); 915 return; 916 } 917 918 /* 919 * The spa_config_lock is an array of rwlocks, ordered as 920 * follows (from highest to lowest): 921 * SCL_CONFIG > SCL_STATE > SCL_L2ARC > SCL_ALLOC > 922 * SCL_ZIO > SCL_FREE > SCL_VDEV 923 * (For more information about the spa_config_lock see spa_misc.c) 924 * The higher the lock, the broader its coverage. When we passivate 925 * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO 926 * config locks. However, the metaslab group's taskq might be trying 927 * to preload metaslabs so we must drop the SCL_ZIO lock and any 928 * lower locks to allow the I/O to complete. At a minimum, 929 * we continue to hold the SCL_ALLOC lock, which prevents any future 930 * allocations from taking place and any changes to the vdev tree. 931 */ 932 spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa); 933 taskq_wait(mg->mg_taskq); 934 spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER); 935 metaslab_group_alloc_update(mg); 936 for (int i = 0; i < mg->mg_allocators; i++) { 937 metaslab_t *msp = mg->mg_primaries[i]; 938 if (msp != NULL) { 939 mutex_enter(&msp->ms_lock); 940 metaslab_passivate(msp, 941 metaslab_weight_from_range_tree(msp)); 942 mutex_exit(&msp->ms_lock); 943 } 944 msp = mg->mg_secondaries[i]; 945 if (msp != NULL) { 946 mutex_enter(&msp->ms_lock); 947 metaslab_passivate(msp, 948 metaslab_weight_from_range_tree(msp)); 949 mutex_exit(&msp->ms_lock); 950 } 951 } 952 953 mgprev = mg->mg_prev; 954 mgnext = mg->mg_next; 955 956 if (mg == mgnext) { 957 mc->mc_rotor = NULL; 958 } else { 959 mc->mc_rotor = mgnext; 960 mgprev->mg_next = mgnext; 961 mgnext->mg_prev = mgprev; 962 } 963 964 mg->mg_prev = NULL; 965 mg->mg_next = NULL; 966 } 967 968 boolean_t 969 metaslab_group_initialized(metaslab_group_t *mg) 970 { 971 vdev_t *vd = mg->mg_vd; 972 vdev_stat_t *vs = &vd->vdev_stat; 973 974 return (vs->vs_space != 0 && mg->mg_activation_count > 0); 975 } 976 977 uint64_t 978 metaslab_group_get_space(metaslab_group_t *mg) 979 { 980 return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count); 981 } 982 983 void 984 metaslab_group_histogram_verify(metaslab_group_t *mg) 985 { 986 uint64_t *mg_hist; 987 vdev_t *vd = mg->mg_vd; 988 uint64_t ashift = vd->vdev_ashift; 989 int i; 990 991 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 992 return; 993 994 mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 995 KM_SLEEP); 996 997 ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=, 998 SPACE_MAP_HISTOGRAM_SIZE + ashift); 999 1000 for (int m = 0; m < vd->vdev_ms_count; m++) { 1001 metaslab_t *msp = vd->vdev_ms[m]; 1002 1003 /* skip if not active or not a member */ 1004 if (msp->ms_sm == NULL || msp->ms_group != mg) 1005 continue; 1006 1007 for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) 1008 mg_hist[i + ashift] += 1009 msp->ms_sm->sm_phys->smp_histogram[i]; 1010 } 1011 1012 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++) 1013 VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]); 1014 1015 kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 1016 } 1017 1018 static void 1019 metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp) 1020 { 1021 metaslab_class_t *mc = mg->mg_class; 1022 uint64_t ashift = mg->mg_vd->vdev_ashift; 1023 1024 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1025 if (msp->ms_sm == NULL) 1026 return; 1027 1028 mutex_enter(&mg->mg_lock); 1029 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 1030 mg->mg_histogram[i + ashift] += 1031 msp->ms_sm->sm_phys->smp_histogram[i]; 1032 mc->mc_histogram[i + ashift] += 1033 msp->ms_sm->sm_phys->smp_histogram[i]; 1034 } 1035 mutex_exit(&mg->mg_lock); 1036 } 1037 1038 void 1039 metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp) 1040 { 1041 metaslab_class_t *mc = mg->mg_class; 1042 uint64_t ashift = mg->mg_vd->vdev_ashift; 1043 1044 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1045 if (msp->ms_sm == NULL) 1046 return; 1047 1048 mutex_enter(&mg->mg_lock); 1049 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 1050 ASSERT3U(mg->mg_histogram[i + ashift], >=, 1051 msp->ms_sm->sm_phys->smp_histogram[i]); 1052 ASSERT3U(mc->mc_histogram[i + ashift], >=, 1053 msp->ms_sm->sm_phys->smp_histogram[i]); 1054 1055 mg->mg_histogram[i + ashift] -= 1056 msp->ms_sm->sm_phys->smp_histogram[i]; 1057 mc->mc_histogram[i + ashift] -= 1058 msp->ms_sm->sm_phys->smp_histogram[i]; 1059 } 1060 mutex_exit(&mg->mg_lock); 1061 } 1062 1063 static void 1064 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) 1065 { 1066 ASSERT(msp->ms_group == NULL); 1067 mutex_enter(&mg->mg_lock); 1068 msp->ms_group = mg; 1069 msp->ms_weight = 0; 1070 avl_add(&mg->mg_metaslab_tree, msp); 1071 mutex_exit(&mg->mg_lock); 1072 1073 mutex_enter(&msp->ms_lock); 1074 metaslab_group_histogram_add(mg, msp); 1075 mutex_exit(&msp->ms_lock); 1076 } 1077 1078 static void 1079 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) 1080 { 1081 mutex_enter(&msp->ms_lock); 1082 metaslab_group_histogram_remove(mg, msp); 1083 mutex_exit(&msp->ms_lock); 1084 1085 mutex_enter(&mg->mg_lock); 1086 ASSERT(msp->ms_group == mg); 1087 avl_remove(&mg->mg_metaslab_tree, msp); 1088 1089 metaslab_class_t *mc = msp->ms_group->mg_class; 1090 multilist_sublist_t *mls = 1091 multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp); 1092 if (multilist_link_active(&msp->ms_class_txg_node)) 1093 multilist_sublist_remove(mls, msp); 1094 multilist_sublist_unlock(mls); 1095 1096 msp->ms_group = NULL; 1097 mutex_exit(&mg->mg_lock); 1098 } 1099 1100 static void 1101 metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 1102 { 1103 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1104 ASSERT(MUTEX_HELD(&mg->mg_lock)); 1105 ASSERT(msp->ms_group == mg); 1106 1107 avl_remove(&mg->mg_metaslab_tree, msp); 1108 msp->ms_weight = weight; 1109 avl_add(&mg->mg_metaslab_tree, msp); 1110 1111 } 1112 1113 static void 1114 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 1115 { 1116 /* 1117 * Although in principle the weight can be any value, in 1118 * practice we do not use values in the range [1, 511]. 1119 */ 1120 ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0); 1121 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1122 1123 mutex_enter(&mg->mg_lock); 1124 metaslab_group_sort_impl(mg, msp, weight); 1125 mutex_exit(&mg->mg_lock); 1126 } 1127 1128 /* 1129 * Calculate the fragmentation for a given metaslab group. We can use 1130 * a simple average here since all metaslabs within the group must have 1131 * the same size. The return value will be a value between 0 and 100 1132 * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this 1133 * group have a fragmentation metric. 1134 */ 1135 uint64_t 1136 metaslab_group_fragmentation(metaslab_group_t *mg) 1137 { 1138 vdev_t *vd = mg->mg_vd; 1139 uint64_t fragmentation = 0; 1140 uint64_t valid_ms = 0; 1141 1142 for (int m = 0; m < vd->vdev_ms_count; m++) { 1143 metaslab_t *msp = vd->vdev_ms[m]; 1144 1145 if (msp->ms_fragmentation == ZFS_FRAG_INVALID) 1146 continue; 1147 if (msp->ms_group != mg) 1148 continue; 1149 1150 valid_ms++; 1151 fragmentation += msp->ms_fragmentation; 1152 } 1153 1154 if (valid_ms <= mg->mg_vd->vdev_ms_count / 2) 1155 return (ZFS_FRAG_INVALID); 1156 1157 fragmentation /= valid_ms; 1158 ASSERT3U(fragmentation, <=, 100); 1159 return (fragmentation); 1160 } 1161 1162 /* 1163 * Determine if a given metaslab group should skip allocations. A metaslab 1164 * group should avoid allocations if its free capacity is less than the 1165 * zfs_mg_noalloc_threshold or its fragmentation metric is greater than 1166 * zfs_mg_fragmentation_threshold and there is at least one metaslab group 1167 * that can still handle allocations. If the allocation throttle is enabled 1168 * then we skip allocations to devices that have reached their maximum 1169 * allocation queue depth unless the selected metaslab group is the only 1170 * eligible group remaining. 1171 */ 1172 static boolean_t 1173 metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, 1174 uint64_t psize, int allocator, int d) 1175 { 1176 spa_t *spa = mg->mg_vd->vdev_spa; 1177 metaslab_class_t *mc = mg->mg_class; 1178 1179 /* 1180 * We can only consider skipping this metaslab group if it's 1181 * in the normal metaslab class and there are other metaslab 1182 * groups to select from. Otherwise, we always consider it eligible 1183 * for allocations. 1184 */ 1185 if ((mc != spa_normal_class(spa) && 1186 mc != spa_special_class(spa) && 1187 mc != spa_dedup_class(spa)) || 1188 mc->mc_groups <= 1) 1189 return (B_TRUE); 1190 1191 /* 1192 * If the metaslab group's mg_allocatable flag is set (see comments 1193 * in metaslab_group_alloc_update() for more information) and 1194 * the allocation throttle is disabled then allow allocations to this 1195 * device. However, if the allocation throttle is enabled then 1196 * check if we have reached our allocation limit (mg_alloc_queue_depth) 1197 * to determine if we should allow allocations to this metaslab group. 1198 * If all metaslab groups are no longer considered allocatable 1199 * (mc_alloc_groups == 0) or we're trying to allocate the smallest 1200 * gang block size then we allow allocations on this metaslab group 1201 * regardless of the mg_allocatable or throttle settings. 1202 */ 1203 if (mg->mg_allocatable) { 1204 metaslab_group_t *mgp; 1205 int64_t qdepth; 1206 uint64_t qmax = mg->mg_cur_max_alloc_queue_depth[allocator]; 1207 1208 if (!mc->mc_alloc_throttle_enabled) 1209 return (B_TRUE); 1210 1211 /* 1212 * If this metaslab group does not have any free space, then 1213 * there is no point in looking further. 1214 */ 1215 if (mg->mg_no_free_space) 1216 return (B_FALSE); 1217 1218 /* 1219 * Relax allocation throttling for ditto blocks. Due to 1220 * random imbalances in allocation it tends to push copies 1221 * to one vdev, that looks a bit better at the moment. 1222 */ 1223 qmax = qmax * (4 + d) / 4; 1224 1225 qdepth = zfs_refcount_count( 1226 &mg->mg_alloc_queue_depth[allocator]); 1227 1228 /* 1229 * If this metaslab group is below its qmax or it's 1230 * the only allocatable metasable group, then attempt 1231 * to allocate from it. 1232 */ 1233 if (qdepth < qmax || mc->mc_alloc_groups == 1) 1234 return (B_TRUE); 1235 ASSERT3U(mc->mc_alloc_groups, >, 1); 1236 1237 /* 1238 * Since this metaslab group is at or over its qmax, we 1239 * need to determine if there are metaslab groups after this 1240 * one that might be able to handle this allocation. This is 1241 * racy since we can't hold the locks for all metaslab 1242 * groups at the same time when we make this check. 1243 */ 1244 for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) { 1245 qmax = mgp->mg_cur_max_alloc_queue_depth[allocator]; 1246 qmax = qmax * (4 + d) / 4; 1247 qdepth = zfs_refcount_count( 1248 &mgp->mg_alloc_queue_depth[allocator]); 1249 1250 /* 1251 * If there is another metaslab group that 1252 * might be able to handle the allocation, then 1253 * we return false so that we skip this group. 1254 */ 1255 if (qdepth < qmax && !mgp->mg_no_free_space) 1256 return (B_FALSE); 1257 } 1258 1259 /* 1260 * We didn't find another group to handle the allocation 1261 * so we can't skip this metaslab group even though 1262 * we are at or over our qmax. 1263 */ 1264 return (B_TRUE); 1265 1266 } else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) { 1267 return (B_TRUE); 1268 } 1269 return (B_FALSE); 1270 } 1271 1272 /* 1273 * ========================================================================== 1274 * Range tree callbacks 1275 * ========================================================================== 1276 */ 1277 1278 /* 1279 * Comparison function for the private size-ordered tree using 32-bit 1280 * ranges. Tree is sorted by size, larger sizes at the end of the tree. 1281 */ 1282 static int 1283 metaslab_rangesize32_compare(const void *x1, const void *x2) 1284 { 1285 const range_seg32_t *r1 = x1; 1286 const range_seg32_t *r2 = x2; 1287 1288 uint64_t rs_size1 = r1->rs_end - r1->rs_start; 1289 uint64_t rs_size2 = r2->rs_end - r2->rs_start; 1290 1291 int cmp = TREE_CMP(rs_size1, rs_size2); 1292 if (likely(cmp)) 1293 return (cmp); 1294 1295 return (TREE_CMP(r1->rs_start, r2->rs_start)); 1296 } 1297 1298 /* 1299 * Comparison function for the private size-ordered tree using 64-bit 1300 * ranges. Tree is sorted by size, larger sizes at the end of the tree. 1301 */ 1302 static int 1303 metaslab_rangesize64_compare(const void *x1, const void *x2) 1304 { 1305 const range_seg64_t *r1 = x1; 1306 const range_seg64_t *r2 = x2; 1307 1308 uint64_t rs_size1 = r1->rs_end - r1->rs_start; 1309 uint64_t rs_size2 = r2->rs_end - r2->rs_start; 1310 1311 int cmp = TREE_CMP(rs_size1, rs_size2); 1312 if (likely(cmp)) 1313 return (cmp); 1314 1315 return (TREE_CMP(r1->rs_start, r2->rs_start)); 1316 } 1317 typedef struct metaslab_rt_arg { 1318 zfs_btree_t *mra_bt; 1319 uint32_t mra_floor_shift; 1320 } metaslab_rt_arg_t; 1321 1322 struct mssa_arg { 1323 range_tree_t *rt; 1324 metaslab_rt_arg_t *mra; 1325 }; 1326 1327 static void 1328 metaslab_size_sorted_add(void *arg, uint64_t start, uint64_t size) 1329 { 1330 struct mssa_arg *mssap = arg; 1331 range_tree_t *rt = mssap->rt; 1332 metaslab_rt_arg_t *mrap = mssap->mra; 1333 range_seg_max_t seg = {0}; 1334 rs_set_start(&seg, rt, start); 1335 rs_set_end(&seg, rt, start + size); 1336 metaslab_rt_add(rt, &seg, mrap); 1337 } 1338 1339 static void 1340 metaslab_size_tree_full_load(range_tree_t *rt) 1341 { 1342 metaslab_rt_arg_t *mrap = rt->rt_arg; 1343 #ifdef _METASLAB_TRACING 1344 METASLABSTAT_BUMP(metaslabstat_reload_tree); 1345 #endif 1346 ASSERT0(zfs_btree_numnodes(mrap->mra_bt)); 1347 mrap->mra_floor_shift = 0; 1348 struct mssa_arg arg = {0}; 1349 arg.rt = rt; 1350 arg.mra = mrap; 1351 range_tree_walk(rt, metaslab_size_sorted_add, &arg); 1352 } 1353 1354 /* 1355 * Create any block allocator specific components. The current allocators 1356 * rely on using both a size-ordered range_tree_t and an array of uint64_t's. 1357 */ 1358 /* ARGSUSED */ 1359 static void 1360 metaslab_rt_create(range_tree_t *rt, void *arg) 1361 { 1362 metaslab_rt_arg_t *mrap = arg; 1363 zfs_btree_t *size_tree = mrap->mra_bt; 1364 1365 size_t size; 1366 int (*compare) (const void *, const void *); 1367 switch (rt->rt_type) { 1368 case RANGE_SEG32: 1369 size = sizeof (range_seg32_t); 1370 compare = metaslab_rangesize32_compare; 1371 break; 1372 case RANGE_SEG64: 1373 size = sizeof (range_seg64_t); 1374 compare = metaslab_rangesize64_compare; 1375 break; 1376 default: 1377 panic("Invalid range seg type %d", rt->rt_type); 1378 } 1379 zfs_btree_create(size_tree, compare, size); 1380 mrap->mra_floor_shift = metaslab_by_size_min_shift; 1381 } 1382 1383 /* ARGSUSED */ 1384 static void 1385 metaslab_rt_destroy(range_tree_t *rt, void *arg) 1386 { 1387 metaslab_rt_arg_t *mrap = arg; 1388 zfs_btree_t *size_tree = mrap->mra_bt; 1389 1390 zfs_btree_destroy(size_tree); 1391 kmem_free(mrap, sizeof (*mrap)); 1392 } 1393 1394 /* ARGSUSED */ 1395 static void 1396 metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg) 1397 { 1398 metaslab_rt_arg_t *mrap = arg; 1399 zfs_btree_t *size_tree = mrap->mra_bt; 1400 1401 if (rs_get_end(rs, rt) - rs_get_start(rs, rt) < 1402 (1 << mrap->mra_floor_shift)) 1403 return; 1404 1405 zfs_btree_add(size_tree, rs); 1406 } 1407 1408 /* ARGSUSED */ 1409 static void 1410 metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg) 1411 { 1412 metaslab_rt_arg_t *mrap = arg; 1413 zfs_btree_t *size_tree = mrap->mra_bt; 1414 1415 if (rs_get_end(rs, rt) - rs_get_start(rs, rt) < (1 << 1416 mrap->mra_floor_shift)) 1417 return; 1418 1419 zfs_btree_remove(size_tree, rs); 1420 } 1421 1422 /* ARGSUSED */ 1423 static void 1424 metaslab_rt_vacate(range_tree_t *rt, void *arg) 1425 { 1426 metaslab_rt_arg_t *mrap = arg; 1427 zfs_btree_t *size_tree = mrap->mra_bt; 1428 zfs_btree_clear(size_tree); 1429 zfs_btree_destroy(size_tree); 1430 1431 metaslab_rt_create(rt, arg); 1432 } 1433 1434 static range_tree_ops_t metaslab_rt_ops = { 1435 .rtop_create = metaslab_rt_create, 1436 .rtop_destroy = metaslab_rt_destroy, 1437 .rtop_add = metaslab_rt_add, 1438 .rtop_remove = metaslab_rt_remove, 1439 .rtop_vacate = metaslab_rt_vacate 1440 }; 1441 1442 /* 1443 * ========================================================================== 1444 * Common allocator routines 1445 * ========================================================================== 1446 */ 1447 1448 /* 1449 * Return the maximum contiguous segment within the metaslab. 1450 */ 1451 uint64_t 1452 metaslab_largest_allocatable(metaslab_t *msp) 1453 { 1454 zfs_btree_t *t = &msp->ms_allocatable_by_size; 1455 range_seg_t *rs; 1456 1457 if (t == NULL) 1458 return (0); 1459 if (zfs_btree_numnodes(t) == 0) 1460 metaslab_size_tree_full_load(msp->ms_allocatable); 1461 1462 rs = zfs_btree_last(t, NULL); 1463 if (rs == NULL) 1464 return (0); 1465 1466 return (rs_get_end(rs, msp->ms_allocatable) - rs_get_start(rs, 1467 msp->ms_allocatable)); 1468 } 1469 1470 /* 1471 * Return the maximum contiguous segment within the unflushed frees of this 1472 * metaslab. 1473 */ 1474 uint64_t 1475 metaslab_largest_unflushed_free(metaslab_t *msp) 1476 { 1477 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1478 1479 if (msp->ms_unflushed_frees == NULL) 1480 return (0); 1481 1482 if (zfs_btree_numnodes(&msp->ms_unflushed_frees_by_size) == 0) 1483 metaslab_size_tree_full_load(msp->ms_unflushed_frees); 1484 range_seg_t *rs = zfs_btree_last(&msp->ms_unflushed_frees_by_size, 1485 NULL); 1486 if (rs == NULL) 1487 return (0); 1488 1489 /* 1490 * When a range is freed from the metaslab, that range is added to 1491 * both the unflushed frees and the deferred frees. While the block 1492 * will eventually be usable, if the metaslab were loaded the range 1493 * would not be added to the ms_allocatable tree until TXG_DEFER_SIZE 1494 * txgs had passed. As a result, when attempting to estimate an upper 1495 * bound for the largest currently-usable free segment in the 1496 * metaslab, we need to not consider any ranges currently in the defer 1497 * trees. This algorithm approximates the largest available chunk in 1498 * the largest range in the unflushed_frees tree by taking the first 1499 * chunk. While this may be a poor estimate, it should only remain so 1500 * briefly and should eventually self-correct as frees are no longer 1501 * deferred. Similar logic applies to the ms_freed tree. See 1502 * metaslab_load() for more details. 1503 * 1504 * There are two primary sources of innacuracy in this estimate. Both 1505 * are tolerated for performance reasons. The first source is that we 1506 * only check the largest segment for overlaps. Smaller segments may 1507 * have more favorable overlaps with the other trees, resulting in 1508 * larger usable chunks. Second, we only look at the first chunk in 1509 * the largest segment; there may be other usable chunks in the 1510 * largest segment, but we ignore them. 1511 */ 1512 uint64_t rstart = rs_get_start(rs, msp->ms_unflushed_frees); 1513 uint64_t rsize = rs_get_end(rs, msp->ms_unflushed_frees) - rstart; 1514 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1515 uint64_t start = 0; 1516 uint64_t size = 0; 1517 boolean_t found = range_tree_find_in(msp->ms_defer[t], rstart, 1518 rsize, &start, &size); 1519 if (found) { 1520 if (rstart == start) 1521 return (0); 1522 rsize = start - rstart; 1523 } 1524 } 1525 1526 uint64_t start = 0; 1527 uint64_t size = 0; 1528 boolean_t found = range_tree_find_in(msp->ms_freed, rstart, 1529 rsize, &start, &size); 1530 if (found) 1531 rsize = start - rstart; 1532 1533 return (rsize); 1534 } 1535 1536 static range_seg_t * 1537 metaslab_block_find(zfs_btree_t *t, range_tree_t *rt, uint64_t start, 1538 uint64_t size, zfs_btree_index_t *where) 1539 { 1540 range_seg_t *rs; 1541 range_seg_max_t rsearch; 1542 1543 rs_set_start(&rsearch, rt, start); 1544 rs_set_end(&rsearch, rt, start + size); 1545 1546 rs = zfs_btree_find(t, &rsearch, where); 1547 if (rs == NULL) { 1548 rs = zfs_btree_next(t, where, where); 1549 } 1550 1551 return (rs); 1552 } 1553 1554 /* 1555 * This is a helper function that can be used by the allocator to find a 1556 * suitable block to allocate. This will search the specified B-tree looking 1557 * for a block that matches the specified criteria. 1558 */ 1559 static uint64_t 1560 metaslab_block_picker(range_tree_t *rt, uint64_t *cursor, uint64_t size, 1561 uint64_t max_search) 1562 { 1563 if (*cursor == 0) 1564 *cursor = rt->rt_start; 1565 zfs_btree_t *bt = &rt->rt_root; 1566 zfs_btree_index_t where; 1567 range_seg_t *rs = metaslab_block_find(bt, rt, *cursor, size, &where); 1568 uint64_t first_found; 1569 int count_searched = 0; 1570 1571 if (rs != NULL) 1572 first_found = rs_get_start(rs, rt); 1573 1574 while (rs != NULL && (rs_get_start(rs, rt) - first_found <= 1575 max_search || count_searched < metaslab_min_search_count)) { 1576 uint64_t offset = rs_get_start(rs, rt); 1577 if (offset + size <= rs_get_end(rs, rt)) { 1578 *cursor = offset + size; 1579 return (offset); 1580 } 1581 rs = zfs_btree_next(bt, &where, &where); 1582 count_searched++; 1583 } 1584 1585 *cursor = 0; 1586 return (-1ULL); 1587 } 1588 1589 /* 1590 * ========================================================================== 1591 * Dynamic Fit (df) block allocator 1592 * 1593 * Search for a free chunk of at least this size, starting from the last 1594 * offset (for this alignment of block) looking for up to 1595 * metaslab_df_max_search bytes (16MB). If a large enough free chunk is not 1596 * found within 16MB, then return a free chunk of exactly the requested size (or 1597 * larger). 1598 * 1599 * If it seems like searching from the last offset will be unproductive, skip 1600 * that and just return a free chunk of exactly the requested size (or larger). 1601 * This is based on metaslab_df_alloc_threshold and metaslab_df_free_pct. This 1602 * mechanism is probably not very useful and may be removed in the future. 1603 * 1604 * The behavior when not searching can be changed to return the largest free 1605 * chunk, instead of a free chunk of exactly the requested size, by setting 1606 * metaslab_df_use_largest_segment. 1607 * ========================================================================== 1608 */ 1609 static uint64_t 1610 metaslab_df_alloc(metaslab_t *msp, uint64_t size) 1611 { 1612 /* 1613 * Find the largest power of 2 block size that evenly divides the 1614 * requested size. This is used to try to allocate blocks with similar 1615 * alignment from the same area of the metaslab (i.e. same cursor 1616 * bucket) but it does not guarantee that other allocations sizes 1617 * may exist in the same region. 1618 */ 1619 uint64_t align = size & -size; 1620 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; 1621 range_tree_t *rt = msp->ms_allocatable; 1622 int free_pct = range_tree_space(rt) * 100 / msp->ms_size; 1623 uint64_t offset; 1624 1625 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1626 1627 /* 1628 * If we're running low on space, find a segment based on size, 1629 * rather than iterating based on offset. 1630 */ 1631 if (metaslab_largest_allocatable(msp) < metaslab_df_alloc_threshold || 1632 free_pct < metaslab_df_free_pct) { 1633 offset = -1; 1634 } else { 1635 offset = metaslab_block_picker(rt, 1636 cursor, size, metaslab_df_max_search); 1637 } 1638 1639 if (offset == -1) { 1640 range_seg_t *rs; 1641 if (zfs_btree_numnodes(&msp->ms_allocatable_by_size) == 0) 1642 metaslab_size_tree_full_load(msp->ms_allocatable); 1643 if (metaslab_df_use_largest_segment) { 1644 /* use largest free segment */ 1645 rs = zfs_btree_last(&msp->ms_allocatable_by_size, NULL); 1646 } else { 1647 zfs_btree_index_t where; 1648 /* use segment of this size, or next largest */ 1649 #ifdef _METASLAB_TRACING 1650 metaslab_rt_arg_t *mrap = msp->ms_allocatable->rt_arg; 1651 if (size < (1 << mrap->mra_floor_shift)) { 1652 METASLABSTAT_BUMP( 1653 metaslabstat_df_find_under_floor); 1654 } 1655 #endif 1656 rs = metaslab_block_find(&msp->ms_allocatable_by_size, 1657 rt, msp->ms_start, size, &where); 1658 } 1659 if (rs != NULL && rs_get_start(rs, rt) + size <= rs_get_end(rs, 1660 rt)) { 1661 offset = rs_get_start(rs, rt); 1662 *cursor = offset + size; 1663 } 1664 } 1665 1666 return (offset); 1667 } 1668 1669 static metaslab_ops_t metaslab_df_ops = { 1670 metaslab_df_alloc 1671 }; 1672 1673 /* 1674 * ========================================================================== 1675 * Cursor fit block allocator - 1676 * Select the largest region in the metaslab, set the cursor to the beginning 1677 * of the range and the cursor_end to the end of the range. As allocations 1678 * are made advance the cursor. Continue allocating from the cursor until 1679 * the range is exhausted and then find a new range. 1680 * ========================================================================== 1681 */ 1682 static uint64_t 1683 metaslab_cf_alloc(metaslab_t *msp, uint64_t size) 1684 { 1685 range_tree_t *rt = msp->ms_allocatable; 1686 zfs_btree_t *t = &msp->ms_allocatable_by_size; 1687 uint64_t *cursor = &msp->ms_lbas[0]; 1688 uint64_t *cursor_end = &msp->ms_lbas[1]; 1689 uint64_t offset = 0; 1690 1691 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1692 1693 ASSERT3U(*cursor_end, >=, *cursor); 1694 1695 if ((*cursor + size) > *cursor_end) { 1696 range_seg_t *rs; 1697 1698 if (zfs_btree_numnodes(t) == 0) 1699 metaslab_size_tree_full_load(msp->ms_allocatable); 1700 rs = zfs_btree_last(t, NULL); 1701 if (rs == NULL || (rs_get_end(rs, rt) - rs_get_start(rs, rt)) < 1702 size) 1703 return (-1ULL); 1704 1705 *cursor = rs_get_start(rs, rt); 1706 *cursor_end = rs_get_end(rs, rt); 1707 } 1708 1709 offset = *cursor; 1710 *cursor += size; 1711 1712 return (offset); 1713 } 1714 1715 static metaslab_ops_t metaslab_cf_ops = { 1716 metaslab_cf_alloc 1717 }; 1718 1719 /* 1720 * ========================================================================== 1721 * New dynamic fit allocator - 1722 * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift 1723 * contiguous blocks. If no region is found then just use the largest segment 1724 * that remains. 1725 * ========================================================================== 1726 */ 1727 1728 /* 1729 * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift) 1730 * to request from the allocator. 1731 */ 1732 uint64_t metaslab_ndf_clump_shift = 4; 1733 1734 static uint64_t 1735 metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) 1736 { 1737 zfs_btree_t *t = &msp->ms_allocatable->rt_root; 1738 range_tree_t *rt = msp->ms_allocatable; 1739 zfs_btree_index_t where; 1740 range_seg_t *rs; 1741 range_seg_max_t rsearch; 1742 uint64_t hbit = highbit64(size); 1743 uint64_t *cursor = &msp->ms_lbas[hbit - 1]; 1744 uint64_t max_size = metaslab_largest_allocatable(msp); 1745 1746 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1747 1748 if (max_size < size) 1749 return (-1ULL); 1750 1751 rs_set_start(&rsearch, rt, *cursor); 1752 rs_set_end(&rsearch, rt, *cursor + size); 1753 1754 rs = zfs_btree_find(t, &rsearch, &where); 1755 if (rs == NULL || (rs_get_end(rs, rt) - rs_get_start(rs, rt)) < size) { 1756 t = &msp->ms_allocatable_by_size; 1757 1758 rs_set_start(&rsearch, rt, 0); 1759 rs_set_end(&rsearch, rt, MIN(max_size, 1ULL << (hbit + 1760 metaslab_ndf_clump_shift))); 1761 1762 rs = zfs_btree_find(t, &rsearch, &where); 1763 if (rs == NULL) 1764 rs = zfs_btree_next(t, &where, &where); 1765 ASSERT(rs != NULL); 1766 } 1767 1768 if ((rs_get_end(rs, rt) - rs_get_start(rs, rt)) >= size) { 1769 *cursor = rs_get_start(rs, rt) + size; 1770 return (rs_get_start(rs, rt)); 1771 } 1772 return (-1ULL); 1773 } 1774 1775 static metaslab_ops_t metaslab_ndf_ops = { 1776 metaslab_ndf_alloc 1777 }; 1778 1779 metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops; 1780 1781 /* 1782 * ========================================================================== 1783 * Metaslabs 1784 * ========================================================================== 1785 */ 1786 1787 /* 1788 * Wait for any in-progress metaslab loads to complete. 1789 */ 1790 void 1791 metaslab_load_wait(metaslab_t *msp) 1792 { 1793 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1794 1795 while (msp->ms_loading) { 1796 ASSERT(!msp->ms_loaded); 1797 cv_wait(&msp->ms_load_cv, &msp->ms_lock); 1798 } 1799 } 1800 1801 /* 1802 * Wait for any in-progress flushing to complete. 1803 */ 1804 void 1805 metaslab_flush_wait(metaslab_t *msp) 1806 { 1807 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1808 1809 while (msp->ms_flushing) 1810 cv_wait(&msp->ms_flush_cv, &msp->ms_lock); 1811 } 1812 1813 static unsigned int 1814 metaslab_idx_func(multilist_t *ml, void *arg) 1815 { 1816 metaslab_t *msp = arg; 1817 return (msp->ms_id % multilist_get_num_sublists(ml)); 1818 } 1819 1820 uint64_t 1821 metaslab_allocated_space(metaslab_t *msp) 1822 { 1823 return (msp->ms_allocated_space); 1824 } 1825 1826 /* 1827 * Verify that the space accounting on disk matches the in-core range_trees. 1828 */ 1829 static void 1830 metaslab_verify_space(metaslab_t *msp, uint64_t txg) 1831 { 1832 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1833 uint64_t allocating = 0; 1834 uint64_t sm_free_space, msp_free_space; 1835 1836 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1837 ASSERT(!msp->ms_condensing); 1838 1839 if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) 1840 return; 1841 1842 /* 1843 * We can only verify the metaslab space when we're called 1844 * from syncing context with a loaded metaslab that has an 1845 * allocated space map. Calling this in non-syncing context 1846 * does not provide a consistent view of the metaslab since 1847 * we're performing allocations in the future. 1848 */ 1849 if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL || 1850 !msp->ms_loaded) 1851 return; 1852 1853 /* 1854 * Even though the smp_alloc field can get negative, 1855 * when it comes to a metaslab's space map, that should 1856 * never be the case. 1857 */ 1858 ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0); 1859 1860 ASSERT3U(space_map_allocated(msp->ms_sm), >=, 1861 range_tree_space(msp->ms_unflushed_frees)); 1862 1863 ASSERT3U(metaslab_allocated_space(msp), ==, 1864 space_map_allocated(msp->ms_sm) + 1865 range_tree_space(msp->ms_unflushed_allocs) - 1866 range_tree_space(msp->ms_unflushed_frees)); 1867 1868 sm_free_space = msp->ms_size - metaslab_allocated_space(msp); 1869 1870 /* 1871 * Account for future allocations since we would have 1872 * already deducted that space from the ms_allocatable. 1873 */ 1874 for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { 1875 allocating += 1876 range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]); 1877 } 1878 ASSERT3U(allocating + msp->ms_allocated_this_txg, ==, 1879 msp->ms_allocating_total); 1880 1881 ASSERT3U(msp->ms_deferspace, ==, 1882 range_tree_space(msp->ms_defer[0]) + 1883 range_tree_space(msp->ms_defer[1])); 1884 1885 msp_free_space = range_tree_space(msp->ms_allocatable) + allocating + 1886 msp->ms_deferspace + range_tree_space(msp->ms_freed); 1887 1888 VERIFY3U(sm_free_space, ==, msp_free_space); 1889 } 1890 1891 static void 1892 metaslab_aux_histograms_clear(metaslab_t *msp) 1893 { 1894 /* 1895 * Auxiliary histograms are only cleared when resetting them, 1896 * which can only happen while the metaslab is loaded. 1897 */ 1898 ASSERT(msp->ms_loaded); 1899 1900 bzero(msp->ms_synchist, sizeof (msp->ms_synchist)); 1901 for (int t = 0; t < TXG_DEFER_SIZE; t++) 1902 bzero(msp->ms_deferhist[t], sizeof (msp->ms_deferhist[t])); 1903 } 1904 1905 static void 1906 metaslab_aux_histogram_add(uint64_t *histogram, uint64_t shift, 1907 range_tree_t *rt) 1908 { 1909 /* 1910 * This is modeled after space_map_histogram_add(), so refer to that 1911 * function for implementation details. We want this to work like 1912 * the space map histogram, and not the range tree histogram, as we 1913 * are essentially constructing a delta that will be later subtracted 1914 * from the space map histogram. 1915 */ 1916 int idx = 0; 1917 for (int i = shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { 1918 ASSERT3U(i, >=, idx + shift); 1919 histogram[idx] += rt->rt_histogram[i] << (i - idx - shift); 1920 1921 if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) { 1922 ASSERT3U(idx + shift, ==, i); 1923 idx++; 1924 ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE); 1925 } 1926 } 1927 } 1928 1929 /* 1930 * Called at every sync pass that the metaslab gets synced. 1931 * 1932 * The reason is that we want our auxiliary histograms to be updated 1933 * wherever the metaslab's space map histogram is updated. This way 1934 * we stay consistent on which parts of the metaslab space map's 1935 * histogram are currently not available for allocations (e.g because 1936 * they are in the defer, freed, and freeing trees). 1937 */ 1938 static void 1939 metaslab_aux_histograms_update(metaslab_t *msp) 1940 { 1941 space_map_t *sm = msp->ms_sm; 1942 ASSERT(sm != NULL); 1943 1944 /* 1945 * This is similar to the metaslab's space map histogram updates 1946 * that take place in metaslab_sync(). The only difference is that 1947 * we only care about segments that haven't made it into the 1948 * ms_allocatable tree yet. 1949 */ 1950 if (msp->ms_loaded) { 1951 metaslab_aux_histograms_clear(msp); 1952 1953 metaslab_aux_histogram_add(msp->ms_synchist, 1954 sm->sm_shift, msp->ms_freed); 1955 1956 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1957 metaslab_aux_histogram_add(msp->ms_deferhist[t], 1958 sm->sm_shift, msp->ms_defer[t]); 1959 } 1960 } 1961 1962 metaslab_aux_histogram_add(msp->ms_synchist, 1963 sm->sm_shift, msp->ms_freeing); 1964 } 1965 1966 /* 1967 * Called every time we are done syncing (writing to) the metaslab, 1968 * i.e. at the end of each sync pass. 1969 * [see the comment in metaslab_impl.h for ms_synchist, ms_deferhist] 1970 */ 1971 static void 1972 metaslab_aux_histograms_update_done(metaslab_t *msp, boolean_t defer_allowed) 1973 { 1974 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1975 space_map_t *sm = msp->ms_sm; 1976 1977 if (sm == NULL) { 1978 /* 1979 * We came here from metaslab_init() when creating/opening a 1980 * pool, looking at a metaslab that hasn't had any allocations 1981 * yet. 1982 */ 1983 return; 1984 } 1985 1986 /* 1987 * This is similar to the actions that we take for the ms_freed 1988 * and ms_defer trees in metaslab_sync_done(). 1989 */ 1990 uint64_t hist_index = spa_syncing_txg(spa) % TXG_DEFER_SIZE; 1991 if (defer_allowed) { 1992 bcopy(msp->ms_synchist, msp->ms_deferhist[hist_index], 1993 sizeof (msp->ms_synchist)); 1994 } else { 1995 bzero(msp->ms_deferhist[hist_index], 1996 sizeof (msp->ms_deferhist[hist_index])); 1997 } 1998 bzero(msp->ms_synchist, sizeof (msp->ms_synchist)); 1999 } 2000 2001 /* 2002 * Ensure that the metaslab's weight and fragmentation are consistent 2003 * with the contents of the histogram (either the range tree's histogram 2004 * or the space map's depending whether the metaslab is loaded). 2005 */ 2006 static void 2007 metaslab_verify_weight_and_frag(metaslab_t *msp) 2008 { 2009 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2010 2011 if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) 2012 return; 2013 2014 /* 2015 * We can end up here from vdev_remove_complete(), in which case we 2016 * cannot do these assertions because we hold spa config locks and 2017 * thus we are not allowed to read from the DMU. 2018 * 2019 * We check if the metaslab group has been removed and if that's 2020 * the case we return immediately as that would mean that we are 2021 * here from the aforementioned code path. 2022 */ 2023 if (msp->ms_group == NULL) 2024 return; 2025 2026 /* 2027 * Devices being removed always return a weight of 0 and leave 2028 * fragmentation and ms_max_size as is - there is nothing for 2029 * us to verify here. 2030 */ 2031 vdev_t *vd = msp->ms_group->mg_vd; 2032 if (vd->vdev_removing) 2033 return; 2034 2035 /* 2036 * If the metaslab is dirty it probably means that we've done 2037 * some allocations or frees that have changed our histograms 2038 * and thus the weight. 2039 */ 2040 for (int t = 0; t < TXG_SIZE; t++) { 2041 if (txg_list_member(&vd->vdev_ms_list, msp, t)) 2042 return; 2043 } 2044 2045 /* 2046 * This verification checks that our in-memory state is consistent 2047 * with what's on disk. If the pool is read-only then there aren't 2048 * any changes and we just have the initially-loaded state. 2049 */ 2050 if (!spa_writeable(msp->ms_group->mg_vd->vdev_spa)) 2051 return; 2052 2053 /* some extra verification for in-core tree if you can */ 2054 if (msp->ms_loaded) { 2055 range_tree_stat_verify(msp->ms_allocatable); 2056 VERIFY(space_map_histogram_verify(msp->ms_sm, 2057 msp->ms_allocatable)); 2058 } 2059 2060 uint64_t weight = msp->ms_weight; 2061 uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; 2062 boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight); 2063 uint64_t frag = msp->ms_fragmentation; 2064 uint64_t max_segsize = msp->ms_max_size; 2065 2066 msp->ms_weight = 0; 2067 msp->ms_fragmentation = 0; 2068 2069 /* 2070 * This function is used for verification purposes. Regardless of 2071 * whether metaslab_weight() thinks this metaslab should be active or 2072 * not, we want to ensure that the actual weight (and therefore the 2073 * value of ms_weight) would be the same if it was to be recalculated 2074 * at this point. 2075 */ 2076 msp->ms_weight = metaslab_weight(msp) | was_active; 2077 2078 VERIFY3U(max_segsize, ==, msp->ms_max_size); 2079 2080 /* 2081 * If the weight type changed then there is no point in doing 2082 * verification. Revert fields to their original values. 2083 */ 2084 if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) || 2085 (!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) { 2086 msp->ms_fragmentation = frag; 2087 msp->ms_weight = weight; 2088 return; 2089 } 2090 2091 VERIFY3U(msp->ms_fragmentation, ==, frag); 2092 VERIFY3U(msp->ms_weight, ==, weight); 2093 } 2094 2095 /* 2096 * If we're over the zfs_metaslab_mem_limit, select the loaded metaslab from 2097 * this class that was used longest ago, and attempt to unload it. We don't 2098 * want to spend too much time in this loop to prevent performance 2099 * degredation, and we expect that most of the time this operation will 2100 * succeed. Between that and the normal unloading processing during txg sync, 2101 * we expect this to keep the metaslab memory usage under control. 2102 */ 2103 static void 2104 metaslab_potentially_evict(metaslab_class_t *mc) 2105 { 2106 #ifdef _KERNEL 2107 uint64_t allmem = arc_all_memory(); 2108 extern kmem_cache_t *zfs_btree_leaf_cache; 2109 uint64_t inuse = kmem_cache_stat(zfs_btree_leaf_cache, "buf_inuse"); 2110 uint64_t size = kmem_cache_stat(zfs_btree_leaf_cache, "buf_size"); 2111 int tries = 0; 2112 for (; allmem * zfs_metaslab_mem_limit / 100 < inuse * size && 2113 tries < multilist_get_num_sublists(mc->mc_metaslab_txg_list) * 2; 2114 tries++) { 2115 unsigned int idx = multilist_get_random_index( 2116 mc->mc_metaslab_txg_list); 2117 multilist_sublist_t *mls = 2118 multilist_sublist_lock(mc->mc_metaslab_txg_list, idx); 2119 metaslab_t *msp = multilist_sublist_head(mls); 2120 multilist_sublist_unlock(mls); 2121 while (msp != NULL && allmem * zfs_metaslab_mem_limit / 100 < 2122 inuse * size) { 2123 VERIFY3P(mls, ==, multilist_sublist_lock( 2124 mc->mc_metaslab_txg_list, idx)); 2125 ASSERT3U(idx, ==, 2126 metaslab_idx_func(mc->mc_metaslab_txg_list, msp)); 2127 2128 if (!multilist_link_active(&msp->ms_class_txg_node)) { 2129 multilist_sublist_unlock(mls); 2130 break; 2131 } 2132 metaslab_t *next_msp = multilist_sublist_next(mls, msp); 2133 multilist_sublist_unlock(mls); 2134 /* 2135 * If the metaslab is currently loading there are two 2136 * cases. If it's the metaslab we're evicting, we 2137 * can't continue on or we'll panic when we attempt to 2138 * recursively lock the mutex. If it's another 2139 * metaslab that's loading, it can be safely skipped, 2140 * since we know it's very new and therefore not a 2141 * good eviction candidate. We check later once the 2142 * lock is held that the metaslab is fully loaded 2143 * before actually unloading it. 2144 */ 2145 if (msp->ms_loading) { 2146 msp = next_msp; 2147 inuse = kmem_cache_stat(zfs_btree_leaf_cache, 2148 "buf_inuse"); 2149 continue; 2150 } 2151 /* 2152 * We can't unload metaslabs with no spacemap because 2153 * they're not ready to be unloaded yet. We can't 2154 * unload metaslabs with outstanding allocations 2155 * because doing so could cause the metaslab's weight 2156 * to decrease while it's unloaded, which violates an 2157 * invariant that we use to prevent unnecessary 2158 * loading. We also don't unload metaslabs that are 2159 * currently active because they are high-weight 2160 * metaslabs that are likely to be used in the near 2161 * future. 2162 */ 2163 mutex_enter(&msp->ms_lock); 2164 if (msp->ms_allocator == -1 && msp->ms_sm != NULL && 2165 msp->ms_allocating_total == 0) { 2166 metaslab_unload(msp); 2167 } 2168 mutex_exit(&msp->ms_lock); 2169 msp = next_msp; 2170 inuse = kmem_cache_stat(zfs_btree_leaf_cache, 2171 "buf_inuse"); 2172 } 2173 } 2174 #endif 2175 } 2176 2177 static int 2178 metaslab_load_impl(metaslab_t *msp) 2179 { 2180 int error = 0; 2181 2182 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2183 ASSERT(msp->ms_loading); 2184 ASSERT(!msp->ms_condensing); 2185 2186 /* 2187 * We temporarily drop the lock to unblock other operations while we 2188 * are reading the space map. Therefore, metaslab_sync() and 2189 * metaslab_sync_done() can run at the same time as we do. 2190 * 2191 * If we are using the log space maps, metaslab_sync() can't write to 2192 * the metaslab's space map while we are loading as we only write to 2193 * it when we are flushing the metaslab, and that can't happen while 2194 * we are loading it. 2195 * 2196 * If we are not using log space maps though, metaslab_sync() can 2197 * append to the space map while we are loading. Therefore we load 2198 * only entries that existed when we started the load. Additionally, 2199 * metaslab_sync_done() has to wait for the load to complete because 2200 * there are potential races like metaslab_load() loading parts of the 2201 * space map that are currently being appended by metaslab_sync(). If 2202 * we didn't, the ms_allocatable would have entries that 2203 * metaslab_sync_done() would try to re-add later. 2204 * 2205 * That's why before dropping the lock we remember the synced length 2206 * of the metaslab and read up to that point of the space map, 2207 * ignoring entries appended by metaslab_sync() that happen after we 2208 * drop the lock. 2209 */ 2210 uint64_t length = msp->ms_synced_length; 2211 mutex_exit(&msp->ms_lock); 2212 2213 hrtime_t load_start = gethrtime(); 2214 metaslab_rt_arg_t *mrap; 2215 if (msp->ms_allocatable->rt_arg == NULL) { 2216 mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP); 2217 } else { 2218 mrap = msp->ms_allocatable->rt_arg; 2219 msp->ms_allocatable->rt_ops = NULL; 2220 msp->ms_allocatable->rt_arg = NULL; 2221 } 2222 mrap->mra_bt = &msp->ms_allocatable_by_size; 2223 mrap->mra_floor_shift = metaslab_by_size_min_shift; 2224 2225 if (msp->ms_sm != NULL) { 2226 error = space_map_load_length(msp->ms_sm, msp->ms_allocatable, 2227 SM_FREE, length); 2228 2229 /* Now, populate the size-sorted tree. */ 2230 metaslab_rt_create(msp->ms_allocatable, mrap); 2231 msp->ms_allocatable->rt_ops = &metaslab_rt_ops; 2232 msp->ms_allocatable->rt_arg = mrap; 2233 2234 struct mssa_arg arg = {0}; 2235 arg.rt = msp->ms_allocatable; 2236 arg.mra = mrap; 2237 range_tree_walk(msp->ms_allocatable, metaslab_size_sorted_add, 2238 &arg); 2239 } else { 2240 /* 2241 * Add the size-sorted tree first, since we don't need to load 2242 * the metaslab from the spacemap. 2243 */ 2244 metaslab_rt_create(msp->ms_allocatable, mrap); 2245 msp->ms_allocatable->rt_ops = &metaslab_rt_ops; 2246 msp->ms_allocatable->rt_arg = mrap; 2247 /* 2248 * The space map has not been allocated yet, so treat 2249 * all the space in the metaslab as free and add it to the 2250 * ms_allocatable tree. 2251 */ 2252 range_tree_add(msp->ms_allocatable, 2253 msp->ms_start, msp->ms_size); 2254 2255 if (msp->ms_freed != NULL) { 2256 /* 2257 * If the ms_sm doesn't exist, this means that this 2258 * metaslab hasn't gone through metaslab_sync() and 2259 * thus has never been dirtied. So we shouldn't 2260 * expect any unflushed allocs or frees from previous 2261 * TXGs. 2262 * 2263 * Note: ms_freed and all the other trees except for 2264 * the ms_allocatable, can be NULL at this point only 2265 * if this is a new metaslab of a vdev that just got 2266 * expanded. 2267 */ 2268 ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); 2269 ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); 2270 } 2271 } 2272 2273 /* 2274 * We need to grab the ms_sync_lock to prevent metaslab_sync() from 2275 * changing the ms_sm (or log_sm) and the metaslab's range trees 2276 * while we are about to use them and populate the ms_allocatable. 2277 * The ms_lock is insufficient for this because metaslab_sync() doesn't 2278 * hold the ms_lock while writing the ms_checkpointing tree to disk. 2279 */ 2280 mutex_enter(&msp->ms_sync_lock); 2281 mutex_enter(&msp->ms_lock); 2282 2283 ASSERT(!msp->ms_condensing); 2284 ASSERT(!msp->ms_flushing); 2285 2286 if (error != 0) { 2287 mutex_exit(&msp->ms_sync_lock); 2288 return (error); 2289 } 2290 2291 ASSERT3P(msp->ms_group, !=, NULL); 2292 msp->ms_loaded = B_TRUE; 2293 2294 /* 2295 * Apply all the unflushed changes to ms_allocatable right 2296 * away so any manipulations we do below have a clear view 2297 * of what is allocated and what is free. 2298 */ 2299 range_tree_walk(msp->ms_unflushed_allocs, 2300 range_tree_remove, msp->ms_allocatable); 2301 range_tree_walk(msp->ms_unflushed_frees, 2302 range_tree_add, msp->ms_allocatable); 2303 2304 msp->ms_loaded = B_TRUE; 2305 2306 ASSERT3P(msp->ms_group, !=, NULL); 2307 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 2308 if (spa_syncing_log_sm(spa) != NULL) { 2309 ASSERT(spa_feature_is_enabled(spa, 2310 SPA_FEATURE_LOG_SPACEMAP)); 2311 2312 /* 2313 * If we use a log space map we add all the segments 2314 * that are in ms_unflushed_frees so they are available 2315 * for allocation. 2316 * 2317 * ms_allocatable needs to contain all free segments 2318 * that are ready for allocations (thus not segments 2319 * from ms_freeing, ms_freed, and the ms_defer trees). 2320 * But if we grab the lock in this code path at a sync 2321 * pass later that 1, then it also contains the 2322 * segments of ms_freed (they were added to it earlier 2323 * in this path through ms_unflushed_frees). So we 2324 * need to remove all the segments that exist in 2325 * ms_freed from ms_allocatable as they will be added 2326 * later in metaslab_sync_done(). 2327 * 2328 * When there's no log space map, the ms_allocatable 2329 * correctly doesn't contain any segments that exist 2330 * in ms_freed [see ms_synced_length]. 2331 */ 2332 range_tree_walk(msp->ms_freed, 2333 range_tree_remove, msp->ms_allocatable); 2334 } 2335 2336 /* 2337 * If we are not using the log space map, ms_allocatable 2338 * contains the segments that exist in the ms_defer trees 2339 * [see ms_synced_length]. Thus we need to remove them 2340 * from ms_allocatable as they will be added again in 2341 * metaslab_sync_done(). 2342 * 2343 * If we are using the log space map, ms_allocatable still 2344 * contains the segments that exist in the ms_defer trees. 2345 * Not because it read them through the ms_sm though. But 2346 * because these segments are part of ms_unflushed_frees 2347 * whose segments we add to ms_allocatable earlier in this 2348 * code path. 2349 */ 2350 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2351 range_tree_walk(msp->ms_defer[t], 2352 range_tree_remove, msp->ms_allocatable); 2353 } 2354 2355 /* 2356 * Call metaslab_recalculate_weight_and_sort() now that the 2357 * metaslab is loaded so we get the metaslab's real weight. 2358 * 2359 * Unless this metaslab was created with older software and 2360 * has not yet been converted to use segment-based weight, we 2361 * expect the new weight to be better or equal to the weight 2362 * that the metaslab had while it was not loaded. This is 2363 * because the old weight does not take into account the 2364 * consolidation of adjacent segments between TXGs. [see 2365 * comment for ms_synchist and ms_deferhist[] for more info] 2366 */ 2367 uint64_t weight = msp->ms_weight; 2368 uint64_t max_size = msp->ms_max_size; 2369 metaslab_recalculate_weight_and_sort(msp); 2370 if (!WEIGHT_IS_SPACEBASED(weight)) 2371 ASSERT3U(weight, <=, msp->ms_weight); 2372 msp->ms_max_size = metaslab_largest_allocatable(msp); 2373 ASSERT3U(max_size, <=, msp->ms_max_size); 2374 hrtime_t load_end = gethrtime(); 2375 msp->ms_load_time = load_end; 2376 if (zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) { 2377 zfs_dbgmsg("loading: txg %llu, spa %s, vdev_id %llu, " 2378 "ms_id %llu, smp_length %llu, " 2379 "unflushed_allocs %llu, unflushed_frees %llu, " 2380 "freed %llu, defer %llu + %llu, " 2381 "loading_time %lld ms, ms_max_size %llu, " 2382 "max size error %llu", 2383 spa_syncing_txg(spa), spa_name(spa), 2384 msp->ms_group->mg_vd->vdev_id, msp->ms_id, 2385 space_map_length(msp->ms_sm), 2386 range_tree_space(msp->ms_unflushed_allocs), 2387 range_tree_space(msp->ms_unflushed_frees), 2388 range_tree_space(msp->ms_freed), 2389 range_tree_space(msp->ms_defer[0]), 2390 range_tree_space(msp->ms_defer[1]), 2391 (longlong_t)((load_end - load_start) / 1000000), 2392 msp->ms_max_size, msp->ms_max_size - max_size); 2393 } 2394 2395 metaslab_verify_space(msp, spa_syncing_txg(spa)); 2396 mutex_exit(&msp->ms_sync_lock); 2397 return (0); 2398 } 2399 2400 int 2401 metaslab_load(metaslab_t *msp) 2402 { 2403 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2404 2405 /* 2406 * There may be another thread loading the same metaslab, if that's 2407 * the case just wait until the other thread is done and return. 2408 */ 2409 metaslab_load_wait(msp); 2410 if (msp->ms_loaded) 2411 return (0); 2412 VERIFY(!msp->ms_loading); 2413 ASSERT(!msp->ms_condensing); 2414 2415 /* 2416 * We set the loading flag BEFORE potentially dropping the lock to 2417 * wait for an ongoing flush (see ms_flushing below). This way other 2418 * threads know that there is already a thread that is loading this 2419 * metaslab. 2420 */ 2421 msp->ms_loading = B_TRUE; 2422 2423 /* 2424 * Wait for any in-progress flushing to finish as we drop the ms_lock 2425 * both here (during space_map_load()) and in metaslab_flush() (when 2426 * we flush our changes to the ms_sm). 2427 */ 2428 if (msp->ms_flushing) 2429 metaslab_flush_wait(msp); 2430 2431 /* 2432 * In the possibility that we were waiting for the metaslab to be 2433 * flushed (where we temporarily dropped the ms_lock), ensure that 2434 * no one else loaded the metaslab somehow. 2435 */ 2436 ASSERT(!msp->ms_loaded); 2437 2438 /* 2439 * If we're loading a metaslab in the normal class, consider evicting 2440 * another one to keep our memory usage under the limit defined by the 2441 * zfs_metaslab_mem_limit tunable. 2442 */ 2443 if (spa_normal_class(msp->ms_group->mg_class->mc_spa) == 2444 msp->ms_group->mg_class) { 2445 metaslab_potentially_evict(msp->ms_group->mg_class); 2446 } 2447 2448 int error = metaslab_load_impl(msp); 2449 2450 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2451 msp->ms_loading = B_FALSE; 2452 cv_broadcast(&msp->ms_load_cv); 2453 2454 return (error); 2455 } 2456 2457 void 2458 metaslab_unload(metaslab_t *msp) 2459 { 2460 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2461 2462 /* 2463 * This can happen if a metaslab is selected for eviction (in 2464 * metaslab_potentially_evict) and then unloaded during spa_sync (via 2465 * metaslab_class_evict_old). 2466 */ 2467 if (!msp->ms_loaded) 2468 return; 2469 2470 range_tree_vacate(msp->ms_allocatable, NULL, NULL); 2471 msp->ms_loaded = B_FALSE; 2472 msp->ms_unload_time = gethrtime(); 2473 2474 msp->ms_activation_weight = 0; 2475 msp->ms_weight &= ~METASLAB_ACTIVE_MASK; 2476 2477 if (msp->ms_group != NULL) { 2478 metaslab_class_t *mc = msp->ms_group->mg_class; 2479 multilist_sublist_t *mls = 2480 multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp); 2481 if (multilist_link_active(&msp->ms_class_txg_node)) 2482 multilist_sublist_remove(mls, msp); 2483 multilist_sublist_unlock(mls); 2484 } 2485 2486 /* 2487 * We explicitly recalculate the metaslab's weight based on its space 2488 * map (as it is now not loaded). We want unload metaslabs to always 2489 * have their weights calculated from the space map histograms, while 2490 * loaded ones have it calculated from their in-core range tree 2491 * [see metaslab_load()]. This way, the weight reflects the information 2492 * available in-core, whether it is loaded or not. 2493 * 2494 * If ms_group == NULL means that we came here from metaslab_fini(), 2495 * at which point it doesn't make sense for us to do the recalculation 2496 * and the sorting. 2497 */ 2498 if (msp->ms_group != NULL) 2499 metaslab_recalculate_weight_and_sort(msp); 2500 } 2501 2502 /* 2503 * We want to optimize the memory use of the per-metaslab range 2504 * trees. To do this, we store the segments in the range trees in 2505 * units of sectors, zero-indexing from the start of the metaslab. If 2506 * the vdev_ms_shift - the vdev_ashift is less than 32, we can store 2507 * the ranges using two uint32_ts, rather than two uint64_ts. 2508 */ 2509 static range_seg_type_t 2510 metaslab_calculate_range_tree_type(vdev_t *vdev, metaslab_t *msp, 2511 uint64_t *start, uint64_t *shift) 2512 { 2513 if (vdev->vdev_ms_shift - vdev->vdev_ashift < 32 && 2514 !zfs_metaslab_force_large_segs) { 2515 *shift = vdev->vdev_ashift; 2516 *start = msp->ms_start; 2517 return (RANGE_SEG32); 2518 } else { 2519 *shift = 0; 2520 *start = 0; 2521 return (RANGE_SEG64); 2522 } 2523 } 2524 2525 void 2526 metaslab_set_selected_txg(metaslab_t *msp, uint64_t txg) 2527 { 2528 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2529 metaslab_class_t *mc = msp->ms_group->mg_class; 2530 multilist_sublist_t *mls = 2531 multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp); 2532 if (multilist_link_active(&msp->ms_class_txg_node)) 2533 multilist_sublist_remove(mls, msp); 2534 msp->ms_selected_txg = txg; 2535 msp->ms_selected_time = gethrtime(); 2536 multilist_sublist_insert_tail(mls, msp); 2537 multilist_sublist_unlock(mls); 2538 } 2539 2540 void 2541 metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta, 2542 int64_t defer_delta, int64_t space_delta) 2543 { 2544 vdev_space_update(vd, alloc_delta, defer_delta, space_delta); 2545 2546 ASSERT3P(vd->vdev_spa->spa_root_vdev, ==, vd->vdev_parent); 2547 ASSERT(vd->vdev_ms_count != 0); 2548 2549 metaslab_class_space_update(mc, alloc_delta, defer_delta, space_delta, 2550 vdev_deflated_space(vd, space_delta)); 2551 } 2552 2553 int 2554 metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, 2555 uint64_t txg, metaslab_t **msp) 2556 { 2557 vdev_t *vd = mg->mg_vd; 2558 spa_t *spa = vd->vdev_spa; 2559 objset_t *mos = spa->spa_meta_objset; 2560 metaslab_t *ms; 2561 int error; 2562 2563 ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); 2564 mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL); 2565 mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL); 2566 cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); 2567 cv_init(&ms->ms_flush_cv, NULL, CV_DEFAULT, NULL); 2568 multilist_link_init(&ms->ms_class_txg_node); 2569 2570 ms->ms_id = id; 2571 ms->ms_start = id << vd->vdev_ms_shift; 2572 ms->ms_size = 1ULL << vd->vdev_ms_shift; 2573 ms->ms_allocator = -1; 2574 ms->ms_new = B_TRUE; 2575 2576 /* 2577 * We only open space map objects that already exist. All others 2578 * will be opened when we finally allocate an object for it. 2579 * 2580 * Note: 2581 * When called from vdev_expand(), we can't call into the DMU as 2582 * we are holding the spa_config_lock as a writer and we would 2583 * deadlock [see relevant comment in vdev_metaslab_init()]. in 2584 * that case, the object parameter is zero though, so we won't 2585 * call into the DMU. 2586 */ 2587 if (object != 0) { 2588 error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start, 2589 ms->ms_size, vd->vdev_ashift); 2590 2591 if (error != 0) { 2592 kmem_free(ms, sizeof (metaslab_t)); 2593 return (error); 2594 } 2595 2596 ASSERT(ms->ms_sm != NULL); 2597 ASSERT3S(space_map_allocated(ms->ms_sm), >=, 0); 2598 ms->ms_allocated_space = space_map_allocated(ms->ms_sm); 2599 } 2600 2601 range_seg_type_t type; 2602 uint64_t shift, start; 2603 type = metaslab_calculate_range_tree_type(vd, ms, &start, &shift); 2604 2605 /* 2606 * We create the ms_allocatable here, but we don't create the 2607 * other range trees until metaslab_sync_done(). This serves 2608 * two purposes: it allows metaslab_sync_done() to detect the 2609 * addition of new space; and for debugging, it ensures that 2610 * we'd data fault on any attempt to use this metaslab before 2611 * it's ready. 2612 */ 2613 ms->ms_allocatable = range_tree_create(NULL, type, NULL, start, shift); 2614 2615 ms->ms_trim = range_tree_create(NULL, type, NULL, start, shift); 2616 2617 metaslab_group_add(mg, ms); 2618 metaslab_set_fragmentation(ms); 2619 2620 /* 2621 * If we're opening an existing pool (txg == 0) or creating 2622 * a new one (txg == TXG_INITIAL), all space is available now. 2623 * If we're adding space to an existing pool, the new space 2624 * does not become available until after this txg has synced. 2625 * The metaslab's weight will also be initialized when we sync 2626 * out this txg. This ensures that we don't attempt to allocate 2627 * from it before we have initialized it completely. 2628 */ 2629 if (txg <= TXG_INITIAL) { 2630 metaslab_sync_done(ms, 0); 2631 metaslab_space_update(vd, mg->mg_class, 2632 metaslab_allocated_space(ms), 0, 0); 2633 } 2634 2635 if (txg != 0) { 2636 vdev_dirty(vd, 0, NULL, txg); 2637 vdev_dirty(vd, VDD_METASLAB, ms, txg); 2638 } 2639 2640 *msp = ms; 2641 2642 return (0); 2643 } 2644 2645 static void 2646 metaslab_fini_flush_data(metaslab_t *msp) 2647 { 2648 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 2649 2650 if (metaslab_unflushed_txg(msp) == 0) { 2651 ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), 2652 ==, NULL); 2653 return; 2654 } 2655 ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); 2656 2657 mutex_enter(&spa->spa_flushed_ms_lock); 2658 avl_remove(&spa->spa_metaslabs_by_flushed, msp); 2659 mutex_exit(&spa->spa_flushed_ms_lock); 2660 2661 spa_log_sm_decrement_mscount(spa, metaslab_unflushed_txg(msp)); 2662 spa_log_summary_decrement_mscount(spa, metaslab_unflushed_txg(msp)); 2663 } 2664 2665 uint64_t 2666 metaslab_unflushed_changes_memused(metaslab_t *ms) 2667 { 2668 return ((range_tree_numsegs(ms->ms_unflushed_allocs) + 2669 range_tree_numsegs(ms->ms_unflushed_frees)) * 2670 ms->ms_unflushed_allocs->rt_root.bt_elem_size); 2671 } 2672 2673 void 2674 metaslab_fini(metaslab_t *msp) 2675 { 2676 metaslab_group_t *mg = msp->ms_group; 2677 vdev_t *vd = mg->mg_vd; 2678 spa_t *spa = vd->vdev_spa; 2679 2680 metaslab_fini_flush_data(msp); 2681 2682 metaslab_group_remove(mg, msp); 2683 2684 mutex_enter(&msp->ms_lock); 2685 VERIFY(msp->ms_group == NULL); 2686 metaslab_space_update(vd, mg->mg_class, 2687 -metaslab_allocated_space(msp), 0, -msp->ms_size); 2688 2689 space_map_close(msp->ms_sm); 2690 msp->ms_sm = NULL; 2691 2692 metaslab_unload(msp); 2693 range_tree_destroy(msp->ms_allocatable); 2694 range_tree_destroy(msp->ms_freeing); 2695 range_tree_destroy(msp->ms_freed); 2696 2697 ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, 2698 metaslab_unflushed_changes_memused(msp)); 2699 spa->spa_unflushed_stats.sus_memused -= 2700 metaslab_unflushed_changes_memused(msp); 2701 range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); 2702 range_tree_destroy(msp->ms_unflushed_allocs); 2703 range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); 2704 range_tree_destroy(msp->ms_unflushed_frees); 2705 2706 for (int t = 0; t < TXG_SIZE; t++) { 2707 range_tree_destroy(msp->ms_allocating[t]); 2708 } 2709 2710 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2711 range_tree_destroy(msp->ms_defer[t]); 2712 } 2713 ASSERT0(msp->ms_deferspace); 2714 2715 range_tree_destroy(msp->ms_checkpointing); 2716 2717 for (int t = 0; t < TXG_SIZE; t++) 2718 ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t)); 2719 2720 range_tree_vacate(msp->ms_trim, NULL, NULL); 2721 range_tree_destroy(msp->ms_trim); 2722 2723 mutex_exit(&msp->ms_lock); 2724 cv_destroy(&msp->ms_load_cv); 2725 cv_destroy(&msp->ms_flush_cv); 2726 mutex_destroy(&msp->ms_lock); 2727 mutex_destroy(&msp->ms_sync_lock); 2728 ASSERT3U(msp->ms_allocator, ==, -1); 2729 2730 kmem_free(msp, sizeof (metaslab_t)); 2731 } 2732 2733 #define FRAGMENTATION_TABLE_SIZE 17 2734 2735 /* 2736 * This table defines a segment size based fragmentation metric that will 2737 * allow each metaslab to derive its own fragmentation value. This is done 2738 * by calculating the space in each bucket of the spacemap histogram and 2739 * multiplying that by the fragmentation metric in this table. Doing 2740 * this for all buckets and dividing it by the total amount of free 2741 * space in this metaslab (i.e. the total free space in all buckets) gives 2742 * us the fragmentation metric. This means that a high fragmentation metric 2743 * equates to most of the free space being comprised of small segments. 2744 * Conversely, if the metric is low, then most of the free space is in 2745 * large segments. A 10% change in fragmentation equates to approximately 2746 * double the number of segments. 2747 * 2748 * This table defines 0% fragmented space using 16MB segments. Testing has 2749 * shown that segments that are greater than or equal to 16MB do not suffer 2750 * from drastic performance problems. Using this value, we derive the rest 2751 * of the table. Since the fragmentation value is never stored on disk, it 2752 * is possible to change these calculations in the future. 2753 */ 2754 int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = { 2755 100, /* 512B */ 2756 100, /* 1K */ 2757 98, /* 2K */ 2758 95, /* 4K */ 2759 90, /* 8K */ 2760 80, /* 16K */ 2761 70, /* 32K */ 2762 60, /* 64K */ 2763 50, /* 128K */ 2764 40, /* 256K */ 2765 30, /* 512K */ 2766 20, /* 1M */ 2767 15, /* 2M */ 2768 10, /* 4M */ 2769 5, /* 8M */ 2770 0 /* 16M */ 2771 }; 2772 2773 /* 2774 * Calculate the metaslab's fragmentation metric and set ms_fragmentation. 2775 * Setting this value to ZFS_FRAG_INVALID means that the metaslab has not 2776 * been upgraded and does not support this metric. Otherwise, the return 2777 * value should be in the range [0, 100]. 2778 */ 2779 static void 2780 metaslab_set_fragmentation(metaslab_t *msp) 2781 { 2782 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 2783 uint64_t fragmentation = 0; 2784 uint64_t total = 0; 2785 boolean_t feature_enabled = spa_feature_is_enabled(spa, 2786 SPA_FEATURE_SPACEMAP_HISTOGRAM); 2787 2788 if (!feature_enabled) { 2789 msp->ms_fragmentation = ZFS_FRAG_INVALID; 2790 return; 2791 } 2792 2793 /* 2794 * A null space map means that the entire metaslab is free 2795 * and thus is not fragmented. 2796 */ 2797 if (msp->ms_sm == NULL) { 2798 msp->ms_fragmentation = 0; 2799 return; 2800 } 2801 2802 /* 2803 * If this metaslab's space map has not been upgraded, flag it 2804 * so that we upgrade next time we encounter it. 2805 */ 2806 if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) { 2807 uint64_t txg = spa_syncing_txg(spa); 2808 vdev_t *vd = msp->ms_group->mg_vd; 2809 2810 /* 2811 * If we've reached the final dirty txg, then we must 2812 * be shutting down the pool. We don't want to dirty 2813 * any data past this point so skip setting the condense 2814 * flag. We can retry this action the next time the pool 2815 * is imported. 2816 */ 2817 if (spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) { 2818 msp->ms_condense_wanted = B_TRUE; 2819 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 2820 zfs_dbgmsg("txg %llu, requesting force condense: " 2821 "ms_id %llu, vdev_id %llu", txg, msp->ms_id, 2822 vd->vdev_id); 2823 } 2824 msp->ms_fragmentation = ZFS_FRAG_INVALID; 2825 return; 2826 } 2827 2828 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 2829 uint64_t space = 0; 2830 uint8_t shift = msp->ms_sm->sm_shift; 2831 2832 int idx = MIN(shift - SPA_MINBLOCKSHIFT + i, 2833 FRAGMENTATION_TABLE_SIZE - 1); 2834 2835 if (msp->ms_sm->sm_phys->smp_histogram[i] == 0) 2836 continue; 2837 2838 space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift); 2839 total += space; 2840 2841 ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE); 2842 fragmentation += space * zfs_frag_table[idx]; 2843 } 2844 2845 if (total > 0) 2846 fragmentation /= total; 2847 ASSERT3U(fragmentation, <=, 100); 2848 2849 msp->ms_fragmentation = fragmentation; 2850 } 2851 2852 /* 2853 * Compute a weight -- a selection preference value -- for the given metaslab. 2854 * This is based on the amount of free space, the level of fragmentation, 2855 * the LBA range, and whether the metaslab is loaded. 2856 */ 2857 static uint64_t 2858 metaslab_space_weight(metaslab_t *msp) 2859 { 2860 metaslab_group_t *mg = msp->ms_group; 2861 vdev_t *vd = mg->mg_vd; 2862 uint64_t weight, space; 2863 2864 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2865 2866 /* 2867 * The baseline weight is the metaslab's free space. 2868 */ 2869 space = msp->ms_size - metaslab_allocated_space(msp); 2870 2871 if (metaslab_fragmentation_factor_enabled && 2872 msp->ms_fragmentation != ZFS_FRAG_INVALID) { 2873 /* 2874 * Use the fragmentation information to inversely scale 2875 * down the baseline weight. We need to ensure that we 2876 * don't exclude this metaslab completely when it's 100% 2877 * fragmented. To avoid this we reduce the fragmented value 2878 * by 1. 2879 */ 2880 space = (space * (100 - (msp->ms_fragmentation - 1))) / 100; 2881 2882 /* 2883 * If space < SPA_MINBLOCKSIZE, then we will not allocate from 2884 * this metaslab again. The fragmentation metric may have 2885 * decreased the space to something smaller than 2886 * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE 2887 * so that we can consume any remaining space. 2888 */ 2889 if (space > 0 && space < SPA_MINBLOCKSIZE) 2890 space = SPA_MINBLOCKSIZE; 2891 } 2892 weight = space; 2893 2894 /* 2895 * Modern disks have uniform bit density and constant angular velocity. 2896 * Therefore, the outer recording zones are faster (higher bandwidth) 2897 * than the inner zones by the ratio of outer to inner track diameter, 2898 * which is typically around 2:1. We account for this by assigning 2899 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). 2900 * In effect, this means that we'll select the metaslab with the most 2901 * free bandwidth rather than simply the one with the most free space. 2902 */ 2903 if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) { 2904 weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count; 2905 ASSERT(weight >= space && weight <= 2 * space); 2906 } 2907 2908 /* 2909 * If this metaslab is one we're actively using, adjust its 2910 * weight to make it preferable to any inactive metaslab so 2911 * we'll polish it off. If the fragmentation on this metaslab 2912 * has exceed our threshold, then don't mark it active. 2913 */ 2914 if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID && 2915 msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) { 2916 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); 2917 } 2918 2919 WEIGHT_SET_SPACEBASED(weight); 2920 return (weight); 2921 } 2922 2923 /* 2924 * Return the weight of the specified metaslab, according to the segment-based 2925 * weighting algorithm. The metaslab must be loaded. This function can 2926 * be called within a sync pass since it relies only on the metaslab's 2927 * range tree which is always accurate when the metaslab is loaded. 2928 */ 2929 static uint64_t 2930 metaslab_weight_from_range_tree(metaslab_t *msp) 2931 { 2932 uint64_t weight = 0; 2933 uint32_t segments = 0; 2934 2935 ASSERT(msp->ms_loaded); 2936 2937 for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT; 2938 i--) { 2939 uint8_t shift = msp->ms_group->mg_vd->vdev_ashift; 2940 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; 2941 2942 segments <<= 1; 2943 segments += msp->ms_allocatable->rt_histogram[i]; 2944 2945 /* 2946 * The range tree provides more precision than the space map 2947 * and must be downgraded so that all values fit within the 2948 * space map's histogram. This allows us to compare loaded 2949 * vs. unloaded metaslabs to determine which metaslab is 2950 * considered "best". 2951 */ 2952 if (i > max_idx) 2953 continue; 2954 2955 if (segments != 0) { 2956 WEIGHT_SET_COUNT(weight, segments); 2957 WEIGHT_SET_INDEX(weight, i); 2958 WEIGHT_SET_ACTIVE(weight, 0); 2959 break; 2960 } 2961 } 2962 return (weight); 2963 } 2964 2965 /* 2966 * Calculate the weight based on the on-disk histogram. Should be applied 2967 * only to unloaded metaslabs (i.e no incoming allocations) in-order to 2968 * give results consistent with the on-disk state 2969 */ 2970 static uint64_t 2971 metaslab_weight_from_spacemap(metaslab_t *msp) 2972 { 2973 space_map_t *sm = msp->ms_sm; 2974 ASSERT(!msp->ms_loaded); 2975 ASSERT(sm != NULL); 2976 ASSERT3U(space_map_object(sm), !=, 0); 2977 ASSERT3U(sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); 2978 2979 /* 2980 * Create a joint histogram from all the segments that have made 2981 * it to the metaslab's space map histogram, that are not yet 2982 * available for allocation because they are still in the freeing 2983 * pipeline (e.g. freeing, freed, and defer trees). Then subtract 2984 * these segments from the space map's histogram to get a more 2985 * accurate weight. 2986 */ 2987 uint64_t deferspace_histogram[SPACE_MAP_HISTOGRAM_SIZE] = {0}; 2988 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) 2989 deferspace_histogram[i] += msp->ms_synchist[i]; 2990 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2991 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 2992 deferspace_histogram[i] += msp->ms_deferhist[t][i]; 2993 } 2994 } 2995 2996 uint64_t weight = 0; 2997 for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) { 2998 ASSERT3U(sm->sm_phys->smp_histogram[i], >=, 2999 deferspace_histogram[i]); 3000 uint64_t count = 3001 sm->sm_phys->smp_histogram[i] - deferspace_histogram[i]; 3002 if (count != 0) { 3003 WEIGHT_SET_COUNT(weight, count); 3004 WEIGHT_SET_INDEX(weight, i + sm->sm_shift); 3005 WEIGHT_SET_ACTIVE(weight, 0); 3006 break; 3007 } 3008 } 3009 return (weight); 3010 } 3011 3012 /* 3013 * Compute a segment-based weight for the specified metaslab. The weight 3014 * is determined by highest bucket in the histogram. The information 3015 * for the highest bucket is encoded into the weight value. 3016 */ 3017 static uint64_t 3018 metaslab_segment_weight(metaslab_t *msp) 3019 { 3020 metaslab_group_t *mg = msp->ms_group; 3021 uint64_t weight = 0; 3022 uint8_t shift = mg->mg_vd->vdev_ashift; 3023 3024 ASSERT(MUTEX_HELD(&msp->ms_lock)); 3025 3026 /* 3027 * The metaslab is completely free. 3028 */ 3029 if (metaslab_allocated_space(msp) == 0) { 3030 int idx = highbit64(msp->ms_size) - 1; 3031 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; 3032 3033 if (idx < max_idx) { 3034 WEIGHT_SET_COUNT(weight, 1ULL); 3035 WEIGHT_SET_INDEX(weight, idx); 3036 } else { 3037 WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx)); 3038 WEIGHT_SET_INDEX(weight, max_idx); 3039 } 3040 WEIGHT_SET_ACTIVE(weight, 0); 3041 ASSERT(!WEIGHT_IS_SPACEBASED(weight)); 3042 return (weight); 3043 } 3044 3045 ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); 3046 3047 /* 3048 * If the metaslab is fully allocated then just make the weight 0. 3049 */ 3050 if (metaslab_allocated_space(msp) == msp->ms_size) 3051 return (0); 3052 /* 3053 * If the metaslab is already loaded, then use the range tree to 3054 * determine the weight. Otherwise, we rely on the space map information 3055 * to generate the weight. 3056 */ 3057 if (msp->ms_loaded) { 3058 weight = metaslab_weight_from_range_tree(msp); 3059 } else { 3060 weight = metaslab_weight_from_spacemap(msp); 3061 } 3062 3063 /* 3064 * If the metaslab was active the last time we calculated its weight 3065 * then keep it active. We want to consume the entire region that 3066 * is associated with this weight. 3067 */ 3068 if (msp->ms_activation_weight != 0 && weight != 0) 3069 WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight)); 3070 return (weight); 3071 } 3072 3073 /* 3074 * Determine if we should attempt to allocate from this metaslab. If the 3075 * metaslab is loaded, then we can determine if the desired allocation 3076 * can be satisfied by looking at the size of the maximum free segment 3077 * on that metaslab. Otherwise, we make our decision based on the metaslab's 3078 * weight. For segment-based weighting we can determine the maximum 3079 * allocation based on the index encoded in its value. For space-based 3080 * weights we rely on the entire weight (excluding the weight-type bit). 3081 */ 3082 boolean_t 3083 metaslab_should_allocate(metaslab_t *msp, uint64_t asize, boolean_t try_hard) 3084 { 3085 /* 3086 * If the metaslab is loaded, ms_max_size is definitive and we can use 3087 * the fast check. If it's not, the ms_max_size is a lower bound (once 3088 * set), and we should use the fast check as long as we're not in 3089 * try_hard and it's been less than zfs_metaslab_max_size_cache_sec 3090 * seconds since the metaslab was unloaded. 3091 */ 3092 if (msp->ms_loaded || 3093 (msp->ms_max_size != 0 && !try_hard && gethrtime() < 3094 msp->ms_unload_time + SEC2NSEC(zfs_metaslab_max_size_cache_sec))) 3095 return (msp->ms_max_size >= asize); 3096 3097 boolean_t should_allocate; 3098 if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) { 3099 /* 3100 * The metaslab segment weight indicates segments in the 3101 * range [2^i, 2^(i+1)), where i is the index in the weight. 3102 * Since the asize might be in the middle of the range, we 3103 * should attempt the allocation if asize < 2^(i+1). 3104 */ 3105 should_allocate = (asize < 3106 1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1)); 3107 } else { 3108 should_allocate = (asize <= 3109 (msp->ms_weight & ~METASLAB_WEIGHT_TYPE)); 3110 } 3111 3112 return (should_allocate); 3113 } 3114 3115 static uint64_t 3116 metaslab_weight(metaslab_t *msp) 3117 { 3118 vdev_t *vd = msp->ms_group->mg_vd; 3119 spa_t *spa = vd->vdev_spa; 3120 uint64_t weight; 3121 3122 ASSERT(MUTEX_HELD(&msp->ms_lock)); 3123 3124 metaslab_set_fragmentation(msp); 3125 3126 /* 3127 * Update the maximum size. If the metaslab is loaded, this will 3128 * ensure that we get an accurate maximum size if newly freed space 3129 * has been added back into the free tree. If the metaslab is 3130 * unloaded, we check if there's a larger free segment in the 3131 * unflushed frees. This is a lower bound on the largest allocatable 3132 * segment size. Coalescing of adjacent entries may reveal larger 3133 * allocatable segments, but we aren't aware of those until loading 3134 * the space map into a range tree. 3135 */ 3136 if (msp->ms_loaded) { 3137 msp->ms_max_size = metaslab_largest_allocatable(msp); 3138 } else { 3139 msp->ms_max_size = MAX(msp->ms_max_size, 3140 metaslab_largest_unflushed_free(msp)); 3141 } 3142 3143 /* 3144 * Segment-based weighting requires space map histogram support. 3145 */ 3146 if (zfs_metaslab_segment_weight_enabled && 3147 spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) && 3148 (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size == 3149 sizeof (space_map_phys_t))) { 3150 weight = metaslab_segment_weight(msp); 3151 } else { 3152 weight = metaslab_space_weight(msp); 3153 } 3154 return (weight); 3155 } 3156 3157 void 3158 metaslab_recalculate_weight_and_sort(metaslab_t *msp) 3159 { 3160 ASSERT(MUTEX_HELD(&msp->ms_lock)); 3161 3162 /* note: we preserve the mask (e.g. indication of primary, etc..) */ 3163 uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; 3164 metaslab_group_sort(msp->ms_group, msp, 3165 metaslab_weight(msp) | was_active); 3166 } 3167 3168 static int 3169 metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp, 3170 int allocator, uint64_t activation_weight) 3171 { 3172 ASSERT(MUTEX_HELD(&msp->ms_lock)); 3173 3174 /* 3175 * If we're activating for the claim code, we don't want to actually 3176 * set the metaslab up for a specific allocator. 3177 */ 3178 if (activation_weight == METASLAB_WEIGHT_CLAIM) { 3179 ASSERT0(msp->ms_activation_weight); 3180 msp->ms_activation_weight = msp->ms_weight; 3181 metaslab_group_sort(mg, msp, msp->ms_weight | 3182 activation_weight); 3183 return (0); 3184 } 3185 3186 metaslab_t **arr = (activation_weight == METASLAB_WEIGHT_PRIMARY ? 3187 mg->mg_primaries : mg->mg_secondaries); 3188 3189 mutex_enter(&mg->mg_lock); 3190 if (arr[allocator] != NULL) { 3191 mutex_exit(&mg->mg_lock); 3192 return (EEXIST); 3193 } 3194 3195 arr[allocator] = msp; 3196 ASSERT3S(msp->ms_allocator, ==, -1); 3197 msp->ms_allocator = allocator; 3198 msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY); 3199 3200 ASSERT0(msp->ms_activation_weight); 3201 msp->ms_activation_weight = msp->ms_weight; 3202 metaslab_group_sort_impl(mg, msp, 3203 msp->ms_weight | activation_weight); 3204 3205 mutex_exit(&mg->mg_lock); 3206 3207 return (0); 3208 } 3209 3210 static int 3211 metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight) 3212 { 3213 ASSERT(MUTEX_HELD(&msp->ms_lock)); 3214 3215 /* 3216 * The current metaslab is already activated for us so there 3217 * is nothing to do. Already activated though, doesn't mean 3218 * that this metaslab is activated for our allocator nor our 3219 * requested activation weight. The metaslab could have started 3220 * as an active one for our allocator but changed allocators 3221 * while we were waiting to grab its ms_lock or we stole it 3222 * [see find_valid_metaslab()]. This means that there is a 3223 * possibility of passivating a metaslab of another allocator 3224 * or from a different activation mask, from this thread. 3225 */ 3226 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) { 3227 ASSERT(msp->ms_loaded); 3228 return (0); 3229 } 3230 3231 int error = metaslab_load(msp); 3232 if (error != 0) { 3233 metaslab_group_sort(msp->ms_group, msp, 0); 3234 return (error); 3235 } 3236 3237 /* 3238 * When entering metaslab_load() we may have dropped the 3239 * ms_lock because we were loading this metaslab, or we 3240 * were waiting for another thread to load it for us. In 3241 * that scenario, we recheck the weight of the metaslab 3242 * to see if it was activated by another thread. 3243 * 3244 * If the metaslab was activated for another allocator or 3245 * it was activated with a different activation weight (e.g. 3246 * we wanted to make it a primary but it was activated as 3247 * secondary) we return error (EBUSY). 3248 * 3249 * If the metaslab was activated for the same allocator 3250 * and requested activation mask, skip activating it. 3251 */ 3252 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) { 3253 if (msp->ms_allocator != allocator) 3254 return (EBUSY); 3255 3256 if ((msp->ms_weight & activation_weight) == 0) 3257 return (EBUSY); 3258 3259 EQUIV((activation_weight == METASLAB_WEIGHT_PRIMARY), 3260 msp->ms_primary); 3261 return (0); 3262 } 3263 3264 /* 3265 * If the metaslab has literally 0 space, it will have weight 0. In 3266 * that case, don't bother activating it. This can happen if the 3267 * metaslab had space during find_valid_metaslab, but another thread 3268 * loaded it and used all that space while we were waiting to grab the 3269 * lock. 3270 */ 3271 if (msp->ms_weight == 0) { 3272 ASSERT0(range_tree_space(msp->ms_allocatable)); 3273 return (SET_ERROR(ENOSPC)); 3274 } 3275 3276 if ((error = metaslab_activate_allocator(msp->ms_group, msp, 3277 allocator, activation_weight)) != 0) { 3278 return (error); 3279 } 3280 3281 ASSERT(msp->ms_loaded); 3282 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); 3283 3284 return (0); 3285 } 3286 3287 static void 3288 metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp, 3289 uint64_t weight) 3290 { 3291 ASSERT(MUTEX_HELD(&msp->ms_lock)); 3292 ASSERT(msp->ms_loaded); 3293 3294 if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) { 3295 metaslab_group_sort(mg, msp, weight); 3296 return; 3297 } 3298 3299 mutex_enter(&mg->mg_lock); 3300 ASSERT3P(msp->ms_group, ==, mg); 3301 ASSERT3S(0, <=, msp->ms_allocator); 3302 ASSERT3U(msp->ms_allocator, <, mg->mg_allocators); 3303 3304 if (msp->ms_primary) { 3305 ASSERT3P(mg->mg_primaries[msp->ms_allocator], ==, msp); 3306 ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); 3307 mg->mg_primaries[msp->ms_allocator] = NULL; 3308 } else { 3309 ASSERT3P(mg->mg_secondaries[msp->ms_allocator], ==, msp); 3310 ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); 3311 mg->mg_secondaries[msp->ms_allocator] = NULL; 3312 } 3313 msp->ms_allocator = -1; 3314 metaslab_group_sort_impl(mg, msp, weight); 3315 mutex_exit(&mg->mg_lock); 3316 } 3317 3318 static void 3319 metaslab_passivate(metaslab_t *msp, uint64_t weight) 3320 { 3321 uint64_t size = weight & ~METASLAB_WEIGHT_TYPE; 3322 3323 /* 3324 * If size < SPA_MINBLOCKSIZE, then we will not allocate from 3325 * this metaslab again. In that case, it had better be empty, 3326 * or we would be leaving space on the table. 3327 */ 3328 ASSERT(size >= SPA_MINBLOCKSIZE || 3329 range_tree_is_empty(msp->ms_allocatable)); 3330 ASSERT0(weight & METASLAB_ACTIVE_MASK); 3331 3332 ASSERT(msp->ms_activation_weight != 0); 3333 msp->ms_activation_weight = 0; 3334 metaslab_passivate_allocator(msp->ms_group, msp, weight); 3335 ASSERT0(msp->ms_weight & METASLAB_ACTIVE_MASK); 3336 } 3337 3338 /* 3339 * Segment-based metaslabs are activated once and remain active until 3340 * we either fail an allocation attempt (similar to space-based metaslabs) 3341 * or have exhausted the free space in zfs_metaslab_switch_threshold 3342 * buckets since the metaslab was activated. This function checks to see 3343 * if we've exhaused the zfs_metaslab_switch_threshold buckets in the 3344 * metaslab and passivates it proactively. This will allow us to select a 3345 * metaslabs with larger contiguous region if any remaining within this 3346 * metaslab group. If we're in sync pass > 1, then we continue using this 3347 * metaslab so that we don't dirty more block and cause more sync passes. 3348 */ 3349 void 3350 metaslab_segment_may_passivate(metaslab_t *msp) 3351 { 3352 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 3353 3354 if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1) 3355 return; 3356 3357 /* 3358 * Since we are in the middle of a sync pass, the most accurate 3359 * information that is accessible to us is the in-core range tree 3360 * histogram; calculate the new weight based on that information. 3361 */ 3362 uint64_t weight = metaslab_weight_from_range_tree(msp); 3363 int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight); 3364 int current_idx = WEIGHT_GET_INDEX(weight); 3365 3366 if (current_idx <= activation_idx - zfs_metaslab_switch_threshold) 3367 metaslab_passivate(msp, weight); 3368 } 3369 3370 static void 3371 metaslab_preload(void *arg) 3372 { 3373 metaslab_t *msp = arg; 3374 metaslab_class_t *mc = msp->ms_group->mg_class; 3375 spa_t *spa = mc->mc_spa; 3376 3377 ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock)); 3378 3379 mutex_enter(&msp->ms_lock); 3380 (void) metaslab_load(msp); 3381 metaslab_set_selected_txg(msp, spa_syncing_txg(spa)); 3382 mutex_exit(&msp->ms_lock); 3383 } 3384 3385 static void 3386 metaslab_group_preload(metaslab_group_t *mg) 3387 { 3388 spa_t *spa = mg->mg_vd->vdev_spa; 3389 metaslab_t *msp; 3390 avl_tree_t *t = &mg->mg_metaslab_tree; 3391 int m = 0; 3392 3393 if (spa_shutting_down(spa) || !metaslab_preload_enabled) { 3394 taskq_wait(mg->mg_taskq); 3395 return; 3396 } 3397 3398 mutex_enter(&mg->mg_lock); 3399 3400 /* 3401 * Load the next potential metaslabs 3402 */ 3403 for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) { 3404 ASSERT3P(msp->ms_group, ==, mg); 3405 3406 /* 3407 * We preload only the maximum number of metaslabs specified 3408 * by metaslab_preload_limit. If a metaslab is being forced 3409 * to condense then we preload it too. This will ensure 3410 * that force condensing happens in the next txg. 3411 */ 3412 if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) { 3413 continue; 3414 } 3415 3416 VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload, 3417 msp, TQ_SLEEP) != TASKQID_INVALID); 3418 } 3419 mutex_exit(&mg->mg_lock); 3420 } 3421 3422 /* 3423 * Determine if the space map's on-disk footprint is past our tolerance for 3424 * inefficiency. We would like to use the following criteria to make our 3425 * decision: 3426 * 3427 * 1. Do not condense if the size of the space map object would dramatically 3428 * increase as a result of writing out the free space range tree. 3429 * 3430 * 2. Condense if the on on-disk space map representation is at least 3431 * zfs_condense_pct/100 times the size of the optimal representation 3432 * (i.e. zfs_condense_pct = 110 and in-core = 1MB, optimal = 1.1MB). 3433 * 3434 * 3. Do not condense if the on-disk size of the space map does not actually 3435 * decrease. 3436 * 3437 * Unfortunately, we cannot compute the on-disk size of the space map in this 3438 * context because we cannot accurately compute the effects of compression, etc. 3439 * Instead, we apply the heuristic described in the block comment for 3440 * zfs_metaslab_condense_block_threshold - we only condense if the space used 3441 * is greater than a threshold number of blocks. 3442 */ 3443 static boolean_t 3444 metaslab_should_condense(metaslab_t *msp) 3445 { 3446 space_map_t *sm = msp->ms_sm; 3447 vdev_t *vd = msp->ms_group->mg_vd; 3448 uint64_t vdev_blocksize = 1 << vd->vdev_ashift; 3449 3450 ASSERT(MUTEX_HELD(&msp->ms_lock)); 3451 ASSERT(msp->ms_loaded); 3452 ASSERT(sm != NULL); 3453 ASSERT3U(spa_sync_pass(vd->vdev_spa), ==, 1); 3454 3455 /* 3456 * We always condense metaslabs that are empty and metaslabs for 3457 * which a condense request has been made. 3458 */ 3459 if (range_tree_numsegs(msp->ms_allocatable) == 0 || 3460 msp->ms_condense_wanted) 3461 return (B_TRUE); 3462 3463 uint64_t record_size = MAX(sm->sm_blksz, vdev_blocksize); 3464 uint64_t object_size = space_map_length(sm); 3465 uint64_t optimal_size = space_map_estimate_optimal_size(sm, 3466 msp->ms_allocatable, SM_NO_VDEVID); 3467 3468 return (object_size >= (optimal_size * zfs_condense_pct / 100) && 3469 object_size > zfs_metaslab_condense_block_threshold * record_size); 3470 } 3471 3472 /* 3473 * Condense the on-disk space map representation to its minimized form. 3474 * The minimized form consists of a small number of allocations followed 3475 * by the entries of the free range tree (ms_allocatable). The condensed 3476 * spacemap contains all the entries of previous TXGs (including those in 3477 * the pool-wide log spacemaps; thus this is effectively a superset of 3478 * metaslab_flush()), but this TXG's entries still need to be written. 3479 */ 3480 static void 3481 metaslab_condense(metaslab_t *msp, dmu_tx_t *tx) 3482 { 3483 range_tree_t *condense_tree; 3484 space_map_t *sm = msp->ms_sm; 3485 uint64_t txg = dmu_tx_get_txg(tx); 3486 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 3487 3488 ASSERT(MUTEX_HELD(&msp->ms_lock)); 3489 ASSERT(msp->ms_loaded); 3490 ASSERT(msp->ms_sm != NULL); 3491 3492 /* 3493 * In order to condense the space map, we need to change it so it 3494 * only describes which segments are currently allocated and free. 3495 * 3496 * All the current free space resides in the ms_allocatable, all 3497 * the ms_defer trees, and all the ms_allocating trees. We ignore 3498 * ms_freed because it is empty because we're in sync pass 1. We 3499 * ignore ms_freeing because these changes are not yet reflected 3500 * in the spacemap (they will be written later this txg). 3501 * 3502 * So to truncate the space map to represent all the entries of 3503 * previous TXGs we do the following: 3504 * 3505 * 1] We create a range tree (condense tree) that is 100% empty. 3506 * 2] We add to it all segments found in the ms_defer trees 3507 * as those segments are marked as free in the original space 3508 * map. We do the same with the ms_allocating trees for the same 3509 * reason. Adding these segments should be a relatively 3510 * inexpensive operation since we expect these trees to have a 3511 * small number of nodes. 3512 * 3] We vacate any unflushed allocs, since they are not frees we 3513 * need to add to the condense tree. Then we vacate any 3514 * unflushed frees as they should already be part of ms_allocatable. 3515 * 4] At this point, we would ideally like to add all segments 3516 * in the ms_allocatable tree from the condense tree. This way 3517 * we would write all the entries of the condense tree as the 3518 * condensed space map, which would only contain freeed 3519 * segments with everything else assumed to be allocated. 3520 * 3521 * Doing so can be prohibitively expensive as ms_allocatable can 3522 * be large, and therefore computationally expensive to add to 3523 * the condense_tree. Instead we first sync out an entry marking 3524 * everything as allocated, then the condense_tree and then the 3525 * ms_allocatable, in the condensed space map. While this is not 3526 * optimal, it is typically close to optimal and more importantly 3527 * much cheaper to compute. 3528 * 3529 * 5] Finally, as both of the unflushed trees were written to our 3530 * new and condensed metaslab space map, we basically flushed 3531 * all the unflushed changes to disk, thus we call 3532 * metaslab_flush_update(). 3533 */ 3534 ASSERT3U(spa_sync_pass(spa), ==, 1); 3535 ASSERT(range_tree_is_empty(msp->ms_freed)); /* since it is pass 1 */ 3536 3537 zfs_dbgmsg("condensing: txg %llu, msp[%llu] %p, vdev id %llu, " 3538 "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg, 3539 msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id, 3540 spa->spa_name, space_map_length(msp->ms_sm), 3541 range_tree_numsegs(msp->ms_allocatable), 3542 msp->ms_condense_wanted ? "TRUE" : "FALSE"); 3543 3544 msp->ms_condense_wanted = B_FALSE; 3545 3546 range_seg_type_t type; 3547 uint64_t shift, start; 3548 type = metaslab_calculate_range_tree_type(msp->ms_group->mg_vd, msp, 3549 &start, &shift); 3550 3551 condense_tree = range_tree_create(NULL, type, NULL, start, shift); 3552 3553 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 3554 range_tree_walk(msp->ms_defer[t], 3555 range_tree_add, condense_tree); 3556 } 3557 3558 for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { 3559 range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK], 3560 range_tree_add, condense_tree); 3561 } 3562 3563 ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, 3564 metaslab_unflushed_changes_memused(msp)); 3565 spa->spa_unflushed_stats.sus_memused -= 3566 metaslab_unflushed_changes_memused(msp); 3567 range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); 3568 range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); 3569 3570 /* 3571 * We're about to drop the metaslab's lock thus allowing other 3572 * consumers to change its content. Set the metaslab's ms_condensing 3573 * flag to ensure that allocations on this metaslab do not occur 3574 * while we're in the middle of committing it to disk. This is only 3575 * critical for ms_allocatable as all other range trees use per TXG 3576 * views of their content. 3577 */ 3578 msp->ms_condensing = B_TRUE; 3579 3580 mutex_exit(&msp->ms_lock); 3581 uint64_t object = space_map_object(msp->ms_sm); 3582 space_map_truncate(sm, 3583 spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ? 3584 zfs_metaslab_sm_blksz_with_log : zfs_metaslab_sm_blksz_no_log, tx); 3585 3586 /* 3587 * space_map_truncate() may have reallocated the spacemap object. 3588 * If so, update the vdev_ms_array. 3589 */ 3590 if (space_map_object(msp->ms_sm) != object) { 3591 object = space_map_object(msp->ms_sm); 3592 dmu_write(spa->spa_meta_objset, 3593 msp->ms_group->mg_vd->vdev_ms_array, sizeof (uint64_t) * 3594 msp->ms_id, sizeof (uint64_t), &object, tx); 3595 } 3596 3597 /* 3598 * Note: 3599 * When the log space map feature is enabled, each space map will 3600 * always have ALLOCS followed by FREES for each sync pass. This is 3601 * typically true even when the log space map feature is disabled, 3602 * except from the case where a metaslab goes through metaslab_sync() 3603 * and gets condensed. In that case the metaslab's space map will have 3604 * ALLOCS followed by FREES (due to condensing) followed by ALLOCS 3605 * followed by FREES (due to space_map_write() in metaslab_sync()) for 3606 * sync pass 1. 3607 */ 3608 range_tree_t *tmp_tree = range_tree_create(NULL, type, NULL, start, 3609 shift); 3610 range_tree_add(tmp_tree, msp->ms_start, msp->ms_size); 3611 space_map_write(sm, tmp_tree, SM_ALLOC, SM_NO_VDEVID, tx); 3612 space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx); 3613 space_map_write(sm, condense_tree, SM_FREE, SM_NO_VDEVID, tx); 3614 3615 range_tree_vacate(condense_tree, NULL, NULL); 3616 range_tree_destroy(condense_tree); 3617 range_tree_vacate(tmp_tree, NULL, NULL); 3618 range_tree_destroy(tmp_tree); 3619 mutex_enter(&msp->ms_lock); 3620 3621 msp->ms_condensing = B_FALSE; 3622 metaslab_flush_update(msp, tx); 3623 } 3624 3625 /* 3626 * Called when the metaslab has been flushed (its own spacemap now reflects 3627 * all the contents of the pool-wide spacemap log). Updates the metaslab's 3628 * metadata and any pool-wide related log space map data (e.g. summary, 3629 * obsolete logs, etc.) to reflect that. 3630 */ 3631 static void 3632 metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx) 3633 { 3634 metaslab_group_t *mg = msp->ms_group; 3635 spa_t *spa = mg->mg_vd->vdev_spa; 3636 3637 ASSERT(MUTEX_HELD(&msp->ms_lock)); 3638 3639 ASSERT3U(spa_sync_pass(spa), ==, 1); 3640 ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); 3641 ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); 3642 3643 /* 3644 * Just because a metaslab got flushed, that doesn't mean that 3645 * it will pass through metaslab_sync_done(). Thus, make sure to 3646 * update ms_synced_length here in case it doesn't. 3647 */ 3648 msp->ms_synced_length = space_map_length(msp->ms_sm); 3649 3650 /* 3651 * We may end up here from metaslab_condense() without the 3652 * feature being active. In that case this is a no-op. 3653 */ 3654 if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) 3655 return; 3656 3657 ASSERT(spa_syncing_log_sm(spa) != NULL); 3658 ASSERT(msp->ms_sm != NULL); 3659 ASSERT(metaslab_unflushed_txg(msp) != 0); 3660 ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), ==, msp); 3661 3662 VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(spa)); 3663 3664 /* update metaslab's position in our flushing tree */ 3665 uint64_t ms_prev_flushed_txg = metaslab_unflushed_txg(msp); 3666 mutex_enter(&spa->spa_flushed_ms_lock); 3667 avl_remove(&spa->spa_metaslabs_by_flushed, msp); 3668 metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx); 3669 avl_add(&spa->spa_metaslabs_by_flushed, msp); 3670 mutex_exit(&spa->spa_flushed_ms_lock); 3671 3672 /* update metaslab counts of spa_log_sm_t nodes */ 3673 spa_log_sm_decrement_mscount(spa, ms_prev_flushed_txg); 3674 spa_log_sm_increment_current_mscount(spa); 3675 3676 /* cleanup obsolete logs if any */ 3677 uint64_t log_blocks_before = spa_log_sm_nblocks(spa); 3678 spa_cleanup_old_sm_logs(spa, tx); 3679 uint64_t log_blocks_after = spa_log_sm_nblocks(spa); 3680 VERIFY3U(log_blocks_after, <=, log_blocks_before); 3681 3682 /* update log space map summary */ 3683 uint64_t blocks_gone = log_blocks_before - log_blocks_after; 3684 spa_log_summary_add_flushed_metaslab(spa); 3685 spa_log_summary_decrement_mscount(spa, ms_prev_flushed_txg); 3686 spa_log_summary_decrement_blkcount(spa, blocks_gone); 3687 } 3688 3689 boolean_t 3690 metaslab_flush(metaslab_t *msp, dmu_tx_t *tx) 3691 { 3692 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 3693 3694 ASSERT(MUTEX_HELD(&msp->ms_lock)); 3695 ASSERT3U(spa_sync_pass(spa), ==, 1); 3696 ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); 3697 3698 ASSERT(msp->ms_sm != NULL); 3699 ASSERT(metaslab_unflushed_txg(msp) != 0); 3700 ASSERT(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL) != NULL); 3701 3702 /* 3703 * There is nothing wrong with flushing the same metaslab twice, as 3704 * this codepath should work on that case. However, the current 3705 * flushing scheme makes sure to avoid this situation as we would be 3706 * making all these calls without having anything meaningful to write 3707 * to disk. We assert this behavior here. 3708 */ 3709 ASSERT3U(metaslab_unflushed_txg(msp), <, dmu_tx_get_txg(tx)); 3710 3711 /* 3712 * We can not flush while loading, because then we would 3713 * not load the ms_unflushed_{allocs,frees}. 3714 */ 3715 if (msp->ms_loading) 3716 return (B_FALSE); 3717 3718 metaslab_verify_space(msp, dmu_tx_get_txg(tx)); 3719 metaslab_verify_weight_and_frag(msp); 3720 3721 /* 3722 * Metaslab condensing is effectively flushing. Therefore if the 3723 * metaslab can be condensed we can just condense it instead of 3724 * flushing it. 3725 * 3726 * Note that metaslab_condense() does call metaslab_flush_update() 3727 * so we can just return immediately after condensing. We also 3728 * don't need to care about setting ms_flushing or broadcasting 3729 * ms_flush_cv, even if we temporarily drop the ms_lock in 3730 * metaslab_condense(), as the metaslab is already loaded. 3731 */ 3732 if (msp->ms_loaded && metaslab_should_condense(msp)) { 3733 metaslab_group_t *mg = msp->ms_group; 3734 3735 /* 3736 * For all histogram operations below refer to the 3737 * comments of metaslab_sync() where we follow a 3738 * similar procedure. 3739 */ 3740 metaslab_group_histogram_verify(mg); 3741 metaslab_class_histogram_verify(mg->mg_class); 3742 metaslab_group_histogram_remove(mg, msp); 3743 3744 metaslab_condense(msp, tx); 3745 3746 space_map_histogram_clear(msp->ms_sm); 3747 space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx); 3748 ASSERT(range_tree_is_empty(msp->ms_freed)); 3749 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 3750 space_map_histogram_add(msp->ms_sm, 3751 msp->ms_defer[t], tx); 3752 } 3753 metaslab_aux_histograms_update(msp); 3754 3755 metaslab_group_histogram_add(mg, msp); 3756 metaslab_group_histogram_verify(mg); 3757 metaslab_class_histogram_verify(mg->mg_class); 3758 3759 metaslab_verify_space(msp, dmu_tx_get_txg(tx)); 3760 3761 /* 3762 * Since we recreated the histogram (and potentially 3763 * the ms_sm too while condensing) ensure that the 3764 * weight is updated too because we are not guaranteed 3765 * that this metaslab is dirty and will go through 3766 * metaslab_sync_done(). 3767 */ 3768 metaslab_recalculate_weight_and_sort(msp); 3769 return (B_TRUE); 3770 } 3771 3772 msp->ms_flushing = B_TRUE; 3773 uint64_t sm_len_before = space_map_length(msp->ms_sm); 3774 3775 mutex_exit(&msp->ms_lock); 3776 space_map_write(msp->ms_sm, msp->ms_unflushed_allocs, SM_ALLOC, 3777 SM_NO_VDEVID, tx); 3778 space_map_write(msp->ms_sm, msp->ms_unflushed_frees, SM_FREE, 3779 SM_NO_VDEVID, tx); 3780 mutex_enter(&msp->ms_lock); 3781 3782 uint64_t sm_len_after = space_map_length(msp->ms_sm); 3783 if (zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) { 3784 zfs_dbgmsg("flushing: txg %llu, spa %s, vdev_id %llu, " 3785 "ms_id %llu, unflushed_allocs %llu, unflushed_frees %llu, " 3786 "appended %llu bytes", dmu_tx_get_txg(tx), spa_name(spa), 3787 msp->ms_group->mg_vd->vdev_id, msp->ms_id, 3788 range_tree_space(msp->ms_unflushed_allocs), 3789 range_tree_space(msp->ms_unflushed_frees), 3790 (sm_len_after - sm_len_before)); 3791 } 3792 3793 ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, 3794 metaslab_unflushed_changes_memused(msp)); 3795 spa->spa_unflushed_stats.sus_memused -= 3796 metaslab_unflushed_changes_memused(msp); 3797 range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); 3798 range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); 3799 3800 metaslab_verify_space(msp, dmu_tx_get_txg(tx)); 3801 metaslab_verify_weight_and_frag(msp); 3802 3803 metaslab_flush_update(msp, tx); 3804 3805 metaslab_verify_space(msp, dmu_tx_get_txg(tx)); 3806 metaslab_verify_weight_and_frag(msp); 3807 3808 msp->ms_flushing = B_FALSE; 3809 cv_broadcast(&msp->ms_flush_cv); 3810 return (B_TRUE); 3811 } 3812 3813 /* 3814 * Write a metaslab to disk in the context of the specified transaction group. 3815 */ 3816 void 3817 metaslab_sync(metaslab_t *msp, uint64_t txg) 3818 { 3819 metaslab_group_t *mg = msp->ms_group; 3820 vdev_t *vd = mg->mg_vd; 3821 spa_t *spa = vd->vdev_spa; 3822 objset_t *mos = spa_meta_objset(spa); 3823 range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK]; 3824 dmu_tx_t *tx; 3825 3826 ASSERT(!vd->vdev_ishole); 3827 3828 /* 3829 * This metaslab has just been added so there's no work to do now. 3830 */ 3831 if (msp->ms_freeing == NULL) { 3832 ASSERT3P(alloctree, ==, NULL); 3833 return; 3834 } 3835 3836 ASSERT3P(alloctree, !=, NULL); 3837 ASSERT3P(msp->ms_freeing, !=, NULL); 3838 ASSERT3P(msp->ms_freed, !=, NULL); 3839 ASSERT3P(msp->ms_checkpointing, !=, NULL); 3840 ASSERT3P(msp->ms_trim, !=, NULL); 3841 3842 /* 3843 * Normally, we don't want to process a metaslab if there are no 3844 * allocations or frees to perform. However, if the metaslab is being 3845 * forced to condense, it's loaded and we're not beyond the final 3846 * dirty txg, we need to let it through. Not condensing beyond the 3847 * final dirty txg prevents an issue where metaslabs that need to be 3848 * condensed but were loaded for other reasons could cause a panic 3849 * here. By only checking the txg in that branch of the conditional, 3850 * we preserve the utility of the VERIFY statements in all other 3851 * cases. 3852 */ 3853 if (range_tree_is_empty(alloctree) && 3854 range_tree_is_empty(msp->ms_freeing) && 3855 range_tree_is_empty(msp->ms_checkpointing) && 3856 !(msp->ms_loaded && msp->ms_condense_wanted && 3857 txg <= spa_final_dirty_txg(spa))) 3858 return; 3859 3860 3861 VERIFY3U(txg, <=, spa_final_dirty_txg(spa)); 3862 3863 /* 3864 * The only state that can actually be changing concurrently 3865 * with metaslab_sync() is the metaslab's ms_allocatable. No 3866 * other thread can be modifying this txg's alloc, freeing, 3867 * freed, or space_map_phys_t. We drop ms_lock whenever we 3868 * could call into the DMU, because the DMU can call down to 3869 * us (e.g. via zio_free()) at any time. 3870 * 3871 * The spa_vdev_remove_thread() can be reading metaslab state 3872 * concurrently, and it is locked out by the ms_sync_lock. 3873 * Note that the ms_lock is insufficient for this, because it 3874 * is dropped by space_map_write(). 3875 */ 3876 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 3877 3878 /* 3879 * Generate a log space map if one doesn't exist already. 3880 */ 3881 spa_generate_syncing_log_sm(spa, tx); 3882 3883 if (msp->ms_sm == NULL) { 3884 uint64_t new_object = space_map_alloc(mos, 3885 spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ? 3886 zfs_metaslab_sm_blksz_with_log : 3887 zfs_metaslab_sm_blksz_no_log, tx); 3888 VERIFY3U(new_object, !=, 0); 3889 3890 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * 3891 msp->ms_id, sizeof (uint64_t), &new_object, tx); 3892 3893 VERIFY0(space_map_open(&msp->ms_sm, mos, new_object, 3894 msp->ms_start, msp->ms_size, vd->vdev_ashift)); 3895 ASSERT(msp->ms_sm != NULL); 3896 3897 ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); 3898 ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); 3899 ASSERT0(metaslab_allocated_space(msp)); 3900 } 3901 3902 if (metaslab_unflushed_txg(msp) == 0 && 3903 spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { 3904 ASSERT(spa_syncing_log_sm(spa) != NULL); 3905 3906 metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx); 3907 spa_log_sm_increment_current_mscount(spa); 3908 spa_log_summary_add_flushed_metaslab(spa); 3909 3910 ASSERT(msp->ms_sm != NULL); 3911 mutex_enter(&spa->spa_flushed_ms_lock); 3912 avl_add(&spa->spa_metaslabs_by_flushed, msp); 3913 mutex_exit(&spa->spa_flushed_ms_lock); 3914 3915 ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); 3916 ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); 3917 } 3918 3919 if (!range_tree_is_empty(msp->ms_checkpointing) && 3920 vd->vdev_checkpoint_sm == NULL) { 3921 ASSERT(spa_has_checkpoint(spa)); 3922 3923 uint64_t new_object = space_map_alloc(mos, 3924 zfs_vdev_standard_sm_blksz, tx); 3925 VERIFY3U(new_object, !=, 0); 3926 3927 VERIFY0(space_map_open(&vd->vdev_checkpoint_sm, 3928 mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift)); 3929 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); 3930 3931 /* 3932 * We save the space map object as an entry in vdev_top_zap 3933 * so it can be retrieved when the pool is reopened after an 3934 * export or through zdb. 3935 */ 3936 VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, 3937 vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, 3938 sizeof (new_object), 1, &new_object, tx)); 3939 } 3940 3941 mutex_enter(&msp->ms_sync_lock); 3942 mutex_enter(&msp->ms_lock); 3943 3944 /* 3945 * Note: metaslab_condense() clears the space map's histogram. 3946 * Therefore we must verify and remove this histogram before 3947 * condensing. 3948 */ 3949 metaslab_group_histogram_verify(mg); 3950 metaslab_class_histogram_verify(mg->mg_class); 3951 metaslab_group_histogram_remove(mg, msp); 3952 3953 if (spa->spa_sync_pass == 1 && msp->ms_loaded && 3954 metaslab_should_condense(msp)) 3955 metaslab_condense(msp, tx); 3956 3957 /* 3958 * We'll be going to disk to sync our space accounting, thus we 3959 * drop the ms_lock during that time so allocations coming from 3960 * open-context (ZIL) for future TXGs do not block. 3961 */ 3962 mutex_exit(&msp->ms_lock); 3963 space_map_t *log_sm = spa_syncing_log_sm(spa); 3964 if (log_sm != NULL) { 3965 ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP)); 3966 3967 space_map_write(log_sm, alloctree, SM_ALLOC, 3968 vd->vdev_id, tx); 3969 space_map_write(log_sm, msp->ms_freeing, SM_FREE, 3970 vd->vdev_id, tx); 3971 mutex_enter(&msp->ms_lock); 3972 3973 ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, 3974 metaslab_unflushed_changes_memused(msp)); 3975 spa->spa_unflushed_stats.sus_memused -= 3976 metaslab_unflushed_changes_memused(msp); 3977 range_tree_remove_xor_add(alloctree, 3978 msp->ms_unflushed_frees, msp->ms_unflushed_allocs); 3979 range_tree_remove_xor_add(msp->ms_freeing, 3980 msp->ms_unflushed_allocs, msp->ms_unflushed_frees); 3981 spa->spa_unflushed_stats.sus_memused += 3982 metaslab_unflushed_changes_memused(msp); 3983 } else { 3984 ASSERT(!spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP)); 3985 3986 space_map_write(msp->ms_sm, alloctree, SM_ALLOC, 3987 SM_NO_VDEVID, tx); 3988 space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE, 3989 SM_NO_VDEVID, tx); 3990 mutex_enter(&msp->ms_lock); 3991 } 3992 3993 msp->ms_allocated_space += range_tree_space(alloctree); 3994 ASSERT3U(msp->ms_allocated_space, >=, 3995 range_tree_space(msp->ms_freeing)); 3996 msp->ms_allocated_space -= range_tree_space(msp->ms_freeing); 3997 3998 if (!range_tree_is_empty(msp->ms_checkpointing)) { 3999 ASSERT(spa_has_checkpoint(spa)); 4000 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); 4001 4002 /* 4003 * Since we are doing writes to disk and the ms_checkpointing 4004 * tree won't be changing during that time, we drop the 4005 * ms_lock while writing to the checkpoint space map, for the 4006 * same reason mentioned above. 4007 */ 4008 mutex_exit(&msp->ms_lock); 4009 space_map_write(vd->vdev_checkpoint_sm, 4010 msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx); 4011 mutex_enter(&msp->ms_lock); 4012 4013 spa->spa_checkpoint_info.sci_dspace += 4014 range_tree_space(msp->ms_checkpointing); 4015 vd->vdev_stat.vs_checkpoint_space += 4016 range_tree_space(msp->ms_checkpointing); 4017 ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==, 4018 -space_map_allocated(vd->vdev_checkpoint_sm)); 4019 4020 range_tree_vacate(msp->ms_checkpointing, NULL, NULL); 4021 } 4022 4023 if (msp->ms_loaded) { 4024 /* 4025 * When the space map is loaded, we have an accurate 4026 * histogram in the range tree. This gives us an opportunity 4027 * to bring the space map's histogram up-to-date so we clear 4028 * it first before updating it. 4029 */ 4030 space_map_histogram_clear(msp->ms_sm); 4031 space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx); 4032 4033 /* 4034 * Since we've cleared the histogram we need to add back 4035 * any free space that has already been processed, plus 4036 * any deferred space. This allows the on-disk histogram 4037 * to accurately reflect all free space even if some space 4038 * is not yet available for allocation (i.e. deferred). 4039 */ 4040 space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx); 4041 4042 /* 4043 * Add back any deferred free space that has not been 4044 * added back into the in-core free tree yet. This will 4045 * ensure that we don't end up with a space map histogram 4046 * that is completely empty unless the metaslab is fully 4047 * allocated. 4048 */ 4049 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 4050 space_map_histogram_add(msp->ms_sm, 4051 msp->ms_defer[t], tx); 4052 } 4053 } 4054 4055 /* 4056 * Always add the free space from this sync pass to the space 4057 * map histogram. We want to make sure that the on-disk histogram 4058 * accounts for all free space. If the space map is not loaded, 4059 * then we will lose some accuracy but will correct it the next 4060 * time we load the space map. 4061 */ 4062 space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx); 4063 metaslab_aux_histograms_update(msp); 4064 4065 metaslab_group_histogram_add(mg, msp); 4066 metaslab_group_histogram_verify(mg); 4067 metaslab_class_histogram_verify(mg->mg_class); 4068 4069 /* 4070 * For sync pass 1, we avoid traversing this txg's free range tree 4071 * and instead will just swap the pointers for freeing and freed. 4072 * We can safely do this since the freed_tree is guaranteed to be 4073 * empty on the initial pass. 4074 * 4075 * Keep in mind that even if we are currently using a log spacemap 4076 * we want current frees to end up in the ms_allocatable (but not 4077 * get appended to the ms_sm) so their ranges can be reused as usual. 4078 */ 4079 if (spa_sync_pass(spa) == 1) { 4080 range_tree_swap(&msp->ms_freeing, &msp->ms_freed); 4081 ASSERT0(msp->ms_allocated_this_txg); 4082 } else { 4083 range_tree_vacate(msp->ms_freeing, 4084 range_tree_add, msp->ms_freed); 4085 } 4086 msp->ms_allocated_this_txg += range_tree_space(alloctree); 4087 range_tree_vacate(alloctree, NULL, NULL); 4088 4089 ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); 4090 ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg) 4091 & TXG_MASK])); 4092 ASSERT0(range_tree_space(msp->ms_freeing)); 4093 ASSERT0(range_tree_space(msp->ms_checkpointing)); 4094 4095 mutex_exit(&msp->ms_lock); 4096 4097 /* 4098 * Verify that the space map object ID has been recorded in the 4099 * vdev_ms_array. 4100 */ 4101 uint64_t object; 4102 VERIFY0(dmu_read(mos, vd->vdev_ms_array, 4103 msp->ms_id * sizeof (uint64_t), sizeof (uint64_t), &object, 0)); 4104 VERIFY3U(object, ==, space_map_object(msp->ms_sm)); 4105 4106 mutex_exit(&msp->ms_sync_lock); 4107 dmu_tx_commit(tx); 4108 } 4109 4110 static void 4111 metaslab_evict(metaslab_t *msp, uint64_t txg) 4112 { 4113 if (!msp->ms_loaded || msp->ms_disabled != 0) 4114 return; 4115 4116 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 4117 VERIFY0(range_tree_space( 4118 msp->ms_allocating[(txg + t) & TXG_MASK])); 4119 } 4120 if (msp->ms_allocator != -1) 4121 metaslab_passivate(msp, msp->ms_weight & ~METASLAB_ACTIVE_MASK); 4122 4123 if (!metaslab_debug_unload) 4124 metaslab_unload(msp); 4125 } 4126 4127 /* 4128 * Called after a transaction group has completely synced to mark 4129 * all of the metaslab's free space as usable. 4130 */ 4131 void 4132 metaslab_sync_done(metaslab_t *msp, uint64_t txg) 4133 { 4134 metaslab_group_t *mg = msp->ms_group; 4135 vdev_t *vd = mg->mg_vd; 4136 spa_t *spa = vd->vdev_spa; 4137 range_tree_t **defer_tree; 4138 int64_t alloc_delta, defer_delta; 4139 boolean_t defer_allowed = B_TRUE; 4140 4141 ASSERT(!vd->vdev_ishole); 4142 4143 mutex_enter(&msp->ms_lock); 4144 4145 /* 4146 * If this metaslab is just becoming available, initialize its 4147 * range trees and add its capacity to the vdev. 4148 */ 4149 if (msp->ms_freed == NULL) { 4150 range_seg_type_t type; 4151 uint64_t shift, start; 4152 type = metaslab_calculate_range_tree_type(vd, msp, &start, 4153 &shift); 4154 4155 for (int t = 0; t < TXG_SIZE; t++) { 4156 ASSERT(msp->ms_allocating[t] == NULL); 4157 4158 msp->ms_allocating[t] = range_tree_create(NULL, type, 4159 NULL, start, shift); 4160 } 4161 4162 ASSERT3P(msp->ms_freeing, ==, NULL); 4163 msp->ms_freeing = range_tree_create(NULL, type, NULL, start, 4164 shift); 4165 4166 ASSERT3P(msp->ms_freed, ==, NULL); 4167 msp->ms_freed = range_tree_create(NULL, type, NULL, start, 4168 shift); 4169 4170 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 4171 ASSERT3P(msp->ms_defer[t], ==, NULL); 4172 msp->ms_defer[t] = range_tree_create(NULL, type, NULL, 4173 start, shift); 4174 } 4175 4176 ASSERT3P(msp->ms_checkpointing, ==, NULL); 4177 msp->ms_checkpointing = range_tree_create(NULL, type, NULL, 4178 start, shift); 4179 4180 ASSERT3P(msp->ms_unflushed_allocs, ==, NULL); 4181 msp->ms_unflushed_allocs = range_tree_create(NULL, type, NULL, 4182 start, shift); 4183 4184 metaslab_rt_arg_t *mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP); 4185 mrap->mra_bt = &msp->ms_unflushed_frees_by_size; 4186 mrap->mra_floor_shift = metaslab_by_size_min_shift; 4187 ASSERT3P(msp->ms_unflushed_frees, ==, NULL); 4188 msp->ms_unflushed_frees = range_tree_create(&metaslab_rt_ops, 4189 type, mrap, start, shift); 4190 4191 metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size); 4192 } 4193 ASSERT0(range_tree_space(msp->ms_freeing)); 4194 ASSERT0(range_tree_space(msp->ms_checkpointing)); 4195 4196 defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE]; 4197 4198 uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) - 4199 metaslab_class_get_alloc(spa_normal_class(spa)); 4200 if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) { 4201 defer_allowed = B_FALSE; 4202 } 4203 4204 defer_delta = 0; 4205 alloc_delta = msp->ms_allocated_this_txg - 4206 range_tree_space(msp->ms_freed); 4207 4208 if (defer_allowed) { 4209 defer_delta = range_tree_space(msp->ms_freed) - 4210 range_tree_space(*defer_tree); 4211 } else { 4212 defer_delta -= range_tree_space(*defer_tree); 4213 } 4214 metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta, 4215 defer_delta, 0); 4216 4217 if (spa_syncing_log_sm(spa) == NULL) { 4218 /* 4219 * If there's a metaslab_load() in progress and we don't have 4220 * a log space map, it means that we probably wrote to the 4221 * metaslab's space map. If this is the case, we need to 4222 * make sure that we wait for the load to complete so that we 4223 * have a consistent view at the in-core side of the metaslab. 4224 */ 4225 metaslab_load_wait(msp); 4226 } else { 4227 ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); 4228 } 4229 4230 /* 4231 * When auto-trimming is enabled, free ranges which are added to 4232 * ms_allocatable are also be added to ms_trim. The ms_trim tree is 4233 * periodically consumed by the vdev_autotrim_thread() which issues 4234 * trims for all ranges and then vacates the tree. The ms_trim tree 4235 * can be discarded at any time with the sole consequence of recent 4236 * frees not being trimmed. 4237 */ 4238 if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON) { 4239 range_tree_walk(*defer_tree, range_tree_add, msp->ms_trim); 4240 if (!defer_allowed) { 4241 range_tree_walk(msp->ms_freed, range_tree_add, 4242 msp->ms_trim); 4243 } 4244 } else { 4245 range_tree_vacate(msp->ms_trim, NULL, NULL); 4246 } 4247 4248 /* 4249 * Move the frees from the defer_tree back to the free 4250 * range tree (if it's loaded). Swap the freed_tree and 4251 * the defer_tree -- this is safe to do because we've 4252 * just emptied out the defer_tree. 4253 */ 4254 range_tree_vacate(*defer_tree, 4255 msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable); 4256 if (defer_allowed) { 4257 range_tree_swap(&msp->ms_freed, defer_tree); 4258 } else { 4259 range_tree_vacate(msp->ms_freed, 4260 msp->ms_loaded ? range_tree_add : NULL, 4261 msp->ms_allocatable); 4262 } 4263 4264 msp->ms_synced_length = space_map_length(msp->ms_sm); 4265 4266 msp->ms_deferspace += defer_delta; 4267 ASSERT3S(msp->ms_deferspace, >=, 0); 4268 ASSERT3S(msp->ms_deferspace, <=, msp->ms_size); 4269 if (msp->ms_deferspace != 0) { 4270 /* 4271 * Keep syncing this metaslab until all deferred frees 4272 * are back in circulation. 4273 */ 4274 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 4275 } 4276 metaslab_aux_histograms_update_done(msp, defer_allowed); 4277 4278 if (msp->ms_new) { 4279 msp->ms_new = B_FALSE; 4280 mutex_enter(&mg->mg_lock); 4281 mg->mg_ms_ready++; 4282 mutex_exit(&mg->mg_lock); 4283 } 4284 4285 /* 4286 * Re-sort metaslab within its group now that we've adjusted 4287 * its allocatable space. 4288 */ 4289 metaslab_recalculate_weight_and_sort(msp); 4290 4291 /* 4292 * If the metaslab is loaded and we've not tried to load or allocate 4293 * from it in 'metaslab_unload_delay' txgs, then unload it. 4294 */ 4295 if (msp->ms_loaded && 4296 msp->ms_disabled == 0 && 4297 msp->ms_selected_txg + metaslab_unload_delay < txg) { 4298 4299 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 4300 VERIFY0(range_tree_space( 4301 msp->ms_allocating[(txg + t) & TXG_MASK])); 4302 } 4303 if (msp->ms_allocator != -1) { 4304 metaslab_passivate(msp, msp->ms_weight & 4305 ~METASLAB_ACTIVE_MASK); 4306 } 4307 4308 if (!metaslab_debug_unload) 4309 metaslab_unload(msp); 4310 } 4311 4312 ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); 4313 ASSERT0(range_tree_space(msp->ms_freeing)); 4314 ASSERT0(range_tree_space(msp->ms_freed)); 4315 ASSERT0(range_tree_space(msp->ms_checkpointing)); 4316 msp->ms_allocating_total -= msp->ms_allocated_this_txg; 4317 msp->ms_allocated_this_txg = 0; 4318 mutex_exit(&msp->ms_lock); 4319 } 4320 4321 void 4322 metaslab_sync_reassess(metaslab_group_t *mg) 4323 { 4324 spa_t *spa = mg->mg_class->mc_spa; 4325 4326 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 4327 metaslab_group_alloc_update(mg); 4328 mg->mg_fragmentation = metaslab_group_fragmentation(mg); 4329 4330 /* 4331 * Preload the next potential metaslabs but only on active 4332 * metaslab groups. We can get into a state where the metaslab 4333 * is no longer active since we dirty metaslabs as we remove a 4334 * a device, thus potentially making the metaslab group eligible 4335 * for preloading. 4336 */ 4337 if (mg->mg_activation_count > 0) { 4338 metaslab_group_preload(mg); 4339 } 4340 spa_config_exit(spa, SCL_ALLOC, FTAG); 4341 } 4342 4343 /* 4344 * When writing a ditto block (i.e. more than one DVA for a given BP) on 4345 * the same vdev as an existing DVA of this BP, then try to allocate it 4346 * on a different metaslab than existing DVAs (i.e. a unique metaslab). 4347 */ 4348 static boolean_t 4349 metaslab_is_unique(metaslab_t *msp, dva_t *dva) 4350 { 4351 uint64_t dva_ms_id; 4352 4353 if (DVA_GET_ASIZE(dva) == 0) 4354 return (B_TRUE); 4355 4356 if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) 4357 return (B_TRUE); 4358 4359 dva_ms_id = DVA_GET_OFFSET(dva) >> msp->ms_group->mg_vd->vdev_ms_shift; 4360 4361 return (msp->ms_id != dva_ms_id); 4362 } 4363 4364 /* 4365 * ========================================================================== 4366 * Metaslab allocation tracing facility 4367 * ========================================================================== 4368 */ 4369 4370 /* 4371 * Add an allocation trace element to the allocation tracing list. 4372 */ 4373 static void 4374 metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg, 4375 metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset, 4376 int allocator) 4377 { 4378 if (!metaslab_trace_enabled) 4379 return; 4380 4381 /* 4382 * When the tracing list reaches its maximum we remove 4383 * the second element in the list before adding a new one. 4384 * By removing the second element we preserve the original 4385 * entry as a clue to what allocations steps have already been 4386 * performed. 4387 */ 4388 if (zal->zal_size == metaslab_trace_max_entries) { 4389 metaslab_alloc_trace_t *mat_next; 4390 #ifdef DEBUG 4391 panic("too many entries in allocation list"); 4392 #endif 4393 METASLABSTAT_BUMP(metaslabstat_trace_over_limit); 4394 zal->zal_size--; 4395 mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list)); 4396 list_remove(&zal->zal_list, mat_next); 4397 kmem_cache_free(metaslab_alloc_trace_cache, mat_next); 4398 } 4399 4400 metaslab_alloc_trace_t *mat = 4401 kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP); 4402 list_link_init(&mat->mat_list_node); 4403 mat->mat_mg = mg; 4404 mat->mat_msp = msp; 4405 mat->mat_size = psize; 4406 mat->mat_dva_id = dva_id; 4407 mat->mat_offset = offset; 4408 mat->mat_weight = 0; 4409 mat->mat_allocator = allocator; 4410 4411 if (msp != NULL) 4412 mat->mat_weight = msp->ms_weight; 4413 4414 /* 4415 * The list is part of the zio so locking is not required. Only 4416 * a single thread will perform allocations for a given zio. 4417 */ 4418 list_insert_tail(&zal->zal_list, mat); 4419 zal->zal_size++; 4420 4421 ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries); 4422 } 4423 4424 void 4425 metaslab_trace_init(zio_alloc_list_t *zal) 4426 { 4427 list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t), 4428 offsetof(metaslab_alloc_trace_t, mat_list_node)); 4429 zal->zal_size = 0; 4430 } 4431 4432 void 4433 metaslab_trace_fini(zio_alloc_list_t *zal) 4434 { 4435 metaslab_alloc_trace_t *mat; 4436 4437 while ((mat = list_remove_head(&zal->zal_list)) != NULL) 4438 kmem_cache_free(metaslab_alloc_trace_cache, mat); 4439 list_destroy(&zal->zal_list); 4440 zal->zal_size = 0; 4441 } 4442 4443 /* 4444 * ========================================================================== 4445 * Metaslab block operations 4446 * ========================================================================== 4447 */ 4448 4449 static void 4450 metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags, 4451 int allocator) 4452 { 4453 if (!(flags & METASLAB_ASYNC_ALLOC) || 4454 (flags & METASLAB_DONT_THROTTLE)) 4455 return; 4456 4457 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 4458 if (!mg->mg_class->mc_alloc_throttle_enabled) 4459 return; 4460 4461 (void) zfs_refcount_add(&mg->mg_alloc_queue_depth[allocator], tag); 4462 } 4463 4464 static void 4465 metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator) 4466 { 4467 uint64_t max = mg->mg_max_alloc_queue_depth; 4468 uint64_t cur = mg->mg_cur_max_alloc_queue_depth[allocator]; 4469 while (cur < max) { 4470 if (atomic_cas_64(&mg->mg_cur_max_alloc_queue_depth[allocator], 4471 cur, cur + 1) == cur) { 4472 atomic_inc_64( 4473 &mg->mg_class->mc_alloc_max_slots[allocator]); 4474 return; 4475 } 4476 cur = mg->mg_cur_max_alloc_queue_depth[allocator]; 4477 } 4478 } 4479 4480 void 4481 metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags, 4482 int allocator, boolean_t io_complete) 4483 { 4484 if (!(flags & METASLAB_ASYNC_ALLOC) || 4485 (flags & METASLAB_DONT_THROTTLE)) 4486 return; 4487 4488 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 4489 if (!mg->mg_class->mc_alloc_throttle_enabled) 4490 return; 4491 4492 (void) zfs_refcount_remove(&mg->mg_alloc_queue_depth[allocator], tag); 4493 if (io_complete) 4494 metaslab_group_increment_qdepth(mg, allocator); 4495 } 4496 4497 void 4498 metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag, 4499 int allocator) 4500 { 4501 #ifdef ZFS_DEBUG 4502 const dva_t *dva = bp->blk_dva; 4503 int ndvas = BP_GET_NDVAS(bp); 4504 4505 for (int d = 0; d < ndvas; d++) { 4506 uint64_t vdev = DVA_GET_VDEV(&dva[d]); 4507 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 4508 VERIFY(zfs_refcount_not_held( 4509 &mg->mg_alloc_queue_depth[allocator], tag)); 4510 } 4511 #endif 4512 } 4513 4514 static uint64_t 4515 metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) 4516 { 4517 uint64_t start; 4518 range_tree_t *rt = msp->ms_allocatable; 4519 metaslab_class_t *mc = msp->ms_group->mg_class; 4520 4521 ASSERT(MUTEX_HELD(&msp->ms_lock)); 4522 VERIFY(!msp->ms_condensing); 4523 VERIFY0(msp->ms_disabled); 4524 4525 start = mc->mc_ops->msop_alloc(msp, size); 4526 if (start != -1ULL) { 4527 metaslab_group_t *mg = msp->ms_group; 4528 vdev_t *vd = mg->mg_vd; 4529 4530 VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift)); 4531 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 4532 VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size); 4533 range_tree_remove(rt, start, size); 4534 range_tree_clear(msp->ms_trim, start, size); 4535 4536 if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) 4537 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); 4538 4539 range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size); 4540 msp->ms_allocating_total += size; 4541 4542 /* Track the last successful allocation */ 4543 msp->ms_alloc_txg = txg; 4544 metaslab_verify_space(msp, txg); 4545 } 4546 4547 /* 4548 * Now that we've attempted the allocation we need to update the 4549 * metaslab's maximum block size since it may have changed. 4550 */ 4551 msp->ms_max_size = metaslab_largest_allocatable(msp); 4552 return (start); 4553 } 4554 4555 /* 4556 * Find the metaslab with the highest weight that is less than what we've 4557 * already tried. In the common case, this means that we will examine each 4558 * metaslab at most once. Note that concurrent callers could reorder metaslabs 4559 * by activation/passivation once we have dropped the mg_lock. If a metaslab is 4560 * activated by another thread, and we fail to allocate from the metaslab we 4561 * have selected, we may not try the newly-activated metaslab, and instead 4562 * activate another metaslab. This is not optimal, but generally does not cause 4563 * any problems (a possible exception being if every metaslab is completely full 4564 * except for the the newly-activated metaslab which we fail to examine). 4565 */ 4566 static metaslab_t * 4567 find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, 4568 dva_t *dva, int d, boolean_t want_unique, uint64_t asize, int allocator, 4569 boolean_t try_hard, zio_alloc_list_t *zal, metaslab_t *search, 4570 boolean_t *was_active) 4571 { 4572 avl_index_t idx; 4573 avl_tree_t *t = &mg->mg_metaslab_tree; 4574 metaslab_t *msp = avl_find(t, search, &idx); 4575 if (msp == NULL) 4576 msp = avl_nearest(t, idx, AVL_AFTER); 4577 4578 for (; msp != NULL; msp = AVL_NEXT(t, msp)) { 4579 int i; 4580 if (!metaslab_should_allocate(msp, asize, try_hard)) { 4581 metaslab_trace_add(zal, mg, msp, asize, d, 4582 TRACE_TOO_SMALL, allocator); 4583 continue; 4584 } 4585 4586 /* 4587 * If the selected metaslab is condensing or disabled, 4588 * skip it. 4589 */ 4590 if (msp->ms_condensing || msp->ms_disabled > 0) 4591 continue; 4592 4593 *was_active = msp->ms_allocator != -1; 4594 /* 4595 * If we're activating as primary, this is our first allocation 4596 * from this disk, so we don't need to check how close we are. 4597 * If the metaslab under consideration was already active, 4598 * we're getting desperate enough to steal another allocator's 4599 * metaslab, so we still don't care about distances. 4600 */ 4601 if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active) 4602 break; 4603 4604 for (i = 0; i < d; i++) { 4605 if (want_unique && 4606 !metaslab_is_unique(msp, &dva[i])) 4607 break; /* try another metaslab */ 4608 } 4609 if (i == d) 4610 break; 4611 } 4612 4613 if (msp != NULL) { 4614 search->ms_weight = msp->ms_weight; 4615 search->ms_start = msp->ms_start + 1; 4616 search->ms_allocator = msp->ms_allocator; 4617 search->ms_primary = msp->ms_primary; 4618 } 4619 return (msp); 4620 } 4621 4622 void 4623 metaslab_active_mask_verify(metaslab_t *msp) 4624 { 4625 ASSERT(MUTEX_HELD(&msp->ms_lock)); 4626 4627 if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) 4628 return; 4629 4630 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) 4631 return; 4632 4633 if (msp->ms_weight & METASLAB_WEIGHT_PRIMARY) { 4634 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); 4635 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM); 4636 VERIFY3S(msp->ms_allocator, !=, -1); 4637 VERIFY(msp->ms_primary); 4638 return; 4639 } 4640 4641 if (msp->ms_weight & METASLAB_WEIGHT_SECONDARY) { 4642 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); 4643 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM); 4644 VERIFY3S(msp->ms_allocator, !=, -1); 4645 VERIFY(!msp->ms_primary); 4646 return; 4647 } 4648 4649 if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) { 4650 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); 4651 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); 4652 VERIFY3S(msp->ms_allocator, ==, -1); 4653 return; 4654 } 4655 } 4656 4657 /* ARGSUSED */ 4658 static uint64_t 4659 metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, 4660 uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d, 4661 int allocator, boolean_t try_hard) 4662 { 4663 metaslab_t *msp = NULL; 4664 uint64_t offset = -1ULL; 4665 4666 uint64_t activation_weight = METASLAB_WEIGHT_PRIMARY; 4667 for (int i = 0; i < d; i++) { 4668 if (activation_weight == METASLAB_WEIGHT_PRIMARY && 4669 DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { 4670 activation_weight = METASLAB_WEIGHT_SECONDARY; 4671 } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && 4672 DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { 4673 activation_weight = METASLAB_WEIGHT_CLAIM; 4674 break; 4675 } 4676 } 4677 4678 /* 4679 * If we don't have enough metaslabs active to fill the entire array, we 4680 * just use the 0th slot. 4681 */ 4682 if (mg->mg_ms_ready < mg->mg_allocators * 3) 4683 allocator = 0; 4684 4685 ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2); 4686 4687 metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP); 4688 search->ms_weight = UINT64_MAX; 4689 search->ms_start = 0; 4690 /* 4691 * At the end of the metaslab tree are the already-active metaslabs, 4692 * first the primaries, then the secondaries. When we resume searching 4693 * through the tree, we need to consider ms_allocator and ms_primary so 4694 * we start in the location right after where we left off, and don't 4695 * accidentally loop forever considering the same metaslabs. 4696 */ 4697 search->ms_allocator = -1; 4698 search->ms_primary = B_TRUE; 4699 for (;;) { 4700 boolean_t was_active = B_FALSE; 4701 4702 mutex_enter(&mg->mg_lock); 4703 4704 if (activation_weight == METASLAB_WEIGHT_PRIMARY && 4705 mg->mg_primaries[allocator] != NULL) { 4706 msp = mg->mg_primaries[allocator]; 4707 4708 /* 4709 * Even though we don't hold the ms_lock for the 4710 * primary metaslab, those fields should not 4711 * change while we hold the mg_lock. Thus is is 4712 * safe to make assertions on them. 4713 */ 4714 ASSERT(msp->ms_primary); 4715 ASSERT3S(msp->ms_allocator, ==, allocator); 4716 ASSERT(msp->ms_loaded); 4717 4718 was_active = B_TRUE; 4719 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); 4720 } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && 4721 mg->mg_secondaries[allocator] != NULL) { 4722 msp = mg->mg_secondaries[allocator]; 4723 4724 /* 4725 * See comment above about the similar assertions 4726 * for the primary metaslab. 4727 */ 4728 ASSERT(!msp->ms_primary); 4729 ASSERT3S(msp->ms_allocator, ==, allocator); 4730 ASSERT(msp->ms_loaded); 4731 4732 was_active = B_TRUE; 4733 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); 4734 } else { 4735 msp = find_valid_metaslab(mg, activation_weight, dva, d, 4736 want_unique, asize, allocator, try_hard, zal, 4737 search, &was_active); 4738 } 4739 4740 mutex_exit(&mg->mg_lock); 4741 if (msp == NULL) { 4742 kmem_free(search, sizeof (*search)); 4743 return (-1ULL); 4744 } 4745 mutex_enter(&msp->ms_lock); 4746 4747 metaslab_active_mask_verify(msp); 4748 4749 /* 4750 * This code is disabled out because of issues with 4751 * tracepoints in non-gpl kernel modules. 4752 */ 4753 #if 0 4754 DTRACE_PROBE3(ms__activation__attempt, 4755 metaslab_t *, msp, uint64_t, activation_weight, 4756 boolean_t, was_active); 4757 #endif 4758 4759 /* 4760 * Ensure that the metaslab we have selected is still 4761 * capable of handling our request. It's possible that 4762 * another thread may have changed the weight while we 4763 * were blocked on the metaslab lock. We check the 4764 * active status first to see if we need to set_selected_txg 4765 * a new metaslab. 4766 */ 4767 if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) { 4768 ASSERT3S(msp->ms_allocator, ==, -1); 4769 mutex_exit(&msp->ms_lock); 4770 continue; 4771 } 4772 4773 /* 4774 * If the metaslab was activated for another allocator 4775 * while we were waiting in the ms_lock above, or it's 4776 * a primary and we're seeking a secondary (or vice versa), 4777 * we go back and select a new metaslab. 4778 */ 4779 if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) && 4780 (msp->ms_allocator != -1) && 4781 (msp->ms_allocator != allocator || ((activation_weight == 4782 METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) { 4783 ASSERT(msp->ms_loaded); 4784 ASSERT((msp->ms_weight & METASLAB_WEIGHT_CLAIM) || 4785 msp->ms_allocator != -1); 4786 mutex_exit(&msp->ms_lock); 4787 continue; 4788 } 4789 4790 /* 4791 * This metaslab was used for claiming regions allocated 4792 * by the ZIL during pool import. Once these regions are 4793 * claimed we don't need to keep the CLAIM bit set 4794 * anymore. Passivate this metaslab to zero its activation 4795 * mask. 4796 */ 4797 if (msp->ms_weight & METASLAB_WEIGHT_CLAIM && 4798 activation_weight != METASLAB_WEIGHT_CLAIM) { 4799 ASSERT(msp->ms_loaded); 4800 ASSERT3S(msp->ms_allocator, ==, -1); 4801 metaslab_passivate(msp, msp->ms_weight & 4802 ~METASLAB_WEIGHT_CLAIM); 4803 mutex_exit(&msp->ms_lock); 4804 continue; 4805 } 4806 4807 metaslab_set_selected_txg(msp, txg); 4808 4809 int activation_error = 4810 metaslab_activate(msp, allocator, activation_weight); 4811 metaslab_active_mask_verify(msp); 4812 4813 /* 4814 * If the metaslab was activated by another thread for 4815 * another allocator or activation_weight (EBUSY), or it 4816 * failed because another metaslab was assigned as primary 4817 * for this allocator (EEXIST) we continue using this 4818 * metaslab for our allocation, rather than going on to a 4819 * worse metaslab (we waited for that metaslab to be loaded 4820 * after all). 4821 * 4822 * If the activation failed due to an I/O error or ENOSPC we 4823 * skip to the next metaslab. 4824 */ 4825 boolean_t activated; 4826 if (activation_error == 0) { 4827 activated = B_TRUE; 4828 } else if (activation_error == EBUSY || 4829 activation_error == EEXIST) { 4830 activated = B_FALSE; 4831 } else { 4832 mutex_exit(&msp->ms_lock); 4833 continue; 4834 } 4835 ASSERT(msp->ms_loaded); 4836 4837 /* 4838 * Now that we have the lock, recheck to see if we should 4839 * continue to use this metaslab for this allocation. The 4840 * the metaslab is now loaded so metaslab_should_allocate() 4841 * can accurately determine if the allocation attempt should 4842 * proceed. 4843 */ 4844 if (!metaslab_should_allocate(msp, asize, try_hard)) { 4845 /* Passivate this metaslab and select a new one. */ 4846 metaslab_trace_add(zal, mg, msp, asize, d, 4847 TRACE_TOO_SMALL, allocator); 4848 goto next; 4849 } 4850 4851 /* 4852 * If this metaslab is currently condensing then pick again 4853 * as we can't manipulate this metaslab until it's committed 4854 * to disk. If this metaslab is being initialized, we shouldn't 4855 * allocate from it since the allocated region might be 4856 * overwritten after allocation. 4857 */ 4858 if (msp->ms_condensing) { 4859 metaslab_trace_add(zal, mg, msp, asize, d, 4860 TRACE_CONDENSING, allocator); 4861 if (activated) { 4862 metaslab_passivate(msp, msp->ms_weight & 4863 ~METASLAB_ACTIVE_MASK); 4864 } 4865 mutex_exit(&msp->ms_lock); 4866 continue; 4867 } else if (msp->ms_disabled > 0) { 4868 metaslab_trace_add(zal, mg, msp, asize, d, 4869 TRACE_DISABLED, allocator); 4870 if (activated) { 4871 metaslab_passivate(msp, msp->ms_weight & 4872 ~METASLAB_ACTIVE_MASK); 4873 } 4874 mutex_exit(&msp->ms_lock); 4875 continue; 4876 } 4877 4878 offset = metaslab_block_alloc(msp, asize, txg); 4879 metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator); 4880 4881 if (offset != -1ULL) { 4882 /* Proactively passivate the metaslab, if needed */ 4883 if (activated) 4884 metaslab_segment_may_passivate(msp); 4885 break; 4886 } 4887 next: 4888 ASSERT(msp->ms_loaded); 4889 4890 /* 4891 * This code is disabled out because of issues with 4892 * tracepoints in non-gpl kernel modules. 4893 */ 4894 #if 0 4895 DTRACE_PROBE2(ms__alloc__failure, metaslab_t *, msp, 4896 uint64_t, asize); 4897 #endif 4898 4899 /* 4900 * We were unable to allocate from this metaslab so determine 4901 * a new weight for this metaslab. Now that we have loaded 4902 * the metaslab we can provide a better hint to the metaslab 4903 * selector. 4904 * 4905 * For space-based metaslabs, we use the maximum block size. 4906 * This information is only available when the metaslab 4907 * is loaded and is more accurate than the generic free 4908 * space weight that was calculated by metaslab_weight(). 4909 * This information allows us to quickly compare the maximum 4910 * available allocation in the metaslab to the allocation 4911 * size being requested. 4912 * 4913 * For segment-based metaslabs, determine the new weight 4914 * based on the highest bucket in the range tree. We 4915 * explicitly use the loaded segment weight (i.e. the range 4916 * tree histogram) since it contains the space that is 4917 * currently available for allocation and is accurate 4918 * even within a sync pass. 4919 */ 4920 uint64_t weight; 4921 if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) { 4922 weight = metaslab_largest_allocatable(msp); 4923 WEIGHT_SET_SPACEBASED(weight); 4924 } else { 4925 weight = metaslab_weight_from_range_tree(msp); 4926 } 4927 4928 if (activated) { 4929 metaslab_passivate(msp, weight); 4930 } else { 4931 /* 4932 * For the case where we use the metaslab that is 4933 * active for another allocator we want to make 4934 * sure that we retain the activation mask. 4935 * 4936 * Note that we could attempt to use something like 4937 * metaslab_recalculate_weight_and_sort() that 4938 * retains the activation mask here. That function 4939 * uses metaslab_weight() to set the weight though 4940 * which is not as accurate as the calculations 4941 * above. 4942 */ 4943 weight |= msp->ms_weight & METASLAB_ACTIVE_MASK; 4944 metaslab_group_sort(mg, msp, weight); 4945 } 4946 metaslab_active_mask_verify(msp); 4947 4948 /* 4949 * We have just failed an allocation attempt, check 4950 * that metaslab_should_allocate() agrees. Otherwise, 4951 * we may end up in an infinite loop retrying the same 4952 * metaslab. 4953 */ 4954 ASSERT(!metaslab_should_allocate(msp, asize, try_hard)); 4955 4956 mutex_exit(&msp->ms_lock); 4957 } 4958 mutex_exit(&msp->ms_lock); 4959 kmem_free(search, sizeof (*search)); 4960 return (offset); 4961 } 4962 4963 static uint64_t 4964 metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, 4965 uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d, 4966 int allocator, boolean_t try_hard) 4967 { 4968 uint64_t offset; 4969 ASSERT(mg->mg_initialized); 4970 4971 offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique, 4972 dva, d, allocator, try_hard); 4973 4974 mutex_enter(&mg->mg_lock); 4975 if (offset == -1ULL) { 4976 mg->mg_failed_allocations++; 4977 metaslab_trace_add(zal, mg, NULL, asize, d, 4978 TRACE_GROUP_FAILURE, allocator); 4979 if (asize == SPA_GANGBLOCKSIZE) { 4980 /* 4981 * This metaslab group was unable to allocate 4982 * the minimum gang block size so it must be out of 4983 * space. We must notify the allocation throttle 4984 * to start skipping allocation attempts to this 4985 * metaslab group until more space becomes available. 4986 * Note: this failure cannot be caused by the 4987 * allocation throttle since the allocation throttle 4988 * is only responsible for skipping devices and 4989 * not failing block allocations. 4990 */ 4991 mg->mg_no_free_space = B_TRUE; 4992 } 4993 } 4994 mg->mg_allocations++; 4995 mutex_exit(&mg->mg_lock); 4996 return (offset); 4997 } 4998 4999 /* 5000 * Allocate a block for the specified i/o. 5001 */ 5002 int 5003 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, 5004 dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags, 5005 zio_alloc_list_t *zal, int allocator) 5006 { 5007 metaslab_group_t *mg, *rotor; 5008 vdev_t *vd; 5009 boolean_t try_hard = B_FALSE; 5010 5011 ASSERT(!DVA_IS_VALID(&dva[d])); 5012 5013 /* 5014 * For testing, make some blocks above a certain size be gang blocks. 5015 * This will also test spilling from special to normal. 5016 */ 5017 if (psize >= metaslab_force_ganging && (ddi_get_lbolt() & 3) == 0) { 5018 metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG, 5019 allocator); 5020 return (SET_ERROR(ENOSPC)); 5021 } 5022 5023 /* 5024 * Start at the rotor and loop through all mgs until we find something. 5025 * Note that there's no locking on mc_rotor or mc_aliquot because 5026 * nothing actually breaks if we miss a few updates -- we just won't 5027 * allocate quite as evenly. It all balances out over time. 5028 * 5029 * If we are doing ditto or log blocks, try to spread them across 5030 * consecutive vdevs. If we're forced to reuse a vdev before we've 5031 * allocated all of our ditto blocks, then try and spread them out on 5032 * that vdev as much as possible. If it turns out to not be possible, 5033 * gradually lower our standards until anything becomes acceptable. 5034 * Also, allocating on consecutive vdevs (as opposed to random vdevs) 5035 * gives us hope of containing our fault domains to something we're 5036 * able to reason about. Otherwise, any two top-level vdev failures 5037 * will guarantee the loss of data. With consecutive allocation, 5038 * only two adjacent top-level vdev failures will result in data loss. 5039 * 5040 * If we are doing gang blocks (hintdva is non-NULL), try to keep 5041 * ourselves on the same vdev as our gang block header. That 5042 * way, we can hope for locality in vdev_cache, plus it makes our 5043 * fault domains something tractable. 5044 */ 5045 if (hintdva) { 5046 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); 5047 5048 /* 5049 * It's possible the vdev we're using as the hint no 5050 * longer exists or its mg has been closed (e.g. by 5051 * device removal). Consult the rotor when 5052 * all else fails. 5053 */ 5054 if (vd != NULL && vd->vdev_mg != NULL) { 5055 mg = vd->vdev_mg; 5056 5057 if (flags & METASLAB_HINTBP_AVOID && 5058 mg->mg_next != NULL) 5059 mg = mg->mg_next; 5060 } else { 5061 mg = mc->mc_rotor; 5062 } 5063 } else if (d != 0) { 5064 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); 5065 mg = vd->vdev_mg->mg_next; 5066 } else { 5067 ASSERT(mc->mc_rotor != NULL); 5068 mg = mc->mc_rotor; 5069 } 5070 5071 /* 5072 * If the hint put us into the wrong metaslab class, or into a 5073 * metaslab group that has been passivated, just follow the rotor. 5074 */ 5075 if (mg->mg_class != mc || mg->mg_activation_count <= 0) 5076 mg = mc->mc_rotor; 5077 5078 rotor = mg; 5079 top: 5080 do { 5081 boolean_t allocatable; 5082 5083 ASSERT(mg->mg_activation_count == 1); 5084 vd = mg->mg_vd; 5085 5086 /* 5087 * Don't allocate from faulted devices. 5088 */ 5089 if (try_hard) { 5090 spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); 5091 allocatable = vdev_allocatable(vd); 5092 spa_config_exit(spa, SCL_ZIO, FTAG); 5093 } else { 5094 allocatable = vdev_allocatable(vd); 5095 } 5096 5097 /* 5098 * Determine if the selected metaslab group is eligible 5099 * for allocations. If we're ganging then don't allow 5100 * this metaslab group to skip allocations since that would 5101 * inadvertently return ENOSPC and suspend the pool 5102 * even though space is still available. 5103 */ 5104 if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) { 5105 allocatable = metaslab_group_allocatable(mg, rotor, 5106 psize, allocator, d); 5107 } 5108 5109 if (!allocatable) { 5110 metaslab_trace_add(zal, mg, NULL, psize, d, 5111 TRACE_NOT_ALLOCATABLE, allocator); 5112 goto next; 5113 } 5114 5115 ASSERT(mg->mg_initialized); 5116 5117 /* 5118 * Avoid writing single-copy data to a failing, 5119 * non-redundant vdev, unless we've already tried all 5120 * other vdevs. 5121 */ 5122 if ((vd->vdev_stat.vs_write_errors > 0 || 5123 vd->vdev_state < VDEV_STATE_HEALTHY) && 5124 d == 0 && !try_hard && vd->vdev_children == 0) { 5125 metaslab_trace_add(zal, mg, NULL, psize, d, 5126 TRACE_VDEV_ERROR, allocator); 5127 goto next; 5128 } 5129 5130 ASSERT(mg->mg_class == mc); 5131 5132 uint64_t asize = vdev_psize_to_asize(vd, psize); 5133 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); 5134 5135 /* 5136 * If we don't need to try hard, then require that the 5137 * block be on an different metaslab from any other DVAs 5138 * in this BP (unique=true). If we are trying hard, then 5139 * allow any metaslab to be used (unique=false). 5140 */ 5141 uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg, 5142 !try_hard, dva, d, allocator, try_hard); 5143 5144 if (offset != -1ULL) { 5145 /* 5146 * If we've just selected this metaslab group, 5147 * figure out whether the corresponding vdev is 5148 * over- or under-used relative to the pool, 5149 * and set an allocation bias to even it out. 5150 */ 5151 if (mc->mc_aliquot == 0 && metaslab_bias_enabled) { 5152 vdev_stat_t *vs = &vd->vdev_stat; 5153 int64_t vu, cu; 5154 5155 vu = (vs->vs_alloc * 100) / (vs->vs_space + 1); 5156 cu = (mc->mc_alloc * 100) / (mc->mc_space + 1); 5157 5158 /* 5159 * Calculate how much more or less we should 5160 * try to allocate from this device during 5161 * this iteration around the rotor. 5162 * For example, if a device is 80% full 5163 * and the pool is 20% full then we should 5164 * reduce allocations by 60% on this device. 5165 * 5166 * mg_bias = (20 - 80) * 512K / 100 = -307K 5167 * 5168 * This reduces allocations by 307K for this 5169 * iteration. 5170 */ 5171 mg->mg_bias = ((cu - vu) * 5172 (int64_t)mg->mg_aliquot) / 100; 5173 } else if (!metaslab_bias_enabled) { 5174 mg->mg_bias = 0; 5175 } 5176 5177 if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= 5178 mg->mg_aliquot + mg->mg_bias) { 5179 mc->mc_rotor = mg->mg_next; 5180 mc->mc_aliquot = 0; 5181 } 5182 5183 DVA_SET_VDEV(&dva[d], vd->vdev_id); 5184 DVA_SET_OFFSET(&dva[d], offset); 5185 DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER)); 5186 DVA_SET_ASIZE(&dva[d], asize); 5187 5188 return (0); 5189 } 5190 next: 5191 mc->mc_rotor = mg->mg_next; 5192 mc->mc_aliquot = 0; 5193 } while ((mg = mg->mg_next) != rotor); 5194 5195 /* 5196 * If we haven't tried hard, do so now. 5197 */ 5198 if (!try_hard) { 5199 try_hard = B_TRUE; 5200 goto top; 5201 } 5202 5203 bzero(&dva[d], sizeof (dva_t)); 5204 5205 metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator); 5206 return (SET_ERROR(ENOSPC)); 5207 } 5208 5209 void 5210 metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize, 5211 boolean_t checkpoint) 5212 { 5213 metaslab_t *msp; 5214 spa_t *spa = vd->vdev_spa; 5215 5216 ASSERT(vdev_is_concrete(vd)); 5217 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 5218 ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); 5219 5220 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 5221 5222 VERIFY(!msp->ms_condensing); 5223 VERIFY3U(offset, >=, msp->ms_start); 5224 VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size); 5225 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 5226 VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift)); 5227 5228 metaslab_check_free_impl(vd, offset, asize); 5229 5230 mutex_enter(&msp->ms_lock); 5231 if (range_tree_is_empty(msp->ms_freeing) && 5232 range_tree_is_empty(msp->ms_checkpointing)) { 5233 vdev_dirty(vd, VDD_METASLAB, msp, spa_syncing_txg(spa)); 5234 } 5235 5236 if (checkpoint) { 5237 ASSERT(spa_has_checkpoint(spa)); 5238 range_tree_add(msp->ms_checkpointing, offset, asize); 5239 } else { 5240 range_tree_add(msp->ms_freeing, offset, asize); 5241 } 5242 mutex_exit(&msp->ms_lock); 5243 } 5244 5245 /* ARGSUSED */ 5246 void 5247 metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, 5248 uint64_t size, void *arg) 5249 { 5250 boolean_t *checkpoint = arg; 5251 5252 ASSERT3P(checkpoint, !=, NULL); 5253 5254 if (vd->vdev_ops->vdev_op_remap != NULL) 5255 vdev_indirect_mark_obsolete(vd, offset, size); 5256 else 5257 metaslab_free_impl(vd, offset, size, *checkpoint); 5258 } 5259 5260 static void 5261 metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size, 5262 boolean_t checkpoint) 5263 { 5264 spa_t *spa = vd->vdev_spa; 5265 5266 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 5267 5268 if (spa_syncing_txg(spa) > spa_freeze_txg(spa)) 5269 return; 5270 5271 if (spa->spa_vdev_removal != NULL && 5272 spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id && 5273 vdev_is_concrete(vd)) { 5274 /* 5275 * Note: we check if the vdev is concrete because when 5276 * we complete the removal, we first change the vdev to be 5277 * an indirect vdev (in open context), and then (in syncing 5278 * context) clear spa_vdev_removal. 5279 */ 5280 free_from_removing_vdev(vd, offset, size); 5281 } else if (vd->vdev_ops->vdev_op_remap != NULL) { 5282 vdev_indirect_mark_obsolete(vd, offset, size); 5283 vd->vdev_ops->vdev_op_remap(vd, offset, size, 5284 metaslab_free_impl_cb, &checkpoint); 5285 } else { 5286 metaslab_free_concrete(vd, offset, size, checkpoint); 5287 } 5288 } 5289 5290 typedef struct remap_blkptr_cb_arg { 5291 blkptr_t *rbca_bp; 5292 spa_remap_cb_t rbca_cb; 5293 vdev_t *rbca_remap_vd; 5294 uint64_t rbca_remap_offset; 5295 void *rbca_cb_arg; 5296 } remap_blkptr_cb_arg_t; 5297 5298 void 5299 remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, 5300 uint64_t size, void *arg) 5301 { 5302 remap_blkptr_cb_arg_t *rbca = arg; 5303 blkptr_t *bp = rbca->rbca_bp; 5304 5305 /* We can not remap split blocks. */ 5306 if (size != DVA_GET_ASIZE(&bp->blk_dva[0])) 5307 return; 5308 ASSERT0(inner_offset); 5309 5310 if (rbca->rbca_cb != NULL) { 5311 /* 5312 * At this point we know that we are not handling split 5313 * blocks and we invoke the callback on the previous 5314 * vdev which must be indirect. 5315 */ 5316 ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops); 5317 5318 rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id, 5319 rbca->rbca_remap_offset, size, rbca->rbca_cb_arg); 5320 5321 /* set up remap_blkptr_cb_arg for the next call */ 5322 rbca->rbca_remap_vd = vd; 5323 rbca->rbca_remap_offset = offset; 5324 } 5325 5326 /* 5327 * The phys birth time is that of dva[0]. This ensures that we know 5328 * when each dva was written, so that resilver can determine which 5329 * blocks need to be scrubbed (i.e. those written during the time 5330 * the vdev was offline). It also ensures that the key used in 5331 * the ARC hash table is unique (i.e. dva[0] + phys_birth). If 5332 * we didn't change the phys_birth, a lookup in the ARC for a 5333 * remapped BP could find the data that was previously stored at 5334 * this vdev + offset. 5335 */ 5336 vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa, 5337 DVA_GET_VDEV(&bp->blk_dva[0])); 5338 vdev_indirect_births_t *vib = oldvd->vdev_indirect_births; 5339 bp->blk_phys_birth = vdev_indirect_births_physbirth(vib, 5340 DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0])); 5341 5342 DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id); 5343 DVA_SET_OFFSET(&bp->blk_dva[0], offset); 5344 } 5345 5346 /* 5347 * If the block pointer contains any indirect DVAs, modify them to refer to 5348 * concrete DVAs. Note that this will sometimes not be possible, leaving 5349 * the indirect DVA in place. This happens if the indirect DVA spans multiple 5350 * segments in the mapping (i.e. it is a "split block"). 5351 * 5352 * If the BP was remapped, calls the callback on the original dva (note the 5353 * callback can be called multiple times if the original indirect DVA refers 5354 * to another indirect DVA, etc). 5355 * 5356 * Returns TRUE if the BP was remapped. 5357 */ 5358 boolean_t 5359 spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg) 5360 { 5361 remap_blkptr_cb_arg_t rbca; 5362 5363 if (!zfs_remap_blkptr_enable) 5364 return (B_FALSE); 5365 5366 if (!spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) 5367 return (B_FALSE); 5368 5369 /* 5370 * Dedup BP's can not be remapped, because ddt_phys_select() depends 5371 * on DVA[0] being the same in the BP as in the DDT (dedup table). 5372 */ 5373 if (BP_GET_DEDUP(bp)) 5374 return (B_FALSE); 5375 5376 /* 5377 * Gang blocks can not be remapped, because 5378 * zio_checksum_gang_verifier() depends on the DVA[0] that's in 5379 * the BP used to read the gang block header (GBH) being the same 5380 * as the DVA[0] that we allocated for the GBH. 5381 */ 5382 if (BP_IS_GANG(bp)) 5383 return (B_FALSE); 5384 5385 /* 5386 * Embedded BP's have no DVA to remap. 5387 */ 5388 if (BP_GET_NDVAS(bp) < 1) 5389 return (B_FALSE); 5390 5391 /* 5392 * Note: we only remap dva[0]. If we remapped other dvas, we 5393 * would no longer know what their phys birth txg is. 5394 */ 5395 dva_t *dva = &bp->blk_dva[0]; 5396 5397 uint64_t offset = DVA_GET_OFFSET(dva); 5398 uint64_t size = DVA_GET_ASIZE(dva); 5399 vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); 5400 5401 if (vd->vdev_ops->vdev_op_remap == NULL) 5402 return (B_FALSE); 5403 5404 rbca.rbca_bp = bp; 5405 rbca.rbca_cb = callback; 5406 rbca.rbca_remap_vd = vd; 5407 rbca.rbca_remap_offset = offset; 5408 rbca.rbca_cb_arg = arg; 5409 5410 /* 5411 * remap_blkptr_cb() will be called in order for each level of 5412 * indirection, until a concrete vdev is reached or a split block is 5413 * encountered. old_vd and old_offset are updated within the callback 5414 * as we go from the one indirect vdev to the next one (either concrete 5415 * or indirect again) in that order. 5416 */ 5417 vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca); 5418 5419 /* Check if the DVA wasn't remapped because it is a split block */ 5420 if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id) 5421 return (B_FALSE); 5422 5423 return (B_TRUE); 5424 } 5425 5426 /* 5427 * Undo the allocation of a DVA which happened in the given transaction group. 5428 */ 5429 void 5430 metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg) 5431 { 5432 metaslab_t *msp; 5433 vdev_t *vd; 5434 uint64_t vdev = DVA_GET_VDEV(dva); 5435 uint64_t offset = DVA_GET_OFFSET(dva); 5436 uint64_t size = DVA_GET_ASIZE(dva); 5437 5438 ASSERT(DVA_IS_VALID(dva)); 5439 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 5440 5441 if (txg > spa_freeze_txg(spa)) 5442 return; 5443 5444 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 5445 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { 5446 cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu", 5447 (u_longlong_t)vdev, (u_longlong_t)offset); 5448 ASSERT(0); 5449 return; 5450 } 5451 5452 ASSERT(!vd->vdev_removing); 5453 ASSERT(vdev_is_concrete(vd)); 5454 ASSERT0(vd->vdev_indirect_config.vic_mapping_object); 5455 ASSERT3P(vd->vdev_indirect_mapping, ==, NULL); 5456 5457 if (DVA_GET_GANG(dva)) 5458 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 5459 5460 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 5461 5462 mutex_enter(&msp->ms_lock); 5463 range_tree_remove(msp->ms_allocating[txg & TXG_MASK], 5464 offset, size); 5465 msp->ms_allocating_total -= size; 5466 5467 VERIFY(!msp->ms_condensing); 5468 VERIFY3U(offset, >=, msp->ms_start); 5469 VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size); 5470 VERIFY3U(range_tree_space(msp->ms_allocatable) + size, <=, 5471 msp->ms_size); 5472 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 5473 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 5474 range_tree_add(msp->ms_allocatable, offset, size); 5475 mutex_exit(&msp->ms_lock); 5476 } 5477 5478 /* 5479 * Free the block represented by the given DVA. 5480 */ 5481 void 5482 metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint) 5483 { 5484 uint64_t vdev = DVA_GET_VDEV(dva); 5485 uint64_t offset = DVA_GET_OFFSET(dva); 5486 uint64_t size = DVA_GET_ASIZE(dva); 5487 vdev_t *vd = vdev_lookup_top(spa, vdev); 5488 5489 ASSERT(DVA_IS_VALID(dva)); 5490 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 5491 5492 if (DVA_GET_GANG(dva)) { 5493 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 5494 } 5495 5496 metaslab_free_impl(vd, offset, size, checkpoint); 5497 } 5498 5499 /* 5500 * Reserve some allocation slots. The reservation system must be called 5501 * before we call into the allocator. If there aren't any available slots 5502 * then the I/O will be throttled until an I/O completes and its slots are 5503 * freed up. The function returns true if it was successful in placing 5504 * the reservation. 5505 */ 5506 boolean_t 5507 metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator, 5508 zio_t *zio, int flags) 5509 { 5510 uint64_t available_slots = 0; 5511 boolean_t slot_reserved = B_FALSE; 5512 uint64_t max = mc->mc_alloc_max_slots[allocator]; 5513 5514 ASSERT(mc->mc_alloc_throttle_enabled); 5515 mutex_enter(&mc->mc_lock); 5516 5517 uint64_t reserved_slots = 5518 zfs_refcount_count(&mc->mc_alloc_slots[allocator]); 5519 if (reserved_slots < max) 5520 available_slots = max - reserved_slots; 5521 5522 if (slots <= available_slots || GANG_ALLOCATION(flags) || 5523 flags & METASLAB_MUST_RESERVE) { 5524 /* 5525 * We reserve the slots individually so that we can unreserve 5526 * them individually when an I/O completes. 5527 */ 5528 zfs_refcount_add_few(&mc->mc_alloc_slots[allocator], slots, 5529 zio); 5530 zio->io_flags |= ZIO_FLAG_IO_ALLOCATING; 5531 slot_reserved = B_TRUE; 5532 } 5533 5534 mutex_exit(&mc->mc_lock); 5535 return (slot_reserved); 5536 } 5537 5538 void 5539 metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, 5540 int allocator, zio_t *zio) 5541 { 5542 ASSERT(mc->mc_alloc_throttle_enabled); 5543 mutex_enter(&mc->mc_lock); 5544 zfs_refcount_remove_few(&mc->mc_alloc_slots[allocator], slots, zio); 5545 mutex_exit(&mc->mc_lock); 5546 } 5547 5548 static int 5549 metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size, 5550 uint64_t txg) 5551 { 5552 metaslab_t *msp; 5553 spa_t *spa = vd->vdev_spa; 5554 int error = 0; 5555 5556 if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count) 5557 return (ENXIO); 5558 5559 ASSERT3P(vd->vdev_ms, !=, NULL); 5560 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 5561 5562 mutex_enter(&msp->ms_lock); 5563 5564 if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) 5565 error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM); 5566 /* 5567 * No need to fail in that case; someone else has activated the 5568 * metaslab, but that doesn't preclude us from using it. 5569 */ 5570 if (error == EBUSY) 5571 error = 0; 5572 5573 if (error == 0 && 5574 !range_tree_contains(msp->ms_allocatable, offset, size)) 5575 error = SET_ERROR(ENOENT); 5576 5577 if (error || txg == 0) { /* txg == 0 indicates dry run */ 5578 mutex_exit(&msp->ms_lock); 5579 return (error); 5580 } 5581 5582 VERIFY(!msp->ms_condensing); 5583 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 5584 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 5585 VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=, 5586 msp->ms_size); 5587 range_tree_remove(msp->ms_allocatable, offset, size); 5588 range_tree_clear(msp->ms_trim, offset, size); 5589 5590 if (spa_writeable(spa)) { /* don't dirty if we're zdb(8) */ 5591 metaslab_class_t *mc = msp->ms_group->mg_class; 5592 multilist_sublist_t *mls = 5593 multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp); 5594 if (!multilist_link_active(&msp->ms_class_txg_node)) { 5595 msp->ms_selected_txg = txg; 5596 multilist_sublist_insert_head(mls, msp); 5597 } 5598 multilist_sublist_unlock(mls); 5599 5600 if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) 5601 vdev_dirty(vd, VDD_METASLAB, msp, txg); 5602 range_tree_add(msp->ms_allocating[txg & TXG_MASK], 5603 offset, size); 5604 msp->ms_allocating_total += size; 5605 } 5606 5607 mutex_exit(&msp->ms_lock); 5608 5609 return (0); 5610 } 5611 5612 typedef struct metaslab_claim_cb_arg_t { 5613 uint64_t mcca_txg; 5614 int mcca_error; 5615 } metaslab_claim_cb_arg_t; 5616 5617 /* ARGSUSED */ 5618 static void 5619 metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, 5620 uint64_t size, void *arg) 5621 { 5622 metaslab_claim_cb_arg_t *mcca_arg = arg; 5623 5624 if (mcca_arg->mcca_error == 0) { 5625 mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset, 5626 size, mcca_arg->mcca_txg); 5627 } 5628 } 5629 5630 int 5631 metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg) 5632 { 5633 if (vd->vdev_ops->vdev_op_remap != NULL) { 5634 metaslab_claim_cb_arg_t arg; 5635 5636 /* 5637 * Only zdb(8) can claim on indirect vdevs. This is used 5638 * to detect leaks of mapped space (that are not accounted 5639 * for in the obsolete counts, spacemap, or bpobj). 5640 */ 5641 ASSERT(!spa_writeable(vd->vdev_spa)); 5642 arg.mcca_error = 0; 5643 arg.mcca_txg = txg; 5644 5645 vd->vdev_ops->vdev_op_remap(vd, offset, size, 5646 metaslab_claim_impl_cb, &arg); 5647 5648 if (arg.mcca_error == 0) { 5649 arg.mcca_error = metaslab_claim_concrete(vd, 5650 offset, size, txg); 5651 } 5652 return (arg.mcca_error); 5653 } else { 5654 return (metaslab_claim_concrete(vd, offset, size, txg)); 5655 } 5656 } 5657 5658 /* 5659 * Intent log support: upon opening the pool after a crash, notify the SPA 5660 * of blocks that the intent log has allocated for immediate write, but 5661 * which are still considered free by the SPA because the last transaction 5662 * group didn't commit yet. 5663 */ 5664 static int 5665 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) 5666 { 5667 uint64_t vdev = DVA_GET_VDEV(dva); 5668 uint64_t offset = DVA_GET_OFFSET(dva); 5669 uint64_t size = DVA_GET_ASIZE(dva); 5670 vdev_t *vd; 5671 5672 if ((vd = vdev_lookup_top(spa, vdev)) == NULL) { 5673 return (SET_ERROR(ENXIO)); 5674 } 5675 5676 ASSERT(DVA_IS_VALID(dva)); 5677 5678 if (DVA_GET_GANG(dva)) 5679 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 5680 5681 return (metaslab_claim_impl(vd, offset, size, txg)); 5682 } 5683 5684 int 5685 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, 5686 int ndvas, uint64_t txg, blkptr_t *hintbp, int flags, 5687 zio_alloc_list_t *zal, zio_t *zio, int allocator) 5688 { 5689 dva_t *dva = bp->blk_dva; 5690 dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL; 5691 int error = 0; 5692 5693 ASSERT(bp->blk_birth == 0); 5694 ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); 5695 5696 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 5697 5698 if (mc->mc_rotor == NULL) { /* no vdevs in this class */ 5699 spa_config_exit(spa, SCL_ALLOC, FTAG); 5700 return (SET_ERROR(ENOSPC)); 5701 } 5702 5703 ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); 5704 ASSERT(BP_GET_NDVAS(bp) == 0); 5705 ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); 5706 ASSERT3P(zal, !=, NULL); 5707 5708 for (int d = 0; d < ndvas; d++) { 5709 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, 5710 txg, flags, zal, allocator); 5711 if (error != 0) { 5712 for (d--; d >= 0; d--) { 5713 metaslab_unalloc_dva(spa, &dva[d], txg); 5714 metaslab_group_alloc_decrement(spa, 5715 DVA_GET_VDEV(&dva[d]), zio, flags, 5716 allocator, B_FALSE); 5717 bzero(&dva[d], sizeof (dva_t)); 5718 } 5719 spa_config_exit(spa, SCL_ALLOC, FTAG); 5720 return (error); 5721 } else { 5722 /* 5723 * Update the metaslab group's queue depth 5724 * based on the newly allocated dva. 5725 */ 5726 metaslab_group_alloc_increment(spa, 5727 DVA_GET_VDEV(&dva[d]), zio, flags, allocator); 5728 } 5729 5730 } 5731 ASSERT(error == 0); 5732 ASSERT(BP_GET_NDVAS(bp) == ndvas); 5733 5734 spa_config_exit(spa, SCL_ALLOC, FTAG); 5735 5736 BP_SET_BIRTH(bp, txg, txg); 5737 5738 return (0); 5739 } 5740 5741 void 5742 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) 5743 { 5744 const dva_t *dva = bp->blk_dva; 5745 int ndvas = BP_GET_NDVAS(bp); 5746 5747 ASSERT(!BP_IS_HOLE(bp)); 5748 ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); 5749 5750 /* 5751 * If we have a checkpoint for the pool we need to make sure that 5752 * the blocks that we free that are part of the checkpoint won't be 5753 * reused until the checkpoint is discarded or we revert to it. 5754 * 5755 * The checkpoint flag is passed down the metaslab_free code path 5756 * and is set whenever we want to add a block to the checkpoint's 5757 * accounting. That is, we "checkpoint" blocks that existed at the 5758 * time the checkpoint was created and are therefore referenced by 5759 * the checkpointed uberblock. 5760 * 5761 * Note that, we don't checkpoint any blocks if the current 5762 * syncing txg <= spa_checkpoint_txg. We want these frees to sync 5763 * normally as they will be referenced by the checkpointed uberblock. 5764 */ 5765 boolean_t checkpoint = B_FALSE; 5766 if (bp->blk_birth <= spa->spa_checkpoint_txg && 5767 spa_syncing_txg(spa) > spa->spa_checkpoint_txg) { 5768 /* 5769 * At this point, if the block is part of the checkpoint 5770 * there is no way it was created in the current txg. 5771 */ 5772 ASSERT(!now); 5773 ASSERT3U(spa_syncing_txg(spa), ==, txg); 5774 checkpoint = B_TRUE; 5775 } 5776 5777 spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); 5778 5779 for (int d = 0; d < ndvas; d++) { 5780 if (now) { 5781 metaslab_unalloc_dva(spa, &dva[d], txg); 5782 } else { 5783 ASSERT3U(txg, ==, spa_syncing_txg(spa)); 5784 metaslab_free_dva(spa, &dva[d], checkpoint); 5785 } 5786 } 5787 5788 spa_config_exit(spa, SCL_FREE, FTAG); 5789 } 5790 5791 int 5792 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) 5793 { 5794 const dva_t *dva = bp->blk_dva; 5795 int ndvas = BP_GET_NDVAS(bp); 5796 int error = 0; 5797 5798 ASSERT(!BP_IS_HOLE(bp)); 5799 5800 if (txg != 0) { 5801 /* 5802 * First do a dry run to make sure all DVAs are claimable, 5803 * so we don't have to unwind from partial failures below. 5804 */ 5805 if ((error = metaslab_claim(spa, bp, 0)) != 0) 5806 return (error); 5807 } 5808 5809 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 5810 5811 for (int d = 0; d < ndvas; d++) { 5812 error = metaslab_claim_dva(spa, &dva[d], txg); 5813 if (error != 0) 5814 break; 5815 } 5816 5817 spa_config_exit(spa, SCL_ALLOC, FTAG); 5818 5819 ASSERT(error == 0 || txg == 0); 5820 5821 return (error); 5822 } 5823 5824 /* ARGSUSED */ 5825 static void 5826 metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset, 5827 uint64_t size, void *arg) 5828 { 5829 if (vd->vdev_ops == &vdev_indirect_ops) 5830 return; 5831 5832 metaslab_check_free_impl(vd, offset, size); 5833 } 5834 5835 static void 5836 metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size) 5837 { 5838 metaslab_t *msp; 5839 spa_t *spa = vd->vdev_spa; 5840 5841 if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) 5842 return; 5843 5844 if (vd->vdev_ops->vdev_op_remap != NULL) { 5845 vd->vdev_ops->vdev_op_remap(vd, offset, size, 5846 metaslab_check_free_impl_cb, NULL); 5847 return; 5848 } 5849 5850 ASSERT(vdev_is_concrete(vd)); 5851 ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); 5852 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 5853 5854 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 5855 5856 mutex_enter(&msp->ms_lock); 5857 if (msp->ms_loaded) { 5858 range_tree_verify_not_present(msp->ms_allocatable, 5859 offset, size); 5860 } 5861 5862 /* 5863 * Check all segments that currently exist in the freeing pipeline. 5864 * 5865 * It would intuitively make sense to also check the current allocating 5866 * tree since metaslab_unalloc_dva() exists for extents that are 5867 * allocated and freed in the same sync pass withing the same txg. 5868 * Unfortunately there are places (e.g. the ZIL) where we allocate a 5869 * segment but then we free part of it within the same txg 5870 * [see zil_sync()]. Thus, we don't call range_tree_verify() in the 5871 * current allocating tree. 5872 */ 5873 range_tree_verify_not_present(msp->ms_freeing, offset, size); 5874 range_tree_verify_not_present(msp->ms_checkpointing, offset, size); 5875 range_tree_verify_not_present(msp->ms_freed, offset, size); 5876 for (int j = 0; j < TXG_DEFER_SIZE; j++) 5877 range_tree_verify_not_present(msp->ms_defer[j], offset, size); 5878 range_tree_verify_not_present(msp->ms_trim, offset, size); 5879 mutex_exit(&msp->ms_lock); 5880 } 5881 5882 void 5883 metaslab_check_free(spa_t *spa, const blkptr_t *bp) 5884 { 5885 if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) 5886 return; 5887 5888 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 5889 for (int i = 0; i < BP_GET_NDVAS(bp); i++) { 5890 uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]); 5891 vdev_t *vd = vdev_lookup_top(spa, vdev); 5892 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); 5893 uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]); 5894 5895 if (DVA_GET_GANG(&bp->blk_dva[i])) 5896 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 5897 5898 ASSERT3P(vd, !=, NULL); 5899 5900 metaslab_check_free_impl(vd, offset, size); 5901 } 5902 spa_config_exit(spa, SCL_VDEV, FTAG); 5903 } 5904 5905 static void 5906 metaslab_group_disable_wait(metaslab_group_t *mg) 5907 { 5908 ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock)); 5909 while (mg->mg_disabled_updating) { 5910 cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock); 5911 } 5912 } 5913 5914 static void 5915 metaslab_group_disabled_increment(metaslab_group_t *mg) 5916 { 5917 ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock)); 5918 ASSERT(mg->mg_disabled_updating); 5919 5920 while (mg->mg_ms_disabled >= max_disabled_ms) { 5921 cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock); 5922 } 5923 mg->mg_ms_disabled++; 5924 ASSERT3U(mg->mg_ms_disabled, <=, max_disabled_ms); 5925 } 5926 5927 /* 5928 * Mark the metaslab as disabled to prevent any allocations on this metaslab. 5929 * We must also track how many metaslabs are currently disabled within a 5930 * metaslab group and limit them to prevent allocation failures from 5931 * occurring because all metaslabs are disabled. 5932 */ 5933 void 5934 metaslab_disable(metaslab_t *msp) 5935 { 5936 ASSERT(!MUTEX_HELD(&msp->ms_lock)); 5937 metaslab_group_t *mg = msp->ms_group; 5938 5939 mutex_enter(&mg->mg_ms_disabled_lock); 5940 5941 /* 5942 * To keep an accurate count of how many threads have disabled 5943 * a specific metaslab group, we only allow one thread to mark 5944 * the metaslab group at a time. This ensures that the value of 5945 * ms_disabled will be accurate when we decide to mark a metaslab 5946 * group as disabled. To do this we force all other threads 5947 * to wait till the metaslab's mg_disabled_updating flag is no 5948 * longer set. 5949 */ 5950 metaslab_group_disable_wait(mg); 5951 mg->mg_disabled_updating = B_TRUE; 5952 if (msp->ms_disabled == 0) { 5953 metaslab_group_disabled_increment(mg); 5954 } 5955 mutex_enter(&msp->ms_lock); 5956 msp->ms_disabled++; 5957 mutex_exit(&msp->ms_lock); 5958 5959 mg->mg_disabled_updating = B_FALSE; 5960 cv_broadcast(&mg->mg_ms_disabled_cv); 5961 mutex_exit(&mg->mg_ms_disabled_lock); 5962 } 5963 5964 void 5965 metaslab_enable(metaslab_t *msp, boolean_t sync, boolean_t unload) 5966 { 5967 metaslab_group_t *mg = msp->ms_group; 5968 spa_t *spa = mg->mg_vd->vdev_spa; 5969 5970 /* 5971 * Wait for the outstanding IO to be synced to prevent newly 5972 * allocated blocks from being overwritten. This used by 5973 * initialize and TRIM which are modifying unallocated space. 5974 */ 5975 if (sync) 5976 txg_wait_synced(spa_get_dsl(spa), 0); 5977 5978 mutex_enter(&mg->mg_ms_disabled_lock); 5979 mutex_enter(&msp->ms_lock); 5980 if (--msp->ms_disabled == 0) { 5981 mg->mg_ms_disabled--; 5982 cv_broadcast(&mg->mg_ms_disabled_cv); 5983 if (unload) 5984 metaslab_unload(msp); 5985 } 5986 mutex_exit(&msp->ms_lock); 5987 mutex_exit(&mg->mg_ms_disabled_lock); 5988 } 5989 5990 static void 5991 metaslab_update_ondisk_flush_data(metaslab_t *ms, dmu_tx_t *tx) 5992 { 5993 vdev_t *vd = ms->ms_group->mg_vd; 5994 spa_t *spa = vd->vdev_spa; 5995 objset_t *mos = spa_meta_objset(spa); 5996 5997 ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); 5998 5999 metaslab_unflushed_phys_t entry = { 6000 .msp_unflushed_txg = metaslab_unflushed_txg(ms), 6001 }; 6002 uint64_t entry_size = sizeof (entry); 6003 uint64_t entry_offset = ms->ms_id * entry_size; 6004 6005 uint64_t object = 0; 6006 int err = zap_lookup(mos, vd->vdev_top_zap, 6007 VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, 6008 &object); 6009 if (err == ENOENT) { 6010 object = dmu_object_alloc(mos, DMU_OTN_UINT64_METADATA, 6011 SPA_OLD_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx); 6012 VERIFY0(zap_add(mos, vd->vdev_top_zap, 6013 VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, 6014 &object, tx)); 6015 } else { 6016 VERIFY0(err); 6017 } 6018 6019 dmu_write(spa_meta_objset(spa), object, entry_offset, entry_size, 6020 &entry, tx); 6021 } 6022 6023 void 6024 metaslab_set_unflushed_txg(metaslab_t *ms, uint64_t txg, dmu_tx_t *tx) 6025 { 6026 spa_t *spa = ms->ms_group->mg_vd->vdev_spa; 6027 6028 if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) 6029 return; 6030 6031 ms->ms_unflushed_txg = txg; 6032 metaslab_update_ondisk_flush_data(ms, tx); 6033 } 6034 6035 uint64_t 6036 metaslab_unflushed_txg(metaslab_t *ms) 6037 { 6038 return (ms->ms_unflushed_txg); 6039 } 6040