1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 24 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 25 * Copyright (c) 2014 Integros [integros.com] 26 * Copyright (c) 2017, Intel Corporation. 27 */ 28 29 #include <sys/zfs_context.h> 30 #include <sys/dmu.h> 31 #include <sys/dmu_tx.h> 32 #include <sys/space_map.h> 33 #include <sys/metaslab_impl.h> 34 #include <sys/vdev_impl.h> 35 #include <sys/zio.h> 36 #include <sys/spa_impl.h> 37 #include <sys/zfeature.h> 38 #include <sys/vdev_indirect_mapping.h> 39 #include <sys/zap.h> 40 41 #define GANG_ALLOCATION(flags) \ 42 ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER)) 43 44 uint64_t metaslab_aliquot = 512ULL << 10; 45 uint64_t metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ 46 47 /* 48 * In pools where the log space map feature is not enabled we touch 49 * multiple metaslabs (and their respective space maps) with each 50 * transaction group. Thus, we benefit from having a small space map 51 * block size since it allows us to issue more I/O operations scattered 52 * around the disk. So a sane default for the space map block size 53 * is 8~16K. 54 */ 55 int zfs_metaslab_sm_blksz_no_log = (1 << 14); 56 57 /* 58 * When the log space map feature is enabled, we accumulate a lot of 59 * changes per metaslab that are flushed once in a while so we benefit 60 * from a bigger block size like 128K for the metaslab space maps. 61 */ 62 int zfs_metaslab_sm_blksz_with_log = (1 << 17); 63 64 /* 65 * The in-core space map representation is more compact than its on-disk form. 66 * The zfs_condense_pct determines how much more compact the in-core 67 * space map representation must be before we compact it on-disk. 68 * Values should be greater than or equal to 100. 69 */ 70 int zfs_condense_pct = 200; 71 72 /* 73 * Condensing a metaslab is not guaranteed to actually reduce the amount of 74 * space used on disk. In particular, a space map uses data in increments of 75 * MAX(1 << ashift, space_map_blksize), so a metaslab might use the 76 * same number of blocks after condensing. Since the goal of condensing is to 77 * reduce the number of IOPs required to read the space map, we only want to 78 * condense when we can be sure we will reduce the number of blocks used by the 79 * space map. Unfortunately, we cannot precisely compute whether or not this is 80 * the case in metaslab_should_condense since we are holding ms_lock. Instead, 81 * we apply the following heuristic: do not condense a spacemap unless the 82 * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold 83 * blocks. 84 */ 85 int zfs_metaslab_condense_block_threshold = 4; 86 87 /* 88 * The zfs_mg_noalloc_threshold defines which metaslab groups should 89 * be eligible for allocation. The value is defined as a percentage of 90 * free space. Metaslab groups that have more free space than 91 * zfs_mg_noalloc_threshold are always eligible for allocations. Once 92 * a metaslab group's free space is less than or equal to the 93 * zfs_mg_noalloc_threshold the allocator will avoid allocating to that 94 * group unless all groups in the pool have reached zfs_mg_noalloc_threshold. 95 * Once all groups in the pool reach zfs_mg_noalloc_threshold then all 96 * groups are allowed to accept allocations. Gang blocks are always 97 * eligible to allocate on any metaslab group. The default value of 0 means 98 * no metaslab group will be excluded based on this criterion. 99 */ 100 int zfs_mg_noalloc_threshold = 0; 101 102 /* 103 * Metaslab groups are considered eligible for allocations if their 104 * fragmenation metric (measured as a percentage) is less than or 105 * equal to zfs_mg_fragmentation_threshold. If a metaslab group 106 * exceeds this threshold then it will be skipped unless all metaslab 107 * groups within the metaslab class have also crossed this threshold. 108 * 109 * This tunable was introduced to avoid edge cases where we continue 110 * allocating from very fragmented disks in our pool while other, less 111 * fragmented disks, exists. On the other hand, if all disks in the 112 * pool are uniformly approaching the threshold, the threshold can 113 * be a speed bump in performance, where we keep switching the disks 114 * that we allocate from (e.g. we allocate some segments from disk A 115 * making it bypassing the threshold while freeing segments from disk 116 * B getting its fragmentation below the threshold). 117 * 118 * Empirically, we've seen that our vdev selection for allocations is 119 * good enough that fragmentation increases uniformly across all vdevs 120 * the majority of the time. Thus we set the threshold percentage high 121 * enough to avoid hitting the speed bump on pools that are being pushed 122 * to the edge. 123 */ 124 int zfs_mg_fragmentation_threshold = 95; 125 126 /* 127 * Allow metaslabs to keep their active state as long as their fragmentation 128 * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An 129 * active metaslab that exceeds this threshold will no longer keep its active 130 * status allowing better metaslabs to be selected. 131 */ 132 int zfs_metaslab_fragmentation_threshold = 70; 133 134 /* 135 * When set will load all metaslabs when pool is first opened. 136 */ 137 int metaslab_debug_load = 0; 138 139 /* 140 * When set will prevent metaslabs from being unloaded. 141 */ 142 int metaslab_debug_unload = 0; 143 144 /* 145 * Minimum size which forces the dynamic allocator to change 146 * it's allocation strategy. Once the space map cannot satisfy 147 * an allocation of this size then it switches to using more 148 * aggressive strategy (i.e search by size rather than offset). 149 */ 150 uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE; 151 152 /* 153 * The minimum free space, in percent, which must be available 154 * in a space map to continue allocations in a first-fit fashion. 155 * Once the space map's free space drops below this level we dynamically 156 * switch to using best-fit allocations. 157 */ 158 int metaslab_df_free_pct = 4; 159 160 /* 161 * Maximum distance to search forward from the last offset. Without this 162 * limit, fragmented pools can see >100,000 iterations and 163 * metaslab_block_picker() becomes the performance limiting factor on 164 * high-performance storage. 165 * 166 * With the default setting of 16MB, we typically see less than 500 167 * iterations, even with very fragmented, ashift=9 pools. The maximum number 168 * of iterations possible is: 169 * metaslab_df_max_search / (2 * (1<<ashift)) 170 * With the default setting of 16MB this is 16*1024 (with ashift=9) or 171 * 2048 (with ashift=12). 172 */ 173 int metaslab_df_max_search = 16 * 1024 * 1024; 174 175 /* 176 * If we are not searching forward (due to metaslab_df_max_search, 177 * metaslab_df_free_pct, or metaslab_df_alloc_threshold), this tunable 178 * controls what segment is used. If it is set, we will use the largest free 179 * segment. If it is not set, we will use a segment of exactly the requested 180 * size (or larger). 181 */ 182 int metaslab_df_use_largest_segment = B_FALSE; 183 184 /* 185 * A metaslab is considered "free" if it contains a contiguous 186 * segment which is greater than metaslab_min_alloc_size. 187 */ 188 uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS; 189 190 /* 191 * Percentage of all cpus that can be used by the metaslab taskq. 192 */ 193 int metaslab_load_pct = 50; 194 195 /* 196 * These tunables control how long a metaslab will remain loaded after the 197 * last allocation from it. A metaslab can't be unloaded until at least 198 * metaslab_unload_delay TXG's and metaslab_unload_delay_ms milliseconds 199 * have elapsed. However, zfs_metaslab_mem_limit may cause it to be 200 * unloaded sooner. These settings are intended to be generous -- to keep 201 * metaslabs loaded for a long time, reducing the rate of metaslab loading. 202 */ 203 int metaslab_unload_delay = 32; 204 int metaslab_unload_delay_ms = 10 * 60 * 1000; /* ten minutes */ 205 206 /* 207 * Max number of metaslabs per group to preload. 208 */ 209 int metaslab_preload_limit = 10; 210 211 /* 212 * Enable/disable preloading of metaslab. 213 */ 214 boolean_t metaslab_preload_enabled = B_TRUE; 215 216 /* 217 * Enable/disable fragmentation weighting on metaslabs. 218 */ 219 boolean_t metaslab_fragmentation_factor_enabled = B_TRUE; 220 221 /* 222 * Enable/disable lba weighting (i.e. outer tracks are given preference). 223 */ 224 boolean_t metaslab_lba_weighting_enabled = B_TRUE; 225 226 /* 227 * Enable/disable metaslab group biasing. 228 */ 229 boolean_t metaslab_bias_enabled = B_TRUE; 230 231 /* 232 * Enable/disable remapping of indirect DVAs to their concrete vdevs. 233 */ 234 boolean_t zfs_remap_blkptr_enable = B_TRUE; 235 236 /* 237 * Enable/disable segment-based metaslab selection. 238 */ 239 boolean_t zfs_metaslab_segment_weight_enabled = B_TRUE; 240 241 /* 242 * When using segment-based metaslab selection, we will continue 243 * allocating from the active metaslab until we have exhausted 244 * zfs_metaslab_switch_threshold of its buckets. 245 */ 246 int zfs_metaslab_switch_threshold = 2; 247 248 /* 249 * Internal switch to enable/disable the metaslab allocation tracing 250 * facility. 251 */ 252 boolean_t metaslab_trace_enabled = B_TRUE; 253 254 /* 255 * Maximum entries that the metaslab allocation tracing facility will keep 256 * in a given list when running in non-debug mode. We limit the number 257 * of entries in non-debug mode to prevent us from using up too much memory. 258 * The limit should be sufficiently large that we don't expect any allocation 259 * to every exceed this value. In debug mode, the system will panic if this 260 * limit is ever reached allowing for further investigation. 261 */ 262 uint64_t metaslab_trace_max_entries = 5000; 263 264 /* 265 * Maximum number of metaslabs per group that can be disabled 266 * simultaneously. 267 */ 268 int max_disabled_ms = 3; 269 270 /* 271 * Maximum percentage of memory to use on storing loaded metaslabs. If loading 272 * a metaslab would take it over this percentage, the oldest selected metaslab 273 * is automatically unloaded. 274 */ 275 int zfs_metaslab_mem_limit = 25; 276 277 /* 278 * Time (in seconds) to respect ms_max_size when the metaslab is not loaded. 279 * To avoid 64-bit overflow, don't set above UINT32_MAX. 280 */ 281 unsigned long zfs_metaslab_max_size_cache_sec = 3600; /* 1 hour */ 282 283 static uint64_t metaslab_weight(metaslab_t *); 284 static void metaslab_set_fragmentation(metaslab_t *); 285 static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t); 286 static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t); 287 static void metaslab_passivate(metaslab_t *msp, uint64_t weight); 288 static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp); 289 static void metaslab_flush_update(metaslab_t *, dmu_tx_t *); 290 static unsigned int metaslab_idx_func(multilist_t *, void *); 291 static void metaslab_evict(metaslab_t *, uint64_t); 292 293 kmem_cache_t *metaslab_alloc_trace_cache; 294 295 /* 296 * ========================================================================== 297 * Metaslab classes 298 * ========================================================================== 299 */ 300 metaslab_class_t * 301 metaslab_class_create(spa_t *spa, metaslab_ops_t *ops) 302 { 303 metaslab_class_t *mc; 304 305 mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); 306 307 mc->mc_spa = spa; 308 mc->mc_rotor = NULL; 309 mc->mc_ops = ops; 310 mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL); 311 mc->mc_metaslab_txg_list = multilist_create(sizeof (metaslab_t), 312 offsetof(metaslab_t, ms_class_txg_node), metaslab_idx_func); 313 mc->mc_alloc_slots = kmem_zalloc(spa->spa_alloc_count * 314 sizeof (zfs_refcount_t), KM_SLEEP); 315 mc->mc_alloc_max_slots = kmem_zalloc(spa->spa_alloc_count * 316 sizeof (uint64_t), KM_SLEEP); 317 for (int i = 0; i < spa->spa_alloc_count; i++) 318 zfs_refcount_create_tracked(&mc->mc_alloc_slots[i]); 319 320 return (mc); 321 } 322 323 void 324 metaslab_class_destroy(metaslab_class_t *mc) 325 { 326 ASSERT(mc->mc_rotor == NULL); 327 ASSERT(mc->mc_alloc == 0); 328 ASSERT(mc->mc_deferred == 0); 329 ASSERT(mc->mc_space == 0); 330 ASSERT(mc->mc_dspace == 0); 331 332 for (int i = 0; i < mc->mc_spa->spa_alloc_count; i++) 333 zfs_refcount_destroy(&mc->mc_alloc_slots[i]); 334 kmem_free(mc->mc_alloc_slots, mc->mc_spa->spa_alloc_count * 335 sizeof (zfs_refcount_t)); 336 kmem_free(mc->mc_alloc_max_slots, mc->mc_spa->spa_alloc_count * 337 sizeof (uint64_t)); 338 mutex_destroy(&mc->mc_lock); 339 multilist_destroy(mc->mc_metaslab_txg_list); 340 kmem_free(mc, sizeof (metaslab_class_t)); 341 } 342 343 int 344 metaslab_class_validate(metaslab_class_t *mc) 345 { 346 metaslab_group_t *mg; 347 vdev_t *vd; 348 349 /* 350 * Must hold one of the spa_config locks. 351 */ 352 ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || 353 spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); 354 355 if ((mg = mc->mc_rotor) == NULL) 356 return (0); 357 358 do { 359 vd = mg->mg_vd; 360 ASSERT(vd->vdev_mg != NULL); 361 ASSERT3P(vd->vdev_top, ==, vd); 362 ASSERT3P(mg->mg_class, ==, mc); 363 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); 364 } while ((mg = mg->mg_next) != mc->mc_rotor); 365 366 return (0); 367 } 368 369 static void 370 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, 371 int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) 372 { 373 atomic_add_64(&mc->mc_alloc, alloc_delta); 374 atomic_add_64(&mc->mc_deferred, defer_delta); 375 atomic_add_64(&mc->mc_space, space_delta); 376 atomic_add_64(&mc->mc_dspace, dspace_delta); 377 } 378 379 uint64_t 380 metaslab_class_get_alloc(metaslab_class_t *mc) 381 { 382 return (mc->mc_alloc); 383 } 384 385 uint64_t 386 metaslab_class_get_deferred(metaslab_class_t *mc) 387 { 388 return (mc->mc_deferred); 389 } 390 391 uint64_t 392 metaslab_class_get_space(metaslab_class_t *mc) 393 { 394 return (mc->mc_space); 395 } 396 397 uint64_t 398 metaslab_class_get_dspace(metaslab_class_t *mc) 399 { 400 return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); 401 } 402 403 void 404 metaslab_class_histogram_verify(metaslab_class_t *mc) 405 { 406 spa_t *spa = mc->mc_spa; 407 vdev_t *rvd = spa->spa_root_vdev; 408 uint64_t *mc_hist; 409 int i; 410 411 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 412 return; 413 414 mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 415 KM_SLEEP); 416 417 for (int c = 0; c < rvd->vdev_children; c++) { 418 vdev_t *tvd = rvd->vdev_child[c]; 419 metaslab_group_t *mg = tvd->vdev_mg; 420 421 /* 422 * Skip any holes, uninitialized top-levels, or 423 * vdevs that are not in this metalab class. 424 */ 425 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || 426 mg->mg_class != mc) { 427 continue; 428 } 429 430 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 431 mc_hist[i] += mg->mg_histogram[i]; 432 } 433 434 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 435 VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]); 436 437 kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 438 } 439 440 /* 441 * Calculate the metaslab class's fragmentation metric. The metric 442 * is weighted based on the space contribution of each metaslab group. 443 * The return value will be a number between 0 and 100 (inclusive), or 444 * ZFS_FRAG_INVALID if the metric has not been set. See comment above the 445 * zfs_frag_table for more information about the metric. 446 */ 447 uint64_t 448 metaslab_class_fragmentation(metaslab_class_t *mc) 449 { 450 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 451 uint64_t fragmentation = 0; 452 453 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 454 455 for (int c = 0; c < rvd->vdev_children; c++) { 456 vdev_t *tvd = rvd->vdev_child[c]; 457 metaslab_group_t *mg = tvd->vdev_mg; 458 459 /* 460 * Skip any holes, uninitialized top-levels, 461 * or vdevs that are not in this metalab class. 462 */ 463 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || 464 mg->mg_class != mc) { 465 continue; 466 } 467 468 /* 469 * If a metaslab group does not contain a fragmentation 470 * metric then just bail out. 471 */ 472 if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { 473 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 474 return (ZFS_FRAG_INVALID); 475 } 476 477 /* 478 * Determine how much this metaslab_group is contributing 479 * to the overall pool fragmentation metric. 480 */ 481 fragmentation += mg->mg_fragmentation * 482 metaslab_group_get_space(mg); 483 } 484 fragmentation /= metaslab_class_get_space(mc); 485 486 ASSERT3U(fragmentation, <=, 100); 487 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 488 return (fragmentation); 489 } 490 491 /* 492 * Calculate the amount of expandable space that is available in 493 * this metaslab class. If a device is expanded then its expandable 494 * space will be the amount of allocatable space that is currently not 495 * part of this metaslab class. 496 */ 497 uint64_t 498 metaslab_class_expandable_space(metaslab_class_t *mc) 499 { 500 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 501 uint64_t space = 0; 502 503 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 504 for (int c = 0; c < rvd->vdev_children; c++) { 505 uint64_t tspace; 506 vdev_t *tvd = rvd->vdev_child[c]; 507 metaslab_group_t *mg = tvd->vdev_mg; 508 509 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || 510 mg->mg_class != mc) { 511 continue; 512 } 513 514 /* 515 * Calculate if we have enough space to add additional 516 * metaslabs. We report the expandable space in terms 517 * of the metaslab size since that's the unit of expansion. 518 * Adjust by efi system partition size. 519 */ 520 tspace = tvd->vdev_max_asize - tvd->vdev_asize; 521 if (tspace > mc->mc_spa->spa_bootsize) { 522 tspace -= mc->mc_spa->spa_bootsize; 523 } 524 space += P2ALIGN(tspace, 1ULL << tvd->vdev_ms_shift); 525 } 526 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 527 return (space); 528 } 529 530 void 531 metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg) 532 { 533 multilist_t *ml = mc->mc_metaslab_txg_list; 534 for (int i = 0; i < multilist_get_num_sublists(ml); i++) { 535 multilist_sublist_t *mls = multilist_sublist_lock(ml, i); 536 metaslab_t *msp = multilist_sublist_head(mls); 537 multilist_sublist_unlock(mls); 538 while (msp != NULL) { 539 mutex_enter(&msp->ms_lock); 540 541 /* 542 * If the metaslab has been removed from the list 543 * (which could happen if we were at the memory limit 544 * and it was evicted during this loop), then we can't 545 * proceed and we should restart the sublist. 546 */ 547 if (!multilist_link_active(&msp->ms_class_txg_node)) { 548 mutex_exit(&msp->ms_lock); 549 i--; 550 break; 551 } 552 mls = multilist_sublist_lock(ml, i); 553 metaslab_t *next_msp = multilist_sublist_next(mls, msp); 554 multilist_sublist_unlock(mls); 555 if (txg > 556 msp->ms_selected_txg + metaslab_unload_delay && 557 gethrtime() > msp->ms_selected_time + 558 (uint64_t)MSEC2NSEC(metaslab_unload_delay_ms)) { 559 metaslab_evict(msp, txg); 560 } else { 561 /* 562 * Once we've hit a metaslab selected too 563 * recently to evict, we're done evicting for 564 * now. 565 */ 566 mutex_exit(&msp->ms_lock); 567 break; 568 } 569 mutex_exit(&msp->ms_lock); 570 msp = next_msp; 571 } 572 } 573 } 574 575 static int 576 metaslab_compare(const void *x1, const void *x2) 577 { 578 const metaslab_t *m1 = (const metaslab_t *)x1; 579 const metaslab_t *m2 = (const metaslab_t *)x2; 580 581 int sort1 = 0; 582 int sort2 = 0; 583 if (m1->ms_allocator != -1 && m1->ms_primary) 584 sort1 = 1; 585 else if (m1->ms_allocator != -1 && !m1->ms_primary) 586 sort1 = 2; 587 if (m2->ms_allocator != -1 && m2->ms_primary) 588 sort2 = 1; 589 else if (m2->ms_allocator != -1 && !m2->ms_primary) 590 sort2 = 2; 591 592 /* 593 * Sort inactive metaslabs first, then primaries, then secondaries. When 594 * selecting a metaslab to allocate from, an allocator first tries its 595 * primary, then secondary active metaslab. If it doesn't have active 596 * metaslabs, or can't allocate from them, it searches for an inactive 597 * metaslab to activate. If it can't find a suitable one, it will steal 598 * a primary or secondary metaslab from another allocator. 599 */ 600 if (sort1 < sort2) 601 return (-1); 602 if (sort1 > sort2) 603 return (1); 604 605 int cmp = AVL_CMP(m2->ms_weight, m1->ms_weight); 606 if (likely(cmp)) 607 return (cmp); 608 609 IMPLY(AVL_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2); 610 611 return (AVL_CMP(m1->ms_start, m2->ms_start)); 612 } 613 614 /* 615 * ========================================================================== 616 * Metaslab groups 617 * ========================================================================== 618 */ 619 /* 620 * Update the allocatable flag and the metaslab group's capacity. 621 * The allocatable flag is set to true if the capacity is below 622 * the zfs_mg_noalloc_threshold or has a fragmentation value that is 623 * greater than zfs_mg_fragmentation_threshold. If a metaslab group 624 * transitions from allocatable to non-allocatable or vice versa then the 625 * metaslab group's class is updated to reflect the transition. 626 */ 627 static void 628 metaslab_group_alloc_update(metaslab_group_t *mg) 629 { 630 vdev_t *vd = mg->mg_vd; 631 metaslab_class_t *mc = mg->mg_class; 632 vdev_stat_t *vs = &vd->vdev_stat; 633 boolean_t was_allocatable; 634 boolean_t was_initialized; 635 636 ASSERT(vd == vd->vdev_top); 637 ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==, 638 SCL_ALLOC); 639 640 mutex_enter(&mg->mg_lock); 641 was_allocatable = mg->mg_allocatable; 642 was_initialized = mg->mg_initialized; 643 644 mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / 645 (vs->vs_space + 1); 646 647 mutex_enter(&mc->mc_lock); 648 649 /* 650 * If the metaslab group was just added then it won't 651 * have any space until we finish syncing out this txg. 652 * At that point we will consider it initialized and available 653 * for allocations. We also don't consider non-activated 654 * metaslab groups (e.g. vdevs that are in the middle of being removed) 655 * to be initialized, because they can't be used for allocation. 656 */ 657 mg->mg_initialized = metaslab_group_initialized(mg); 658 if (!was_initialized && mg->mg_initialized) { 659 mc->mc_groups++; 660 } else if (was_initialized && !mg->mg_initialized) { 661 ASSERT3U(mc->mc_groups, >, 0); 662 mc->mc_groups--; 663 } 664 if (mg->mg_initialized) 665 mg->mg_no_free_space = B_FALSE; 666 667 /* 668 * A metaslab group is considered allocatable if it has plenty 669 * of free space or is not heavily fragmented. We only take 670 * fragmentation into account if the metaslab group has a valid 671 * fragmentation metric (i.e. a value between 0 and 100). 672 */ 673 mg->mg_allocatable = (mg->mg_activation_count > 0 && 674 mg->mg_free_capacity > zfs_mg_noalloc_threshold && 675 (mg->mg_fragmentation == ZFS_FRAG_INVALID || 676 mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)); 677 678 /* 679 * The mc_alloc_groups maintains a count of the number of 680 * groups in this metaslab class that are still above the 681 * zfs_mg_noalloc_threshold. This is used by the allocating 682 * threads to determine if they should avoid allocations to 683 * a given group. The allocator will avoid allocations to a group 684 * if that group has reached or is below the zfs_mg_noalloc_threshold 685 * and there are still other groups that are above the threshold. 686 * When a group transitions from allocatable to non-allocatable or 687 * vice versa we update the metaslab class to reflect that change. 688 * When the mc_alloc_groups value drops to 0 that means that all 689 * groups have reached the zfs_mg_noalloc_threshold making all groups 690 * eligible for allocations. This effectively means that all devices 691 * are balanced again. 692 */ 693 if (was_allocatable && !mg->mg_allocatable) 694 mc->mc_alloc_groups--; 695 else if (!was_allocatable && mg->mg_allocatable) 696 mc->mc_alloc_groups++; 697 mutex_exit(&mc->mc_lock); 698 699 mutex_exit(&mg->mg_lock); 700 } 701 702 int 703 metaslab_sort_by_flushed(const void *va, const void *vb) 704 { 705 const metaslab_t *a = va; 706 const metaslab_t *b = vb; 707 708 int cmp = AVL_CMP(a->ms_unflushed_txg, b->ms_unflushed_txg); 709 if (likely(cmp)) 710 return (cmp); 711 712 uint64_t a_vdev_id = a->ms_group->mg_vd->vdev_id; 713 uint64_t b_vdev_id = b->ms_group->mg_vd->vdev_id; 714 cmp = AVL_CMP(a_vdev_id, b_vdev_id); 715 if (cmp) 716 return (cmp); 717 718 return (AVL_CMP(a->ms_id, b->ms_id)); 719 } 720 721 metaslab_group_t * 722 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators) 723 { 724 metaslab_group_t *mg; 725 726 mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); 727 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); 728 mutex_init(&mg->mg_ms_disabled_lock, NULL, MUTEX_DEFAULT, NULL); 729 cv_init(&mg->mg_ms_disabled_cv, NULL, CV_DEFAULT, NULL); 730 mg->mg_primaries = kmem_zalloc(allocators * sizeof (metaslab_t *), 731 KM_SLEEP); 732 mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *), 733 KM_SLEEP); 734 avl_create(&mg->mg_metaslab_tree, metaslab_compare, 735 sizeof (metaslab_t), offsetof(metaslab_t, ms_group_node)); 736 mg->mg_vd = vd; 737 mg->mg_class = mc; 738 mg->mg_activation_count = 0; 739 mg->mg_initialized = B_FALSE; 740 mg->mg_no_free_space = B_TRUE; 741 mg->mg_allocators = allocators; 742 743 mg->mg_alloc_queue_depth = kmem_zalloc(allocators * 744 sizeof (zfs_refcount_t), KM_SLEEP); 745 mg->mg_cur_max_alloc_queue_depth = kmem_zalloc(allocators * 746 sizeof (uint64_t), KM_SLEEP); 747 for (int i = 0; i < allocators; i++) { 748 zfs_refcount_create_tracked(&mg->mg_alloc_queue_depth[i]); 749 mg->mg_cur_max_alloc_queue_depth[i] = 0; 750 } 751 752 mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct, 753 minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT); 754 755 return (mg); 756 } 757 758 void 759 metaslab_group_destroy(metaslab_group_t *mg) 760 { 761 ASSERT(mg->mg_prev == NULL); 762 ASSERT(mg->mg_next == NULL); 763 /* 764 * We may have gone below zero with the activation count 765 * either because we never activated in the first place or 766 * because we're done, and possibly removing the vdev. 767 */ 768 ASSERT(mg->mg_activation_count <= 0); 769 770 taskq_destroy(mg->mg_taskq); 771 avl_destroy(&mg->mg_metaslab_tree); 772 kmem_free(mg->mg_primaries, mg->mg_allocators * sizeof (metaslab_t *)); 773 kmem_free(mg->mg_secondaries, mg->mg_allocators * 774 sizeof (metaslab_t *)); 775 mutex_destroy(&mg->mg_lock); 776 mutex_destroy(&mg->mg_ms_disabled_lock); 777 cv_destroy(&mg->mg_ms_disabled_cv); 778 779 for (int i = 0; i < mg->mg_allocators; i++) { 780 zfs_refcount_destroy(&mg->mg_alloc_queue_depth[i]); 781 mg->mg_cur_max_alloc_queue_depth[i] = 0; 782 } 783 kmem_free(mg->mg_alloc_queue_depth, mg->mg_allocators * 784 sizeof (zfs_refcount_t)); 785 kmem_free(mg->mg_cur_max_alloc_queue_depth, mg->mg_allocators * 786 sizeof (uint64_t)); 787 788 kmem_free(mg, sizeof (metaslab_group_t)); 789 } 790 791 void 792 metaslab_group_activate(metaslab_group_t *mg) 793 { 794 metaslab_class_t *mc = mg->mg_class; 795 metaslab_group_t *mgprev, *mgnext; 796 797 ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER), !=, 0); 798 799 ASSERT(mc->mc_rotor != mg); 800 ASSERT(mg->mg_prev == NULL); 801 ASSERT(mg->mg_next == NULL); 802 ASSERT(mg->mg_activation_count <= 0); 803 804 if (++mg->mg_activation_count <= 0) 805 return; 806 807 mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); 808 metaslab_group_alloc_update(mg); 809 810 if ((mgprev = mc->mc_rotor) == NULL) { 811 mg->mg_prev = mg; 812 mg->mg_next = mg; 813 } else { 814 mgnext = mgprev->mg_next; 815 mg->mg_prev = mgprev; 816 mg->mg_next = mgnext; 817 mgprev->mg_next = mg; 818 mgnext->mg_prev = mg; 819 } 820 mc->mc_rotor = mg; 821 } 822 823 /* 824 * Passivate a metaslab group and remove it from the allocation rotor. 825 * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating 826 * a metaslab group. This function will momentarily drop spa_config_locks 827 * that are lower than the SCL_ALLOC lock (see comment below). 828 */ 829 void 830 metaslab_group_passivate(metaslab_group_t *mg) 831 { 832 metaslab_class_t *mc = mg->mg_class; 833 spa_t *spa = mc->mc_spa; 834 metaslab_group_t *mgprev, *mgnext; 835 int locks = spa_config_held(spa, SCL_ALL, RW_WRITER); 836 837 ASSERT3U(spa_config_held(spa, SCL_ALLOC | SCL_ZIO, RW_WRITER), ==, 838 (SCL_ALLOC | SCL_ZIO)); 839 840 if (--mg->mg_activation_count != 0) { 841 ASSERT(mc->mc_rotor != mg); 842 ASSERT(mg->mg_prev == NULL); 843 ASSERT(mg->mg_next == NULL); 844 ASSERT(mg->mg_activation_count < 0); 845 return; 846 } 847 848 /* 849 * The spa_config_lock is an array of rwlocks, ordered as 850 * follows (from highest to lowest): 851 * SCL_CONFIG > SCL_STATE > SCL_L2ARC > SCL_ALLOC > 852 * SCL_ZIO > SCL_FREE > SCL_VDEV 853 * (For more information about the spa_config_lock see spa_misc.c) 854 * The higher the lock, the broader its coverage. When we passivate 855 * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO 856 * config locks. However, the metaslab group's taskq might be trying 857 * to preload metaslabs so we must drop the SCL_ZIO lock and any 858 * lower locks to allow the I/O to complete. At a minimum, 859 * we continue to hold the SCL_ALLOC lock, which prevents any future 860 * allocations from taking place and any changes to the vdev tree. 861 */ 862 spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa); 863 taskq_wait(mg->mg_taskq); 864 spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER); 865 metaslab_group_alloc_update(mg); 866 for (int i = 0; i < mg->mg_allocators; i++) { 867 metaslab_t *msp = mg->mg_primaries[i]; 868 if (msp != NULL) { 869 mutex_enter(&msp->ms_lock); 870 metaslab_passivate(msp, 871 metaslab_weight_from_range_tree(msp)); 872 mutex_exit(&msp->ms_lock); 873 } 874 msp = mg->mg_secondaries[i]; 875 if (msp != NULL) { 876 mutex_enter(&msp->ms_lock); 877 metaslab_passivate(msp, 878 metaslab_weight_from_range_tree(msp)); 879 mutex_exit(&msp->ms_lock); 880 } 881 } 882 883 mgprev = mg->mg_prev; 884 mgnext = mg->mg_next; 885 886 if (mg == mgnext) { 887 mc->mc_rotor = NULL; 888 } else { 889 mc->mc_rotor = mgnext; 890 mgprev->mg_next = mgnext; 891 mgnext->mg_prev = mgprev; 892 } 893 894 mg->mg_prev = NULL; 895 mg->mg_next = NULL; 896 } 897 898 boolean_t 899 metaslab_group_initialized(metaslab_group_t *mg) 900 { 901 vdev_t *vd = mg->mg_vd; 902 vdev_stat_t *vs = &vd->vdev_stat; 903 904 return (vs->vs_space != 0 && mg->mg_activation_count > 0); 905 } 906 907 uint64_t 908 metaslab_group_get_space(metaslab_group_t *mg) 909 { 910 return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count); 911 } 912 913 void 914 metaslab_group_histogram_verify(metaslab_group_t *mg) 915 { 916 uint64_t *mg_hist; 917 vdev_t *vd = mg->mg_vd; 918 uint64_t ashift = vd->vdev_ashift; 919 int i; 920 921 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 922 return; 923 924 mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 925 KM_SLEEP); 926 927 ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=, 928 SPACE_MAP_HISTOGRAM_SIZE + ashift); 929 930 for (int m = 0; m < vd->vdev_ms_count; m++) { 931 metaslab_t *msp = vd->vdev_ms[m]; 932 933 /* skip if not active or not a member */ 934 if (msp->ms_sm == NULL || msp->ms_group != mg) 935 continue; 936 937 for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) 938 mg_hist[i + ashift] += 939 msp->ms_sm->sm_phys->smp_histogram[i]; 940 } 941 942 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++) 943 VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]); 944 945 kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 946 } 947 948 static void 949 metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp) 950 { 951 metaslab_class_t *mc = mg->mg_class; 952 uint64_t ashift = mg->mg_vd->vdev_ashift; 953 954 ASSERT(MUTEX_HELD(&msp->ms_lock)); 955 if (msp->ms_sm == NULL) 956 return; 957 958 mutex_enter(&mg->mg_lock); 959 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 960 mg->mg_histogram[i + ashift] += 961 msp->ms_sm->sm_phys->smp_histogram[i]; 962 mc->mc_histogram[i + ashift] += 963 msp->ms_sm->sm_phys->smp_histogram[i]; 964 } 965 mutex_exit(&mg->mg_lock); 966 } 967 968 void 969 metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp) 970 { 971 metaslab_class_t *mc = mg->mg_class; 972 uint64_t ashift = mg->mg_vd->vdev_ashift; 973 974 ASSERT(MUTEX_HELD(&msp->ms_lock)); 975 if (msp->ms_sm == NULL) 976 return; 977 978 mutex_enter(&mg->mg_lock); 979 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 980 ASSERT3U(mg->mg_histogram[i + ashift], >=, 981 msp->ms_sm->sm_phys->smp_histogram[i]); 982 ASSERT3U(mc->mc_histogram[i + ashift], >=, 983 msp->ms_sm->sm_phys->smp_histogram[i]); 984 985 mg->mg_histogram[i + ashift] -= 986 msp->ms_sm->sm_phys->smp_histogram[i]; 987 mc->mc_histogram[i + ashift] -= 988 msp->ms_sm->sm_phys->smp_histogram[i]; 989 } 990 mutex_exit(&mg->mg_lock); 991 } 992 993 static void 994 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) 995 { 996 ASSERT(msp->ms_group == NULL); 997 mutex_enter(&mg->mg_lock); 998 msp->ms_group = mg; 999 msp->ms_weight = 0; 1000 avl_add(&mg->mg_metaslab_tree, msp); 1001 mutex_exit(&mg->mg_lock); 1002 1003 mutex_enter(&msp->ms_lock); 1004 metaslab_group_histogram_add(mg, msp); 1005 mutex_exit(&msp->ms_lock); 1006 } 1007 1008 static void 1009 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) 1010 { 1011 mutex_enter(&msp->ms_lock); 1012 metaslab_group_histogram_remove(mg, msp); 1013 mutex_exit(&msp->ms_lock); 1014 1015 mutex_enter(&mg->mg_lock); 1016 ASSERT(msp->ms_group == mg); 1017 avl_remove(&mg->mg_metaslab_tree, msp); 1018 1019 metaslab_class_t *mc = msp->ms_group->mg_class; 1020 multilist_sublist_t *mls = 1021 multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp); 1022 if (multilist_link_active(&msp->ms_class_txg_node)) 1023 multilist_sublist_remove(mls, msp); 1024 multilist_sublist_unlock(mls); 1025 1026 msp->ms_group = NULL; 1027 mutex_exit(&mg->mg_lock); 1028 } 1029 1030 static void 1031 metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 1032 { 1033 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1034 ASSERT(MUTEX_HELD(&mg->mg_lock)); 1035 ASSERT(msp->ms_group == mg); 1036 1037 avl_remove(&mg->mg_metaslab_tree, msp); 1038 msp->ms_weight = weight; 1039 avl_add(&mg->mg_metaslab_tree, msp); 1040 1041 } 1042 1043 static void 1044 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 1045 { 1046 /* 1047 * Although in principle the weight can be any value, in 1048 * practice we do not use values in the range [1, 511]. 1049 */ 1050 ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0); 1051 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1052 1053 mutex_enter(&mg->mg_lock); 1054 metaslab_group_sort_impl(mg, msp, weight); 1055 mutex_exit(&mg->mg_lock); 1056 } 1057 1058 /* 1059 * Calculate the fragmentation for a given metaslab group. We can use 1060 * a simple average here since all metaslabs within the group must have 1061 * the same size. The return value will be a value between 0 and 100 1062 * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this 1063 * group have a fragmentation metric. 1064 */ 1065 uint64_t 1066 metaslab_group_fragmentation(metaslab_group_t *mg) 1067 { 1068 vdev_t *vd = mg->mg_vd; 1069 uint64_t fragmentation = 0; 1070 uint64_t valid_ms = 0; 1071 1072 for (int m = 0; m < vd->vdev_ms_count; m++) { 1073 metaslab_t *msp = vd->vdev_ms[m]; 1074 1075 if (msp->ms_fragmentation == ZFS_FRAG_INVALID) 1076 continue; 1077 if (msp->ms_group != mg) 1078 continue; 1079 1080 valid_ms++; 1081 fragmentation += msp->ms_fragmentation; 1082 } 1083 1084 if (valid_ms <= mg->mg_vd->vdev_ms_count / 2) 1085 return (ZFS_FRAG_INVALID); 1086 1087 fragmentation /= valid_ms; 1088 ASSERT3U(fragmentation, <=, 100); 1089 return (fragmentation); 1090 } 1091 1092 /* 1093 * Determine if a given metaslab group should skip allocations. A metaslab 1094 * group should avoid allocations if its free capacity is less than the 1095 * zfs_mg_noalloc_threshold or its fragmentation metric is greater than 1096 * zfs_mg_fragmentation_threshold and there is at least one metaslab group 1097 * that can still handle allocations. If the allocation throttle is enabled 1098 * then we skip allocations to devices that have reached their maximum 1099 * allocation queue depth unless the selected metaslab group is the only 1100 * eligible group remaining. 1101 */ 1102 static boolean_t 1103 metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, 1104 uint64_t psize, int allocator, int d) 1105 { 1106 spa_t *spa = mg->mg_vd->vdev_spa; 1107 metaslab_class_t *mc = mg->mg_class; 1108 1109 /* 1110 * We can only consider skipping this metaslab group if it's 1111 * in the normal metaslab class and there are other metaslab 1112 * groups to select from. Otherwise, we always consider it eligible 1113 * for allocations. 1114 */ 1115 if ((mc != spa_normal_class(spa) && 1116 mc != spa_special_class(spa) && 1117 mc != spa_dedup_class(spa)) || 1118 mc->mc_groups <= 1) 1119 return (B_TRUE); 1120 1121 /* 1122 * If the metaslab group's mg_allocatable flag is set (see comments 1123 * in metaslab_group_alloc_update() for more information) and 1124 * the allocation throttle is disabled then allow allocations to this 1125 * device. However, if the allocation throttle is enabled then 1126 * check if we have reached our allocation limit (mg_alloc_queue_depth) 1127 * to determine if we should allow allocations to this metaslab group. 1128 * If all metaslab groups are no longer considered allocatable 1129 * (mc_alloc_groups == 0) or we're trying to allocate the smallest 1130 * gang block size then we allow allocations on this metaslab group 1131 * regardless of the mg_allocatable or throttle settings. 1132 */ 1133 if (mg->mg_allocatable) { 1134 metaslab_group_t *mgp; 1135 int64_t qdepth; 1136 uint64_t qmax = mg->mg_cur_max_alloc_queue_depth[allocator]; 1137 1138 if (!mc->mc_alloc_throttle_enabled) 1139 return (B_TRUE); 1140 1141 /* 1142 * If this metaslab group does not have any free space, then 1143 * there is no point in looking further. 1144 */ 1145 if (mg->mg_no_free_space) 1146 return (B_FALSE); 1147 1148 /* 1149 * Relax allocation throttling for ditto blocks. Due to 1150 * random imbalances in allocation it tends to push copies 1151 * to one vdev, that looks a bit better at the moment. 1152 */ 1153 qmax = qmax * (4 + d) / 4; 1154 1155 qdepth = zfs_refcount_count( 1156 &mg->mg_alloc_queue_depth[allocator]); 1157 1158 /* 1159 * If this metaslab group is below its qmax or it's 1160 * the only allocatable metasable group, then attempt 1161 * to allocate from it. 1162 */ 1163 if (qdepth < qmax || mc->mc_alloc_groups == 1) 1164 return (B_TRUE); 1165 ASSERT3U(mc->mc_alloc_groups, >, 1); 1166 1167 /* 1168 * Since this metaslab group is at or over its qmax, we 1169 * need to determine if there are metaslab groups after this 1170 * one that might be able to handle this allocation. This is 1171 * racy since we can't hold the locks for all metaslab 1172 * groups at the same time when we make this check. 1173 */ 1174 for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) { 1175 qmax = mgp->mg_cur_max_alloc_queue_depth[allocator]; 1176 qmax = qmax * (4 + d) / 4; 1177 qdepth = zfs_refcount_count( 1178 &mgp->mg_alloc_queue_depth[allocator]); 1179 1180 /* 1181 * If there is another metaslab group that 1182 * might be able to handle the allocation, then 1183 * we return false so that we skip this group. 1184 */ 1185 if (qdepth < qmax && !mgp->mg_no_free_space) 1186 return (B_FALSE); 1187 } 1188 1189 /* 1190 * We didn't find another group to handle the allocation 1191 * so we can't skip this metaslab group even though 1192 * we are at or over our qmax. 1193 */ 1194 return (B_TRUE); 1195 1196 } else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) { 1197 return (B_TRUE); 1198 } 1199 return (B_FALSE); 1200 } 1201 1202 /* 1203 * ========================================================================== 1204 * Range tree callbacks 1205 * ========================================================================== 1206 */ 1207 1208 /* 1209 * Comparison function for the private size-ordered tree. Tree is sorted 1210 * by size, larger sizes at the end of the tree. 1211 */ 1212 static int 1213 metaslab_rangesize_compare(const void *x1, const void *x2) 1214 { 1215 const range_seg_t *r1 = x1; 1216 const range_seg_t *r2 = x2; 1217 uint64_t rs_size1 = r1->rs_end - r1->rs_start; 1218 uint64_t rs_size2 = r2->rs_end - r2->rs_start; 1219 1220 int cmp = AVL_CMP(rs_size1, rs_size2); 1221 if (likely(cmp)) 1222 return (cmp); 1223 1224 return (AVL_CMP(r1->rs_start, r2->rs_start)); 1225 } 1226 1227 /* 1228 * ========================================================================== 1229 * Common allocator routines 1230 * ========================================================================== 1231 */ 1232 1233 /* 1234 * Return the maximum contiguous segment within the metaslab. 1235 */ 1236 uint64_t 1237 metaslab_largest_allocatable(metaslab_t *msp) 1238 { 1239 avl_tree_t *t = &msp->ms_allocatable_by_size; 1240 range_seg_t *rs; 1241 1242 if (t == NULL) 1243 return (0); 1244 rs = avl_last(t); 1245 if (rs == NULL) 1246 return (0); 1247 1248 return (rs->rs_end - rs->rs_start); 1249 } 1250 1251 /* 1252 * Return the maximum contiguous segment within the unflushed frees of this 1253 * metaslab. 1254 */ 1255 uint64_t 1256 metaslab_largest_unflushed_free(metaslab_t *msp) 1257 { 1258 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1259 1260 if (msp->ms_unflushed_frees == NULL) 1261 return (0); 1262 1263 range_seg_t *rs = avl_last(&msp->ms_unflushed_frees_by_size); 1264 if (rs == NULL) 1265 return (0); 1266 1267 /* 1268 * When a range is freed from the metaslab, that range is added to 1269 * both the unflushed frees and the deferred frees. While the block 1270 * will eventually be usable, if the metaslab were loaded the range 1271 * would not be added to the ms_allocatable tree until TXG_DEFER_SIZE 1272 * txgs had passed. As a result, when attempting to estimate an upper 1273 * bound for the largest currently-usable free segment in the 1274 * metaslab, we need to not consider any ranges currently in the defer 1275 * trees. This algorithm approximates the largest available chunk in 1276 * the largest range in the unflushed_frees tree by taking the first 1277 * chunk. While this may be a poor estimate, it should only remain so 1278 * briefly and should eventually self-correct as frees are no longer 1279 * deferred. Similar logic applies to the ms_freed tree. See 1280 * metaslab_load() for more details. 1281 * 1282 * There are two primary sources of innacuracy in this estimate. Both 1283 * are tolerated for performance reasons. The first source is that we 1284 * only check the largest segment for overlaps. Smaller segments may 1285 * have more favorable overlaps with the other trees, resulting in 1286 * larger usable chunks. Second, we only look at the first chunk in 1287 * the largest segment; there may be other usable chunks in the 1288 * largest segment, but we ignore them. 1289 */ 1290 uint64_t rstart = rs->rs_start; 1291 uint64_t rsize = rs->rs_end - rstart; 1292 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1293 uint64_t start = 0; 1294 uint64_t size = 0; 1295 boolean_t found = range_tree_find_in(msp->ms_defer[t], rstart, 1296 rsize, &start, &size); 1297 if (found) { 1298 if (rstart == start) 1299 return (0); 1300 rsize = start - rstart; 1301 } 1302 } 1303 1304 uint64_t start = 0; 1305 uint64_t size = 0; 1306 boolean_t found = range_tree_find_in(msp->ms_freed, rstart, 1307 rsize, &start, &size); 1308 if (found) 1309 rsize = start - rstart; 1310 1311 return (rsize); 1312 } 1313 1314 static range_seg_t * 1315 metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size) 1316 { 1317 range_seg_t *rs, rsearch; 1318 avl_index_t where; 1319 1320 rsearch.rs_start = start; 1321 rsearch.rs_end = start + size; 1322 1323 rs = avl_find(t, &rsearch, &where); 1324 if (rs == NULL) { 1325 rs = avl_nearest(t, where, AVL_AFTER); 1326 } 1327 1328 return (rs); 1329 } 1330 1331 /* 1332 * This is a helper function that can be used by the allocator to find 1333 * a suitable block to allocate. This will search the specified AVL 1334 * tree looking for a block that matches the specified criteria. 1335 */ 1336 static uint64_t 1337 metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, 1338 uint64_t max_search) 1339 { 1340 range_seg_t *rs = metaslab_block_find(t, *cursor, size); 1341 uint64_t first_found; 1342 1343 if (rs != NULL) 1344 first_found = rs->rs_start; 1345 1346 while (rs != NULL && rs->rs_start - first_found <= max_search) { 1347 uint64_t offset = rs->rs_start; 1348 if (offset + size <= rs->rs_end) { 1349 *cursor = offset + size; 1350 return (offset); 1351 } 1352 rs = AVL_NEXT(t, rs); 1353 } 1354 1355 *cursor = 0; 1356 return (-1ULL); 1357 } 1358 1359 /* 1360 * ========================================================================== 1361 * Dynamic Fit (df) block allocator 1362 * 1363 * Search for a free chunk of at least this size, starting from the last 1364 * offset (for this alignment of block) looking for up to 1365 * metaslab_df_max_search bytes (16MB). If a large enough free chunk is not 1366 * found within 16MB, then return a free chunk of exactly the requested size (or 1367 * larger). 1368 * 1369 * If it seems like searching from the last offset will be unproductive, skip 1370 * that and just return a free chunk of exactly the requested size (or larger). 1371 * This is based on metaslab_df_alloc_threshold and metaslab_df_free_pct. This 1372 * mechanism is probably not very useful and may be removed in the future. 1373 * 1374 * The behavior when not searching can be changed to return the largest free 1375 * chunk, instead of a free chunk of exactly the requested size, by setting 1376 * metaslab_df_use_largest_segment. 1377 * ========================================================================== 1378 */ 1379 static uint64_t 1380 metaslab_df_alloc(metaslab_t *msp, uint64_t size) 1381 { 1382 /* 1383 * Find the largest power of 2 block size that evenly divides the 1384 * requested size. This is used to try to allocate blocks with similar 1385 * alignment from the same area of the metaslab (i.e. same cursor 1386 * bucket) but it does not guarantee that other allocations sizes 1387 * may exist in the same region. 1388 */ 1389 uint64_t align = size & -size; 1390 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; 1391 range_tree_t *rt = msp->ms_allocatable; 1392 int free_pct = range_tree_space(rt) * 100 / msp->ms_size; 1393 uint64_t offset; 1394 1395 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1396 ASSERT3U(avl_numnodes(&rt->rt_root), ==, 1397 avl_numnodes(&msp->ms_allocatable_by_size)); 1398 1399 /* 1400 * If we're running low on space, find a segment based on size, 1401 * rather than iterating based on offset. 1402 */ 1403 if (metaslab_largest_allocatable(msp) < metaslab_df_alloc_threshold || 1404 free_pct < metaslab_df_free_pct) { 1405 offset = -1; 1406 } else { 1407 offset = metaslab_block_picker(&rt->rt_root, 1408 cursor, size, metaslab_df_max_search); 1409 } 1410 1411 if (offset == -1) { 1412 range_seg_t *rs; 1413 if (metaslab_df_use_largest_segment) { 1414 /* use largest free segment */ 1415 rs = avl_last(&msp->ms_allocatable_by_size); 1416 } else { 1417 /* use segment of this size, or next largest */ 1418 rs = metaslab_block_find(&msp->ms_allocatable_by_size, 1419 0, size); 1420 } 1421 if (rs != NULL && rs->rs_start + size <= rs->rs_end) { 1422 offset = rs->rs_start; 1423 *cursor = offset + size; 1424 } 1425 } 1426 1427 return (offset); 1428 } 1429 1430 static metaslab_ops_t metaslab_df_ops = { 1431 metaslab_df_alloc 1432 }; 1433 1434 /* 1435 * ========================================================================== 1436 * Cursor fit block allocator - 1437 * Select the largest region in the metaslab, set the cursor to the beginning 1438 * of the range and the cursor_end to the end of the range. As allocations 1439 * are made advance the cursor. Continue allocating from the cursor until 1440 * the range is exhausted and then find a new range. 1441 * ========================================================================== 1442 */ 1443 static uint64_t 1444 metaslab_cf_alloc(metaslab_t *msp, uint64_t size) 1445 { 1446 range_tree_t *rt = msp->ms_allocatable; 1447 avl_tree_t *t = &msp->ms_allocatable_by_size; 1448 uint64_t *cursor = &msp->ms_lbas[0]; 1449 uint64_t *cursor_end = &msp->ms_lbas[1]; 1450 uint64_t offset = 0; 1451 1452 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1453 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root)); 1454 1455 ASSERT3U(*cursor_end, >=, *cursor); 1456 1457 if ((*cursor + size) > *cursor_end) { 1458 range_seg_t *rs; 1459 1460 rs = avl_last(&msp->ms_allocatable_by_size); 1461 if (rs == NULL || (rs->rs_end - rs->rs_start) < size) 1462 return (-1ULL); 1463 1464 *cursor = rs->rs_start; 1465 *cursor_end = rs->rs_end; 1466 } 1467 1468 offset = *cursor; 1469 *cursor += size; 1470 1471 return (offset); 1472 } 1473 1474 static metaslab_ops_t metaslab_cf_ops = { 1475 metaslab_cf_alloc 1476 }; 1477 1478 /* 1479 * ========================================================================== 1480 * New dynamic fit allocator - 1481 * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift 1482 * contiguous blocks. If no region is found then just use the largest segment 1483 * that remains. 1484 * ========================================================================== 1485 */ 1486 1487 /* 1488 * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift) 1489 * to request from the allocator. 1490 */ 1491 uint64_t metaslab_ndf_clump_shift = 4; 1492 1493 static uint64_t 1494 metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) 1495 { 1496 avl_tree_t *t = &msp->ms_allocatable->rt_root; 1497 avl_index_t where; 1498 range_seg_t *rs, rsearch; 1499 uint64_t hbit = highbit64(size); 1500 uint64_t *cursor = &msp->ms_lbas[hbit - 1]; 1501 uint64_t max_size = metaslab_largest_allocatable(msp); 1502 1503 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1504 ASSERT3U(avl_numnodes(t), ==, 1505 avl_numnodes(&msp->ms_allocatable_by_size)); 1506 1507 if (max_size < size) 1508 return (-1ULL); 1509 1510 rsearch.rs_start = *cursor; 1511 rsearch.rs_end = *cursor + size; 1512 1513 rs = avl_find(t, &rsearch, &where); 1514 if (rs == NULL || (rs->rs_end - rs->rs_start) < size) { 1515 t = &msp->ms_allocatable_by_size; 1516 1517 rsearch.rs_start = 0; 1518 rsearch.rs_end = MIN(max_size, 1519 1ULL << (hbit + metaslab_ndf_clump_shift)); 1520 rs = avl_find(t, &rsearch, &where); 1521 if (rs == NULL) 1522 rs = avl_nearest(t, where, AVL_AFTER); 1523 ASSERT(rs != NULL); 1524 } 1525 1526 if ((rs->rs_end - rs->rs_start) >= size) { 1527 *cursor = rs->rs_start + size; 1528 return (rs->rs_start); 1529 } 1530 return (-1ULL); 1531 } 1532 1533 static metaslab_ops_t metaslab_ndf_ops = { 1534 metaslab_ndf_alloc 1535 }; 1536 1537 metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops; 1538 1539 /* 1540 * ========================================================================== 1541 * Metaslabs 1542 * ========================================================================== 1543 */ 1544 1545 /* 1546 * Wait for any in-progress metaslab loads to complete. 1547 */ 1548 void 1549 metaslab_load_wait(metaslab_t *msp) 1550 { 1551 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1552 1553 while (msp->ms_loading) { 1554 ASSERT(!msp->ms_loaded); 1555 cv_wait(&msp->ms_load_cv, &msp->ms_lock); 1556 } 1557 } 1558 1559 /* 1560 * Wait for any in-progress flushing to complete. 1561 */ 1562 void 1563 metaslab_flush_wait(metaslab_t *msp) 1564 { 1565 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1566 1567 while (msp->ms_flushing) 1568 cv_wait(&msp->ms_flush_cv, &msp->ms_lock); 1569 } 1570 1571 static unsigned int 1572 metaslab_idx_func(multilist_t *ml, void *arg) 1573 { 1574 metaslab_t *msp = arg; 1575 return (msp->ms_id % multilist_get_num_sublists(ml)); 1576 } 1577 1578 uint64_t 1579 metaslab_allocated_space(metaslab_t *msp) 1580 { 1581 return (msp->ms_allocated_space); 1582 } 1583 1584 /* 1585 * Verify that the space accounting on disk matches the in-core range_trees. 1586 */ 1587 static void 1588 metaslab_verify_space(metaslab_t *msp, uint64_t txg) 1589 { 1590 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1591 uint64_t allocating = 0; 1592 uint64_t sm_free_space, msp_free_space; 1593 1594 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1595 ASSERT(!msp->ms_condensing); 1596 1597 if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) 1598 return; 1599 1600 /* 1601 * We can only verify the metaslab space when we're called 1602 * from syncing context with a loaded metaslab that has an 1603 * allocated space map. Calling this in non-syncing context 1604 * does not provide a consistent view of the metaslab since 1605 * we're performing allocations in the future. 1606 */ 1607 if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL || 1608 !msp->ms_loaded) 1609 return; 1610 1611 /* 1612 * Even though the smp_alloc field can get negative, 1613 * when it comes to a metaslab's space map, that should 1614 * never be the case. 1615 */ 1616 ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0); 1617 1618 ASSERT3U(space_map_allocated(msp->ms_sm), >=, 1619 range_tree_space(msp->ms_unflushed_frees)); 1620 1621 ASSERT3U(metaslab_allocated_space(msp), ==, 1622 space_map_allocated(msp->ms_sm) + 1623 range_tree_space(msp->ms_unflushed_allocs) - 1624 range_tree_space(msp->ms_unflushed_frees)); 1625 1626 sm_free_space = msp->ms_size - metaslab_allocated_space(msp); 1627 1628 /* 1629 * Account for future allocations since we would have 1630 * already deducted that space from the ms_allocatable. 1631 */ 1632 for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { 1633 allocating += 1634 range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]); 1635 } 1636 ASSERT3U(allocating + msp->ms_allocated_this_txg, ==, 1637 msp->ms_allocating_total); 1638 1639 ASSERT3U(msp->ms_deferspace, ==, 1640 range_tree_space(msp->ms_defer[0]) + 1641 range_tree_space(msp->ms_defer[1])); 1642 1643 msp_free_space = range_tree_space(msp->ms_allocatable) + allocating + 1644 msp->ms_deferspace + range_tree_space(msp->ms_freed); 1645 1646 VERIFY3U(sm_free_space, ==, msp_free_space); 1647 } 1648 1649 static void 1650 metaslab_aux_histograms_clear(metaslab_t *msp) 1651 { 1652 /* 1653 * Auxiliary histograms are only cleared when resetting them, 1654 * which can only happen while the metaslab is loaded. 1655 */ 1656 ASSERT(msp->ms_loaded); 1657 1658 bzero(msp->ms_synchist, sizeof (msp->ms_synchist)); 1659 for (int t = 0; t < TXG_DEFER_SIZE; t++) 1660 bzero(msp->ms_deferhist[t], sizeof (msp->ms_deferhist[t])); 1661 } 1662 1663 static void 1664 metaslab_aux_histogram_add(uint64_t *histogram, uint64_t shift, 1665 range_tree_t *rt) 1666 { 1667 /* 1668 * This is modeled after space_map_histogram_add(), so refer to that 1669 * function for implementation details. We want this to work like 1670 * the space map histogram, and not the range tree histogram, as we 1671 * are essentially constructing a delta that will be later subtracted 1672 * from the space map histogram. 1673 */ 1674 int idx = 0; 1675 for (int i = shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { 1676 ASSERT3U(i, >=, idx + shift); 1677 histogram[idx] += rt->rt_histogram[i] << (i - idx - shift); 1678 1679 if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) { 1680 ASSERT3U(idx + shift, ==, i); 1681 idx++; 1682 ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE); 1683 } 1684 } 1685 } 1686 1687 /* 1688 * Called at every sync pass that the metaslab gets synced. 1689 * 1690 * The reason is that we want our auxiliary histograms to be updated 1691 * wherever the metaslab's space map histogram is updated. This way 1692 * we stay consistent on which parts of the metaslab space map's 1693 * histogram are currently not available for allocations (e.g because 1694 * they are in the defer, freed, and freeing trees). 1695 */ 1696 static void 1697 metaslab_aux_histograms_update(metaslab_t *msp) 1698 { 1699 space_map_t *sm = msp->ms_sm; 1700 ASSERT(sm != NULL); 1701 1702 /* 1703 * This is similar to the metaslab's space map histogram updates 1704 * that take place in metaslab_sync(). The only difference is that 1705 * we only care about segments that haven't made it into the 1706 * ms_allocatable tree yet. 1707 */ 1708 if (msp->ms_loaded) { 1709 metaslab_aux_histograms_clear(msp); 1710 1711 metaslab_aux_histogram_add(msp->ms_synchist, 1712 sm->sm_shift, msp->ms_freed); 1713 1714 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1715 metaslab_aux_histogram_add(msp->ms_deferhist[t], 1716 sm->sm_shift, msp->ms_defer[t]); 1717 } 1718 } 1719 1720 metaslab_aux_histogram_add(msp->ms_synchist, 1721 sm->sm_shift, msp->ms_freeing); 1722 } 1723 1724 /* 1725 * Called every time we are done syncing (writing to) the metaslab, 1726 * i.e. at the end of each sync pass. 1727 * [see the comment in metaslab_impl.h for ms_synchist, ms_deferhist] 1728 */ 1729 static void 1730 metaslab_aux_histograms_update_done(metaslab_t *msp, boolean_t defer_allowed) 1731 { 1732 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1733 space_map_t *sm = msp->ms_sm; 1734 1735 if (sm == NULL) { 1736 /* 1737 * We came here from metaslab_init() when creating/opening a 1738 * pool, looking at a metaslab that hasn't had any allocations 1739 * yet. 1740 */ 1741 return; 1742 } 1743 1744 /* 1745 * This is similar to the actions that we take for the ms_freed 1746 * and ms_defer trees in metaslab_sync_done(). 1747 */ 1748 uint64_t hist_index = spa_syncing_txg(spa) % TXG_DEFER_SIZE; 1749 if (defer_allowed) { 1750 bcopy(msp->ms_synchist, msp->ms_deferhist[hist_index], 1751 sizeof (msp->ms_synchist)); 1752 } else { 1753 bzero(msp->ms_deferhist[hist_index], 1754 sizeof (msp->ms_deferhist[hist_index])); 1755 } 1756 bzero(msp->ms_synchist, sizeof (msp->ms_synchist)); 1757 } 1758 1759 /* 1760 * Ensure that the metaslab's weight and fragmentation are consistent 1761 * with the contents of the histogram (either the range tree's histogram 1762 * or the space map's depending whether the metaslab is loaded). 1763 */ 1764 static void 1765 metaslab_verify_weight_and_frag(metaslab_t *msp) 1766 { 1767 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1768 1769 if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) 1770 return; 1771 1772 /* 1773 * We can end up here from vdev_remove_complete(), in which case we 1774 * cannot do these assertions because we hold spa config locks and 1775 * thus we are not allowed to read from the DMU. 1776 * 1777 * We check if the metaslab group has been removed and if that's 1778 * the case we return immediately as that would mean that we are 1779 * here from the aforementioned code path. 1780 */ 1781 if (msp->ms_group == NULL) 1782 return; 1783 1784 /* 1785 * Devices being removed always return a weight of 0 and leave 1786 * fragmentation and ms_max_size as is - there is nothing for 1787 * us to verify here. 1788 */ 1789 vdev_t *vd = msp->ms_group->mg_vd; 1790 if (vd->vdev_removing) 1791 return; 1792 1793 /* 1794 * If the metaslab is dirty it probably means that we've done 1795 * some allocations or frees that have changed our histograms 1796 * and thus the weight. 1797 */ 1798 for (int t = 0; t < TXG_SIZE; t++) { 1799 if (txg_list_member(&vd->vdev_ms_list, msp, t)) 1800 return; 1801 } 1802 1803 /* 1804 * This verification checks that our in-memory state is consistent 1805 * with what's on disk. If the pool is read-only then there aren't 1806 * any changes and we just have the initially-loaded state. 1807 */ 1808 if (!spa_writeable(msp->ms_group->mg_vd->vdev_spa)) 1809 return; 1810 1811 /* some extra verification for in-core tree if you can */ 1812 if (msp->ms_loaded) { 1813 range_tree_stat_verify(msp->ms_allocatable); 1814 VERIFY(space_map_histogram_verify(msp->ms_sm, 1815 msp->ms_allocatable)); 1816 } 1817 1818 uint64_t weight = msp->ms_weight; 1819 uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; 1820 boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight); 1821 uint64_t frag = msp->ms_fragmentation; 1822 uint64_t max_segsize = msp->ms_max_size; 1823 1824 msp->ms_weight = 0; 1825 msp->ms_fragmentation = 0; 1826 1827 /* 1828 * This function is used for verification purposes. Regardless of 1829 * whether metaslab_weight() thinks this metaslab should be active or 1830 * not, we want to ensure that the actual weight (and therefore the 1831 * value of ms_weight) would be the same if it was to be recalculated 1832 * at this point. 1833 */ 1834 msp->ms_weight = metaslab_weight(msp) | was_active; 1835 1836 VERIFY3U(max_segsize, ==, msp->ms_max_size); 1837 1838 /* 1839 * If the weight type changed then there is no point in doing 1840 * verification. Revert fields to their original values. 1841 */ 1842 if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) || 1843 (!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) { 1844 msp->ms_fragmentation = frag; 1845 msp->ms_weight = weight; 1846 return; 1847 } 1848 1849 VERIFY3U(msp->ms_fragmentation, ==, frag); 1850 VERIFY3U(msp->ms_weight, ==, weight); 1851 } 1852 1853 /* 1854 * If we're over the zfs_metaslab_mem_limit, select the loaded metaslab from 1855 * this class that was used longest ago, and attempt to unload it. We don't 1856 * want to spend too much time in this loop to prevent performance 1857 * degredation, and we expect that most of the time this operation will 1858 * succeed. Between that and the normal unloading processing during txg sync, 1859 * we expect this to keep the metaslab memory usage under control. 1860 */ 1861 static void 1862 metaslab_potentially_evict(metaslab_class_t *mc) 1863 { 1864 #ifdef _KERNEL 1865 uint64_t allmem = arc_all_memory(); 1866 extern kmem_cache_t *range_seg_cache; 1867 uint64_t inuse = kmem_cache_stat(range_seg_cache, "buf_inuse"); 1868 uint64_t size = kmem_cache_stat(range_seg_cache, "buf_size"); 1869 int tries = 0; 1870 for (; allmem * zfs_metaslab_mem_limit / 100 < inuse * size && 1871 tries < multilist_get_num_sublists(mc->mc_metaslab_txg_list) * 2; 1872 tries++) { 1873 unsigned int idx = multilist_get_random_index( 1874 mc->mc_metaslab_txg_list); 1875 multilist_sublist_t *mls = 1876 multilist_sublist_lock(mc->mc_metaslab_txg_list, idx); 1877 metaslab_t *msp = multilist_sublist_head(mls); 1878 multilist_sublist_unlock(mls); 1879 while (msp != NULL && allmem * zfs_metaslab_mem_limit / 100 < 1880 inuse * size) { 1881 VERIFY3P(mls, ==, multilist_sublist_lock( 1882 mc->mc_metaslab_txg_list, idx)); 1883 ASSERT3U(idx, ==, 1884 metaslab_idx_func(mc->mc_metaslab_txg_list, msp)); 1885 1886 if (!multilist_link_active(&msp->ms_class_txg_node)) { 1887 multilist_sublist_unlock(mls); 1888 break; 1889 } 1890 metaslab_t *next_msp = multilist_sublist_next(mls, msp); 1891 multilist_sublist_unlock(mls); 1892 /* 1893 * If the metaslab is currently loading there are two 1894 * cases. If it's the metaslab we're evicting, we 1895 * can't continue on or we'll panic when we attempt to 1896 * recursively lock the mutex. If it's another 1897 * metaslab that's loading, it can be safely skipped, 1898 * since we know it's very new and therefore not a 1899 * good eviction candidate. We check later once the 1900 * lock is held that the metaslab is fully loaded 1901 * before actually unloading it. 1902 */ 1903 if (msp->ms_loading) { 1904 msp = next_msp; 1905 inuse = kmem_cache_stat(range_seg_cache, 1906 "buf_inuse"); 1907 continue; 1908 } 1909 /* 1910 * We can't unload metaslabs with no spacemap because 1911 * they're not ready to be unloaded yet. We can't 1912 * unload metaslabs with outstanding allocations 1913 * because doing so could cause the metaslab's weight 1914 * to decrease while it's unloaded, which violates an 1915 * invariant that we use to prevent unnecessary 1916 * loading. We also don't unload metaslabs that are 1917 * currently active because they are high-weight 1918 * metaslabs that are likely to be used in the near 1919 * future. 1920 */ 1921 mutex_enter(&msp->ms_lock); 1922 if (msp->ms_allocator == -1 && msp->ms_sm != NULL && 1923 msp->ms_allocating_total == 0) { 1924 metaslab_unload(msp); 1925 } 1926 mutex_exit(&msp->ms_lock); 1927 msp = next_msp; 1928 inuse = kmem_cache_stat(range_seg_cache, "buf_inuse"); 1929 } 1930 } 1931 #endif 1932 } 1933 1934 static int 1935 metaslab_load_impl(metaslab_t *msp) 1936 { 1937 int error = 0; 1938 1939 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1940 ASSERT(msp->ms_loading); 1941 ASSERT(!msp->ms_condensing); 1942 1943 /* 1944 * We temporarily drop the lock to unblock other operations while we 1945 * are reading the space map. Therefore, metaslab_sync() and 1946 * metaslab_sync_done() can run at the same time as we do. 1947 * 1948 * If we are using the log space maps, metaslab_sync() can't write to 1949 * the metaslab's space map while we are loading as we only write to 1950 * it when we are flushing the metaslab, and that can't happen while 1951 * we are loading it. 1952 * 1953 * If we are not using log space maps though, metaslab_sync() can 1954 * append to the space map while we are loading. Therefore we load 1955 * only entries that existed when we started the load. Additionally, 1956 * metaslab_sync_done() has to wait for the load to complete because 1957 * there are potential races like metaslab_load() loading parts of the 1958 * space map that are currently being appended by metaslab_sync(). If 1959 * we didn't, the ms_allocatable would have entries that 1960 * metaslab_sync_done() would try to re-add later. 1961 * 1962 * That's why before dropping the lock we remember the synced length 1963 * of the metaslab and read up to that point of the space map, 1964 * ignoring entries appended by metaslab_sync() that happen after we 1965 * drop the lock. 1966 */ 1967 uint64_t length = msp->ms_synced_length; 1968 mutex_exit(&msp->ms_lock); 1969 1970 hrtime_t load_start = gethrtime(); 1971 if (msp->ms_sm != NULL) { 1972 error = space_map_load_length(msp->ms_sm, msp->ms_allocatable, 1973 SM_FREE, length); 1974 } else { 1975 /* 1976 * The space map has not been allocated yet, so treat 1977 * all the space in the metaslab as free and add it to the 1978 * ms_allocatable tree. 1979 */ 1980 range_tree_add(msp->ms_allocatable, 1981 msp->ms_start, msp->ms_size); 1982 1983 if (msp->ms_freed != NULL) { 1984 /* 1985 * If the ms_sm doesn't exist, this means that this 1986 * metaslab hasn't gone through metaslab_sync() and 1987 * thus has never been dirtied. So we shouldn't 1988 * expect any unflushed allocs or frees from previous 1989 * TXGs. 1990 * 1991 * Note: ms_freed and all the other trees except for 1992 * the ms_allocatable, can be NULL at this point only 1993 * if this is a new metaslab of a vdev that just got 1994 * expanded. 1995 */ 1996 ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); 1997 ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); 1998 } 1999 } 2000 2001 /* 2002 * We need to grab the ms_sync_lock to prevent metaslab_sync() from 2003 * changing the ms_sm (or log_sm) and the metaslab's range trees 2004 * while we are about to use them and populate the ms_allocatable. 2005 * The ms_lock is insufficient for this because metaslab_sync() doesn't 2006 * hold the ms_lock while writing the ms_checkpointing tree to disk. 2007 */ 2008 mutex_enter(&msp->ms_sync_lock); 2009 mutex_enter(&msp->ms_lock); 2010 2011 ASSERT(!msp->ms_condensing); 2012 ASSERT(!msp->ms_flushing); 2013 2014 if (error != 0) { 2015 mutex_exit(&msp->ms_sync_lock); 2016 return (error); 2017 } 2018 2019 ASSERT3P(msp->ms_group, !=, NULL); 2020 msp->ms_loaded = B_TRUE; 2021 2022 /* 2023 * Apply all the unflushed changes to ms_allocatable right 2024 * away so any manipulations we do below have a clear view 2025 * of what is allocated and what is free. 2026 */ 2027 range_tree_walk(msp->ms_unflushed_allocs, 2028 range_tree_remove, msp->ms_allocatable); 2029 range_tree_walk(msp->ms_unflushed_frees, 2030 range_tree_add, msp->ms_allocatable); 2031 2032 msp->ms_loaded = B_TRUE; 2033 2034 ASSERT3P(msp->ms_group, !=, NULL); 2035 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 2036 if (spa_syncing_log_sm(spa) != NULL) { 2037 ASSERT(spa_feature_is_enabled(spa, 2038 SPA_FEATURE_LOG_SPACEMAP)); 2039 2040 /* 2041 * If we use a log space map we add all the segments 2042 * that are in ms_unflushed_frees so they are available 2043 * for allocation. 2044 * 2045 * ms_allocatable needs to contain all free segments 2046 * that are ready for allocations (thus not segments 2047 * from ms_freeing, ms_freed, and the ms_defer trees). 2048 * But if we grab the lock in this code path at a sync 2049 * pass later that 1, then it also contains the 2050 * segments of ms_freed (they were added to it earlier 2051 * in this path through ms_unflushed_frees). So we 2052 * need to remove all the segments that exist in 2053 * ms_freed from ms_allocatable as they will be added 2054 * later in metaslab_sync_done(). 2055 * 2056 * When there's no log space map, the ms_allocatable 2057 * correctly doesn't contain any segments that exist 2058 * in ms_freed [see ms_synced_length]. 2059 */ 2060 range_tree_walk(msp->ms_freed, 2061 range_tree_remove, msp->ms_allocatable); 2062 } 2063 2064 /* 2065 * If we are not using the log space map, ms_allocatable 2066 * contains the segments that exist in the ms_defer trees 2067 * [see ms_synced_length]. Thus we need to remove them 2068 * from ms_allocatable as they will be added again in 2069 * metaslab_sync_done(). 2070 * 2071 * If we are using the log space map, ms_allocatable still 2072 * contains the segments that exist in the ms_defer trees. 2073 * Not because it read them through the ms_sm though. But 2074 * because these segments are part of ms_unflushed_frees 2075 * whose segments we add to ms_allocatable earlier in this 2076 * code path. 2077 */ 2078 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2079 range_tree_walk(msp->ms_defer[t], 2080 range_tree_remove, msp->ms_allocatable); 2081 } 2082 2083 /* 2084 * Call metaslab_recalculate_weight_and_sort() now that the 2085 * metaslab is loaded so we get the metaslab's real weight. 2086 * 2087 * Unless this metaslab was created with older software and 2088 * has not yet been converted to use segment-based weight, we 2089 * expect the new weight to be better or equal to the weight 2090 * that the metaslab had while it was not loaded. This is 2091 * because the old weight does not take into account the 2092 * consolidation of adjacent segments between TXGs. [see 2093 * comment for ms_synchist and ms_deferhist[] for more info] 2094 */ 2095 uint64_t weight = msp->ms_weight; 2096 uint64_t max_size = msp->ms_max_size; 2097 metaslab_recalculate_weight_and_sort(msp); 2098 if (!WEIGHT_IS_SPACEBASED(weight)) 2099 ASSERT3U(weight, <=, msp->ms_weight); 2100 msp->ms_max_size = metaslab_largest_allocatable(msp); 2101 ASSERT3U(max_size, <=, msp->ms_max_size); 2102 hrtime_t load_end = gethrtime(); 2103 msp->ms_load_time = load_end; 2104 if (zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) { 2105 zfs_dbgmsg("loading: txg %llu, spa %s, vdev_id %llu, " 2106 "ms_id %llu, smp_length %llu, " 2107 "unflushed_allocs %llu, unflushed_frees %llu, " 2108 "freed %llu, defer %llu + %llu, " 2109 "loading_time %lld ms, ms_max_size %llu, " 2110 "max size error %llu", 2111 spa_syncing_txg(spa), spa_name(spa), 2112 msp->ms_group->mg_vd->vdev_id, msp->ms_id, 2113 space_map_length(msp->ms_sm), 2114 range_tree_space(msp->ms_unflushed_allocs), 2115 range_tree_space(msp->ms_unflushed_frees), 2116 range_tree_space(msp->ms_freed), 2117 range_tree_space(msp->ms_defer[0]), 2118 range_tree_space(msp->ms_defer[1]), 2119 (longlong_t)((load_end - load_start) / 1000000), 2120 msp->ms_max_size, msp->ms_max_size - max_size); 2121 } 2122 2123 metaslab_verify_space(msp, spa_syncing_txg(spa)); 2124 mutex_exit(&msp->ms_sync_lock); 2125 return (0); 2126 } 2127 2128 int 2129 metaslab_load(metaslab_t *msp) 2130 { 2131 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2132 2133 /* 2134 * There may be another thread loading the same metaslab, if that's 2135 * the case just wait until the other thread is done and return. 2136 */ 2137 metaslab_load_wait(msp); 2138 if (msp->ms_loaded) 2139 return (0); 2140 VERIFY(!msp->ms_loading); 2141 ASSERT(!msp->ms_condensing); 2142 2143 /* 2144 * We set the loading flag BEFORE potentially dropping the lock to 2145 * wait for an ongoing flush (see ms_flushing below). This way other 2146 * threads know that there is already a thread that is loading this 2147 * metaslab. 2148 */ 2149 msp->ms_loading = B_TRUE; 2150 2151 /* 2152 * Wait for any in-progress flushing to finish as we drop the ms_lock 2153 * both here (during space_map_load()) and in metaslab_flush() (when 2154 * we flush our changes to the ms_sm). 2155 */ 2156 if (msp->ms_flushing) 2157 metaslab_flush_wait(msp); 2158 2159 /* 2160 * In the possibility that we were waiting for the metaslab to be 2161 * flushed (where we temporarily dropped the ms_lock), ensure that 2162 * no one else loaded the metaslab somehow. 2163 */ 2164 ASSERT(!msp->ms_loaded); 2165 2166 /* 2167 * If we're loading a metaslab in the normal class, consider evicting 2168 * another one to keep our memory usage under the limit defined by the 2169 * zfs_metaslab_mem_limit tunable. 2170 */ 2171 if (spa_normal_class(msp->ms_group->mg_class->mc_spa) == 2172 msp->ms_group->mg_class) { 2173 metaslab_potentially_evict(msp->ms_group->mg_class); 2174 } 2175 2176 int error = metaslab_load_impl(msp); 2177 2178 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2179 msp->ms_loading = B_FALSE; 2180 cv_broadcast(&msp->ms_load_cv); 2181 2182 return (error); 2183 } 2184 2185 void 2186 metaslab_unload(metaslab_t *msp) 2187 { 2188 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2189 2190 /* 2191 * This can happen if a metaslab is selected for eviction (in 2192 * metaslab_potentially_evict) and then unloaded during spa_sync (via 2193 * metaslab_class_evict_old). 2194 */ 2195 if (!msp->ms_loaded) 2196 return; 2197 2198 range_tree_vacate(msp->ms_allocatable, NULL, NULL); 2199 msp->ms_loaded = B_FALSE; 2200 msp->ms_unload_time = gethrtime(); 2201 2202 msp->ms_activation_weight = 0; 2203 msp->ms_weight &= ~METASLAB_ACTIVE_MASK; 2204 2205 if (msp->ms_group != NULL) { 2206 metaslab_class_t *mc = msp->ms_group->mg_class; 2207 multilist_sublist_t *mls = 2208 multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp); 2209 if (multilist_link_active(&msp->ms_class_txg_node)) 2210 multilist_sublist_remove(mls, msp); 2211 multilist_sublist_unlock(mls); 2212 } 2213 2214 /* 2215 * We explicitly recalculate the metaslab's weight based on its space 2216 * map (as it is now not loaded). We want unload metaslabs to always 2217 * have their weights calculated from the space map histograms, while 2218 * loaded ones have it calculated from their in-core range tree 2219 * [see metaslab_load()]. This way, the weight reflects the information 2220 * available in-core, whether it is loaded or not. 2221 * 2222 * If ms_group == NULL means that we came here from metaslab_fini(), 2223 * at which point it doesn't make sense for us to do the recalculation 2224 * and the sorting. 2225 */ 2226 if (msp->ms_group != NULL) 2227 metaslab_recalculate_weight_and_sort(msp); 2228 } 2229 2230 void 2231 metaslab_set_selected_txg(metaslab_t *msp, uint64_t txg) 2232 { 2233 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2234 metaslab_class_t *mc = msp->ms_group->mg_class; 2235 multilist_sublist_t *mls = 2236 multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp); 2237 if (multilist_link_active(&msp->ms_class_txg_node)) 2238 multilist_sublist_remove(mls, msp); 2239 msp->ms_selected_txg = txg; 2240 msp->ms_selected_time = gethrtime(); 2241 multilist_sublist_insert_tail(mls, msp); 2242 multilist_sublist_unlock(mls); 2243 } 2244 2245 void 2246 metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta, 2247 int64_t defer_delta, int64_t space_delta) 2248 { 2249 vdev_space_update(vd, alloc_delta, defer_delta, space_delta); 2250 2251 ASSERT3P(vd->vdev_spa->spa_root_vdev, ==, vd->vdev_parent); 2252 ASSERT(vd->vdev_ms_count != 0); 2253 2254 metaslab_class_space_update(mc, alloc_delta, defer_delta, space_delta, 2255 vdev_deflated_space(vd, space_delta)); 2256 } 2257 2258 int 2259 metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, 2260 uint64_t txg, metaslab_t **msp) 2261 { 2262 vdev_t *vd = mg->mg_vd; 2263 spa_t *spa = vd->vdev_spa; 2264 objset_t *mos = spa->spa_meta_objset; 2265 metaslab_t *ms; 2266 int error; 2267 2268 ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); 2269 mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL); 2270 mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL); 2271 cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); 2272 cv_init(&ms->ms_flush_cv, NULL, CV_DEFAULT, NULL); 2273 multilist_link_init(&ms->ms_class_txg_node); 2274 2275 ms->ms_id = id; 2276 ms->ms_start = id << vd->vdev_ms_shift; 2277 ms->ms_size = 1ULL << vd->vdev_ms_shift; 2278 ms->ms_allocator = -1; 2279 ms->ms_new = B_TRUE; 2280 2281 /* 2282 * We only open space map objects that already exist. All others 2283 * will be opened when we finally allocate an object for it. 2284 * 2285 * Note: 2286 * When called from vdev_expand(), we can't call into the DMU as 2287 * we are holding the spa_config_lock as a writer and we would 2288 * deadlock [see relevant comment in vdev_metaslab_init()]. in 2289 * that case, the object parameter is zero though, so we won't 2290 * call into the DMU. 2291 */ 2292 if (object != 0) { 2293 error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start, 2294 ms->ms_size, vd->vdev_ashift); 2295 2296 if (error != 0) { 2297 kmem_free(ms, sizeof (metaslab_t)); 2298 return (error); 2299 } 2300 2301 ASSERT(ms->ms_sm != NULL); 2302 ASSERT3S(space_map_allocated(ms->ms_sm), >=, 0); 2303 ms->ms_allocated_space = space_map_allocated(ms->ms_sm); 2304 } 2305 2306 /* 2307 * We create the ms_allocatable here, but we don't create the 2308 * other range trees until metaslab_sync_done(). This serves 2309 * two purposes: it allows metaslab_sync_done() to detect the 2310 * addition of new space; and for debugging, it ensures that 2311 * we'd data fault on any attempt to use this metaslab before 2312 * it's ready. 2313 */ 2314 ms->ms_allocatable = range_tree_create_impl(&rt_avl_ops, 2315 &ms->ms_allocatable_by_size, metaslab_rangesize_compare, 0); 2316 2317 ms->ms_trim = range_tree_create(NULL, NULL); 2318 2319 metaslab_group_add(mg, ms); 2320 metaslab_set_fragmentation(ms); 2321 2322 /* 2323 * If we're opening an existing pool (txg == 0) or creating 2324 * a new one (txg == TXG_INITIAL), all space is available now. 2325 * If we're adding space to an existing pool, the new space 2326 * does not become available until after this txg has synced. 2327 * The metaslab's weight will also be initialized when we sync 2328 * out this txg. This ensures that we don't attempt to allocate 2329 * from it before we have initialized it completely. 2330 */ 2331 if (txg <= TXG_INITIAL) { 2332 metaslab_sync_done(ms, 0); 2333 metaslab_space_update(vd, mg->mg_class, 2334 metaslab_allocated_space(ms), 0, 0); 2335 } 2336 2337 if (txg != 0) { 2338 vdev_dirty(vd, 0, NULL, txg); 2339 vdev_dirty(vd, VDD_METASLAB, ms, txg); 2340 } 2341 2342 *msp = ms; 2343 2344 return (0); 2345 } 2346 2347 static void 2348 metaslab_fini_flush_data(metaslab_t *msp) 2349 { 2350 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 2351 2352 if (metaslab_unflushed_txg(msp) == 0) { 2353 ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), 2354 ==, NULL); 2355 return; 2356 } 2357 ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); 2358 2359 mutex_enter(&spa->spa_flushed_ms_lock); 2360 avl_remove(&spa->spa_metaslabs_by_flushed, msp); 2361 mutex_exit(&spa->spa_flushed_ms_lock); 2362 2363 spa_log_sm_decrement_mscount(spa, metaslab_unflushed_txg(msp)); 2364 spa_log_summary_decrement_mscount(spa, metaslab_unflushed_txg(msp)); 2365 } 2366 2367 uint64_t 2368 metaslab_unflushed_changes_memused(metaslab_t *ms) 2369 { 2370 return ((range_tree_numsegs(ms->ms_unflushed_allocs) + 2371 range_tree_numsegs(ms->ms_unflushed_frees)) * 2372 sizeof (range_seg_t)); 2373 } 2374 2375 void 2376 metaslab_fini(metaslab_t *msp) 2377 { 2378 metaslab_group_t *mg = msp->ms_group; 2379 vdev_t *vd = mg->mg_vd; 2380 spa_t *spa = vd->vdev_spa; 2381 2382 metaslab_fini_flush_data(msp); 2383 2384 metaslab_group_remove(mg, msp); 2385 2386 mutex_enter(&msp->ms_lock); 2387 VERIFY(msp->ms_group == NULL); 2388 metaslab_space_update(vd, mg->mg_class, 2389 -metaslab_allocated_space(msp), 0, -msp->ms_size); 2390 2391 space_map_close(msp->ms_sm); 2392 msp->ms_sm = NULL; 2393 2394 metaslab_unload(msp); 2395 range_tree_destroy(msp->ms_allocatable); 2396 range_tree_destroy(msp->ms_freeing); 2397 range_tree_destroy(msp->ms_freed); 2398 2399 ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, 2400 metaslab_unflushed_changes_memused(msp)); 2401 spa->spa_unflushed_stats.sus_memused -= 2402 metaslab_unflushed_changes_memused(msp); 2403 range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); 2404 range_tree_destroy(msp->ms_unflushed_allocs); 2405 range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); 2406 range_tree_destroy(msp->ms_unflushed_frees); 2407 2408 for (int t = 0; t < TXG_SIZE; t++) { 2409 range_tree_destroy(msp->ms_allocating[t]); 2410 } 2411 2412 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2413 range_tree_destroy(msp->ms_defer[t]); 2414 } 2415 ASSERT0(msp->ms_deferspace); 2416 2417 range_tree_destroy(msp->ms_checkpointing); 2418 2419 for (int t = 0; t < TXG_SIZE; t++) 2420 ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t)); 2421 2422 range_tree_vacate(msp->ms_trim, NULL, NULL); 2423 range_tree_destroy(msp->ms_trim); 2424 2425 mutex_exit(&msp->ms_lock); 2426 cv_destroy(&msp->ms_load_cv); 2427 cv_destroy(&msp->ms_flush_cv); 2428 mutex_destroy(&msp->ms_lock); 2429 mutex_destroy(&msp->ms_sync_lock); 2430 ASSERT3U(msp->ms_allocator, ==, -1); 2431 2432 kmem_free(msp, sizeof (metaslab_t)); 2433 } 2434 2435 #define FRAGMENTATION_TABLE_SIZE 17 2436 2437 /* 2438 * This table defines a segment size based fragmentation metric that will 2439 * allow each metaslab to derive its own fragmentation value. This is done 2440 * by calculating the space in each bucket of the spacemap histogram and 2441 * multiplying that by the fragmentation metric in this table. Doing 2442 * this for all buckets and dividing it by the total amount of free 2443 * space in this metaslab (i.e. the total free space in all buckets) gives 2444 * us the fragmentation metric. This means that a high fragmentation metric 2445 * equates to most of the free space being comprised of small segments. 2446 * Conversely, if the metric is low, then most of the free space is in 2447 * large segments. A 10% change in fragmentation equates to approximately 2448 * double the number of segments. 2449 * 2450 * This table defines 0% fragmented space using 16MB segments. Testing has 2451 * shown that segments that are greater than or equal to 16MB do not suffer 2452 * from drastic performance problems. Using this value, we derive the rest 2453 * of the table. Since the fragmentation value is never stored on disk, it 2454 * is possible to change these calculations in the future. 2455 */ 2456 int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = { 2457 100, /* 512B */ 2458 100, /* 1K */ 2459 98, /* 2K */ 2460 95, /* 4K */ 2461 90, /* 8K */ 2462 80, /* 16K */ 2463 70, /* 32K */ 2464 60, /* 64K */ 2465 50, /* 128K */ 2466 40, /* 256K */ 2467 30, /* 512K */ 2468 20, /* 1M */ 2469 15, /* 2M */ 2470 10, /* 4M */ 2471 5, /* 8M */ 2472 0 /* 16M */ 2473 }; 2474 2475 /* 2476 * Calculate the metaslab's fragmentation metric and set ms_fragmentation. 2477 * Setting this value to ZFS_FRAG_INVALID means that the metaslab has not 2478 * been upgraded and does not support this metric. Otherwise, the return 2479 * value should be in the range [0, 100]. 2480 */ 2481 static void 2482 metaslab_set_fragmentation(metaslab_t *msp) 2483 { 2484 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 2485 uint64_t fragmentation = 0; 2486 uint64_t total = 0; 2487 boolean_t feature_enabled = spa_feature_is_enabled(spa, 2488 SPA_FEATURE_SPACEMAP_HISTOGRAM); 2489 2490 if (!feature_enabled) { 2491 msp->ms_fragmentation = ZFS_FRAG_INVALID; 2492 return; 2493 } 2494 2495 /* 2496 * A null space map means that the entire metaslab is free 2497 * and thus is not fragmented. 2498 */ 2499 if (msp->ms_sm == NULL) { 2500 msp->ms_fragmentation = 0; 2501 return; 2502 } 2503 2504 /* 2505 * If this metaslab's space map has not been upgraded, flag it 2506 * so that we upgrade next time we encounter it. 2507 */ 2508 if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) { 2509 uint64_t txg = spa_syncing_txg(spa); 2510 vdev_t *vd = msp->ms_group->mg_vd; 2511 2512 /* 2513 * If we've reached the final dirty txg, then we must 2514 * be shutting down the pool. We don't want to dirty 2515 * any data past this point so skip setting the condense 2516 * flag. We can retry this action the next time the pool 2517 * is imported. 2518 */ 2519 if (spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) { 2520 msp->ms_condense_wanted = B_TRUE; 2521 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 2522 zfs_dbgmsg("txg %llu, requesting force condense: " 2523 "ms_id %llu, vdev_id %llu", txg, msp->ms_id, 2524 vd->vdev_id); 2525 } 2526 msp->ms_fragmentation = ZFS_FRAG_INVALID; 2527 return; 2528 } 2529 2530 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 2531 uint64_t space = 0; 2532 uint8_t shift = msp->ms_sm->sm_shift; 2533 2534 int idx = MIN(shift - SPA_MINBLOCKSHIFT + i, 2535 FRAGMENTATION_TABLE_SIZE - 1); 2536 2537 if (msp->ms_sm->sm_phys->smp_histogram[i] == 0) 2538 continue; 2539 2540 space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift); 2541 total += space; 2542 2543 ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE); 2544 fragmentation += space * zfs_frag_table[idx]; 2545 } 2546 2547 if (total > 0) 2548 fragmentation /= total; 2549 ASSERT3U(fragmentation, <=, 100); 2550 2551 msp->ms_fragmentation = fragmentation; 2552 } 2553 2554 /* 2555 * Compute a weight -- a selection preference value -- for the given metaslab. 2556 * This is based on the amount of free space, the level of fragmentation, 2557 * the LBA range, and whether the metaslab is loaded. 2558 */ 2559 static uint64_t 2560 metaslab_space_weight(metaslab_t *msp) 2561 { 2562 metaslab_group_t *mg = msp->ms_group; 2563 vdev_t *vd = mg->mg_vd; 2564 uint64_t weight, space; 2565 2566 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2567 2568 /* 2569 * The baseline weight is the metaslab's free space. 2570 */ 2571 space = msp->ms_size - metaslab_allocated_space(msp); 2572 2573 if (metaslab_fragmentation_factor_enabled && 2574 msp->ms_fragmentation != ZFS_FRAG_INVALID) { 2575 /* 2576 * Use the fragmentation information to inversely scale 2577 * down the baseline weight. We need to ensure that we 2578 * don't exclude this metaslab completely when it's 100% 2579 * fragmented. To avoid this we reduce the fragmented value 2580 * by 1. 2581 */ 2582 space = (space * (100 - (msp->ms_fragmentation - 1))) / 100; 2583 2584 /* 2585 * If space < SPA_MINBLOCKSIZE, then we will not allocate from 2586 * this metaslab again. The fragmentation metric may have 2587 * decreased the space to something smaller than 2588 * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE 2589 * so that we can consume any remaining space. 2590 */ 2591 if (space > 0 && space < SPA_MINBLOCKSIZE) 2592 space = SPA_MINBLOCKSIZE; 2593 } 2594 weight = space; 2595 2596 /* 2597 * Modern disks have uniform bit density and constant angular velocity. 2598 * Therefore, the outer recording zones are faster (higher bandwidth) 2599 * than the inner zones by the ratio of outer to inner track diameter, 2600 * which is typically around 2:1. We account for this by assigning 2601 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). 2602 * In effect, this means that we'll select the metaslab with the most 2603 * free bandwidth rather than simply the one with the most free space. 2604 */ 2605 if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) { 2606 weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count; 2607 ASSERT(weight >= space && weight <= 2 * space); 2608 } 2609 2610 /* 2611 * If this metaslab is one we're actively using, adjust its 2612 * weight to make it preferable to any inactive metaslab so 2613 * we'll polish it off. If the fragmentation on this metaslab 2614 * has exceed our threshold, then don't mark it active. 2615 */ 2616 if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID && 2617 msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) { 2618 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); 2619 } 2620 2621 WEIGHT_SET_SPACEBASED(weight); 2622 return (weight); 2623 } 2624 2625 /* 2626 * Return the weight of the specified metaslab, according to the segment-based 2627 * weighting algorithm. The metaslab must be loaded. This function can 2628 * be called within a sync pass since it relies only on the metaslab's 2629 * range tree which is always accurate when the metaslab is loaded. 2630 */ 2631 static uint64_t 2632 metaslab_weight_from_range_tree(metaslab_t *msp) 2633 { 2634 uint64_t weight = 0; 2635 uint32_t segments = 0; 2636 2637 ASSERT(msp->ms_loaded); 2638 2639 for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT; 2640 i--) { 2641 uint8_t shift = msp->ms_group->mg_vd->vdev_ashift; 2642 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; 2643 2644 segments <<= 1; 2645 segments += msp->ms_allocatable->rt_histogram[i]; 2646 2647 /* 2648 * The range tree provides more precision than the space map 2649 * and must be downgraded so that all values fit within the 2650 * space map's histogram. This allows us to compare loaded 2651 * vs. unloaded metaslabs to determine which metaslab is 2652 * considered "best". 2653 */ 2654 if (i > max_idx) 2655 continue; 2656 2657 if (segments != 0) { 2658 WEIGHT_SET_COUNT(weight, segments); 2659 WEIGHT_SET_INDEX(weight, i); 2660 WEIGHT_SET_ACTIVE(weight, 0); 2661 break; 2662 } 2663 } 2664 return (weight); 2665 } 2666 2667 /* 2668 * Calculate the weight based on the on-disk histogram. Should be applied 2669 * only to unloaded metaslabs (i.e no incoming allocations) in-order to 2670 * give results consistent with the on-disk state 2671 */ 2672 static uint64_t 2673 metaslab_weight_from_spacemap(metaslab_t *msp) 2674 { 2675 space_map_t *sm = msp->ms_sm; 2676 ASSERT(!msp->ms_loaded); 2677 ASSERT(sm != NULL); 2678 ASSERT3U(space_map_object(sm), !=, 0); 2679 ASSERT3U(sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); 2680 2681 /* 2682 * Create a joint histogram from all the segments that have made 2683 * it to the metaslab's space map histogram, that are not yet 2684 * available for allocation because they are still in the freeing 2685 * pipeline (e.g. freeing, freed, and defer trees). Then subtract 2686 * these segments from the space map's histogram to get a more 2687 * accurate weight. 2688 */ 2689 uint64_t deferspace_histogram[SPACE_MAP_HISTOGRAM_SIZE] = {0}; 2690 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) 2691 deferspace_histogram[i] += msp->ms_synchist[i]; 2692 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2693 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 2694 deferspace_histogram[i] += msp->ms_deferhist[t][i]; 2695 } 2696 } 2697 2698 uint64_t weight = 0; 2699 for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) { 2700 ASSERT3U(sm->sm_phys->smp_histogram[i], >=, 2701 deferspace_histogram[i]); 2702 uint64_t count = 2703 sm->sm_phys->smp_histogram[i] - deferspace_histogram[i]; 2704 if (count != 0) { 2705 WEIGHT_SET_COUNT(weight, count); 2706 WEIGHT_SET_INDEX(weight, i + sm->sm_shift); 2707 WEIGHT_SET_ACTIVE(weight, 0); 2708 break; 2709 } 2710 } 2711 return (weight); 2712 } 2713 2714 /* 2715 * Compute a segment-based weight for the specified metaslab. The weight 2716 * is determined by highest bucket in the histogram. The information 2717 * for the highest bucket is encoded into the weight value. 2718 */ 2719 static uint64_t 2720 metaslab_segment_weight(metaslab_t *msp) 2721 { 2722 metaslab_group_t *mg = msp->ms_group; 2723 uint64_t weight = 0; 2724 uint8_t shift = mg->mg_vd->vdev_ashift; 2725 2726 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2727 2728 /* 2729 * The metaslab is completely free. 2730 */ 2731 if (metaslab_allocated_space(msp) == 0) { 2732 int idx = highbit64(msp->ms_size) - 1; 2733 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; 2734 2735 if (idx < max_idx) { 2736 WEIGHT_SET_COUNT(weight, 1ULL); 2737 WEIGHT_SET_INDEX(weight, idx); 2738 } else { 2739 WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx)); 2740 WEIGHT_SET_INDEX(weight, max_idx); 2741 } 2742 WEIGHT_SET_ACTIVE(weight, 0); 2743 ASSERT(!WEIGHT_IS_SPACEBASED(weight)); 2744 return (weight); 2745 } 2746 2747 ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); 2748 2749 /* 2750 * If the metaslab is fully allocated then just make the weight 0. 2751 */ 2752 if (metaslab_allocated_space(msp) == msp->ms_size) 2753 return (0); 2754 /* 2755 * If the metaslab is already loaded, then use the range tree to 2756 * determine the weight. Otherwise, we rely on the space map information 2757 * to generate the weight. 2758 */ 2759 if (msp->ms_loaded) { 2760 weight = metaslab_weight_from_range_tree(msp); 2761 } else { 2762 weight = metaslab_weight_from_spacemap(msp); 2763 } 2764 2765 /* 2766 * If the metaslab was active the last time we calculated its weight 2767 * then keep it active. We want to consume the entire region that 2768 * is associated with this weight. 2769 */ 2770 if (msp->ms_activation_weight != 0 && weight != 0) 2771 WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight)); 2772 return (weight); 2773 } 2774 2775 /* 2776 * Determine if we should attempt to allocate from this metaslab. If the 2777 * metaslab is loaded, then we can determine if the desired allocation 2778 * can be satisfied by looking at the size of the maximum free segment 2779 * on that metaslab. Otherwise, we make our decision based on the metaslab's 2780 * weight. For segment-based weighting we can determine the maximum 2781 * allocation based on the index encoded in its value. For space-based 2782 * weights we rely on the entire weight (excluding the weight-type bit). 2783 */ 2784 boolean_t 2785 metaslab_should_allocate(metaslab_t *msp, uint64_t asize, boolean_t try_hard) 2786 { 2787 /* 2788 * If the metaslab is loaded, ms_max_size is definitive and we can use 2789 * the fast check. If it's not, the ms_max_size is a lower bound (once 2790 * set), and we should use the fast check as long as we're not in 2791 * try_hard and it's been less than zfs_metaslab_max_size_cache_sec 2792 * seconds since the metaslab was unloaded. 2793 */ 2794 if (msp->ms_loaded || 2795 (msp->ms_max_size != 0 && !try_hard && gethrtime() < 2796 msp->ms_unload_time + SEC2NSEC(zfs_metaslab_max_size_cache_sec))) 2797 return (msp->ms_max_size >= asize); 2798 2799 boolean_t should_allocate; 2800 if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) { 2801 /* 2802 * The metaslab segment weight indicates segments in the 2803 * range [2^i, 2^(i+1)), where i is the index in the weight. 2804 * Since the asize might be in the middle of the range, we 2805 * should attempt the allocation if asize < 2^(i+1). 2806 */ 2807 should_allocate = (asize < 2808 1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1)); 2809 } else { 2810 should_allocate = (asize <= 2811 (msp->ms_weight & ~METASLAB_WEIGHT_TYPE)); 2812 } 2813 2814 return (should_allocate); 2815 } 2816 2817 static uint64_t 2818 metaslab_weight(metaslab_t *msp) 2819 { 2820 vdev_t *vd = msp->ms_group->mg_vd; 2821 spa_t *spa = vd->vdev_spa; 2822 uint64_t weight; 2823 2824 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2825 2826 metaslab_set_fragmentation(msp); 2827 2828 /* 2829 * Update the maximum size. If the metaslab is loaded, this will 2830 * ensure that we get an accurate maximum size if newly freed space 2831 * has been added back into the free tree. If the metaslab is 2832 * unloaded, we check if there's a larger free segment in the 2833 * unflushed frees. This is a lower bound on the largest allocatable 2834 * segment size. Coalescing of adjacent entries may reveal larger 2835 * allocatable segments, but we aren't aware of those until loading 2836 * the space map into a range tree. 2837 */ 2838 if (msp->ms_loaded) { 2839 msp->ms_max_size = metaslab_largest_allocatable(msp); 2840 } else { 2841 msp->ms_max_size = MAX(msp->ms_max_size, 2842 metaslab_largest_unflushed_free(msp)); 2843 } 2844 2845 /* 2846 * Segment-based weighting requires space map histogram support. 2847 */ 2848 if (zfs_metaslab_segment_weight_enabled && 2849 spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) && 2850 (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size == 2851 sizeof (space_map_phys_t))) { 2852 weight = metaslab_segment_weight(msp); 2853 } else { 2854 weight = metaslab_space_weight(msp); 2855 } 2856 return (weight); 2857 } 2858 2859 void 2860 metaslab_recalculate_weight_and_sort(metaslab_t *msp) 2861 { 2862 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2863 2864 /* note: we preserve the mask (e.g. indication of primary, etc..) */ 2865 uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; 2866 metaslab_group_sort(msp->ms_group, msp, 2867 metaslab_weight(msp) | was_active); 2868 } 2869 2870 static int 2871 metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp, 2872 int allocator, uint64_t activation_weight) 2873 { 2874 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2875 2876 /* 2877 * If we're activating for the claim code, we don't want to actually 2878 * set the metaslab up for a specific allocator. 2879 */ 2880 if (activation_weight == METASLAB_WEIGHT_CLAIM) { 2881 ASSERT0(msp->ms_activation_weight); 2882 msp->ms_activation_weight = msp->ms_weight; 2883 metaslab_group_sort(mg, msp, msp->ms_weight | 2884 activation_weight); 2885 return (0); 2886 } 2887 2888 metaslab_t **arr = (activation_weight == METASLAB_WEIGHT_PRIMARY ? 2889 mg->mg_primaries : mg->mg_secondaries); 2890 2891 mutex_enter(&mg->mg_lock); 2892 if (arr[allocator] != NULL) { 2893 mutex_exit(&mg->mg_lock); 2894 return (EEXIST); 2895 } 2896 2897 arr[allocator] = msp; 2898 ASSERT3S(msp->ms_allocator, ==, -1); 2899 msp->ms_allocator = allocator; 2900 msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY); 2901 2902 ASSERT0(msp->ms_activation_weight); 2903 msp->ms_activation_weight = msp->ms_weight; 2904 metaslab_group_sort_impl(mg, msp, 2905 msp->ms_weight | activation_weight); 2906 2907 mutex_exit(&mg->mg_lock); 2908 2909 return (0); 2910 } 2911 2912 static int 2913 metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight) 2914 { 2915 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2916 2917 /* 2918 * The current metaslab is already activated for us so there 2919 * is nothing to do. Already activated though, doesn't mean 2920 * that this metaslab is activated for our allocator nor our 2921 * requested activation weight. The metaslab could have started 2922 * as an active one for our allocator but changed allocators 2923 * while we were waiting to grab its ms_lock or we stole it 2924 * [see find_valid_metaslab()]. This means that there is a 2925 * possibility of passivating a metaslab of another allocator 2926 * or from a different activation mask, from this thread. 2927 */ 2928 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) { 2929 ASSERT(msp->ms_loaded); 2930 return (0); 2931 } 2932 2933 int error = metaslab_load(msp); 2934 if (error != 0) { 2935 metaslab_group_sort(msp->ms_group, msp, 0); 2936 return (error); 2937 } 2938 2939 /* 2940 * When entering metaslab_load() we may have dropped the 2941 * ms_lock because we were loading this metaslab, or we 2942 * were waiting for another thread to load it for us. In 2943 * that scenario, we recheck the weight of the metaslab 2944 * to see if it was activated by another thread. 2945 * 2946 * If the metaslab was activated for another allocator or 2947 * it was activated with a different activation weight (e.g. 2948 * we wanted to make it a primary but it was activated as 2949 * secondary) we return error (EBUSY). 2950 * 2951 * If the metaslab was activated for the same allocator 2952 * and requested activation mask, skip activating it. 2953 */ 2954 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) { 2955 if (msp->ms_allocator != allocator) 2956 return (EBUSY); 2957 2958 if ((msp->ms_weight & activation_weight) == 0) 2959 return (EBUSY); 2960 2961 EQUIV((activation_weight == METASLAB_WEIGHT_PRIMARY), 2962 msp->ms_primary); 2963 return (0); 2964 } 2965 2966 /* 2967 * If the metaslab has literally 0 space, it will have weight 0. In 2968 * that case, don't bother activating it. This can happen if the 2969 * metaslab had space during find_valid_metaslab, but another thread 2970 * loaded it and used all that space while we were waiting to grab the 2971 * lock. 2972 */ 2973 if (msp->ms_weight == 0) { 2974 ASSERT0(range_tree_space(msp->ms_allocatable)); 2975 return (SET_ERROR(ENOSPC)); 2976 } 2977 2978 if ((error = metaslab_activate_allocator(msp->ms_group, msp, 2979 allocator, activation_weight)) != 0) { 2980 return (error); 2981 } 2982 2983 ASSERT(msp->ms_loaded); 2984 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); 2985 2986 return (0); 2987 } 2988 2989 static void 2990 metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp, 2991 uint64_t weight) 2992 { 2993 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2994 ASSERT(msp->ms_loaded); 2995 2996 if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) { 2997 metaslab_group_sort(mg, msp, weight); 2998 return; 2999 } 3000 3001 mutex_enter(&mg->mg_lock); 3002 ASSERT3P(msp->ms_group, ==, mg); 3003 ASSERT3S(0, <=, msp->ms_allocator); 3004 ASSERT3U(msp->ms_allocator, <, mg->mg_allocators); 3005 3006 if (msp->ms_primary) { 3007 ASSERT3P(mg->mg_primaries[msp->ms_allocator], ==, msp); 3008 ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); 3009 mg->mg_primaries[msp->ms_allocator] = NULL; 3010 } else { 3011 ASSERT3P(mg->mg_secondaries[msp->ms_allocator], ==, msp); 3012 ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); 3013 mg->mg_secondaries[msp->ms_allocator] = NULL; 3014 } 3015 msp->ms_allocator = -1; 3016 metaslab_group_sort_impl(mg, msp, weight); 3017 mutex_exit(&mg->mg_lock); 3018 } 3019 3020 static void 3021 metaslab_passivate(metaslab_t *msp, uint64_t weight) 3022 { 3023 uint64_t size = weight & ~METASLAB_WEIGHT_TYPE; 3024 3025 /* 3026 * If size < SPA_MINBLOCKSIZE, then we will not allocate from 3027 * this metaslab again. In that case, it had better be empty, 3028 * or we would be leaving space on the table. 3029 */ 3030 ASSERT(size >= SPA_MINBLOCKSIZE || 3031 range_tree_is_empty(msp->ms_allocatable)); 3032 ASSERT0(weight & METASLAB_ACTIVE_MASK); 3033 3034 ASSERT(msp->ms_activation_weight != 0); 3035 msp->ms_activation_weight = 0; 3036 metaslab_passivate_allocator(msp->ms_group, msp, weight); 3037 ASSERT0(msp->ms_weight & METASLAB_ACTIVE_MASK); 3038 } 3039 3040 /* 3041 * Segment-based metaslabs are activated once and remain active until 3042 * we either fail an allocation attempt (similar to space-based metaslabs) 3043 * or have exhausted the free space in zfs_metaslab_switch_threshold 3044 * buckets since the metaslab was activated. This function checks to see 3045 * if we've exhaused the zfs_metaslab_switch_threshold buckets in the 3046 * metaslab and passivates it proactively. This will allow us to select a 3047 * metaslabs with larger contiguous region if any remaining within this 3048 * metaslab group. If we're in sync pass > 1, then we continue using this 3049 * metaslab so that we don't dirty more block and cause more sync passes. 3050 */ 3051 void 3052 metaslab_segment_may_passivate(metaslab_t *msp) 3053 { 3054 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 3055 3056 if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1) 3057 return; 3058 3059 /* 3060 * Since we are in the middle of a sync pass, the most accurate 3061 * information that is accessible to us is the in-core range tree 3062 * histogram; calculate the new weight based on that information. 3063 */ 3064 uint64_t weight = metaslab_weight_from_range_tree(msp); 3065 int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight); 3066 int current_idx = WEIGHT_GET_INDEX(weight); 3067 3068 if (current_idx <= activation_idx - zfs_metaslab_switch_threshold) 3069 metaslab_passivate(msp, weight); 3070 } 3071 3072 static void 3073 metaslab_preload(void *arg) 3074 { 3075 metaslab_t *msp = arg; 3076 metaslab_class_t *mc = msp->ms_group->mg_class; 3077 spa_t *spa = mc->mc_spa; 3078 3079 ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock)); 3080 3081 mutex_enter(&msp->ms_lock); 3082 (void) metaslab_load(msp); 3083 metaslab_set_selected_txg(msp, spa_syncing_txg(spa)); 3084 mutex_exit(&msp->ms_lock); 3085 } 3086 3087 static void 3088 metaslab_group_preload(metaslab_group_t *mg) 3089 { 3090 spa_t *spa = mg->mg_vd->vdev_spa; 3091 metaslab_t *msp; 3092 avl_tree_t *t = &mg->mg_metaslab_tree; 3093 int m = 0; 3094 3095 if (spa_shutting_down(spa) || !metaslab_preload_enabled) { 3096 taskq_wait(mg->mg_taskq); 3097 return; 3098 } 3099 3100 mutex_enter(&mg->mg_lock); 3101 3102 /* 3103 * Load the next potential metaslabs 3104 */ 3105 for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) { 3106 ASSERT3P(msp->ms_group, ==, mg); 3107 3108 /* 3109 * We preload only the maximum number of metaslabs specified 3110 * by metaslab_preload_limit. If a metaslab is being forced 3111 * to condense then we preload it too. This will ensure 3112 * that force condensing happens in the next txg. 3113 */ 3114 if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) { 3115 continue; 3116 } 3117 3118 VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload, 3119 msp, TQ_SLEEP) != TASKQID_INVALID); 3120 } 3121 mutex_exit(&mg->mg_lock); 3122 } 3123 3124 /* 3125 * Determine if the space map's on-disk footprint is past our tolerance for 3126 * inefficiency. We would like to use the following criteria to make our 3127 * decision: 3128 * 3129 * 1. Do not condense if the size of the space map object would dramatically 3130 * increase as a result of writing out the free space range tree. 3131 * 3132 * 2. Condense if the on on-disk space map representation is at least 3133 * zfs_condense_pct/100 times the size of the optimal representation 3134 * (i.e. zfs_condense_pct = 110 and in-core = 1MB, optimal = 1.1MB). 3135 * 3136 * 3. Do not condense if the on-disk size of the space map does not actually 3137 * decrease. 3138 * 3139 * Unfortunately, we cannot compute the on-disk size of the space map in this 3140 * context because we cannot accurately compute the effects of compression, etc. 3141 * Instead, we apply the heuristic described in the block comment for 3142 * zfs_metaslab_condense_block_threshold - we only condense if the space used 3143 * is greater than a threshold number of blocks. 3144 */ 3145 static boolean_t 3146 metaslab_should_condense(metaslab_t *msp) 3147 { 3148 space_map_t *sm = msp->ms_sm; 3149 vdev_t *vd = msp->ms_group->mg_vd; 3150 uint64_t vdev_blocksize = 1 << vd->vdev_ashift; 3151 3152 ASSERT(MUTEX_HELD(&msp->ms_lock)); 3153 ASSERT(msp->ms_loaded); 3154 ASSERT(sm != NULL); 3155 ASSERT3U(spa_sync_pass(vd->vdev_spa), ==, 1); 3156 3157 /* 3158 * We always condense metaslabs that are empty and metaslabs for 3159 * which a condense request has been made. 3160 */ 3161 if (avl_is_empty(&msp->ms_allocatable_by_size) || 3162 msp->ms_condense_wanted) 3163 return (B_TRUE); 3164 3165 uint64_t record_size = MAX(sm->sm_blksz, vdev_blocksize); 3166 uint64_t object_size = space_map_length(sm); 3167 uint64_t optimal_size = space_map_estimate_optimal_size(sm, 3168 msp->ms_allocatable, SM_NO_VDEVID); 3169 3170 return (object_size >= (optimal_size * zfs_condense_pct / 100) && 3171 object_size > zfs_metaslab_condense_block_threshold * record_size); 3172 } 3173 3174 /* 3175 * Condense the on-disk space map representation to its minimized form. 3176 * The minimized form consists of a small number of allocations followed 3177 * by the entries of the free range tree (ms_allocatable). The condensed 3178 * spacemap contains all the entries of previous TXGs (including those in 3179 * the pool-wide log spacemaps; thus this is effectively a superset of 3180 * metaslab_flush()), but this TXG's entries still need to be written. 3181 */ 3182 static void 3183 metaslab_condense(metaslab_t *msp, dmu_tx_t *tx) 3184 { 3185 range_tree_t *condense_tree; 3186 space_map_t *sm = msp->ms_sm; 3187 uint64_t txg = dmu_tx_get_txg(tx); 3188 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 3189 3190 ASSERT(MUTEX_HELD(&msp->ms_lock)); 3191 ASSERT(msp->ms_loaded); 3192 ASSERT(msp->ms_sm != NULL); 3193 3194 /* 3195 * In order to condense the space map, we need to change it so it 3196 * only describes which segments are currently allocated and free. 3197 * 3198 * All the current free space resides in the ms_allocatable, all 3199 * the ms_defer trees, and all the ms_allocating trees. We ignore 3200 * ms_freed because it is empty because we're in sync pass 1. We 3201 * ignore ms_freeing because these changes are not yet reflected 3202 * in the spacemap (they will be written later this txg). 3203 * 3204 * So to truncate the space map to represent all the entries of 3205 * previous TXGs we do the following: 3206 * 3207 * 1] We create a range tree (condense tree) that is 100% allocated. 3208 * 2] We remove from it all segments found in the ms_defer trees 3209 * as those segments are marked as free in the original space 3210 * map. We do the same with the ms_allocating trees for the same 3211 * reason. Removing these segments should be a relatively 3212 * inexpensive operation since we expect these trees to have a 3213 * small number of nodes. 3214 * 3] We vacate any unflushed allocs as they should already exist 3215 * in the condense tree. Then we vacate any unflushed frees as 3216 * they should already be part of ms_allocatable. 3217 * 4] At this point, we would ideally like to remove all segments 3218 * in the ms_allocatable tree from the condense tree. This way 3219 * we would write all the entries of the condense tree as the 3220 * condensed space map, which would only contain allocated 3221 * segments with everything else assumed to be freed. 3222 * 3223 * Doing so can be prohibitively expensive as ms_allocatable can 3224 * be large, and therefore computationally expensive to subtract 3225 * from the condense_tree. Instead we first sync out the 3226 * condense_tree and then the ms_allocatable, in the condensed 3227 * space map. While this is not optimal, it is typically close to 3228 * optimal and more importantly much cheaper to compute. 3229 * 3230 * 5] Finally, as both of the unflushed trees were written to our 3231 * new and condensed metaslab space map, we basically flushed 3232 * all the unflushed changes to disk, thus we call 3233 * metaslab_flush_update(). 3234 */ 3235 ASSERT3U(spa_sync_pass(spa), ==, 1); 3236 ASSERT(range_tree_is_empty(msp->ms_freed)); /* since it is pass 1 */ 3237 3238 zfs_dbgmsg("condensing: txg %llu, msp[%llu] %p, vdev id %llu, " 3239 "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg, 3240 msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id, 3241 spa->spa_name, space_map_length(msp->ms_sm), 3242 avl_numnodes(&msp->ms_allocatable->rt_root), 3243 msp->ms_condense_wanted ? "TRUE" : "FALSE"); 3244 3245 msp->ms_condense_wanted = B_FALSE; 3246 3247 condense_tree = range_tree_create(NULL, NULL); 3248 range_tree_add(condense_tree, msp->ms_start, msp->ms_size); 3249 3250 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 3251 range_tree_walk(msp->ms_defer[t], 3252 range_tree_remove, condense_tree); 3253 } 3254 3255 for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { 3256 range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK], 3257 range_tree_remove, condense_tree); 3258 } 3259 3260 ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, 3261 metaslab_unflushed_changes_memused(msp)); 3262 spa->spa_unflushed_stats.sus_memused -= 3263 metaslab_unflushed_changes_memused(msp); 3264 range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); 3265 range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); 3266 3267 /* 3268 * We're about to drop the metaslab's lock thus allowing other 3269 * consumers to change its content. Set the metaslab's ms_condensing 3270 * flag to ensure that allocations on this metaslab do not occur 3271 * while we're in the middle of committing it to disk. This is only 3272 * critical for ms_allocatable as all other range trees use per TXG 3273 * views of their content. 3274 */ 3275 msp->ms_condensing = B_TRUE; 3276 3277 mutex_exit(&msp->ms_lock); 3278 uint64_t object = space_map_object(msp->ms_sm); 3279 space_map_truncate(sm, 3280 spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ? 3281 zfs_metaslab_sm_blksz_with_log : zfs_metaslab_sm_blksz_no_log, tx); 3282 3283 /* 3284 * space_map_truncate() may have reallocated the spacemap object. 3285 * If so, update the vdev_ms_array. 3286 */ 3287 if (space_map_object(msp->ms_sm) != object) { 3288 object = space_map_object(msp->ms_sm); 3289 dmu_write(spa->spa_meta_objset, 3290 msp->ms_group->mg_vd->vdev_ms_array, sizeof (uint64_t) * 3291 msp->ms_id, sizeof (uint64_t), &object, tx); 3292 } 3293 3294 /* 3295 * Note: 3296 * When the log space map feature is enabled, each space map will 3297 * always have ALLOCS followed by FREES for each sync pass. This is 3298 * typically true even when the log space map feature is disabled, 3299 * except from the case where a metaslab goes through metaslab_sync() 3300 * and gets condensed. In that case the metaslab's space map will have 3301 * ALLOCS followed by FREES (due to condensing) followed by ALLOCS 3302 * followed by FREES (due to space_map_write() in metaslab_sync()) for 3303 * sync pass 1. 3304 */ 3305 space_map_write(sm, condense_tree, SM_ALLOC, SM_NO_VDEVID, tx); 3306 space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx); 3307 3308 range_tree_vacate(condense_tree, NULL, NULL); 3309 range_tree_destroy(condense_tree); 3310 mutex_enter(&msp->ms_lock); 3311 3312 msp->ms_condensing = B_FALSE; 3313 metaslab_flush_update(msp, tx); 3314 } 3315 3316 /* 3317 * Called when the metaslab has been flushed (its own spacemap now reflects 3318 * all the contents of the pool-wide spacemap log). Updates the metaslab's 3319 * metadata and any pool-wide related log space map data (e.g. summary, 3320 * obsolete logs, etc.) to reflect that. 3321 */ 3322 static void 3323 metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx) 3324 { 3325 metaslab_group_t *mg = msp->ms_group; 3326 spa_t *spa = mg->mg_vd->vdev_spa; 3327 3328 ASSERT(MUTEX_HELD(&msp->ms_lock)); 3329 3330 ASSERT3U(spa_sync_pass(spa), ==, 1); 3331 ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); 3332 ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); 3333 3334 /* 3335 * Just because a metaslab got flushed, that doesn't mean that 3336 * it will pass through metaslab_sync_done(). Thus, make sure to 3337 * update ms_synced_length here in case it doesn't. 3338 */ 3339 msp->ms_synced_length = space_map_length(msp->ms_sm); 3340 3341 /* 3342 * We may end up here from metaslab_condense() without the 3343 * feature being active. In that case this is a no-op. 3344 */ 3345 if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) 3346 return; 3347 3348 ASSERT(spa_syncing_log_sm(spa) != NULL); 3349 ASSERT(msp->ms_sm != NULL); 3350 ASSERT(metaslab_unflushed_txg(msp) != 0); 3351 ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), ==, msp); 3352 3353 VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(spa)); 3354 3355 /* update metaslab's position in our flushing tree */ 3356 uint64_t ms_prev_flushed_txg = metaslab_unflushed_txg(msp); 3357 mutex_enter(&spa->spa_flushed_ms_lock); 3358 avl_remove(&spa->spa_metaslabs_by_flushed, msp); 3359 metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx); 3360 avl_add(&spa->spa_metaslabs_by_flushed, msp); 3361 mutex_exit(&spa->spa_flushed_ms_lock); 3362 3363 /* update metaslab counts of spa_log_sm_t nodes */ 3364 spa_log_sm_decrement_mscount(spa, ms_prev_flushed_txg); 3365 spa_log_sm_increment_current_mscount(spa); 3366 3367 /* cleanup obsolete logs if any */ 3368 uint64_t log_blocks_before = spa_log_sm_nblocks(spa); 3369 spa_cleanup_old_sm_logs(spa, tx); 3370 uint64_t log_blocks_after = spa_log_sm_nblocks(spa); 3371 VERIFY3U(log_blocks_after, <=, log_blocks_before); 3372 3373 /* update log space map summary */ 3374 uint64_t blocks_gone = log_blocks_before - log_blocks_after; 3375 spa_log_summary_add_flushed_metaslab(spa); 3376 spa_log_summary_decrement_mscount(spa, ms_prev_flushed_txg); 3377 spa_log_summary_decrement_blkcount(spa, blocks_gone); 3378 } 3379 3380 boolean_t 3381 metaslab_flush(metaslab_t *msp, dmu_tx_t *tx) 3382 { 3383 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 3384 3385 ASSERT(MUTEX_HELD(&msp->ms_lock)); 3386 ASSERT3U(spa_sync_pass(spa), ==, 1); 3387 ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); 3388 3389 ASSERT(msp->ms_sm != NULL); 3390 ASSERT(metaslab_unflushed_txg(msp) != 0); 3391 ASSERT(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL) != NULL); 3392 3393 /* 3394 * There is nothing wrong with flushing the same metaslab twice, as 3395 * this codepath should work on that case. However, the current 3396 * flushing scheme makes sure to avoid this situation as we would be 3397 * making all these calls without having anything meaningful to write 3398 * to disk. We assert this behavior here. 3399 */ 3400 ASSERT3U(metaslab_unflushed_txg(msp), <, dmu_tx_get_txg(tx)); 3401 3402 /* 3403 * We can not flush while loading, because then we would 3404 * not load the ms_unflushed_{allocs,frees}. 3405 */ 3406 if (msp->ms_loading) 3407 return (B_FALSE); 3408 3409 metaslab_verify_space(msp, dmu_tx_get_txg(tx)); 3410 metaslab_verify_weight_and_frag(msp); 3411 3412 /* 3413 * Metaslab condensing is effectively flushing. Therefore if the 3414 * metaslab can be condensed we can just condense it instead of 3415 * flushing it. 3416 * 3417 * Note that metaslab_condense() does call metaslab_flush_update() 3418 * so we can just return immediately after condensing. We also 3419 * don't need to care about setting ms_flushing or broadcasting 3420 * ms_flush_cv, even if we temporarily drop the ms_lock in 3421 * metaslab_condense(), as the metaslab is already loaded. 3422 */ 3423 if (msp->ms_loaded && metaslab_should_condense(msp)) { 3424 metaslab_group_t *mg = msp->ms_group; 3425 3426 /* 3427 * For all histogram operations below refer to the 3428 * comments of metaslab_sync() where we follow a 3429 * similar procedure. 3430 */ 3431 metaslab_group_histogram_verify(mg); 3432 metaslab_class_histogram_verify(mg->mg_class); 3433 metaslab_group_histogram_remove(mg, msp); 3434 3435 metaslab_condense(msp, tx); 3436 3437 space_map_histogram_clear(msp->ms_sm); 3438 space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx); 3439 ASSERT(range_tree_is_empty(msp->ms_freed)); 3440 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 3441 space_map_histogram_add(msp->ms_sm, 3442 msp->ms_defer[t], tx); 3443 } 3444 metaslab_aux_histograms_update(msp); 3445 3446 metaslab_group_histogram_add(mg, msp); 3447 metaslab_group_histogram_verify(mg); 3448 metaslab_class_histogram_verify(mg->mg_class); 3449 3450 metaslab_verify_space(msp, dmu_tx_get_txg(tx)); 3451 3452 /* 3453 * Since we recreated the histogram (and potentially 3454 * the ms_sm too while condensing) ensure that the 3455 * weight is updated too because we are not guaranteed 3456 * that this metaslab is dirty and will go through 3457 * metaslab_sync_done(). 3458 */ 3459 metaslab_recalculate_weight_and_sort(msp); 3460 return (B_TRUE); 3461 } 3462 3463 msp->ms_flushing = B_TRUE; 3464 uint64_t sm_len_before = space_map_length(msp->ms_sm); 3465 3466 mutex_exit(&msp->ms_lock); 3467 space_map_write(msp->ms_sm, msp->ms_unflushed_allocs, SM_ALLOC, 3468 SM_NO_VDEVID, tx); 3469 space_map_write(msp->ms_sm, msp->ms_unflushed_frees, SM_FREE, 3470 SM_NO_VDEVID, tx); 3471 mutex_enter(&msp->ms_lock); 3472 3473 uint64_t sm_len_after = space_map_length(msp->ms_sm); 3474 if (zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) { 3475 zfs_dbgmsg("flushing: txg %llu, spa %s, vdev_id %llu, " 3476 "ms_id %llu, unflushed_allocs %llu, unflushed_frees %llu, " 3477 "appended %llu bytes", dmu_tx_get_txg(tx), spa_name(spa), 3478 msp->ms_group->mg_vd->vdev_id, msp->ms_id, 3479 range_tree_space(msp->ms_unflushed_allocs), 3480 range_tree_space(msp->ms_unflushed_frees), 3481 (sm_len_after - sm_len_before)); 3482 } 3483 3484 ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, 3485 metaslab_unflushed_changes_memused(msp)); 3486 spa->spa_unflushed_stats.sus_memused -= 3487 metaslab_unflushed_changes_memused(msp); 3488 range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); 3489 range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); 3490 3491 metaslab_verify_space(msp, dmu_tx_get_txg(tx)); 3492 metaslab_verify_weight_and_frag(msp); 3493 3494 metaslab_flush_update(msp, tx); 3495 3496 metaslab_verify_space(msp, dmu_tx_get_txg(tx)); 3497 metaslab_verify_weight_and_frag(msp); 3498 3499 msp->ms_flushing = B_FALSE; 3500 cv_broadcast(&msp->ms_flush_cv); 3501 return (B_TRUE); 3502 } 3503 3504 /* 3505 * Write a metaslab to disk in the context of the specified transaction group. 3506 */ 3507 void 3508 metaslab_sync(metaslab_t *msp, uint64_t txg) 3509 { 3510 metaslab_group_t *mg = msp->ms_group; 3511 vdev_t *vd = mg->mg_vd; 3512 spa_t *spa = vd->vdev_spa; 3513 objset_t *mos = spa_meta_objset(spa); 3514 range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK]; 3515 dmu_tx_t *tx; 3516 3517 ASSERT(!vd->vdev_ishole); 3518 3519 /* 3520 * This metaslab has just been added so there's no work to do now. 3521 */ 3522 if (msp->ms_freeing == NULL) { 3523 ASSERT3P(alloctree, ==, NULL); 3524 return; 3525 } 3526 3527 ASSERT3P(alloctree, !=, NULL); 3528 ASSERT3P(msp->ms_freeing, !=, NULL); 3529 ASSERT3P(msp->ms_freed, !=, NULL); 3530 ASSERT3P(msp->ms_checkpointing, !=, NULL); 3531 ASSERT3P(msp->ms_trim, !=, NULL); 3532 3533 /* 3534 * Normally, we don't want to process a metaslab if there are no 3535 * allocations or frees to perform. However, if the metaslab is being 3536 * forced to condense, it's loaded and we're not beyond the final 3537 * dirty txg, we need to let it through. Not condensing beyond the 3538 * final dirty txg prevents an issue where metaslabs that need to be 3539 * condensed but were loaded for other reasons could cause a panic 3540 * here. By only checking the txg in that branch of the conditional, 3541 * we preserve the utility of the VERIFY statements in all other 3542 * cases. 3543 */ 3544 if (range_tree_is_empty(alloctree) && 3545 range_tree_is_empty(msp->ms_freeing) && 3546 range_tree_is_empty(msp->ms_checkpointing) && 3547 !(msp->ms_loaded && msp->ms_condense_wanted && 3548 txg <= spa_final_dirty_txg(spa))) 3549 return; 3550 3551 3552 VERIFY(txg <= spa_final_dirty_txg(spa)); 3553 3554 /* 3555 * The only state that can actually be changing concurrently 3556 * with metaslab_sync() is the metaslab's ms_allocatable. No 3557 * other thread can be modifying this txg's alloc, freeing, 3558 * freed, or space_map_phys_t. We drop ms_lock whenever we 3559 * could call into the DMU, because the DMU can call down to 3560 * us (e.g. via zio_free()) at any time. 3561 * 3562 * The spa_vdev_remove_thread() can be reading metaslab state 3563 * concurrently, and it is locked out by the ms_sync_lock. 3564 * Note that the ms_lock is insufficient for this, because it 3565 * is dropped by space_map_write(). 3566 */ 3567 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 3568 3569 /* 3570 * Generate a log space map if one doesn't exist already. 3571 */ 3572 spa_generate_syncing_log_sm(spa, tx); 3573 3574 if (msp->ms_sm == NULL) { 3575 uint64_t new_object = space_map_alloc(mos, 3576 spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ? 3577 zfs_metaslab_sm_blksz_with_log : 3578 zfs_metaslab_sm_blksz_no_log, tx); 3579 VERIFY3U(new_object, !=, 0); 3580 3581 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * 3582 msp->ms_id, sizeof (uint64_t), &new_object, tx); 3583 3584 VERIFY0(space_map_open(&msp->ms_sm, mos, new_object, 3585 msp->ms_start, msp->ms_size, vd->vdev_ashift)); 3586 ASSERT(msp->ms_sm != NULL); 3587 3588 ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); 3589 ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); 3590 ASSERT0(metaslab_allocated_space(msp)); 3591 } 3592 3593 if (metaslab_unflushed_txg(msp) == 0 && 3594 spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { 3595 ASSERT(spa_syncing_log_sm(spa) != NULL); 3596 3597 metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx); 3598 spa_log_sm_increment_current_mscount(spa); 3599 spa_log_summary_add_flushed_metaslab(spa); 3600 3601 ASSERT(msp->ms_sm != NULL); 3602 mutex_enter(&spa->spa_flushed_ms_lock); 3603 avl_add(&spa->spa_metaslabs_by_flushed, msp); 3604 mutex_exit(&spa->spa_flushed_ms_lock); 3605 3606 ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); 3607 ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); 3608 } 3609 3610 if (!range_tree_is_empty(msp->ms_checkpointing) && 3611 vd->vdev_checkpoint_sm == NULL) { 3612 ASSERT(spa_has_checkpoint(spa)); 3613 3614 uint64_t new_object = space_map_alloc(mos, 3615 zfs_vdev_standard_sm_blksz, tx); 3616 VERIFY3U(new_object, !=, 0); 3617 3618 VERIFY0(space_map_open(&vd->vdev_checkpoint_sm, 3619 mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift)); 3620 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); 3621 3622 /* 3623 * We save the space map object as an entry in vdev_top_zap 3624 * so it can be retrieved when the pool is reopened after an 3625 * export or through zdb. 3626 */ 3627 VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, 3628 vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, 3629 sizeof (new_object), 1, &new_object, tx)); 3630 } 3631 3632 mutex_enter(&msp->ms_sync_lock); 3633 mutex_enter(&msp->ms_lock); 3634 3635 /* 3636 * Note: metaslab_condense() clears the space map's histogram. 3637 * Therefore we must verify and remove this histogram before 3638 * condensing. 3639 */ 3640 metaslab_group_histogram_verify(mg); 3641 metaslab_class_histogram_verify(mg->mg_class); 3642 metaslab_group_histogram_remove(mg, msp); 3643 3644 if (spa->spa_sync_pass == 1 && msp->ms_loaded && 3645 metaslab_should_condense(msp)) 3646 metaslab_condense(msp, tx); 3647 3648 /* 3649 * We'll be going to disk to sync our space accounting, thus we 3650 * drop the ms_lock during that time so allocations coming from 3651 * open-context (ZIL) for future TXGs do not block. 3652 */ 3653 mutex_exit(&msp->ms_lock); 3654 space_map_t *log_sm = spa_syncing_log_sm(spa); 3655 if (log_sm != NULL) { 3656 ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP)); 3657 3658 space_map_write(log_sm, alloctree, SM_ALLOC, 3659 vd->vdev_id, tx); 3660 space_map_write(log_sm, msp->ms_freeing, SM_FREE, 3661 vd->vdev_id, tx); 3662 mutex_enter(&msp->ms_lock); 3663 3664 ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, 3665 metaslab_unflushed_changes_memused(msp)); 3666 spa->spa_unflushed_stats.sus_memused -= 3667 metaslab_unflushed_changes_memused(msp); 3668 range_tree_remove_xor_add(alloctree, 3669 msp->ms_unflushed_frees, msp->ms_unflushed_allocs); 3670 range_tree_remove_xor_add(msp->ms_freeing, 3671 msp->ms_unflushed_allocs, msp->ms_unflushed_frees); 3672 spa->spa_unflushed_stats.sus_memused += 3673 metaslab_unflushed_changes_memused(msp); 3674 } else { 3675 ASSERT(!spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP)); 3676 3677 space_map_write(msp->ms_sm, alloctree, SM_ALLOC, 3678 SM_NO_VDEVID, tx); 3679 space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE, 3680 SM_NO_VDEVID, tx); 3681 mutex_enter(&msp->ms_lock); 3682 } 3683 3684 msp->ms_allocated_space += range_tree_space(alloctree); 3685 ASSERT3U(msp->ms_allocated_space, >=, 3686 range_tree_space(msp->ms_freeing)); 3687 msp->ms_allocated_space -= range_tree_space(msp->ms_freeing); 3688 3689 if (!range_tree_is_empty(msp->ms_checkpointing)) { 3690 ASSERT(spa_has_checkpoint(spa)); 3691 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); 3692 3693 /* 3694 * Since we are doing writes to disk and the ms_checkpointing 3695 * tree won't be changing during that time, we drop the 3696 * ms_lock while writing to the checkpoint space map, for the 3697 * same reason mentioned above. 3698 */ 3699 mutex_exit(&msp->ms_lock); 3700 space_map_write(vd->vdev_checkpoint_sm, 3701 msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx); 3702 mutex_enter(&msp->ms_lock); 3703 3704 spa->spa_checkpoint_info.sci_dspace += 3705 range_tree_space(msp->ms_checkpointing); 3706 vd->vdev_stat.vs_checkpoint_space += 3707 range_tree_space(msp->ms_checkpointing); 3708 ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==, 3709 -space_map_allocated(vd->vdev_checkpoint_sm)); 3710 3711 range_tree_vacate(msp->ms_checkpointing, NULL, NULL); 3712 } 3713 3714 if (msp->ms_loaded) { 3715 /* 3716 * When the space map is loaded, we have an accurate 3717 * histogram in the range tree. This gives us an opportunity 3718 * to bring the space map's histogram up-to-date so we clear 3719 * it first before updating it. 3720 */ 3721 space_map_histogram_clear(msp->ms_sm); 3722 space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx); 3723 3724 /* 3725 * Since we've cleared the histogram we need to add back 3726 * any free space that has already been processed, plus 3727 * any deferred space. This allows the on-disk histogram 3728 * to accurately reflect all free space even if some space 3729 * is not yet available for allocation (i.e. deferred). 3730 */ 3731 space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx); 3732 3733 /* 3734 * Add back any deferred free space that has not been 3735 * added back into the in-core free tree yet. This will 3736 * ensure that we don't end up with a space map histogram 3737 * that is completely empty unless the metaslab is fully 3738 * allocated. 3739 */ 3740 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 3741 space_map_histogram_add(msp->ms_sm, 3742 msp->ms_defer[t], tx); 3743 } 3744 } 3745 3746 /* 3747 * Always add the free space from this sync pass to the space 3748 * map histogram. We want to make sure that the on-disk histogram 3749 * accounts for all free space. If the space map is not loaded, 3750 * then we will lose some accuracy but will correct it the next 3751 * time we load the space map. 3752 */ 3753 space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx); 3754 metaslab_aux_histograms_update(msp); 3755 3756 metaslab_group_histogram_add(mg, msp); 3757 metaslab_group_histogram_verify(mg); 3758 metaslab_class_histogram_verify(mg->mg_class); 3759 3760 /* 3761 * For sync pass 1, we avoid traversing this txg's free range tree 3762 * and instead will just swap the pointers for freeing and freed. 3763 * We can safely do this since the freed_tree is guaranteed to be 3764 * empty on the initial pass. 3765 * 3766 * Keep in mind that even if we are currently using a log spacemap 3767 * we want current frees to end up in the ms_allocatable (but not 3768 * get appended to the ms_sm) so their ranges can be reused as usual. 3769 */ 3770 if (spa_sync_pass(spa) == 1) { 3771 range_tree_swap(&msp->ms_freeing, &msp->ms_freed); 3772 ASSERT0(msp->ms_allocated_this_txg); 3773 } else { 3774 range_tree_vacate(msp->ms_freeing, 3775 range_tree_add, msp->ms_freed); 3776 } 3777 msp->ms_allocated_this_txg += range_tree_space(alloctree); 3778 range_tree_vacate(alloctree, NULL, NULL); 3779 3780 ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); 3781 ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg) 3782 & TXG_MASK])); 3783 ASSERT0(range_tree_space(msp->ms_freeing)); 3784 ASSERT0(range_tree_space(msp->ms_checkpointing)); 3785 3786 mutex_exit(&msp->ms_lock); 3787 3788 /* 3789 * Verify that the space map object ID has been recorded in the 3790 * vdev_ms_array. 3791 */ 3792 uint64_t object; 3793 VERIFY0(dmu_read(mos, vd->vdev_ms_array, 3794 msp->ms_id * sizeof (uint64_t), sizeof (uint64_t), &object, 0)); 3795 VERIFY3U(object, ==, space_map_object(msp->ms_sm)); 3796 3797 mutex_exit(&msp->ms_sync_lock); 3798 dmu_tx_commit(tx); 3799 } 3800 3801 static void 3802 metaslab_evict(metaslab_t *msp, uint64_t txg) 3803 { 3804 if (!msp->ms_loaded || msp->ms_disabled != 0) 3805 return; 3806 3807 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 3808 VERIFY0(range_tree_space( 3809 msp->ms_allocating[(txg + t) & TXG_MASK])); 3810 } 3811 if (msp->ms_allocator != -1) 3812 metaslab_passivate(msp, msp->ms_weight & ~METASLAB_ACTIVE_MASK); 3813 3814 if (!metaslab_debug_unload) 3815 metaslab_unload(msp); 3816 } 3817 3818 /* 3819 * Called after a transaction group has completely synced to mark 3820 * all of the metaslab's free space as usable. 3821 */ 3822 void 3823 metaslab_sync_done(metaslab_t *msp, uint64_t txg) 3824 { 3825 metaslab_group_t *mg = msp->ms_group; 3826 vdev_t *vd = mg->mg_vd; 3827 spa_t *spa = vd->vdev_spa; 3828 range_tree_t **defer_tree; 3829 int64_t alloc_delta, defer_delta; 3830 boolean_t defer_allowed = B_TRUE; 3831 3832 ASSERT(!vd->vdev_ishole); 3833 3834 mutex_enter(&msp->ms_lock); 3835 3836 /* 3837 * If this metaslab is just becoming available, initialize its 3838 * range trees and add its capacity to the vdev. 3839 */ 3840 if (msp->ms_freed == NULL) { 3841 for (int t = 0; t < TXG_SIZE; t++) { 3842 ASSERT(msp->ms_allocating[t] == NULL); 3843 3844 msp->ms_allocating[t] = range_tree_create(NULL, NULL); 3845 } 3846 3847 ASSERT3P(msp->ms_freeing, ==, NULL); 3848 msp->ms_freeing = range_tree_create(NULL, NULL); 3849 3850 ASSERT3P(msp->ms_freed, ==, NULL); 3851 msp->ms_freed = range_tree_create(NULL, NULL); 3852 3853 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 3854 ASSERT3P(msp->ms_defer[t], ==, NULL); 3855 msp->ms_defer[t] = range_tree_create(NULL, NULL); 3856 } 3857 3858 ASSERT3P(msp->ms_checkpointing, ==, NULL); 3859 msp->ms_checkpointing = range_tree_create(NULL, NULL); 3860 3861 ASSERT3P(msp->ms_unflushed_allocs, ==, NULL); 3862 msp->ms_unflushed_allocs = range_tree_create(NULL, NULL); 3863 ASSERT3P(msp->ms_unflushed_frees, ==, NULL); 3864 msp->ms_unflushed_frees = range_tree_create_impl(&rt_avl_ops, 3865 &msp->ms_unflushed_frees_by_size, 3866 metaslab_rangesize_compare, 0); 3867 3868 metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size); 3869 } 3870 ASSERT0(range_tree_space(msp->ms_freeing)); 3871 ASSERT0(range_tree_space(msp->ms_checkpointing)); 3872 3873 defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE]; 3874 3875 uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) - 3876 metaslab_class_get_alloc(spa_normal_class(spa)); 3877 if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) { 3878 defer_allowed = B_FALSE; 3879 } 3880 3881 defer_delta = 0; 3882 alloc_delta = msp->ms_allocated_this_txg - 3883 range_tree_space(msp->ms_freed); 3884 3885 if (defer_allowed) { 3886 defer_delta = range_tree_space(msp->ms_freed) - 3887 range_tree_space(*defer_tree); 3888 } else { 3889 defer_delta -= range_tree_space(*defer_tree); 3890 } 3891 metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta, 3892 defer_delta, 0); 3893 3894 if (spa_syncing_log_sm(spa) == NULL) { 3895 /* 3896 * If there's a metaslab_load() in progress and we don't have 3897 * a log space map, it means that we probably wrote to the 3898 * metaslab's space map. If this is the case, we need to 3899 * make sure that we wait for the load to complete so that we 3900 * have a consistent view at the in-core side of the metaslab. 3901 */ 3902 metaslab_load_wait(msp); 3903 } else { 3904 ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); 3905 } 3906 3907 /* 3908 * When auto-trimming is enabled, free ranges which are added to 3909 * ms_allocatable are also be added to ms_trim. The ms_trim tree is 3910 * periodically consumed by the vdev_autotrim_thread() which issues 3911 * trims for all ranges and then vacates the tree. The ms_trim tree 3912 * can be discarded at any time with the sole consequence of recent 3913 * frees not being trimmed. 3914 */ 3915 if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON) { 3916 range_tree_walk(*defer_tree, range_tree_add, msp->ms_trim); 3917 if (!defer_allowed) { 3918 range_tree_walk(msp->ms_freed, range_tree_add, 3919 msp->ms_trim); 3920 } 3921 } else { 3922 range_tree_vacate(msp->ms_trim, NULL, NULL); 3923 } 3924 3925 /* 3926 * Move the frees from the defer_tree back to the free 3927 * range tree (if it's loaded). Swap the freed_tree and 3928 * the defer_tree -- this is safe to do because we've 3929 * just emptied out the defer_tree. 3930 */ 3931 range_tree_vacate(*defer_tree, 3932 msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable); 3933 if (defer_allowed) { 3934 range_tree_swap(&msp->ms_freed, defer_tree); 3935 } else { 3936 range_tree_vacate(msp->ms_freed, 3937 msp->ms_loaded ? range_tree_add : NULL, 3938 msp->ms_allocatable); 3939 } 3940 3941 msp->ms_synced_length = space_map_length(msp->ms_sm); 3942 3943 msp->ms_deferspace += defer_delta; 3944 ASSERT3S(msp->ms_deferspace, >=, 0); 3945 ASSERT3S(msp->ms_deferspace, <=, msp->ms_size); 3946 if (msp->ms_deferspace != 0) { 3947 /* 3948 * Keep syncing this metaslab until all deferred frees 3949 * are back in circulation. 3950 */ 3951 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 3952 } 3953 metaslab_aux_histograms_update_done(msp, defer_allowed); 3954 3955 if (msp->ms_new) { 3956 msp->ms_new = B_FALSE; 3957 mutex_enter(&mg->mg_lock); 3958 mg->mg_ms_ready++; 3959 mutex_exit(&mg->mg_lock); 3960 } 3961 3962 /* 3963 * Re-sort metaslab within its group now that we've adjusted 3964 * its allocatable space. 3965 */ 3966 metaslab_recalculate_weight_and_sort(msp); 3967 3968 /* 3969 * If the metaslab is loaded and we've not tried to load or allocate 3970 * from it in 'metaslab_unload_delay' txgs, then unload it. 3971 */ 3972 if (msp->ms_loaded && 3973 msp->ms_disabled == 0 && 3974 msp->ms_selected_txg + metaslab_unload_delay < txg) { 3975 3976 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 3977 VERIFY0(range_tree_space( 3978 msp->ms_allocating[(txg + t) & TXG_MASK])); 3979 } 3980 if (msp->ms_allocator != -1) { 3981 metaslab_passivate(msp, msp->ms_weight & 3982 ~METASLAB_ACTIVE_MASK); 3983 } 3984 3985 if (!metaslab_debug_unload) 3986 metaslab_unload(msp); 3987 } 3988 3989 ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); 3990 ASSERT0(range_tree_space(msp->ms_freeing)); 3991 ASSERT0(range_tree_space(msp->ms_freed)); 3992 ASSERT0(range_tree_space(msp->ms_checkpointing)); 3993 msp->ms_allocating_total -= msp->ms_allocated_this_txg; 3994 msp->ms_allocated_this_txg = 0; 3995 mutex_exit(&msp->ms_lock); 3996 } 3997 3998 void 3999 metaslab_sync_reassess(metaslab_group_t *mg) 4000 { 4001 spa_t *spa = mg->mg_class->mc_spa; 4002 4003 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 4004 metaslab_group_alloc_update(mg); 4005 mg->mg_fragmentation = metaslab_group_fragmentation(mg); 4006 4007 /* 4008 * Preload the next potential metaslabs but only on active 4009 * metaslab groups. We can get into a state where the metaslab 4010 * is no longer active since we dirty metaslabs as we remove a 4011 * a device, thus potentially making the metaslab group eligible 4012 * for preloading. 4013 */ 4014 if (mg->mg_activation_count > 0) { 4015 metaslab_group_preload(mg); 4016 } 4017 spa_config_exit(spa, SCL_ALLOC, FTAG); 4018 } 4019 4020 /* 4021 * When writing a ditto block (i.e. more than one DVA for a given BP) on 4022 * the same vdev as an existing DVA of this BP, then try to allocate it 4023 * on a different metaslab than existing DVAs (i.e. a unique metaslab). 4024 */ 4025 static boolean_t 4026 metaslab_is_unique(metaslab_t *msp, dva_t *dva) 4027 { 4028 uint64_t dva_ms_id; 4029 4030 if (DVA_GET_ASIZE(dva) == 0) 4031 return (B_TRUE); 4032 4033 if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) 4034 return (B_TRUE); 4035 4036 dva_ms_id = DVA_GET_OFFSET(dva) >> msp->ms_group->mg_vd->vdev_ms_shift; 4037 4038 return (msp->ms_id != dva_ms_id); 4039 } 4040 4041 /* 4042 * ========================================================================== 4043 * Metaslab allocation tracing facility 4044 * ========================================================================== 4045 */ 4046 kstat_t *metaslab_trace_ksp; 4047 kstat_named_t metaslab_trace_over_limit; 4048 4049 void 4050 metaslab_alloc_trace_init(void) 4051 { 4052 ASSERT(metaslab_alloc_trace_cache == NULL); 4053 metaslab_alloc_trace_cache = kmem_cache_create( 4054 "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t), 4055 0, NULL, NULL, NULL, NULL, NULL, 0); 4056 metaslab_trace_ksp = kstat_create("zfs", 0, "metaslab_trace_stats", 4057 "misc", KSTAT_TYPE_NAMED, 1, KSTAT_FLAG_VIRTUAL); 4058 if (metaslab_trace_ksp != NULL) { 4059 metaslab_trace_ksp->ks_data = &metaslab_trace_over_limit; 4060 kstat_named_init(&metaslab_trace_over_limit, 4061 "metaslab_trace_over_limit", KSTAT_DATA_UINT64); 4062 kstat_install(metaslab_trace_ksp); 4063 } 4064 } 4065 4066 void 4067 metaslab_alloc_trace_fini(void) 4068 { 4069 if (metaslab_trace_ksp != NULL) { 4070 kstat_delete(metaslab_trace_ksp); 4071 metaslab_trace_ksp = NULL; 4072 } 4073 kmem_cache_destroy(metaslab_alloc_trace_cache); 4074 metaslab_alloc_trace_cache = NULL; 4075 } 4076 4077 /* 4078 * Add an allocation trace element to the allocation tracing list. 4079 */ 4080 static void 4081 metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg, 4082 metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset, 4083 int allocator) 4084 { 4085 if (!metaslab_trace_enabled) 4086 return; 4087 4088 /* 4089 * When the tracing list reaches its maximum we remove 4090 * the second element in the list before adding a new one. 4091 * By removing the second element we preserve the original 4092 * entry as a clue to what allocations steps have already been 4093 * performed. 4094 */ 4095 if (zal->zal_size == metaslab_trace_max_entries) { 4096 metaslab_alloc_trace_t *mat_next; 4097 #ifdef DEBUG 4098 panic("too many entries in allocation list"); 4099 #endif 4100 atomic_inc_64(&metaslab_trace_over_limit.value.ui64); 4101 zal->zal_size--; 4102 mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list)); 4103 list_remove(&zal->zal_list, mat_next); 4104 kmem_cache_free(metaslab_alloc_trace_cache, mat_next); 4105 } 4106 4107 metaslab_alloc_trace_t *mat = 4108 kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP); 4109 list_link_init(&mat->mat_list_node); 4110 mat->mat_mg = mg; 4111 mat->mat_msp = msp; 4112 mat->mat_size = psize; 4113 mat->mat_dva_id = dva_id; 4114 mat->mat_offset = offset; 4115 mat->mat_weight = 0; 4116 mat->mat_allocator = allocator; 4117 4118 if (msp != NULL) 4119 mat->mat_weight = msp->ms_weight; 4120 4121 /* 4122 * The list is part of the zio so locking is not required. Only 4123 * a single thread will perform allocations for a given zio. 4124 */ 4125 list_insert_tail(&zal->zal_list, mat); 4126 zal->zal_size++; 4127 4128 ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries); 4129 } 4130 4131 void 4132 metaslab_trace_init(zio_alloc_list_t *zal) 4133 { 4134 list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t), 4135 offsetof(metaslab_alloc_trace_t, mat_list_node)); 4136 zal->zal_size = 0; 4137 } 4138 4139 void 4140 metaslab_trace_fini(zio_alloc_list_t *zal) 4141 { 4142 metaslab_alloc_trace_t *mat; 4143 4144 while ((mat = list_remove_head(&zal->zal_list)) != NULL) 4145 kmem_cache_free(metaslab_alloc_trace_cache, mat); 4146 list_destroy(&zal->zal_list); 4147 zal->zal_size = 0; 4148 } 4149 4150 /* 4151 * ========================================================================== 4152 * Metaslab block operations 4153 * ========================================================================== 4154 */ 4155 4156 static void 4157 metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags, 4158 int allocator) 4159 { 4160 if (!(flags & METASLAB_ASYNC_ALLOC) || 4161 (flags & METASLAB_DONT_THROTTLE)) 4162 return; 4163 4164 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 4165 if (!mg->mg_class->mc_alloc_throttle_enabled) 4166 return; 4167 4168 (void) zfs_refcount_add(&mg->mg_alloc_queue_depth[allocator], tag); 4169 } 4170 4171 static void 4172 metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator) 4173 { 4174 uint64_t max = mg->mg_max_alloc_queue_depth; 4175 uint64_t cur = mg->mg_cur_max_alloc_queue_depth[allocator]; 4176 while (cur < max) { 4177 if (atomic_cas_64(&mg->mg_cur_max_alloc_queue_depth[allocator], 4178 cur, cur + 1) == cur) { 4179 atomic_inc_64( 4180 &mg->mg_class->mc_alloc_max_slots[allocator]); 4181 return; 4182 } 4183 cur = mg->mg_cur_max_alloc_queue_depth[allocator]; 4184 } 4185 } 4186 4187 void 4188 metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags, 4189 int allocator, boolean_t io_complete) 4190 { 4191 if (!(flags & METASLAB_ASYNC_ALLOC) || 4192 (flags & METASLAB_DONT_THROTTLE)) 4193 return; 4194 4195 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 4196 if (!mg->mg_class->mc_alloc_throttle_enabled) 4197 return; 4198 4199 (void) zfs_refcount_remove(&mg->mg_alloc_queue_depth[allocator], tag); 4200 if (io_complete) 4201 metaslab_group_increment_qdepth(mg, allocator); 4202 } 4203 4204 void 4205 metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag, 4206 int allocator) 4207 { 4208 #ifdef ZFS_DEBUG 4209 const dva_t *dva = bp->blk_dva; 4210 int ndvas = BP_GET_NDVAS(bp); 4211 4212 for (int d = 0; d < ndvas; d++) { 4213 uint64_t vdev = DVA_GET_VDEV(&dva[d]); 4214 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 4215 VERIFY(zfs_refcount_not_held( 4216 &mg->mg_alloc_queue_depth[allocator], tag)); 4217 } 4218 #endif 4219 } 4220 4221 static uint64_t 4222 metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) 4223 { 4224 uint64_t start; 4225 range_tree_t *rt = msp->ms_allocatable; 4226 metaslab_class_t *mc = msp->ms_group->mg_class; 4227 4228 ASSERT(MUTEX_HELD(&msp->ms_lock)); 4229 VERIFY(!msp->ms_condensing); 4230 VERIFY0(msp->ms_disabled); 4231 4232 start = mc->mc_ops->msop_alloc(msp, size); 4233 if (start != -1ULL) { 4234 metaslab_group_t *mg = msp->ms_group; 4235 vdev_t *vd = mg->mg_vd; 4236 4237 VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift)); 4238 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 4239 VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size); 4240 range_tree_remove(rt, start, size); 4241 range_tree_clear(msp->ms_trim, start, size); 4242 4243 if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) 4244 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); 4245 4246 range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size); 4247 msp->ms_allocating_total += size; 4248 4249 /* Track the last successful allocation */ 4250 msp->ms_alloc_txg = txg; 4251 metaslab_verify_space(msp, txg); 4252 } 4253 4254 /* 4255 * Now that we've attempted the allocation we need to update the 4256 * metaslab's maximum block size since it may have changed. 4257 */ 4258 msp->ms_max_size = metaslab_largest_allocatable(msp); 4259 return (start); 4260 } 4261 4262 /* 4263 * Find the metaslab with the highest weight that is less than what we've 4264 * already tried. In the common case, this means that we will examine each 4265 * metaslab at most once. Note that concurrent callers could reorder metaslabs 4266 * by activation/passivation once we have dropped the mg_lock. If a metaslab is 4267 * activated by another thread, and we fail to allocate from the metaslab we 4268 * have selected, we may not try the newly-activated metaslab, and instead 4269 * activate another metaslab. This is not optimal, but generally does not cause 4270 * any problems (a possible exception being if every metaslab is completely full 4271 * except for the the newly-activated metaslab which we fail to examine). 4272 */ 4273 static metaslab_t * 4274 find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, 4275 dva_t *dva, int d, boolean_t want_unique, uint64_t asize, int allocator, 4276 boolean_t try_hard, zio_alloc_list_t *zal, metaslab_t *search, 4277 boolean_t *was_active) 4278 { 4279 avl_index_t idx; 4280 avl_tree_t *t = &mg->mg_metaslab_tree; 4281 metaslab_t *msp = avl_find(t, search, &idx); 4282 if (msp == NULL) 4283 msp = avl_nearest(t, idx, AVL_AFTER); 4284 4285 for (; msp != NULL; msp = AVL_NEXT(t, msp)) { 4286 int i; 4287 if (!metaslab_should_allocate(msp, asize, try_hard)) { 4288 metaslab_trace_add(zal, mg, msp, asize, d, 4289 TRACE_TOO_SMALL, allocator); 4290 continue; 4291 } 4292 4293 /* 4294 * If the selected metaslab is condensing or disabled, 4295 * skip it. 4296 */ 4297 if (msp->ms_condensing || msp->ms_disabled > 0) 4298 continue; 4299 4300 *was_active = msp->ms_allocator != -1; 4301 /* 4302 * If we're activating as primary, this is our first allocation 4303 * from this disk, so we don't need to check how close we are. 4304 * If the metaslab under consideration was already active, 4305 * we're getting desperate enough to steal another allocator's 4306 * metaslab, so we still don't care about distances. 4307 */ 4308 if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active) 4309 break; 4310 4311 for (i = 0; i < d; i++) { 4312 if (want_unique && 4313 !metaslab_is_unique(msp, &dva[i])) 4314 break; /* try another metaslab */ 4315 } 4316 if (i == d) 4317 break; 4318 } 4319 4320 if (msp != NULL) { 4321 search->ms_weight = msp->ms_weight; 4322 search->ms_start = msp->ms_start + 1; 4323 search->ms_allocator = msp->ms_allocator; 4324 search->ms_primary = msp->ms_primary; 4325 } 4326 return (msp); 4327 } 4328 4329 void 4330 metaslab_active_mask_verify(metaslab_t *msp) 4331 { 4332 ASSERT(MUTEX_HELD(&msp->ms_lock)); 4333 4334 if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) 4335 return; 4336 4337 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) 4338 return; 4339 4340 if (msp->ms_weight & METASLAB_WEIGHT_PRIMARY) { 4341 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); 4342 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM); 4343 VERIFY3S(msp->ms_allocator, !=, -1); 4344 VERIFY(msp->ms_primary); 4345 return; 4346 } 4347 4348 if (msp->ms_weight & METASLAB_WEIGHT_SECONDARY) { 4349 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); 4350 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM); 4351 VERIFY3S(msp->ms_allocator, !=, -1); 4352 VERIFY(!msp->ms_primary); 4353 return; 4354 } 4355 4356 if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) { 4357 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); 4358 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); 4359 VERIFY3S(msp->ms_allocator, ==, -1); 4360 return; 4361 } 4362 } 4363 4364 /* ARGSUSED */ 4365 static uint64_t 4366 metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, 4367 uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d, 4368 int allocator, boolean_t try_hard) 4369 { 4370 metaslab_t *msp = NULL; 4371 uint64_t offset = -1ULL; 4372 4373 uint64_t activation_weight = METASLAB_WEIGHT_PRIMARY; 4374 for (int i = 0; i < d; i++) { 4375 if (activation_weight == METASLAB_WEIGHT_PRIMARY && 4376 DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { 4377 activation_weight = METASLAB_WEIGHT_SECONDARY; 4378 } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && 4379 DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { 4380 activation_weight = METASLAB_WEIGHT_CLAIM; 4381 break; 4382 } 4383 } 4384 4385 /* 4386 * If we don't have enough metaslabs active to fill the entire array, we 4387 * just use the 0th slot. 4388 */ 4389 if (mg->mg_ms_ready < mg->mg_allocators * 3) 4390 allocator = 0; 4391 4392 ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2); 4393 4394 metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP); 4395 search->ms_weight = UINT64_MAX; 4396 search->ms_start = 0; 4397 /* 4398 * At the end of the metaslab tree are the already-active metaslabs, 4399 * first the primaries, then the secondaries. When we resume searching 4400 * through the tree, we need to consider ms_allocator and ms_primary so 4401 * we start in the location right after where we left off, and don't 4402 * accidentally loop forever considering the same metaslabs. 4403 */ 4404 search->ms_allocator = -1; 4405 search->ms_primary = B_TRUE; 4406 for (;;) { 4407 boolean_t was_active = B_FALSE; 4408 4409 mutex_enter(&mg->mg_lock); 4410 4411 if (activation_weight == METASLAB_WEIGHT_PRIMARY && 4412 mg->mg_primaries[allocator] != NULL) { 4413 msp = mg->mg_primaries[allocator]; 4414 4415 /* 4416 * Even though we don't hold the ms_lock for the 4417 * primary metaslab, those fields should not 4418 * change while we hold the mg_lock. Thus is is 4419 * safe to make assertions on them. 4420 */ 4421 ASSERT(msp->ms_primary); 4422 ASSERT3S(msp->ms_allocator, ==, allocator); 4423 ASSERT(msp->ms_loaded); 4424 4425 was_active = B_TRUE; 4426 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); 4427 } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && 4428 mg->mg_secondaries[allocator] != NULL) { 4429 msp = mg->mg_secondaries[allocator]; 4430 4431 /* 4432 * See comment above about the similar assertions 4433 * for the primary metaslab. 4434 */ 4435 ASSERT(!msp->ms_primary); 4436 ASSERT3S(msp->ms_allocator, ==, allocator); 4437 ASSERT(msp->ms_loaded); 4438 4439 was_active = B_TRUE; 4440 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); 4441 } else { 4442 msp = find_valid_metaslab(mg, activation_weight, dva, d, 4443 want_unique, asize, allocator, try_hard, zal, 4444 search, &was_active); 4445 } 4446 4447 mutex_exit(&mg->mg_lock); 4448 if (msp == NULL) { 4449 kmem_free(search, sizeof (*search)); 4450 return (-1ULL); 4451 } 4452 mutex_enter(&msp->ms_lock); 4453 4454 metaslab_active_mask_verify(msp); 4455 4456 /* 4457 * This code is disabled out because of issues with 4458 * tracepoints in non-gpl kernel modules. 4459 */ 4460 #if 0 4461 DTRACE_PROBE3(ms__activation__attempt, 4462 metaslab_t *, msp, uint64_t, activation_weight, 4463 boolean_t, was_active); 4464 #endif 4465 4466 /* 4467 * Ensure that the metaslab we have selected is still 4468 * capable of handling our request. It's possible that 4469 * another thread may have changed the weight while we 4470 * were blocked on the metaslab lock. We check the 4471 * active status first to see if we need to set_selected_txg 4472 * a new metaslab. 4473 */ 4474 if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) { 4475 ASSERT3S(msp->ms_allocator, ==, -1); 4476 mutex_exit(&msp->ms_lock); 4477 continue; 4478 } 4479 4480 /* 4481 * If the metaslab was activated for another allocator 4482 * while we were waiting in the ms_lock above, or it's 4483 * a primary and we're seeking a secondary (or vice versa), 4484 * we go back and select a new metaslab. 4485 */ 4486 if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) && 4487 (msp->ms_allocator != -1) && 4488 (msp->ms_allocator != allocator || ((activation_weight == 4489 METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) { 4490 ASSERT(msp->ms_loaded); 4491 ASSERT((msp->ms_weight & METASLAB_WEIGHT_CLAIM) || 4492 msp->ms_allocator != -1); 4493 mutex_exit(&msp->ms_lock); 4494 continue; 4495 } 4496 4497 /* 4498 * This metaslab was used for claiming regions allocated 4499 * by the ZIL during pool import. Once these regions are 4500 * claimed we don't need to keep the CLAIM bit set 4501 * anymore. Passivate this metaslab to zero its activation 4502 * mask. 4503 */ 4504 if (msp->ms_weight & METASLAB_WEIGHT_CLAIM && 4505 activation_weight != METASLAB_WEIGHT_CLAIM) { 4506 ASSERT(msp->ms_loaded); 4507 ASSERT3S(msp->ms_allocator, ==, -1); 4508 metaslab_passivate(msp, msp->ms_weight & 4509 ~METASLAB_WEIGHT_CLAIM); 4510 mutex_exit(&msp->ms_lock); 4511 continue; 4512 } 4513 4514 metaslab_set_selected_txg(msp, txg); 4515 4516 int activation_error = 4517 metaslab_activate(msp, allocator, activation_weight); 4518 metaslab_active_mask_verify(msp); 4519 4520 /* 4521 * If the metaslab was activated by another thread for 4522 * another allocator or activation_weight (EBUSY), or it 4523 * failed because another metaslab was assigned as primary 4524 * for this allocator (EEXIST) we continue using this 4525 * metaslab for our allocation, rather than going on to a 4526 * worse metaslab (we waited for that metaslab to be loaded 4527 * after all). 4528 * 4529 * If the activation failed due to an I/O error or ENOSPC we 4530 * skip to the next metaslab. 4531 */ 4532 boolean_t activated; 4533 if (activation_error == 0) { 4534 activated = B_TRUE; 4535 } else if (activation_error == EBUSY || 4536 activation_error == EEXIST) { 4537 activated = B_FALSE; 4538 } else { 4539 mutex_exit(&msp->ms_lock); 4540 continue; 4541 } 4542 ASSERT(msp->ms_loaded); 4543 4544 /* 4545 * Now that we have the lock, recheck to see if we should 4546 * continue to use this metaslab for this allocation. The 4547 * the metaslab is now loaded so metaslab_should_allocate() 4548 * can accurately determine if the allocation attempt should 4549 * proceed. 4550 */ 4551 if (!metaslab_should_allocate(msp, asize, try_hard)) { 4552 /* Passivate this metaslab and select a new one. */ 4553 metaslab_trace_add(zal, mg, msp, asize, d, 4554 TRACE_TOO_SMALL, allocator); 4555 goto next; 4556 } 4557 4558 /* 4559 * If this metaslab is currently condensing then pick again 4560 * as we can't manipulate this metaslab until it's committed 4561 * to disk. If this metaslab is being initialized, we shouldn't 4562 * allocate from it since the allocated region might be 4563 * overwritten after allocation. 4564 */ 4565 if (msp->ms_condensing) { 4566 metaslab_trace_add(zal, mg, msp, asize, d, 4567 TRACE_CONDENSING, allocator); 4568 if (activated) { 4569 metaslab_passivate(msp, msp->ms_weight & 4570 ~METASLAB_ACTIVE_MASK); 4571 } 4572 mutex_exit(&msp->ms_lock); 4573 continue; 4574 } else if (msp->ms_disabled > 0) { 4575 metaslab_trace_add(zal, mg, msp, asize, d, 4576 TRACE_DISABLED, allocator); 4577 if (activated) { 4578 metaslab_passivate(msp, msp->ms_weight & 4579 ~METASLAB_ACTIVE_MASK); 4580 } 4581 mutex_exit(&msp->ms_lock); 4582 continue; 4583 } 4584 4585 offset = metaslab_block_alloc(msp, asize, txg); 4586 metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator); 4587 4588 if (offset != -1ULL) { 4589 /* Proactively passivate the metaslab, if needed */ 4590 if (activated) 4591 metaslab_segment_may_passivate(msp); 4592 break; 4593 } 4594 next: 4595 ASSERT(msp->ms_loaded); 4596 4597 /* 4598 * This code is disabled out because of issues with 4599 * tracepoints in non-gpl kernel modules. 4600 */ 4601 #if 0 4602 DTRACE_PROBE2(ms__alloc__failure, metaslab_t *, msp, 4603 uint64_t, asize); 4604 #endif 4605 4606 /* 4607 * We were unable to allocate from this metaslab so determine 4608 * a new weight for this metaslab. Now that we have loaded 4609 * the metaslab we can provide a better hint to the metaslab 4610 * selector. 4611 * 4612 * For space-based metaslabs, we use the maximum block size. 4613 * This information is only available when the metaslab 4614 * is loaded and is more accurate than the generic free 4615 * space weight that was calculated by metaslab_weight(). 4616 * This information allows us to quickly compare the maximum 4617 * available allocation in the metaslab to the allocation 4618 * size being requested. 4619 * 4620 * For segment-based metaslabs, determine the new weight 4621 * based on the highest bucket in the range tree. We 4622 * explicitly use the loaded segment weight (i.e. the range 4623 * tree histogram) since it contains the space that is 4624 * currently available for allocation and is accurate 4625 * even within a sync pass. 4626 */ 4627 uint64_t weight; 4628 if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) { 4629 weight = metaslab_largest_allocatable(msp); 4630 WEIGHT_SET_SPACEBASED(weight); 4631 } else { 4632 weight = metaslab_weight_from_range_tree(msp); 4633 } 4634 4635 if (activated) { 4636 metaslab_passivate(msp, weight); 4637 } else { 4638 /* 4639 * For the case where we use the metaslab that is 4640 * active for another allocator we want to make 4641 * sure that we retain the activation mask. 4642 * 4643 * Note that we could attempt to use something like 4644 * metaslab_recalculate_weight_and_sort() that 4645 * retains the activation mask here. That function 4646 * uses metaslab_weight() to set the weight though 4647 * which is not as accurate as the calculations 4648 * above. 4649 */ 4650 weight |= msp->ms_weight & METASLAB_ACTIVE_MASK; 4651 metaslab_group_sort(mg, msp, weight); 4652 } 4653 metaslab_active_mask_verify(msp); 4654 4655 /* 4656 * We have just failed an allocation attempt, check 4657 * that metaslab_should_allocate() agrees. Otherwise, 4658 * we may end up in an infinite loop retrying the same 4659 * metaslab. 4660 */ 4661 ASSERT(!metaslab_should_allocate(msp, asize, try_hard)); 4662 4663 mutex_exit(&msp->ms_lock); 4664 } 4665 mutex_exit(&msp->ms_lock); 4666 kmem_free(search, sizeof (*search)); 4667 return (offset); 4668 } 4669 4670 static uint64_t 4671 metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, 4672 uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d, 4673 int allocator, boolean_t try_hard) 4674 { 4675 uint64_t offset; 4676 ASSERT(mg->mg_initialized); 4677 4678 offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique, 4679 dva, d, allocator, try_hard); 4680 4681 mutex_enter(&mg->mg_lock); 4682 if (offset == -1ULL) { 4683 mg->mg_failed_allocations++; 4684 metaslab_trace_add(zal, mg, NULL, asize, d, 4685 TRACE_GROUP_FAILURE, allocator); 4686 if (asize == SPA_GANGBLOCKSIZE) { 4687 /* 4688 * This metaslab group was unable to allocate 4689 * the minimum gang block size so it must be out of 4690 * space. We must notify the allocation throttle 4691 * to start skipping allocation attempts to this 4692 * metaslab group until more space becomes available. 4693 * Note: this failure cannot be caused by the 4694 * allocation throttle since the allocation throttle 4695 * is only responsible for skipping devices and 4696 * not failing block allocations. 4697 */ 4698 mg->mg_no_free_space = B_TRUE; 4699 } 4700 } 4701 mg->mg_allocations++; 4702 mutex_exit(&mg->mg_lock); 4703 return (offset); 4704 } 4705 4706 /* 4707 * Allocate a block for the specified i/o. 4708 */ 4709 int 4710 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, 4711 dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags, 4712 zio_alloc_list_t *zal, int allocator) 4713 { 4714 metaslab_group_t *mg, *rotor; 4715 vdev_t *vd; 4716 boolean_t try_hard = B_FALSE; 4717 4718 ASSERT(!DVA_IS_VALID(&dva[d])); 4719 4720 /* 4721 * For testing, make some blocks above a certain size be gang blocks. 4722 * This will also test spilling from special to normal. 4723 */ 4724 if (psize >= metaslab_force_ganging && (ddi_get_lbolt() & 3) == 0) { 4725 metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG, 4726 allocator); 4727 return (SET_ERROR(ENOSPC)); 4728 } 4729 4730 /* 4731 * Start at the rotor and loop through all mgs until we find something. 4732 * Note that there's no locking on mc_rotor or mc_aliquot because 4733 * nothing actually breaks if we miss a few updates -- we just won't 4734 * allocate quite as evenly. It all balances out over time. 4735 * 4736 * If we are doing ditto or log blocks, try to spread them across 4737 * consecutive vdevs. If we're forced to reuse a vdev before we've 4738 * allocated all of our ditto blocks, then try and spread them out on 4739 * that vdev as much as possible. If it turns out to not be possible, 4740 * gradually lower our standards until anything becomes acceptable. 4741 * Also, allocating on consecutive vdevs (as opposed to random vdevs) 4742 * gives us hope of containing our fault domains to something we're 4743 * able to reason about. Otherwise, any two top-level vdev failures 4744 * will guarantee the loss of data. With consecutive allocation, 4745 * only two adjacent top-level vdev failures will result in data loss. 4746 * 4747 * If we are doing gang blocks (hintdva is non-NULL), try to keep 4748 * ourselves on the same vdev as our gang block header. That 4749 * way, we can hope for locality in vdev_cache, plus it makes our 4750 * fault domains something tractable. 4751 */ 4752 if (hintdva) { 4753 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); 4754 4755 /* 4756 * It's possible the vdev we're using as the hint no 4757 * longer exists or its mg has been closed (e.g. by 4758 * device removal). Consult the rotor when 4759 * all else fails. 4760 */ 4761 if (vd != NULL && vd->vdev_mg != NULL) { 4762 mg = vd->vdev_mg; 4763 4764 if (flags & METASLAB_HINTBP_AVOID && 4765 mg->mg_next != NULL) 4766 mg = mg->mg_next; 4767 } else { 4768 mg = mc->mc_rotor; 4769 } 4770 } else if (d != 0) { 4771 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); 4772 mg = vd->vdev_mg->mg_next; 4773 } else { 4774 ASSERT(mc->mc_rotor != NULL); 4775 mg = mc->mc_rotor; 4776 } 4777 4778 /* 4779 * If the hint put us into the wrong metaslab class, or into a 4780 * metaslab group that has been passivated, just follow the rotor. 4781 */ 4782 if (mg->mg_class != mc || mg->mg_activation_count <= 0) 4783 mg = mc->mc_rotor; 4784 4785 rotor = mg; 4786 top: 4787 do { 4788 boolean_t allocatable; 4789 4790 ASSERT(mg->mg_activation_count == 1); 4791 vd = mg->mg_vd; 4792 4793 /* 4794 * Don't allocate from faulted devices. 4795 */ 4796 if (try_hard) { 4797 spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); 4798 allocatable = vdev_allocatable(vd); 4799 spa_config_exit(spa, SCL_ZIO, FTAG); 4800 } else { 4801 allocatable = vdev_allocatable(vd); 4802 } 4803 4804 /* 4805 * Determine if the selected metaslab group is eligible 4806 * for allocations. If we're ganging then don't allow 4807 * this metaslab group to skip allocations since that would 4808 * inadvertently return ENOSPC and suspend the pool 4809 * even though space is still available. 4810 */ 4811 if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) { 4812 allocatable = metaslab_group_allocatable(mg, rotor, 4813 psize, allocator, d); 4814 } 4815 4816 if (!allocatable) { 4817 metaslab_trace_add(zal, mg, NULL, psize, d, 4818 TRACE_NOT_ALLOCATABLE, allocator); 4819 goto next; 4820 } 4821 4822 ASSERT(mg->mg_initialized); 4823 4824 /* 4825 * Avoid writing single-copy data to a failing, 4826 * non-redundant vdev, unless we've already tried all 4827 * other vdevs. 4828 */ 4829 if ((vd->vdev_stat.vs_write_errors > 0 || 4830 vd->vdev_state < VDEV_STATE_HEALTHY) && 4831 d == 0 && !try_hard && vd->vdev_children == 0) { 4832 metaslab_trace_add(zal, mg, NULL, psize, d, 4833 TRACE_VDEV_ERROR, allocator); 4834 goto next; 4835 } 4836 4837 ASSERT(mg->mg_class == mc); 4838 4839 uint64_t asize = vdev_psize_to_asize(vd, psize); 4840 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); 4841 4842 /* 4843 * If we don't need to try hard, then require that the 4844 * block be on an different metaslab from any other DVAs 4845 * in this BP (unique=true). If we are trying hard, then 4846 * allow any metaslab to be used (unique=false). 4847 */ 4848 uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg, 4849 !try_hard, dva, d, allocator, try_hard); 4850 4851 if (offset != -1ULL) { 4852 /* 4853 * If we've just selected this metaslab group, 4854 * figure out whether the corresponding vdev is 4855 * over- or under-used relative to the pool, 4856 * and set an allocation bias to even it out. 4857 */ 4858 if (mc->mc_aliquot == 0 && metaslab_bias_enabled) { 4859 vdev_stat_t *vs = &vd->vdev_stat; 4860 int64_t vu, cu; 4861 4862 vu = (vs->vs_alloc * 100) / (vs->vs_space + 1); 4863 cu = (mc->mc_alloc * 100) / (mc->mc_space + 1); 4864 4865 /* 4866 * Calculate how much more or less we should 4867 * try to allocate from this device during 4868 * this iteration around the rotor. 4869 * For example, if a device is 80% full 4870 * and the pool is 20% full then we should 4871 * reduce allocations by 60% on this device. 4872 * 4873 * mg_bias = (20 - 80) * 512K / 100 = -307K 4874 * 4875 * This reduces allocations by 307K for this 4876 * iteration. 4877 */ 4878 mg->mg_bias = ((cu - vu) * 4879 (int64_t)mg->mg_aliquot) / 100; 4880 } else if (!metaslab_bias_enabled) { 4881 mg->mg_bias = 0; 4882 } 4883 4884 if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= 4885 mg->mg_aliquot + mg->mg_bias) { 4886 mc->mc_rotor = mg->mg_next; 4887 mc->mc_aliquot = 0; 4888 } 4889 4890 DVA_SET_VDEV(&dva[d], vd->vdev_id); 4891 DVA_SET_OFFSET(&dva[d], offset); 4892 DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER)); 4893 DVA_SET_ASIZE(&dva[d], asize); 4894 4895 return (0); 4896 } 4897 next: 4898 mc->mc_rotor = mg->mg_next; 4899 mc->mc_aliquot = 0; 4900 } while ((mg = mg->mg_next) != rotor); 4901 4902 /* 4903 * If we haven't tried hard, do so now. 4904 */ 4905 if (!try_hard) { 4906 try_hard = B_TRUE; 4907 goto top; 4908 } 4909 4910 bzero(&dva[d], sizeof (dva_t)); 4911 4912 metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator); 4913 return (SET_ERROR(ENOSPC)); 4914 } 4915 4916 void 4917 metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize, 4918 boolean_t checkpoint) 4919 { 4920 metaslab_t *msp; 4921 spa_t *spa = vd->vdev_spa; 4922 4923 ASSERT(vdev_is_concrete(vd)); 4924 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 4925 ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); 4926 4927 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 4928 4929 VERIFY(!msp->ms_condensing); 4930 VERIFY3U(offset, >=, msp->ms_start); 4931 VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size); 4932 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 4933 VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift)); 4934 4935 metaslab_check_free_impl(vd, offset, asize); 4936 4937 mutex_enter(&msp->ms_lock); 4938 if (range_tree_is_empty(msp->ms_freeing) && 4939 range_tree_is_empty(msp->ms_checkpointing)) { 4940 vdev_dirty(vd, VDD_METASLAB, msp, spa_syncing_txg(spa)); 4941 } 4942 4943 if (checkpoint) { 4944 ASSERT(spa_has_checkpoint(spa)); 4945 range_tree_add(msp->ms_checkpointing, offset, asize); 4946 } else { 4947 range_tree_add(msp->ms_freeing, offset, asize); 4948 } 4949 mutex_exit(&msp->ms_lock); 4950 } 4951 4952 /* ARGSUSED */ 4953 void 4954 metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, 4955 uint64_t size, void *arg) 4956 { 4957 boolean_t *checkpoint = arg; 4958 4959 ASSERT3P(checkpoint, !=, NULL); 4960 4961 if (vd->vdev_ops->vdev_op_remap != NULL) 4962 vdev_indirect_mark_obsolete(vd, offset, size); 4963 else 4964 metaslab_free_impl(vd, offset, size, *checkpoint); 4965 } 4966 4967 static void 4968 metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size, 4969 boolean_t checkpoint) 4970 { 4971 spa_t *spa = vd->vdev_spa; 4972 4973 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 4974 4975 if (spa_syncing_txg(spa) > spa_freeze_txg(spa)) 4976 return; 4977 4978 if (spa->spa_vdev_removal != NULL && 4979 spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id && 4980 vdev_is_concrete(vd)) { 4981 /* 4982 * Note: we check if the vdev is concrete because when 4983 * we complete the removal, we first change the vdev to be 4984 * an indirect vdev (in open context), and then (in syncing 4985 * context) clear spa_vdev_removal. 4986 */ 4987 free_from_removing_vdev(vd, offset, size); 4988 } else if (vd->vdev_ops->vdev_op_remap != NULL) { 4989 vdev_indirect_mark_obsolete(vd, offset, size); 4990 vd->vdev_ops->vdev_op_remap(vd, offset, size, 4991 metaslab_free_impl_cb, &checkpoint); 4992 } else { 4993 metaslab_free_concrete(vd, offset, size, checkpoint); 4994 } 4995 } 4996 4997 typedef struct remap_blkptr_cb_arg { 4998 blkptr_t *rbca_bp; 4999 spa_remap_cb_t rbca_cb; 5000 vdev_t *rbca_remap_vd; 5001 uint64_t rbca_remap_offset; 5002 void *rbca_cb_arg; 5003 } remap_blkptr_cb_arg_t; 5004 5005 void 5006 remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, 5007 uint64_t size, void *arg) 5008 { 5009 remap_blkptr_cb_arg_t *rbca = arg; 5010 blkptr_t *bp = rbca->rbca_bp; 5011 5012 /* We can not remap split blocks. */ 5013 if (size != DVA_GET_ASIZE(&bp->blk_dva[0])) 5014 return; 5015 ASSERT0(inner_offset); 5016 5017 if (rbca->rbca_cb != NULL) { 5018 /* 5019 * At this point we know that we are not handling split 5020 * blocks and we invoke the callback on the previous 5021 * vdev which must be indirect. 5022 */ 5023 ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops); 5024 5025 rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id, 5026 rbca->rbca_remap_offset, size, rbca->rbca_cb_arg); 5027 5028 /* set up remap_blkptr_cb_arg for the next call */ 5029 rbca->rbca_remap_vd = vd; 5030 rbca->rbca_remap_offset = offset; 5031 } 5032 5033 /* 5034 * The phys birth time is that of dva[0]. This ensures that we know 5035 * when each dva was written, so that resilver can determine which 5036 * blocks need to be scrubbed (i.e. those written during the time 5037 * the vdev was offline). It also ensures that the key used in 5038 * the ARC hash table is unique (i.e. dva[0] + phys_birth). If 5039 * we didn't change the phys_birth, a lookup in the ARC for a 5040 * remapped BP could find the data that was previously stored at 5041 * this vdev + offset. 5042 */ 5043 vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa, 5044 DVA_GET_VDEV(&bp->blk_dva[0])); 5045 vdev_indirect_births_t *vib = oldvd->vdev_indirect_births; 5046 bp->blk_phys_birth = vdev_indirect_births_physbirth(vib, 5047 DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0])); 5048 5049 DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id); 5050 DVA_SET_OFFSET(&bp->blk_dva[0], offset); 5051 } 5052 5053 /* 5054 * If the block pointer contains any indirect DVAs, modify them to refer to 5055 * concrete DVAs. Note that this will sometimes not be possible, leaving 5056 * the indirect DVA in place. This happens if the indirect DVA spans multiple 5057 * segments in the mapping (i.e. it is a "split block"). 5058 * 5059 * If the BP was remapped, calls the callback on the original dva (note the 5060 * callback can be called multiple times if the original indirect DVA refers 5061 * to another indirect DVA, etc). 5062 * 5063 * Returns TRUE if the BP was remapped. 5064 */ 5065 boolean_t 5066 spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg) 5067 { 5068 remap_blkptr_cb_arg_t rbca; 5069 5070 if (!zfs_remap_blkptr_enable) 5071 return (B_FALSE); 5072 5073 if (!spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) 5074 return (B_FALSE); 5075 5076 /* 5077 * Dedup BP's can not be remapped, because ddt_phys_select() depends 5078 * on DVA[0] being the same in the BP as in the DDT (dedup table). 5079 */ 5080 if (BP_GET_DEDUP(bp)) 5081 return (B_FALSE); 5082 5083 /* 5084 * Gang blocks can not be remapped, because 5085 * zio_checksum_gang_verifier() depends on the DVA[0] that's in 5086 * the BP used to read the gang block header (GBH) being the same 5087 * as the DVA[0] that we allocated for the GBH. 5088 */ 5089 if (BP_IS_GANG(bp)) 5090 return (B_FALSE); 5091 5092 /* 5093 * Embedded BP's have no DVA to remap. 5094 */ 5095 if (BP_GET_NDVAS(bp) < 1) 5096 return (B_FALSE); 5097 5098 /* 5099 * Note: we only remap dva[0]. If we remapped other dvas, we 5100 * would no longer know what their phys birth txg is. 5101 */ 5102 dva_t *dva = &bp->blk_dva[0]; 5103 5104 uint64_t offset = DVA_GET_OFFSET(dva); 5105 uint64_t size = DVA_GET_ASIZE(dva); 5106 vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); 5107 5108 if (vd->vdev_ops->vdev_op_remap == NULL) 5109 return (B_FALSE); 5110 5111 rbca.rbca_bp = bp; 5112 rbca.rbca_cb = callback; 5113 rbca.rbca_remap_vd = vd; 5114 rbca.rbca_remap_offset = offset; 5115 rbca.rbca_cb_arg = arg; 5116 5117 /* 5118 * remap_blkptr_cb() will be called in order for each level of 5119 * indirection, until a concrete vdev is reached or a split block is 5120 * encountered. old_vd and old_offset are updated within the callback 5121 * as we go from the one indirect vdev to the next one (either concrete 5122 * or indirect again) in that order. 5123 */ 5124 vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca); 5125 5126 /* Check if the DVA wasn't remapped because it is a split block */ 5127 if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id) 5128 return (B_FALSE); 5129 5130 return (B_TRUE); 5131 } 5132 5133 /* 5134 * Undo the allocation of a DVA which happened in the given transaction group. 5135 */ 5136 void 5137 metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg) 5138 { 5139 metaslab_t *msp; 5140 vdev_t *vd; 5141 uint64_t vdev = DVA_GET_VDEV(dva); 5142 uint64_t offset = DVA_GET_OFFSET(dva); 5143 uint64_t size = DVA_GET_ASIZE(dva); 5144 5145 ASSERT(DVA_IS_VALID(dva)); 5146 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 5147 5148 if (txg > spa_freeze_txg(spa)) 5149 return; 5150 5151 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 5152 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { 5153 cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu", 5154 (u_longlong_t)vdev, (u_longlong_t)offset); 5155 ASSERT(0); 5156 return; 5157 } 5158 5159 ASSERT(!vd->vdev_removing); 5160 ASSERT(vdev_is_concrete(vd)); 5161 ASSERT0(vd->vdev_indirect_config.vic_mapping_object); 5162 ASSERT3P(vd->vdev_indirect_mapping, ==, NULL); 5163 5164 if (DVA_GET_GANG(dva)) 5165 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 5166 5167 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 5168 5169 mutex_enter(&msp->ms_lock); 5170 range_tree_remove(msp->ms_allocating[txg & TXG_MASK], 5171 offset, size); 5172 msp->ms_allocating_total -= size; 5173 5174 VERIFY(!msp->ms_condensing); 5175 VERIFY3U(offset, >=, msp->ms_start); 5176 VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size); 5177 VERIFY3U(range_tree_space(msp->ms_allocatable) + size, <=, 5178 msp->ms_size); 5179 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 5180 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 5181 range_tree_add(msp->ms_allocatable, offset, size); 5182 mutex_exit(&msp->ms_lock); 5183 } 5184 5185 /* 5186 * Free the block represented by the given DVA. 5187 */ 5188 void 5189 metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint) 5190 { 5191 uint64_t vdev = DVA_GET_VDEV(dva); 5192 uint64_t offset = DVA_GET_OFFSET(dva); 5193 uint64_t size = DVA_GET_ASIZE(dva); 5194 vdev_t *vd = vdev_lookup_top(spa, vdev); 5195 5196 ASSERT(DVA_IS_VALID(dva)); 5197 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 5198 5199 if (DVA_GET_GANG(dva)) { 5200 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 5201 } 5202 5203 metaslab_free_impl(vd, offset, size, checkpoint); 5204 } 5205 5206 /* 5207 * Reserve some allocation slots. The reservation system must be called 5208 * before we call into the allocator. If there aren't any available slots 5209 * then the I/O will be throttled until an I/O completes and its slots are 5210 * freed up. The function returns true if it was successful in placing 5211 * the reservation. 5212 */ 5213 boolean_t 5214 metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator, 5215 zio_t *zio, int flags) 5216 { 5217 uint64_t available_slots = 0; 5218 boolean_t slot_reserved = B_FALSE; 5219 uint64_t max = mc->mc_alloc_max_slots[allocator]; 5220 5221 ASSERT(mc->mc_alloc_throttle_enabled); 5222 mutex_enter(&mc->mc_lock); 5223 5224 uint64_t reserved_slots = 5225 zfs_refcount_count(&mc->mc_alloc_slots[allocator]); 5226 if (reserved_slots < max) 5227 available_slots = max - reserved_slots; 5228 5229 if (slots <= available_slots || GANG_ALLOCATION(flags) || 5230 flags & METASLAB_MUST_RESERVE) { 5231 /* 5232 * We reserve the slots individually so that we can unreserve 5233 * them individually when an I/O completes. 5234 */ 5235 for (int d = 0; d < slots; d++) { 5236 reserved_slots = 5237 zfs_refcount_add(&mc->mc_alloc_slots[allocator], 5238 zio); 5239 } 5240 zio->io_flags |= ZIO_FLAG_IO_ALLOCATING; 5241 slot_reserved = B_TRUE; 5242 } 5243 5244 mutex_exit(&mc->mc_lock); 5245 return (slot_reserved); 5246 } 5247 5248 void 5249 metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, 5250 int allocator, zio_t *zio) 5251 { 5252 ASSERT(mc->mc_alloc_throttle_enabled); 5253 mutex_enter(&mc->mc_lock); 5254 for (int d = 0; d < slots; d++) { 5255 (void) zfs_refcount_remove(&mc->mc_alloc_slots[allocator], 5256 zio); 5257 } 5258 mutex_exit(&mc->mc_lock); 5259 } 5260 5261 static int 5262 metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size, 5263 uint64_t txg) 5264 { 5265 metaslab_t *msp; 5266 spa_t *spa = vd->vdev_spa; 5267 int error = 0; 5268 5269 if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count) 5270 return (ENXIO); 5271 5272 ASSERT3P(vd->vdev_ms, !=, NULL); 5273 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 5274 5275 mutex_enter(&msp->ms_lock); 5276 5277 if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) 5278 error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM); 5279 /* 5280 * No need to fail in that case; someone else has activated the 5281 * metaslab, but that doesn't preclude us from using it. 5282 */ 5283 if (error == EBUSY) 5284 error = 0; 5285 5286 if (error == 0 && 5287 !range_tree_contains(msp->ms_allocatable, offset, size)) 5288 error = SET_ERROR(ENOENT); 5289 5290 if (error || txg == 0) { /* txg == 0 indicates dry run */ 5291 mutex_exit(&msp->ms_lock); 5292 return (error); 5293 } 5294 5295 VERIFY(!msp->ms_condensing); 5296 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 5297 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 5298 VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=, 5299 msp->ms_size); 5300 range_tree_remove(msp->ms_allocatable, offset, size); 5301 range_tree_clear(msp->ms_trim, offset, size); 5302 5303 if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ 5304 metaslab_class_t *mc = msp->ms_group->mg_class; 5305 multilist_sublist_t *mls = 5306 multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp); 5307 if (!multilist_link_active(&msp->ms_class_txg_node)) { 5308 msp->ms_selected_txg = txg; 5309 multilist_sublist_insert_head(mls, msp); 5310 } 5311 multilist_sublist_unlock(mls); 5312 5313 if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) 5314 vdev_dirty(vd, VDD_METASLAB, msp, txg); 5315 range_tree_add(msp->ms_allocating[txg & TXG_MASK], 5316 offset, size); 5317 msp->ms_allocating_total += size; 5318 } 5319 5320 mutex_exit(&msp->ms_lock); 5321 5322 return (0); 5323 } 5324 5325 typedef struct metaslab_claim_cb_arg_t { 5326 uint64_t mcca_txg; 5327 int mcca_error; 5328 } metaslab_claim_cb_arg_t; 5329 5330 /* ARGSUSED */ 5331 static void 5332 metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, 5333 uint64_t size, void *arg) 5334 { 5335 metaslab_claim_cb_arg_t *mcca_arg = arg; 5336 5337 if (mcca_arg->mcca_error == 0) { 5338 mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset, 5339 size, mcca_arg->mcca_txg); 5340 } 5341 } 5342 5343 int 5344 metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg) 5345 { 5346 if (vd->vdev_ops->vdev_op_remap != NULL) { 5347 metaslab_claim_cb_arg_t arg; 5348 5349 /* 5350 * Only zdb(1M) can claim on indirect vdevs. This is used 5351 * to detect leaks of mapped space (that are not accounted 5352 * for in the obsolete counts, spacemap, or bpobj). 5353 */ 5354 ASSERT(!spa_writeable(vd->vdev_spa)); 5355 arg.mcca_error = 0; 5356 arg.mcca_txg = txg; 5357 5358 vd->vdev_ops->vdev_op_remap(vd, offset, size, 5359 metaslab_claim_impl_cb, &arg); 5360 5361 if (arg.mcca_error == 0) { 5362 arg.mcca_error = metaslab_claim_concrete(vd, 5363 offset, size, txg); 5364 } 5365 return (arg.mcca_error); 5366 } else { 5367 return (metaslab_claim_concrete(vd, offset, size, txg)); 5368 } 5369 } 5370 5371 /* 5372 * Intent log support: upon opening the pool after a crash, notify the SPA 5373 * of blocks that the intent log has allocated for immediate write, but 5374 * which are still considered free by the SPA because the last transaction 5375 * group didn't commit yet. 5376 */ 5377 static int 5378 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) 5379 { 5380 uint64_t vdev = DVA_GET_VDEV(dva); 5381 uint64_t offset = DVA_GET_OFFSET(dva); 5382 uint64_t size = DVA_GET_ASIZE(dva); 5383 vdev_t *vd; 5384 5385 if ((vd = vdev_lookup_top(spa, vdev)) == NULL) { 5386 return (SET_ERROR(ENXIO)); 5387 } 5388 5389 ASSERT(DVA_IS_VALID(dva)); 5390 5391 if (DVA_GET_GANG(dva)) 5392 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 5393 5394 return (metaslab_claim_impl(vd, offset, size, txg)); 5395 } 5396 5397 int 5398 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, 5399 int ndvas, uint64_t txg, blkptr_t *hintbp, int flags, 5400 zio_alloc_list_t *zal, zio_t *zio, int allocator) 5401 { 5402 dva_t *dva = bp->blk_dva; 5403 dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL; 5404 int error = 0; 5405 5406 ASSERT(bp->blk_birth == 0); 5407 ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); 5408 5409 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 5410 5411 if (mc->mc_rotor == NULL) { /* no vdevs in this class */ 5412 spa_config_exit(spa, SCL_ALLOC, FTAG); 5413 return (SET_ERROR(ENOSPC)); 5414 } 5415 5416 ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); 5417 ASSERT(BP_GET_NDVAS(bp) == 0); 5418 ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); 5419 ASSERT3P(zal, !=, NULL); 5420 5421 for (int d = 0; d < ndvas; d++) { 5422 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, 5423 txg, flags, zal, allocator); 5424 if (error != 0) { 5425 for (d--; d >= 0; d--) { 5426 metaslab_unalloc_dva(spa, &dva[d], txg); 5427 metaslab_group_alloc_decrement(spa, 5428 DVA_GET_VDEV(&dva[d]), zio, flags, 5429 allocator, B_FALSE); 5430 bzero(&dva[d], sizeof (dva_t)); 5431 } 5432 spa_config_exit(spa, SCL_ALLOC, FTAG); 5433 return (error); 5434 } else { 5435 /* 5436 * Update the metaslab group's queue depth 5437 * based on the newly allocated dva. 5438 */ 5439 metaslab_group_alloc_increment(spa, 5440 DVA_GET_VDEV(&dva[d]), zio, flags, allocator); 5441 } 5442 5443 } 5444 ASSERT(error == 0); 5445 ASSERT(BP_GET_NDVAS(bp) == ndvas); 5446 5447 spa_config_exit(spa, SCL_ALLOC, FTAG); 5448 5449 BP_SET_BIRTH(bp, txg, txg); 5450 5451 return (0); 5452 } 5453 5454 void 5455 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) 5456 { 5457 const dva_t *dva = bp->blk_dva; 5458 int ndvas = BP_GET_NDVAS(bp); 5459 5460 ASSERT(!BP_IS_HOLE(bp)); 5461 ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); 5462 5463 /* 5464 * If we have a checkpoint for the pool we need to make sure that 5465 * the blocks that we free that are part of the checkpoint won't be 5466 * reused until the checkpoint is discarded or we revert to it. 5467 * 5468 * The checkpoint flag is passed down the metaslab_free code path 5469 * and is set whenever we want to add a block to the checkpoint's 5470 * accounting. That is, we "checkpoint" blocks that existed at the 5471 * time the checkpoint was created and are therefore referenced by 5472 * the checkpointed uberblock. 5473 * 5474 * Note that, we don't checkpoint any blocks if the current 5475 * syncing txg <= spa_checkpoint_txg. We want these frees to sync 5476 * normally as they will be referenced by the checkpointed uberblock. 5477 */ 5478 boolean_t checkpoint = B_FALSE; 5479 if (bp->blk_birth <= spa->spa_checkpoint_txg && 5480 spa_syncing_txg(spa) > spa->spa_checkpoint_txg) { 5481 /* 5482 * At this point, if the block is part of the checkpoint 5483 * there is no way it was created in the current txg. 5484 */ 5485 ASSERT(!now); 5486 ASSERT3U(spa_syncing_txg(spa), ==, txg); 5487 checkpoint = B_TRUE; 5488 } 5489 5490 spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); 5491 5492 for (int d = 0; d < ndvas; d++) { 5493 if (now) { 5494 metaslab_unalloc_dva(spa, &dva[d], txg); 5495 } else { 5496 ASSERT3U(txg, ==, spa_syncing_txg(spa)); 5497 metaslab_free_dva(spa, &dva[d], checkpoint); 5498 } 5499 } 5500 5501 spa_config_exit(spa, SCL_FREE, FTAG); 5502 } 5503 5504 int 5505 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) 5506 { 5507 const dva_t *dva = bp->blk_dva; 5508 int ndvas = BP_GET_NDVAS(bp); 5509 int error = 0; 5510 5511 ASSERT(!BP_IS_HOLE(bp)); 5512 5513 if (txg != 0) { 5514 /* 5515 * First do a dry run to make sure all DVAs are claimable, 5516 * so we don't have to unwind from partial failures below. 5517 */ 5518 if ((error = metaslab_claim(spa, bp, 0)) != 0) 5519 return (error); 5520 } 5521 5522 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 5523 5524 for (int d = 0; d < ndvas; d++) { 5525 error = metaslab_claim_dva(spa, &dva[d], txg); 5526 if (error != 0) 5527 break; 5528 } 5529 5530 spa_config_exit(spa, SCL_ALLOC, FTAG); 5531 5532 ASSERT(error == 0 || txg == 0); 5533 5534 return (error); 5535 } 5536 5537 /* ARGSUSED */ 5538 static void 5539 metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset, 5540 uint64_t size, void *arg) 5541 { 5542 if (vd->vdev_ops == &vdev_indirect_ops) 5543 return; 5544 5545 metaslab_check_free_impl(vd, offset, size); 5546 } 5547 5548 static void 5549 metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size) 5550 { 5551 metaslab_t *msp; 5552 spa_t *spa = vd->vdev_spa; 5553 5554 if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) 5555 return; 5556 5557 if (vd->vdev_ops->vdev_op_remap != NULL) { 5558 vd->vdev_ops->vdev_op_remap(vd, offset, size, 5559 metaslab_check_free_impl_cb, NULL); 5560 return; 5561 } 5562 5563 ASSERT(vdev_is_concrete(vd)); 5564 ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); 5565 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 5566 5567 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 5568 5569 mutex_enter(&msp->ms_lock); 5570 if (msp->ms_loaded) { 5571 range_tree_verify_not_present(msp->ms_allocatable, 5572 offset, size); 5573 } 5574 5575 /* 5576 * Check all segments that currently exist in the freeing pipeline. 5577 * 5578 * It would intuitively make sense to also check the current allocating 5579 * tree since metaslab_unalloc_dva() exists for extents that are 5580 * allocated and freed in the same sync pass withing the same txg. 5581 * Unfortunately there are places (e.g. the ZIL) where we allocate a 5582 * segment but then we free part of it within the same txg 5583 * [see zil_sync()]. Thus, we don't call range_tree_verify() in the 5584 * current allocating tree. 5585 */ 5586 range_tree_verify_not_present(msp->ms_freeing, offset, size); 5587 range_tree_verify_not_present(msp->ms_checkpointing, offset, size); 5588 range_tree_verify_not_present(msp->ms_freed, offset, size); 5589 for (int j = 0; j < TXG_DEFER_SIZE; j++) 5590 range_tree_verify_not_present(msp->ms_defer[j], offset, size); 5591 range_tree_verify_not_present(msp->ms_trim, offset, size); 5592 mutex_exit(&msp->ms_lock); 5593 } 5594 5595 void 5596 metaslab_check_free(spa_t *spa, const blkptr_t *bp) 5597 { 5598 if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) 5599 return; 5600 5601 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 5602 for (int i = 0; i < BP_GET_NDVAS(bp); i++) { 5603 uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]); 5604 vdev_t *vd = vdev_lookup_top(spa, vdev); 5605 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); 5606 uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]); 5607 5608 if (DVA_GET_GANG(&bp->blk_dva[i])) 5609 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 5610 5611 ASSERT3P(vd, !=, NULL); 5612 5613 metaslab_check_free_impl(vd, offset, size); 5614 } 5615 spa_config_exit(spa, SCL_VDEV, FTAG); 5616 } 5617 5618 static void 5619 metaslab_group_disable_wait(metaslab_group_t *mg) 5620 { 5621 ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock)); 5622 while (mg->mg_disabled_updating) { 5623 cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock); 5624 } 5625 } 5626 5627 static void 5628 metaslab_group_disabled_increment(metaslab_group_t *mg) 5629 { 5630 ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock)); 5631 ASSERT(mg->mg_disabled_updating); 5632 5633 while (mg->mg_ms_disabled >= max_disabled_ms) { 5634 cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock); 5635 } 5636 mg->mg_ms_disabled++; 5637 ASSERT3U(mg->mg_ms_disabled, <=, max_disabled_ms); 5638 } 5639 5640 /* 5641 * Mark the metaslab as disabled to prevent any allocations on this metaslab. 5642 * We must also track how many metaslabs are currently disabled within a 5643 * metaslab group and limit them to prevent allocation failures from 5644 * occurring because all metaslabs are disabled. 5645 */ 5646 void 5647 metaslab_disable(metaslab_t *msp) 5648 { 5649 ASSERT(!MUTEX_HELD(&msp->ms_lock)); 5650 metaslab_group_t *mg = msp->ms_group; 5651 5652 mutex_enter(&mg->mg_ms_disabled_lock); 5653 5654 /* 5655 * To keep an accurate count of how many threads have disabled 5656 * a specific metaslab group, we only allow one thread to mark 5657 * the metaslab group at a time. This ensures that the value of 5658 * ms_disabled will be accurate when we decide to mark a metaslab 5659 * group as disabled. To do this we force all other threads 5660 * to wait till the metaslab's mg_disabled_updating flag is no 5661 * longer set. 5662 */ 5663 metaslab_group_disable_wait(mg); 5664 mg->mg_disabled_updating = B_TRUE; 5665 if (msp->ms_disabled == 0) { 5666 metaslab_group_disabled_increment(mg); 5667 } 5668 mutex_enter(&msp->ms_lock); 5669 msp->ms_disabled++; 5670 mutex_exit(&msp->ms_lock); 5671 5672 mg->mg_disabled_updating = B_FALSE; 5673 cv_broadcast(&mg->mg_ms_disabled_cv); 5674 mutex_exit(&mg->mg_ms_disabled_lock); 5675 } 5676 5677 void 5678 metaslab_enable(metaslab_t *msp, boolean_t sync, boolean_t unload) 5679 { 5680 metaslab_group_t *mg = msp->ms_group; 5681 spa_t *spa = mg->mg_vd->vdev_spa; 5682 5683 /* 5684 * Wait for the outstanding IO to be synced to prevent newly 5685 * allocated blocks from being overwritten. This used by 5686 * initialize and TRIM which are modifying unallocated space. 5687 */ 5688 if (sync) 5689 txg_wait_synced(spa_get_dsl(spa), 0); 5690 5691 mutex_enter(&mg->mg_ms_disabled_lock); 5692 mutex_enter(&msp->ms_lock); 5693 if (--msp->ms_disabled == 0) { 5694 mg->mg_ms_disabled--; 5695 cv_broadcast(&mg->mg_ms_disabled_cv); 5696 if (unload) 5697 metaslab_unload(msp); 5698 } 5699 mutex_exit(&msp->ms_lock); 5700 mutex_exit(&mg->mg_ms_disabled_lock); 5701 } 5702 5703 static void 5704 metaslab_update_ondisk_flush_data(metaslab_t *ms, dmu_tx_t *tx) 5705 { 5706 vdev_t *vd = ms->ms_group->mg_vd; 5707 spa_t *spa = vd->vdev_spa; 5708 objset_t *mos = spa_meta_objset(spa); 5709 5710 ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); 5711 5712 metaslab_unflushed_phys_t entry = { 5713 .msp_unflushed_txg = metaslab_unflushed_txg(ms), 5714 }; 5715 uint64_t entry_size = sizeof (entry); 5716 uint64_t entry_offset = ms->ms_id * entry_size; 5717 5718 uint64_t object = 0; 5719 int err = zap_lookup(mos, vd->vdev_top_zap, 5720 VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, 5721 &object); 5722 if (err == ENOENT) { 5723 object = dmu_object_alloc(mos, DMU_OTN_UINT64_METADATA, 5724 SPA_OLD_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx); 5725 VERIFY0(zap_add(mos, vd->vdev_top_zap, 5726 VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, 5727 &object, tx)); 5728 } else { 5729 VERIFY0(err); 5730 } 5731 5732 dmu_write(spa_meta_objset(spa), object, entry_offset, entry_size, 5733 &entry, tx); 5734 } 5735 5736 void 5737 metaslab_set_unflushed_txg(metaslab_t *ms, uint64_t txg, dmu_tx_t *tx) 5738 { 5739 spa_t *spa = ms->ms_group->mg_vd->vdev_spa; 5740 5741 if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) 5742 return; 5743 5744 ms->ms_unflushed_txg = txg; 5745 metaslab_update_ondisk_flush_data(ms, tx); 5746 } 5747 5748 uint64_t 5749 metaslab_unflushed_txg(metaslab_t *ms) 5750 { 5751 return (ms->ms_unflushed_txg); 5752 } 5753