1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 24 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 25 * Copyright (c) 2014 Integros [integros.com] 26 * Copyright (c) 2017, Intel Corporation. 27 */ 28 29 #include <sys/zfs_context.h> 30 #include <sys/dmu.h> 31 #include <sys/dmu_tx.h> 32 #include <sys/space_map.h> 33 #include <sys/metaslab_impl.h> 34 #include <sys/vdev_impl.h> 35 #include <sys/zio.h> 36 #include <sys/spa_impl.h> 37 #include <sys/zfeature.h> 38 #include <sys/vdev_indirect_mapping.h> 39 #include <sys/zap.h> 40 41 #define GANG_ALLOCATION(flags) \ 42 ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER)) 43 44 uint64_t metaslab_aliquot = 512ULL << 10; 45 uint64_t metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ 46 47 /* 48 * In pools where the log space map feature is not enabled we touch 49 * multiple metaslabs (and their respective space maps) with each 50 * transaction group. Thus, we benefit from having a small space map 51 * block size since it allows us to issue more I/O operations scattered 52 * around the disk. So a sane default for the space map block size 53 * is 8~16K. 54 */ 55 int zfs_metaslab_sm_blksz_no_log = (1 << 14); 56 57 /* 58 * When the log space map feature is enabled, we accumulate a lot of 59 * changes per metaslab that are flushed once in a while so we benefit 60 * from a bigger block size like 128K for the metaslab space maps. 61 */ 62 int zfs_metaslab_sm_blksz_with_log = (1 << 17); 63 64 /* 65 * The in-core space map representation is more compact than its on-disk form. 66 * The zfs_condense_pct determines how much more compact the in-core 67 * space map representation must be before we compact it on-disk. 68 * Values should be greater than or equal to 100. 69 */ 70 int zfs_condense_pct = 200; 71 72 /* 73 * Condensing a metaslab is not guaranteed to actually reduce the amount of 74 * space used on disk. In particular, a space map uses data in increments of 75 * MAX(1 << ashift, space_map_blksize), so a metaslab might use the 76 * same number of blocks after condensing. Since the goal of condensing is to 77 * reduce the number of IOPs required to read the space map, we only want to 78 * condense when we can be sure we will reduce the number of blocks used by the 79 * space map. Unfortunately, we cannot precisely compute whether or not this is 80 * the case in metaslab_should_condense since we are holding ms_lock. Instead, 81 * we apply the following heuristic: do not condense a spacemap unless the 82 * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold 83 * blocks. 84 */ 85 int zfs_metaslab_condense_block_threshold = 4; 86 87 /* 88 * The zfs_mg_noalloc_threshold defines which metaslab groups should 89 * be eligible for allocation. The value is defined as a percentage of 90 * free space. Metaslab groups that have more free space than 91 * zfs_mg_noalloc_threshold are always eligible for allocations. Once 92 * a metaslab group's free space is less than or equal to the 93 * zfs_mg_noalloc_threshold the allocator will avoid allocating to that 94 * group unless all groups in the pool have reached zfs_mg_noalloc_threshold. 95 * Once all groups in the pool reach zfs_mg_noalloc_threshold then all 96 * groups are allowed to accept allocations. Gang blocks are always 97 * eligible to allocate on any metaslab group. The default value of 0 means 98 * no metaslab group will be excluded based on this criterion. 99 */ 100 int zfs_mg_noalloc_threshold = 0; 101 102 /* 103 * Metaslab groups are considered eligible for allocations if their 104 * fragmenation metric (measured as a percentage) is less than or 105 * equal to zfs_mg_fragmentation_threshold. If a metaslab group 106 * exceeds this threshold then it will be skipped unless all metaslab 107 * groups within the metaslab class have also crossed this threshold. 108 * 109 * This tunable was introduced to avoid edge cases where we continue 110 * allocating from very fragmented disks in our pool while other, less 111 * fragmented disks, exists. On the other hand, if all disks in the 112 * pool are uniformly approaching the threshold, the threshold can 113 * be a speed bump in performance, where we keep switching the disks 114 * that we allocate from (e.g. we allocate some segments from disk A 115 * making it bypassing the threshold while freeing segments from disk 116 * B getting its fragmentation below the threshold). 117 * 118 * Empirically, we've seen that our vdev selection for allocations is 119 * good enough that fragmentation increases uniformly across all vdevs 120 * the majority of the time. Thus we set the threshold percentage high 121 * enough to avoid hitting the speed bump on pools that are being pushed 122 * to the edge. 123 */ 124 int zfs_mg_fragmentation_threshold = 95; 125 126 /* 127 * Allow metaslabs to keep their active state as long as their fragmentation 128 * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An 129 * active metaslab that exceeds this threshold will no longer keep its active 130 * status allowing better metaslabs to be selected. 131 */ 132 int zfs_metaslab_fragmentation_threshold = 70; 133 134 /* 135 * When set will load all metaslabs when pool is first opened. 136 */ 137 int metaslab_debug_load = 0; 138 139 /* 140 * When set will prevent metaslabs from being unloaded. 141 */ 142 int metaslab_debug_unload = 0; 143 144 /* 145 * Minimum size which forces the dynamic allocator to change 146 * it's allocation strategy. Once the space map cannot satisfy 147 * an allocation of this size then it switches to using more 148 * aggressive strategy (i.e search by size rather than offset). 149 */ 150 uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE; 151 152 /* 153 * The minimum free space, in percent, which must be available 154 * in a space map to continue allocations in a first-fit fashion. 155 * Once the space map's free space drops below this level we dynamically 156 * switch to using best-fit allocations. 157 */ 158 int metaslab_df_free_pct = 4; 159 160 /* 161 * Maximum distance to search forward from the last offset. Without this 162 * limit, fragmented pools can see >100,000 iterations and 163 * metaslab_block_picker() becomes the performance limiting factor on 164 * high-performance storage. 165 * 166 * With the default setting of 16MB, we typically see less than 500 167 * iterations, even with very fragmented, ashift=9 pools. The maximum number 168 * of iterations possible is: 169 * metaslab_df_max_search / (2 * (1<<ashift)) 170 * With the default setting of 16MB this is 16*1024 (with ashift=9) or 171 * 2048 (with ashift=12). 172 */ 173 int metaslab_df_max_search = 16 * 1024 * 1024; 174 175 /* 176 * If we are not searching forward (due to metaslab_df_max_search, 177 * metaslab_df_free_pct, or metaslab_df_alloc_threshold), this tunable 178 * controls what segment is used. If it is set, we will use the largest free 179 * segment. If it is not set, we will use a segment of exactly the requested 180 * size (or larger). 181 */ 182 int metaslab_df_use_largest_segment = B_FALSE; 183 184 /* 185 * A metaslab is considered "free" if it contains a contiguous 186 * segment which is greater than metaslab_min_alloc_size. 187 */ 188 uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS; 189 190 /* 191 * Percentage of all cpus that can be used by the metaslab taskq. 192 */ 193 int metaslab_load_pct = 50; 194 195 /* 196 * Determines how many txgs a metaslab may remain loaded without having any 197 * allocations from it. As long as a metaslab continues to be used we will 198 * keep it loaded. 199 */ 200 int metaslab_unload_delay = TXG_SIZE * 2; 201 202 /* 203 * Max number of metaslabs per group to preload. 204 */ 205 int metaslab_preload_limit = SPA_DVAS_PER_BP; 206 207 /* 208 * Enable/disable preloading of metaslab. 209 */ 210 boolean_t metaslab_preload_enabled = B_TRUE; 211 212 /* 213 * Enable/disable fragmentation weighting on metaslabs. 214 */ 215 boolean_t metaslab_fragmentation_factor_enabled = B_TRUE; 216 217 /* 218 * Enable/disable lba weighting (i.e. outer tracks are given preference). 219 */ 220 boolean_t metaslab_lba_weighting_enabled = B_TRUE; 221 222 /* 223 * Enable/disable metaslab group biasing. 224 */ 225 boolean_t metaslab_bias_enabled = B_TRUE; 226 227 /* 228 * Enable/disable remapping of indirect DVAs to their concrete vdevs. 229 */ 230 boolean_t zfs_remap_blkptr_enable = B_TRUE; 231 232 /* 233 * Enable/disable segment-based metaslab selection. 234 */ 235 boolean_t zfs_metaslab_segment_weight_enabled = B_TRUE; 236 237 /* 238 * When using segment-based metaslab selection, we will continue 239 * allocating from the active metaslab until we have exhausted 240 * zfs_metaslab_switch_threshold of its buckets. 241 */ 242 int zfs_metaslab_switch_threshold = 2; 243 244 /* 245 * Internal switch to enable/disable the metaslab allocation tracing 246 * facility. 247 */ 248 boolean_t metaslab_trace_enabled = B_TRUE; 249 250 /* 251 * Maximum entries that the metaslab allocation tracing facility will keep 252 * in a given list when running in non-debug mode. We limit the number 253 * of entries in non-debug mode to prevent us from using up too much memory. 254 * The limit should be sufficiently large that we don't expect any allocation 255 * to every exceed this value. In debug mode, the system will panic if this 256 * limit is ever reached allowing for further investigation. 257 */ 258 uint64_t metaslab_trace_max_entries = 5000; 259 260 /* 261 * Maximum number of metaslabs per group that can be disabled 262 * simultaneously. 263 */ 264 int max_disabled_ms = 3; 265 266 static uint64_t metaslab_weight(metaslab_t *); 267 static void metaslab_set_fragmentation(metaslab_t *); 268 static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t); 269 static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t); 270 static void metaslab_passivate(metaslab_t *msp, uint64_t weight); 271 static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp); 272 static void metaslab_flush_update(metaslab_t *, dmu_tx_t *); 273 274 kmem_cache_t *metaslab_alloc_trace_cache; 275 276 /* 277 * ========================================================================== 278 * Metaslab classes 279 * ========================================================================== 280 */ 281 metaslab_class_t * 282 metaslab_class_create(spa_t *spa, metaslab_ops_t *ops) 283 { 284 metaslab_class_t *mc; 285 286 mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); 287 288 mc->mc_spa = spa; 289 mc->mc_rotor = NULL; 290 mc->mc_ops = ops; 291 mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL); 292 mc->mc_alloc_slots = kmem_zalloc(spa->spa_alloc_count * 293 sizeof (zfs_refcount_t), KM_SLEEP); 294 mc->mc_alloc_max_slots = kmem_zalloc(spa->spa_alloc_count * 295 sizeof (uint64_t), KM_SLEEP); 296 for (int i = 0; i < spa->spa_alloc_count; i++) 297 zfs_refcount_create_tracked(&mc->mc_alloc_slots[i]); 298 299 return (mc); 300 } 301 302 void 303 metaslab_class_destroy(metaslab_class_t *mc) 304 { 305 ASSERT(mc->mc_rotor == NULL); 306 ASSERT(mc->mc_alloc == 0); 307 ASSERT(mc->mc_deferred == 0); 308 ASSERT(mc->mc_space == 0); 309 ASSERT(mc->mc_dspace == 0); 310 311 for (int i = 0; i < mc->mc_spa->spa_alloc_count; i++) 312 zfs_refcount_destroy(&mc->mc_alloc_slots[i]); 313 kmem_free(mc->mc_alloc_slots, mc->mc_spa->spa_alloc_count * 314 sizeof (zfs_refcount_t)); 315 kmem_free(mc->mc_alloc_max_slots, mc->mc_spa->spa_alloc_count * 316 sizeof (uint64_t)); 317 mutex_destroy(&mc->mc_lock); 318 kmem_free(mc, sizeof (metaslab_class_t)); 319 } 320 321 int 322 metaslab_class_validate(metaslab_class_t *mc) 323 { 324 metaslab_group_t *mg; 325 vdev_t *vd; 326 327 /* 328 * Must hold one of the spa_config locks. 329 */ 330 ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || 331 spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); 332 333 if ((mg = mc->mc_rotor) == NULL) 334 return (0); 335 336 do { 337 vd = mg->mg_vd; 338 ASSERT(vd->vdev_mg != NULL); 339 ASSERT3P(vd->vdev_top, ==, vd); 340 ASSERT3P(mg->mg_class, ==, mc); 341 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); 342 } while ((mg = mg->mg_next) != mc->mc_rotor); 343 344 return (0); 345 } 346 347 static void 348 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, 349 int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) 350 { 351 atomic_add_64(&mc->mc_alloc, alloc_delta); 352 atomic_add_64(&mc->mc_deferred, defer_delta); 353 atomic_add_64(&mc->mc_space, space_delta); 354 atomic_add_64(&mc->mc_dspace, dspace_delta); 355 } 356 357 uint64_t 358 metaslab_class_get_alloc(metaslab_class_t *mc) 359 { 360 return (mc->mc_alloc); 361 } 362 363 uint64_t 364 metaslab_class_get_deferred(metaslab_class_t *mc) 365 { 366 return (mc->mc_deferred); 367 } 368 369 uint64_t 370 metaslab_class_get_space(metaslab_class_t *mc) 371 { 372 return (mc->mc_space); 373 } 374 375 uint64_t 376 metaslab_class_get_dspace(metaslab_class_t *mc) 377 { 378 return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); 379 } 380 381 void 382 metaslab_class_histogram_verify(metaslab_class_t *mc) 383 { 384 spa_t *spa = mc->mc_spa; 385 vdev_t *rvd = spa->spa_root_vdev; 386 uint64_t *mc_hist; 387 int i; 388 389 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 390 return; 391 392 mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 393 KM_SLEEP); 394 395 for (int c = 0; c < rvd->vdev_children; c++) { 396 vdev_t *tvd = rvd->vdev_child[c]; 397 metaslab_group_t *mg = tvd->vdev_mg; 398 399 /* 400 * Skip any holes, uninitialized top-levels, or 401 * vdevs that are not in this metalab class. 402 */ 403 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || 404 mg->mg_class != mc) { 405 continue; 406 } 407 408 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 409 mc_hist[i] += mg->mg_histogram[i]; 410 } 411 412 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 413 VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]); 414 415 kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 416 } 417 418 /* 419 * Calculate the metaslab class's fragmentation metric. The metric 420 * is weighted based on the space contribution of each metaslab group. 421 * The return value will be a number between 0 and 100 (inclusive), or 422 * ZFS_FRAG_INVALID if the metric has not been set. See comment above the 423 * zfs_frag_table for more information about the metric. 424 */ 425 uint64_t 426 metaslab_class_fragmentation(metaslab_class_t *mc) 427 { 428 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 429 uint64_t fragmentation = 0; 430 431 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 432 433 for (int c = 0; c < rvd->vdev_children; c++) { 434 vdev_t *tvd = rvd->vdev_child[c]; 435 metaslab_group_t *mg = tvd->vdev_mg; 436 437 /* 438 * Skip any holes, uninitialized top-levels, 439 * or vdevs that are not in this metalab class. 440 */ 441 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || 442 mg->mg_class != mc) { 443 continue; 444 } 445 446 /* 447 * If a metaslab group does not contain a fragmentation 448 * metric then just bail out. 449 */ 450 if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { 451 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 452 return (ZFS_FRAG_INVALID); 453 } 454 455 /* 456 * Determine how much this metaslab_group is contributing 457 * to the overall pool fragmentation metric. 458 */ 459 fragmentation += mg->mg_fragmentation * 460 metaslab_group_get_space(mg); 461 } 462 fragmentation /= metaslab_class_get_space(mc); 463 464 ASSERT3U(fragmentation, <=, 100); 465 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 466 return (fragmentation); 467 } 468 469 /* 470 * Calculate the amount of expandable space that is available in 471 * this metaslab class. If a device is expanded then its expandable 472 * space will be the amount of allocatable space that is currently not 473 * part of this metaslab class. 474 */ 475 uint64_t 476 metaslab_class_expandable_space(metaslab_class_t *mc) 477 { 478 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 479 uint64_t space = 0; 480 481 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 482 for (int c = 0; c < rvd->vdev_children; c++) { 483 uint64_t tspace; 484 vdev_t *tvd = rvd->vdev_child[c]; 485 metaslab_group_t *mg = tvd->vdev_mg; 486 487 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || 488 mg->mg_class != mc) { 489 continue; 490 } 491 492 /* 493 * Calculate if we have enough space to add additional 494 * metaslabs. We report the expandable space in terms 495 * of the metaslab size since that's the unit of expansion. 496 * Adjust by efi system partition size. 497 */ 498 tspace = tvd->vdev_max_asize - tvd->vdev_asize; 499 if (tspace > mc->mc_spa->spa_bootsize) { 500 tspace -= mc->mc_spa->spa_bootsize; 501 } 502 space += P2ALIGN(tspace, 1ULL << tvd->vdev_ms_shift); 503 } 504 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 505 return (space); 506 } 507 508 static int 509 metaslab_compare(const void *x1, const void *x2) 510 { 511 const metaslab_t *m1 = (const metaslab_t *)x1; 512 const metaslab_t *m2 = (const metaslab_t *)x2; 513 514 int sort1 = 0; 515 int sort2 = 0; 516 if (m1->ms_allocator != -1 && m1->ms_primary) 517 sort1 = 1; 518 else if (m1->ms_allocator != -1 && !m1->ms_primary) 519 sort1 = 2; 520 if (m2->ms_allocator != -1 && m2->ms_primary) 521 sort2 = 1; 522 else if (m2->ms_allocator != -1 && !m2->ms_primary) 523 sort2 = 2; 524 525 /* 526 * Sort inactive metaslabs first, then primaries, then secondaries. When 527 * selecting a metaslab to allocate from, an allocator first tries its 528 * primary, then secondary active metaslab. If it doesn't have active 529 * metaslabs, or can't allocate from them, it searches for an inactive 530 * metaslab to activate. If it can't find a suitable one, it will steal 531 * a primary or secondary metaslab from another allocator. 532 */ 533 if (sort1 < sort2) 534 return (-1); 535 if (sort1 > sort2) 536 return (1); 537 538 int cmp = AVL_CMP(m2->ms_weight, m1->ms_weight); 539 if (likely(cmp)) 540 return (cmp); 541 542 IMPLY(AVL_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2); 543 544 return (AVL_CMP(m1->ms_start, m2->ms_start)); 545 } 546 547 /* 548 * ========================================================================== 549 * Metaslab groups 550 * ========================================================================== 551 */ 552 /* 553 * Update the allocatable flag and the metaslab group's capacity. 554 * The allocatable flag is set to true if the capacity is below 555 * the zfs_mg_noalloc_threshold or has a fragmentation value that is 556 * greater than zfs_mg_fragmentation_threshold. If a metaslab group 557 * transitions from allocatable to non-allocatable or vice versa then the 558 * metaslab group's class is updated to reflect the transition. 559 */ 560 static void 561 metaslab_group_alloc_update(metaslab_group_t *mg) 562 { 563 vdev_t *vd = mg->mg_vd; 564 metaslab_class_t *mc = mg->mg_class; 565 vdev_stat_t *vs = &vd->vdev_stat; 566 boolean_t was_allocatable; 567 boolean_t was_initialized; 568 569 ASSERT(vd == vd->vdev_top); 570 ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==, 571 SCL_ALLOC); 572 573 mutex_enter(&mg->mg_lock); 574 was_allocatable = mg->mg_allocatable; 575 was_initialized = mg->mg_initialized; 576 577 mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / 578 (vs->vs_space + 1); 579 580 mutex_enter(&mc->mc_lock); 581 582 /* 583 * If the metaslab group was just added then it won't 584 * have any space until we finish syncing out this txg. 585 * At that point we will consider it initialized and available 586 * for allocations. We also don't consider non-activated 587 * metaslab groups (e.g. vdevs that are in the middle of being removed) 588 * to be initialized, because they can't be used for allocation. 589 */ 590 mg->mg_initialized = metaslab_group_initialized(mg); 591 if (!was_initialized && mg->mg_initialized) { 592 mc->mc_groups++; 593 } else if (was_initialized && !mg->mg_initialized) { 594 ASSERT3U(mc->mc_groups, >, 0); 595 mc->mc_groups--; 596 } 597 if (mg->mg_initialized) 598 mg->mg_no_free_space = B_FALSE; 599 600 /* 601 * A metaslab group is considered allocatable if it has plenty 602 * of free space or is not heavily fragmented. We only take 603 * fragmentation into account if the metaslab group has a valid 604 * fragmentation metric (i.e. a value between 0 and 100). 605 */ 606 mg->mg_allocatable = (mg->mg_activation_count > 0 && 607 mg->mg_free_capacity > zfs_mg_noalloc_threshold && 608 (mg->mg_fragmentation == ZFS_FRAG_INVALID || 609 mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)); 610 611 /* 612 * The mc_alloc_groups maintains a count of the number of 613 * groups in this metaslab class that are still above the 614 * zfs_mg_noalloc_threshold. This is used by the allocating 615 * threads to determine if they should avoid allocations to 616 * a given group. The allocator will avoid allocations to a group 617 * if that group has reached or is below the zfs_mg_noalloc_threshold 618 * and there are still other groups that are above the threshold. 619 * When a group transitions from allocatable to non-allocatable or 620 * vice versa we update the metaslab class to reflect that change. 621 * When the mc_alloc_groups value drops to 0 that means that all 622 * groups have reached the zfs_mg_noalloc_threshold making all groups 623 * eligible for allocations. This effectively means that all devices 624 * are balanced again. 625 */ 626 if (was_allocatable && !mg->mg_allocatable) 627 mc->mc_alloc_groups--; 628 else if (!was_allocatable && mg->mg_allocatable) 629 mc->mc_alloc_groups++; 630 mutex_exit(&mc->mc_lock); 631 632 mutex_exit(&mg->mg_lock); 633 } 634 635 int 636 metaslab_sort_by_flushed(const void *va, const void *vb) 637 { 638 const metaslab_t *a = va; 639 const metaslab_t *b = vb; 640 641 int cmp = AVL_CMP(a->ms_unflushed_txg, b->ms_unflushed_txg); 642 if (likely(cmp)) 643 return (cmp); 644 645 uint64_t a_vdev_id = a->ms_group->mg_vd->vdev_id; 646 uint64_t b_vdev_id = b->ms_group->mg_vd->vdev_id; 647 cmp = AVL_CMP(a_vdev_id, b_vdev_id); 648 if (cmp) 649 return (cmp); 650 651 return (AVL_CMP(a->ms_id, b->ms_id)); 652 } 653 654 metaslab_group_t * 655 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators) 656 { 657 metaslab_group_t *mg; 658 659 mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); 660 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); 661 mutex_init(&mg->mg_ms_disabled_lock, NULL, MUTEX_DEFAULT, NULL); 662 cv_init(&mg->mg_ms_disabled_cv, NULL, CV_DEFAULT, NULL); 663 mg->mg_primaries = kmem_zalloc(allocators * sizeof (metaslab_t *), 664 KM_SLEEP); 665 mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *), 666 KM_SLEEP); 667 avl_create(&mg->mg_metaslab_tree, metaslab_compare, 668 sizeof (metaslab_t), offsetof(metaslab_t, ms_group_node)); 669 mg->mg_vd = vd; 670 mg->mg_class = mc; 671 mg->mg_activation_count = 0; 672 mg->mg_initialized = B_FALSE; 673 mg->mg_no_free_space = B_TRUE; 674 mg->mg_allocators = allocators; 675 676 mg->mg_alloc_queue_depth = kmem_zalloc(allocators * 677 sizeof (zfs_refcount_t), KM_SLEEP); 678 mg->mg_cur_max_alloc_queue_depth = kmem_zalloc(allocators * 679 sizeof (uint64_t), KM_SLEEP); 680 for (int i = 0; i < allocators; i++) { 681 zfs_refcount_create_tracked(&mg->mg_alloc_queue_depth[i]); 682 mg->mg_cur_max_alloc_queue_depth[i] = 0; 683 } 684 685 mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct, 686 minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT); 687 688 return (mg); 689 } 690 691 void 692 metaslab_group_destroy(metaslab_group_t *mg) 693 { 694 ASSERT(mg->mg_prev == NULL); 695 ASSERT(mg->mg_next == NULL); 696 /* 697 * We may have gone below zero with the activation count 698 * either because we never activated in the first place or 699 * because we're done, and possibly removing the vdev. 700 */ 701 ASSERT(mg->mg_activation_count <= 0); 702 703 taskq_destroy(mg->mg_taskq); 704 avl_destroy(&mg->mg_metaslab_tree); 705 kmem_free(mg->mg_primaries, mg->mg_allocators * sizeof (metaslab_t *)); 706 kmem_free(mg->mg_secondaries, mg->mg_allocators * 707 sizeof (metaslab_t *)); 708 mutex_destroy(&mg->mg_lock); 709 mutex_destroy(&mg->mg_ms_disabled_lock); 710 cv_destroy(&mg->mg_ms_disabled_cv); 711 712 for (int i = 0; i < mg->mg_allocators; i++) { 713 zfs_refcount_destroy(&mg->mg_alloc_queue_depth[i]); 714 mg->mg_cur_max_alloc_queue_depth[i] = 0; 715 } 716 kmem_free(mg->mg_alloc_queue_depth, mg->mg_allocators * 717 sizeof (zfs_refcount_t)); 718 kmem_free(mg->mg_cur_max_alloc_queue_depth, mg->mg_allocators * 719 sizeof (uint64_t)); 720 721 kmem_free(mg, sizeof (metaslab_group_t)); 722 } 723 724 void 725 metaslab_group_activate(metaslab_group_t *mg) 726 { 727 metaslab_class_t *mc = mg->mg_class; 728 metaslab_group_t *mgprev, *mgnext; 729 730 ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER), !=, 0); 731 732 ASSERT(mc->mc_rotor != mg); 733 ASSERT(mg->mg_prev == NULL); 734 ASSERT(mg->mg_next == NULL); 735 ASSERT(mg->mg_activation_count <= 0); 736 737 if (++mg->mg_activation_count <= 0) 738 return; 739 740 mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); 741 metaslab_group_alloc_update(mg); 742 743 if ((mgprev = mc->mc_rotor) == NULL) { 744 mg->mg_prev = mg; 745 mg->mg_next = mg; 746 } else { 747 mgnext = mgprev->mg_next; 748 mg->mg_prev = mgprev; 749 mg->mg_next = mgnext; 750 mgprev->mg_next = mg; 751 mgnext->mg_prev = mg; 752 } 753 mc->mc_rotor = mg; 754 } 755 756 /* 757 * Passivate a metaslab group and remove it from the allocation rotor. 758 * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating 759 * a metaslab group. This function will momentarily drop spa_config_locks 760 * that are lower than the SCL_ALLOC lock (see comment below). 761 */ 762 void 763 metaslab_group_passivate(metaslab_group_t *mg) 764 { 765 metaslab_class_t *mc = mg->mg_class; 766 spa_t *spa = mc->mc_spa; 767 metaslab_group_t *mgprev, *mgnext; 768 int locks = spa_config_held(spa, SCL_ALL, RW_WRITER); 769 770 ASSERT3U(spa_config_held(spa, SCL_ALLOC | SCL_ZIO, RW_WRITER), ==, 771 (SCL_ALLOC | SCL_ZIO)); 772 773 if (--mg->mg_activation_count != 0) { 774 ASSERT(mc->mc_rotor != mg); 775 ASSERT(mg->mg_prev == NULL); 776 ASSERT(mg->mg_next == NULL); 777 ASSERT(mg->mg_activation_count < 0); 778 return; 779 } 780 781 /* 782 * The spa_config_lock is an array of rwlocks, ordered as 783 * follows (from highest to lowest): 784 * SCL_CONFIG > SCL_STATE > SCL_L2ARC > SCL_ALLOC > 785 * SCL_ZIO > SCL_FREE > SCL_VDEV 786 * (For more information about the spa_config_lock see spa_misc.c) 787 * The higher the lock, the broader its coverage. When we passivate 788 * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO 789 * config locks. However, the metaslab group's taskq might be trying 790 * to preload metaslabs so we must drop the SCL_ZIO lock and any 791 * lower locks to allow the I/O to complete. At a minimum, 792 * we continue to hold the SCL_ALLOC lock, which prevents any future 793 * allocations from taking place and any changes to the vdev tree. 794 */ 795 spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa); 796 taskq_wait(mg->mg_taskq); 797 spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER); 798 metaslab_group_alloc_update(mg); 799 for (int i = 0; i < mg->mg_allocators; i++) { 800 metaslab_t *msp = mg->mg_primaries[i]; 801 if (msp != NULL) { 802 mutex_enter(&msp->ms_lock); 803 metaslab_passivate(msp, 804 metaslab_weight_from_range_tree(msp)); 805 mutex_exit(&msp->ms_lock); 806 } 807 msp = mg->mg_secondaries[i]; 808 if (msp != NULL) { 809 mutex_enter(&msp->ms_lock); 810 metaslab_passivate(msp, 811 metaslab_weight_from_range_tree(msp)); 812 mutex_exit(&msp->ms_lock); 813 } 814 } 815 816 mgprev = mg->mg_prev; 817 mgnext = mg->mg_next; 818 819 if (mg == mgnext) { 820 mc->mc_rotor = NULL; 821 } else { 822 mc->mc_rotor = mgnext; 823 mgprev->mg_next = mgnext; 824 mgnext->mg_prev = mgprev; 825 } 826 827 mg->mg_prev = NULL; 828 mg->mg_next = NULL; 829 } 830 831 boolean_t 832 metaslab_group_initialized(metaslab_group_t *mg) 833 { 834 vdev_t *vd = mg->mg_vd; 835 vdev_stat_t *vs = &vd->vdev_stat; 836 837 return (vs->vs_space != 0 && mg->mg_activation_count > 0); 838 } 839 840 uint64_t 841 metaslab_group_get_space(metaslab_group_t *mg) 842 { 843 return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count); 844 } 845 846 void 847 metaslab_group_histogram_verify(metaslab_group_t *mg) 848 { 849 uint64_t *mg_hist; 850 vdev_t *vd = mg->mg_vd; 851 uint64_t ashift = vd->vdev_ashift; 852 int i; 853 854 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 855 return; 856 857 mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 858 KM_SLEEP); 859 860 ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=, 861 SPACE_MAP_HISTOGRAM_SIZE + ashift); 862 863 for (int m = 0; m < vd->vdev_ms_count; m++) { 864 metaslab_t *msp = vd->vdev_ms[m]; 865 866 /* skip if not active or not a member */ 867 if (msp->ms_sm == NULL || msp->ms_group != mg) 868 continue; 869 870 for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) 871 mg_hist[i + ashift] += 872 msp->ms_sm->sm_phys->smp_histogram[i]; 873 } 874 875 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++) 876 VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]); 877 878 kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 879 } 880 881 static void 882 metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp) 883 { 884 metaslab_class_t *mc = mg->mg_class; 885 uint64_t ashift = mg->mg_vd->vdev_ashift; 886 887 ASSERT(MUTEX_HELD(&msp->ms_lock)); 888 if (msp->ms_sm == NULL) 889 return; 890 891 mutex_enter(&mg->mg_lock); 892 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 893 mg->mg_histogram[i + ashift] += 894 msp->ms_sm->sm_phys->smp_histogram[i]; 895 mc->mc_histogram[i + ashift] += 896 msp->ms_sm->sm_phys->smp_histogram[i]; 897 } 898 mutex_exit(&mg->mg_lock); 899 } 900 901 void 902 metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp) 903 { 904 metaslab_class_t *mc = mg->mg_class; 905 uint64_t ashift = mg->mg_vd->vdev_ashift; 906 907 ASSERT(MUTEX_HELD(&msp->ms_lock)); 908 if (msp->ms_sm == NULL) 909 return; 910 911 mutex_enter(&mg->mg_lock); 912 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 913 ASSERT3U(mg->mg_histogram[i + ashift], >=, 914 msp->ms_sm->sm_phys->smp_histogram[i]); 915 ASSERT3U(mc->mc_histogram[i + ashift], >=, 916 msp->ms_sm->sm_phys->smp_histogram[i]); 917 918 mg->mg_histogram[i + ashift] -= 919 msp->ms_sm->sm_phys->smp_histogram[i]; 920 mc->mc_histogram[i + ashift] -= 921 msp->ms_sm->sm_phys->smp_histogram[i]; 922 } 923 mutex_exit(&mg->mg_lock); 924 } 925 926 static void 927 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) 928 { 929 ASSERT(msp->ms_group == NULL); 930 mutex_enter(&mg->mg_lock); 931 msp->ms_group = mg; 932 msp->ms_weight = 0; 933 avl_add(&mg->mg_metaslab_tree, msp); 934 mutex_exit(&mg->mg_lock); 935 936 mutex_enter(&msp->ms_lock); 937 metaslab_group_histogram_add(mg, msp); 938 mutex_exit(&msp->ms_lock); 939 } 940 941 static void 942 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) 943 { 944 mutex_enter(&msp->ms_lock); 945 metaslab_group_histogram_remove(mg, msp); 946 mutex_exit(&msp->ms_lock); 947 948 mutex_enter(&mg->mg_lock); 949 ASSERT(msp->ms_group == mg); 950 avl_remove(&mg->mg_metaslab_tree, msp); 951 msp->ms_group = NULL; 952 mutex_exit(&mg->mg_lock); 953 } 954 955 static void 956 metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 957 { 958 ASSERT(MUTEX_HELD(&mg->mg_lock)); 959 ASSERT(msp->ms_group == mg); 960 avl_remove(&mg->mg_metaslab_tree, msp); 961 msp->ms_weight = weight; 962 avl_add(&mg->mg_metaslab_tree, msp); 963 964 } 965 966 static void 967 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 968 { 969 /* 970 * Although in principle the weight can be any value, in 971 * practice we do not use values in the range [1, 511]. 972 */ 973 ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0); 974 ASSERT(MUTEX_HELD(&msp->ms_lock)); 975 976 mutex_enter(&mg->mg_lock); 977 metaslab_group_sort_impl(mg, msp, weight); 978 mutex_exit(&mg->mg_lock); 979 } 980 981 /* 982 * Calculate the fragmentation for a given metaslab group. We can use 983 * a simple average here since all metaslabs within the group must have 984 * the same size. The return value will be a value between 0 and 100 985 * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this 986 * group have a fragmentation metric. 987 */ 988 uint64_t 989 metaslab_group_fragmentation(metaslab_group_t *mg) 990 { 991 vdev_t *vd = mg->mg_vd; 992 uint64_t fragmentation = 0; 993 uint64_t valid_ms = 0; 994 995 for (int m = 0; m < vd->vdev_ms_count; m++) { 996 metaslab_t *msp = vd->vdev_ms[m]; 997 998 if (msp->ms_fragmentation == ZFS_FRAG_INVALID) 999 continue; 1000 if (msp->ms_group != mg) 1001 continue; 1002 1003 valid_ms++; 1004 fragmentation += msp->ms_fragmentation; 1005 } 1006 1007 if (valid_ms <= mg->mg_vd->vdev_ms_count / 2) 1008 return (ZFS_FRAG_INVALID); 1009 1010 fragmentation /= valid_ms; 1011 ASSERT3U(fragmentation, <=, 100); 1012 return (fragmentation); 1013 } 1014 1015 /* 1016 * Determine if a given metaslab group should skip allocations. A metaslab 1017 * group should avoid allocations if its free capacity is less than the 1018 * zfs_mg_noalloc_threshold or its fragmentation metric is greater than 1019 * zfs_mg_fragmentation_threshold and there is at least one metaslab group 1020 * that can still handle allocations. If the allocation throttle is enabled 1021 * then we skip allocations to devices that have reached their maximum 1022 * allocation queue depth unless the selected metaslab group is the only 1023 * eligible group remaining. 1024 */ 1025 static boolean_t 1026 metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, 1027 uint64_t psize, int allocator, int d) 1028 { 1029 spa_t *spa = mg->mg_vd->vdev_spa; 1030 metaslab_class_t *mc = mg->mg_class; 1031 1032 /* 1033 * We can only consider skipping this metaslab group if it's 1034 * in the normal metaslab class and there are other metaslab 1035 * groups to select from. Otherwise, we always consider it eligible 1036 * for allocations. 1037 */ 1038 if ((mc != spa_normal_class(spa) && 1039 mc != spa_special_class(spa) && 1040 mc != spa_dedup_class(spa)) || 1041 mc->mc_groups <= 1) 1042 return (B_TRUE); 1043 1044 /* 1045 * If the metaslab group's mg_allocatable flag is set (see comments 1046 * in metaslab_group_alloc_update() for more information) and 1047 * the allocation throttle is disabled then allow allocations to this 1048 * device. However, if the allocation throttle is enabled then 1049 * check if we have reached our allocation limit (mg_alloc_queue_depth) 1050 * to determine if we should allow allocations to this metaslab group. 1051 * If all metaslab groups are no longer considered allocatable 1052 * (mc_alloc_groups == 0) or we're trying to allocate the smallest 1053 * gang block size then we allow allocations on this metaslab group 1054 * regardless of the mg_allocatable or throttle settings. 1055 */ 1056 if (mg->mg_allocatable) { 1057 metaslab_group_t *mgp; 1058 int64_t qdepth; 1059 uint64_t qmax = mg->mg_cur_max_alloc_queue_depth[allocator]; 1060 1061 if (!mc->mc_alloc_throttle_enabled) 1062 return (B_TRUE); 1063 1064 /* 1065 * If this metaslab group does not have any free space, then 1066 * there is no point in looking further. 1067 */ 1068 if (mg->mg_no_free_space) 1069 return (B_FALSE); 1070 1071 /* 1072 * Relax allocation throttling for ditto blocks. Due to 1073 * random imbalances in allocation it tends to push copies 1074 * to one vdev, that looks a bit better at the moment. 1075 */ 1076 qmax = qmax * (4 + d) / 4; 1077 1078 qdepth = zfs_refcount_count( 1079 &mg->mg_alloc_queue_depth[allocator]); 1080 1081 /* 1082 * If this metaslab group is below its qmax or it's 1083 * the only allocatable metasable group, then attempt 1084 * to allocate from it. 1085 */ 1086 if (qdepth < qmax || mc->mc_alloc_groups == 1) 1087 return (B_TRUE); 1088 ASSERT3U(mc->mc_alloc_groups, >, 1); 1089 1090 /* 1091 * Since this metaslab group is at or over its qmax, we 1092 * need to determine if there are metaslab groups after this 1093 * one that might be able to handle this allocation. This is 1094 * racy since we can't hold the locks for all metaslab 1095 * groups at the same time when we make this check. 1096 */ 1097 for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) { 1098 qmax = mgp->mg_cur_max_alloc_queue_depth[allocator]; 1099 qmax = qmax * (4 + d) / 4; 1100 qdepth = zfs_refcount_count( 1101 &mgp->mg_alloc_queue_depth[allocator]); 1102 1103 /* 1104 * If there is another metaslab group that 1105 * might be able to handle the allocation, then 1106 * we return false so that we skip this group. 1107 */ 1108 if (qdepth < qmax && !mgp->mg_no_free_space) 1109 return (B_FALSE); 1110 } 1111 1112 /* 1113 * We didn't find another group to handle the allocation 1114 * so we can't skip this metaslab group even though 1115 * we are at or over our qmax. 1116 */ 1117 return (B_TRUE); 1118 1119 } else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) { 1120 return (B_TRUE); 1121 } 1122 return (B_FALSE); 1123 } 1124 1125 /* 1126 * ========================================================================== 1127 * Range tree callbacks 1128 * ========================================================================== 1129 */ 1130 1131 /* 1132 * Comparison function for the private size-ordered tree. Tree is sorted 1133 * by size, larger sizes at the end of the tree. 1134 */ 1135 static int 1136 metaslab_rangesize_compare(const void *x1, const void *x2) 1137 { 1138 const range_seg_t *r1 = x1; 1139 const range_seg_t *r2 = x2; 1140 uint64_t rs_size1 = r1->rs_end - r1->rs_start; 1141 uint64_t rs_size2 = r2->rs_end - r2->rs_start; 1142 1143 int cmp = AVL_CMP(rs_size1, rs_size2); 1144 if (likely(cmp)) 1145 return (cmp); 1146 1147 return (AVL_CMP(r1->rs_start, r2->rs_start)); 1148 } 1149 1150 /* 1151 * ========================================================================== 1152 * Common allocator routines 1153 * ========================================================================== 1154 */ 1155 1156 /* 1157 * Return the maximum contiguous segment within the metaslab. 1158 */ 1159 uint64_t 1160 metaslab_block_maxsize(metaslab_t *msp) 1161 { 1162 avl_tree_t *t = &msp->ms_allocatable_by_size; 1163 range_seg_t *rs; 1164 1165 if (t == NULL || (rs = avl_last(t)) == NULL) 1166 return (0ULL); 1167 1168 return (rs->rs_end - rs->rs_start); 1169 } 1170 1171 static range_seg_t * 1172 metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size) 1173 { 1174 range_seg_t *rs, rsearch; 1175 avl_index_t where; 1176 1177 rsearch.rs_start = start; 1178 rsearch.rs_end = start + size; 1179 1180 rs = avl_find(t, &rsearch, &where); 1181 if (rs == NULL) { 1182 rs = avl_nearest(t, where, AVL_AFTER); 1183 } 1184 1185 return (rs); 1186 } 1187 1188 /* 1189 * This is a helper function that can be used by the allocator to find 1190 * a suitable block to allocate. This will search the specified AVL 1191 * tree looking for a block that matches the specified criteria. 1192 */ 1193 static uint64_t 1194 metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, 1195 uint64_t max_search) 1196 { 1197 range_seg_t *rs = metaslab_block_find(t, *cursor, size); 1198 uint64_t first_found; 1199 1200 if (rs != NULL) 1201 first_found = rs->rs_start; 1202 1203 while (rs != NULL && rs->rs_start - first_found <= max_search) { 1204 uint64_t offset = rs->rs_start; 1205 if (offset + size <= rs->rs_end) { 1206 *cursor = offset + size; 1207 return (offset); 1208 } 1209 rs = AVL_NEXT(t, rs); 1210 } 1211 1212 *cursor = 0; 1213 return (-1ULL); 1214 } 1215 1216 /* 1217 * ========================================================================== 1218 * Dynamic Fit (df) block allocator 1219 * 1220 * Search for a free chunk of at least this size, starting from the last 1221 * offset (for this alignment of block) looking for up to 1222 * metaslab_df_max_search bytes (16MB). If a large enough free chunk is not 1223 * found within 16MB, then return a free chunk of exactly the requested size (or 1224 * larger). 1225 * 1226 * If it seems like searching from the last offset will be unproductive, skip 1227 * that and just return a free chunk of exactly the requested size (or larger). 1228 * This is based on metaslab_df_alloc_threshold and metaslab_df_free_pct. This 1229 * mechanism is probably not very useful and may be removed in the future. 1230 * 1231 * The behavior when not searching can be changed to return the largest free 1232 * chunk, instead of a free chunk of exactly the requested size, by setting 1233 * metaslab_df_use_largest_segment. 1234 * ========================================================================== 1235 */ 1236 static uint64_t 1237 metaslab_df_alloc(metaslab_t *msp, uint64_t size) 1238 { 1239 /* 1240 * Find the largest power of 2 block size that evenly divides the 1241 * requested size. This is used to try to allocate blocks with similar 1242 * alignment from the same area of the metaslab (i.e. same cursor 1243 * bucket) but it does not guarantee that other allocations sizes 1244 * may exist in the same region. 1245 */ 1246 uint64_t align = size & -size; 1247 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; 1248 range_tree_t *rt = msp->ms_allocatable; 1249 int free_pct = range_tree_space(rt) * 100 / msp->ms_size; 1250 uint64_t offset; 1251 1252 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1253 ASSERT3U(avl_numnodes(&rt->rt_root), ==, 1254 avl_numnodes(&msp->ms_allocatable_by_size)); 1255 1256 /* 1257 * If we're running low on space, find a segment based on size, 1258 * rather than iterating based on offset. 1259 */ 1260 if (metaslab_block_maxsize(msp) < metaslab_df_alloc_threshold || 1261 free_pct < metaslab_df_free_pct) { 1262 offset = -1; 1263 } else { 1264 offset = metaslab_block_picker(&rt->rt_root, 1265 cursor, size, metaslab_df_max_search); 1266 } 1267 1268 if (offset == -1) { 1269 range_seg_t *rs; 1270 if (metaslab_df_use_largest_segment) { 1271 /* use largest free segment */ 1272 rs = avl_last(&msp->ms_allocatable_by_size); 1273 } else { 1274 /* use segment of this size, or next largest */ 1275 rs = metaslab_block_find(&msp->ms_allocatable_by_size, 1276 0, size); 1277 } 1278 if (rs != NULL && rs->rs_start + size <= rs->rs_end) { 1279 offset = rs->rs_start; 1280 *cursor = offset + size; 1281 } 1282 } 1283 1284 return (offset); 1285 } 1286 1287 static metaslab_ops_t metaslab_df_ops = { 1288 metaslab_df_alloc 1289 }; 1290 1291 /* 1292 * ========================================================================== 1293 * Cursor fit block allocator - 1294 * Select the largest region in the metaslab, set the cursor to the beginning 1295 * of the range and the cursor_end to the end of the range. As allocations 1296 * are made advance the cursor. Continue allocating from the cursor until 1297 * the range is exhausted and then find a new range. 1298 * ========================================================================== 1299 */ 1300 static uint64_t 1301 metaslab_cf_alloc(metaslab_t *msp, uint64_t size) 1302 { 1303 range_tree_t *rt = msp->ms_allocatable; 1304 avl_tree_t *t = &msp->ms_allocatable_by_size; 1305 uint64_t *cursor = &msp->ms_lbas[0]; 1306 uint64_t *cursor_end = &msp->ms_lbas[1]; 1307 uint64_t offset = 0; 1308 1309 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1310 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root)); 1311 1312 ASSERT3U(*cursor_end, >=, *cursor); 1313 1314 if ((*cursor + size) > *cursor_end) { 1315 range_seg_t *rs; 1316 1317 rs = avl_last(&msp->ms_allocatable_by_size); 1318 if (rs == NULL || (rs->rs_end - rs->rs_start) < size) 1319 return (-1ULL); 1320 1321 *cursor = rs->rs_start; 1322 *cursor_end = rs->rs_end; 1323 } 1324 1325 offset = *cursor; 1326 *cursor += size; 1327 1328 return (offset); 1329 } 1330 1331 static metaslab_ops_t metaslab_cf_ops = { 1332 metaslab_cf_alloc 1333 }; 1334 1335 /* 1336 * ========================================================================== 1337 * New dynamic fit allocator - 1338 * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift 1339 * contiguous blocks. If no region is found then just use the largest segment 1340 * that remains. 1341 * ========================================================================== 1342 */ 1343 1344 /* 1345 * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift) 1346 * to request from the allocator. 1347 */ 1348 uint64_t metaslab_ndf_clump_shift = 4; 1349 1350 static uint64_t 1351 metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) 1352 { 1353 avl_tree_t *t = &msp->ms_allocatable->rt_root; 1354 avl_index_t where; 1355 range_seg_t *rs, rsearch; 1356 uint64_t hbit = highbit64(size); 1357 uint64_t *cursor = &msp->ms_lbas[hbit - 1]; 1358 uint64_t max_size = metaslab_block_maxsize(msp); 1359 1360 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1361 ASSERT3U(avl_numnodes(t), ==, 1362 avl_numnodes(&msp->ms_allocatable_by_size)); 1363 1364 if (max_size < size) 1365 return (-1ULL); 1366 1367 rsearch.rs_start = *cursor; 1368 rsearch.rs_end = *cursor + size; 1369 1370 rs = avl_find(t, &rsearch, &where); 1371 if (rs == NULL || (rs->rs_end - rs->rs_start) < size) { 1372 t = &msp->ms_allocatable_by_size; 1373 1374 rsearch.rs_start = 0; 1375 rsearch.rs_end = MIN(max_size, 1376 1ULL << (hbit + metaslab_ndf_clump_shift)); 1377 rs = avl_find(t, &rsearch, &where); 1378 if (rs == NULL) 1379 rs = avl_nearest(t, where, AVL_AFTER); 1380 ASSERT(rs != NULL); 1381 } 1382 1383 if ((rs->rs_end - rs->rs_start) >= size) { 1384 *cursor = rs->rs_start + size; 1385 return (rs->rs_start); 1386 } 1387 return (-1ULL); 1388 } 1389 1390 static metaslab_ops_t metaslab_ndf_ops = { 1391 metaslab_ndf_alloc 1392 }; 1393 1394 metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops; 1395 1396 /* 1397 * ========================================================================== 1398 * Metaslabs 1399 * ========================================================================== 1400 */ 1401 1402 /* 1403 * Wait for any in-progress metaslab loads to complete. 1404 */ 1405 void 1406 metaslab_load_wait(metaslab_t *msp) 1407 { 1408 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1409 1410 while (msp->ms_loading) { 1411 ASSERT(!msp->ms_loaded); 1412 cv_wait(&msp->ms_load_cv, &msp->ms_lock); 1413 } 1414 } 1415 1416 /* 1417 * Wait for any in-progress flushing to complete. 1418 */ 1419 void 1420 metaslab_flush_wait(metaslab_t *msp) 1421 { 1422 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1423 1424 while (msp->ms_flushing) 1425 cv_wait(&msp->ms_flush_cv, &msp->ms_lock); 1426 } 1427 1428 uint64_t 1429 metaslab_allocated_space(metaslab_t *msp) 1430 { 1431 return (msp->ms_allocated_space); 1432 } 1433 1434 /* 1435 * Verify that the space accounting on disk matches the in-core range_trees. 1436 */ 1437 static void 1438 metaslab_verify_space(metaslab_t *msp, uint64_t txg) 1439 { 1440 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1441 uint64_t allocating = 0; 1442 uint64_t sm_free_space, msp_free_space; 1443 1444 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1445 ASSERT(!msp->ms_condensing); 1446 1447 if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) 1448 return; 1449 1450 /* 1451 * We can only verify the metaslab space when we're called 1452 * from syncing context with a loaded metaslab that has an 1453 * allocated space map. Calling this in non-syncing context 1454 * does not provide a consistent view of the metaslab since 1455 * we're performing allocations in the future. 1456 */ 1457 if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL || 1458 !msp->ms_loaded) 1459 return; 1460 1461 /* 1462 * Even though the smp_alloc field can get negative, 1463 * when it comes to a metaslab's space map, that should 1464 * never be the case. 1465 */ 1466 ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0); 1467 1468 ASSERT3U(space_map_allocated(msp->ms_sm), >=, 1469 range_tree_space(msp->ms_unflushed_frees)); 1470 1471 ASSERT3U(metaslab_allocated_space(msp), ==, 1472 space_map_allocated(msp->ms_sm) + 1473 range_tree_space(msp->ms_unflushed_allocs) - 1474 range_tree_space(msp->ms_unflushed_frees)); 1475 1476 sm_free_space = msp->ms_size - metaslab_allocated_space(msp); 1477 1478 /* 1479 * Account for future allocations since we would have 1480 * already deducted that space from the ms_allocatable. 1481 */ 1482 for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { 1483 allocating += 1484 range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]); 1485 } 1486 1487 ASSERT3U(msp->ms_deferspace, ==, 1488 range_tree_space(msp->ms_defer[0]) + 1489 range_tree_space(msp->ms_defer[1])); 1490 1491 msp_free_space = range_tree_space(msp->ms_allocatable) + allocating + 1492 msp->ms_deferspace + range_tree_space(msp->ms_freed); 1493 1494 VERIFY3U(sm_free_space, ==, msp_free_space); 1495 } 1496 1497 static void 1498 metaslab_aux_histograms_clear(metaslab_t *msp) 1499 { 1500 /* 1501 * Auxiliary histograms are only cleared when resetting them, 1502 * which can only happen while the metaslab is loaded. 1503 */ 1504 ASSERT(msp->ms_loaded); 1505 1506 bzero(msp->ms_synchist, sizeof (msp->ms_synchist)); 1507 for (int t = 0; t < TXG_DEFER_SIZE; t++) 1508 bzero(msp->ms_deferhist[t], sizeof (msp->ms_deferhist[t])); 1509 } 1510 1511 static void 1512 metaslab_aux_histogram_add(uint64_t *histogram, uint64_t shift, 1513 range_tree_t *rt) 1514 { 1515 /* 1516 * This is modeled after space_map_histogram_add(), so refer to that 1517 * function for implementation details. We want this to work like 1518 * the space map histogram, and not the range tree histogram, as we 1519 * are essentially constructing a delta that will be later subtracted 1520 * from the space map histogram. 1521 */ 1522 int idx = 0; 1523 for (int i = shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { 1524 ASSERT3U(i, >=, idx + shift); 1525 histogram[idx] += rt->rt_histogram[i] << (i - idx - shift); 1526 1527 if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) { 1528 ASSERT3U(idx + shift, ==, i); 1529 idx++; 1530 ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE); 1531 } 1532 } 1533 } 1534 1535 /* 1536 * Called at every sync pass that the metaslab gets synced. 1537 * 1538 * The reason is that we want our auxiliary histograms to be updated 1539 * wherever the metaslab's space map histogram is updated. This way 1540 * we stay consistent on which parts of the metaslab space map's 1541 * histogram are currently not available for allocations (e.g because 1542 * they are in the defer, freed, and freeing trees). 1543 */ 1544 static void 1545 metaslab_aux_histograms_update(metaslab_t *msp) 1546 { 1547 space_map_t *sm = msp->ms_sm; 1548 ASSERT(sm != NULL); 1549 1550 /* 1551 * This is similar to the metaslab's space map histogram updates 1552 * that take place in metaslab_sync(). The only difference is that 1553 * we only care about segments that haven't made it into the 1554 * ms_allocatable tree yet. 1555 */ 1556 if (msp->ms_loaded) { 1557 metaslab_aux_histograms_clear(msp); 1558 1559 metaslab_aux_histogram_add(msp->ms_synchist, 1560 sm->sm_shift, msp->ms_freed); 1561 1562 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1563 metaslab_aux_histogram_add(msp->ms_deferhist[t], 1564 sm->sm_shift, msp->ms_defer[t]); 1565 } 1566 } 1567 1568 metaslab_aux_histogram_add(msp->ms_synchist, 1569 sm->sm_shift, msp->ms_freeing); 1570 } 1571 1572 /* 1573 * Called every time we are done syncing (writing to) the metaslab, 1574 * i.e. at the end of each sync pass. 1575 * [see the comment in metaslab_impl.h for ms_synchist, ms_deferhist] 1576 */ 1577 static void 1578 metaslab_aux_histograms_update_done(metaslab_t *msp, boolean_t defer_allowed) 1579 { 1580 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1581 space_map_t *sm = msp->ms_sm; 1582 1583 if (sm == NULL) { 1584 /* 1585 * We came here from metaslab_init() when creating/opening a 1586 * pool, looking at a metaslab that hasn't had any allocations 1587 * yet. 1588 */ 1589 return; 1590 } 1591 1592 /* 1593 * This is similar to the actions that we take for the ms_freed 1594 * and ms_defer trees in metaslab_sync_done(). 1595 */ 1596 uint64_t hist_index = spa_syncing_txg(spa) % TXG_DEFER_SIZE; 1597 if (defer_allowed) { 1598 bcopy(msp->ms_synchist, msp->ms_deferhist[hist_index], 1599 sizeof (msp->ms_synchist)); 1600 } else { 1601 bzero(msp->ms_deferhist[hist_index], 1602 sizeof (msp->ms_deferhist[hist_index])); 1603 } 1604 bzero(msp->ms_synchist, sizeof (msp->ms_synchist)); 1605 } 1606 1607 /* 1608 * Ensure that the metaslab's weight and fragmentation are consistent 1609 * with the contents of the histogram (either the range tree's histogram 1610 * or the space map's depending whether the metaslab is loaded). 1611 */ 1612 static void 1613 metaslab_verify_weight_and_frag(metaslab_t *msp) 1614 { 1615 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1616 1617 if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) 1618 return; 1619 1620 /* 1621 * We can end up here from vdev_remove_complete(), in which case we 1622 * cannot do these assertions because we hold spa config locks and 1623 * thus we are not allowed to read from the DMU. 1624 * 1625 * We check if the metaslab group has been removed and if that's 1626 * the case we return immediately as that would mean that we are 1627 * here from the aforementioned code path. 1628 */ 1629 if (msp->ms_group == NULL) 1630 return; 1631 1632 /* 1633 * Devices being removed always return a weight of 0 and leave 1634 * fragmentation and ms_max_size as is - there is nothing for 1635 * us to verify here. 1636 */ 1637 vdev_t *vd = msp->ms_group->mg_vd; 1638 if (vd->vdev_removing) 1639 return; 1640 1641 /* 1642 * If the metaslab is dirty it probably means that we've done 1643 * some allocations or frees that have changed our histograms 1644 * and thus the weight. 1645 */ 1646 for (int t = 0; t < TXG_SIZE; t++) { 1647 if (txg_list_member(&vd->vdev_ms_list, msp, t)) 1648 return; 1649 } 1650 1651 /* 1652 * This verification checks that our in-memory state is consistent 1653 * with what's on disk. If the pool is read-only then there aren't 1654 * any changes and we just have the initially-loaded state. 1655 */ 1656 if (!spa_writeable(msp->ms_group->mg_vd->vdev_spa)) 1657 return; 1658 1659 /* some extra verification for in-core tree if you can */ 1660 if (msp->ms_loaded) { 1661 range_tree_stat_verify(msp->ms_allocatable); 1662 VERIFY(space_map_histogram_verify(msp->ms_sm, 1663 msp->ms_allocatable)); 1664 } 1665 1666 uint64_t weight = msp->ms_weight; 1667 uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; 1668 boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight); 1669 uint64_t frag = msp->ms_fragmentation; 1670 uint64_t max_segsize = msp->ms_max_size; 1671 1672 msp->ms_weight = 0; 1673 msp->ms_fragmentation = 0; 1674 msp->ms_max_size = 0; 1675 1676 /* 1677 * This function is used for verification purposes. Regardless of 1678 * whether metaslab_weight() thinks this metaslab should be active or 1679 * not, we want to ensure that the actual weight (and therefore the 1680 * value of ms_weight) would be the same if it was to be recalculated 1681 * at this point. 1682 */ 1683 msp->ms_weight = metaslab_weight(msp) | was_active; 1684 1685 VERIFY3U(max_segsize, ==, msp->ms_max_size); 1686 1687 /* 1688 * If the weight type changed then there is no point in doing 1689 * verification. Revert fields to their original values. 1690 */ 1691 if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) || 1692 (!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) { 1693 msp->ms_fragmentation = frag; 1694 msp->ms_weight = weight; 1695 return; 1696 } 1697 1698 VERIFY3U(msp->ms_fragmentation, ==, frag); 1699 VERIFY3U(msp->ms_weight, ==, weight); 1700 } 1701 1702 static int 1703 metaslab_load_impl(metaslab_t *msp) 1704 { 1705 int error = 0; 1706 1707 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1708 ASSERT(msp->ms_loading); 1709 ASSERT(!msp->ms_condensing); 1710 1711 /* 1712 * We temporarily drop the lock to unblock other operations while we 1713 * are reading the space map. Therefore, metaslab_sync() and 1714 * metaslab_sync_done() can run at the same time as we do. 1715 * 1716 * If we are using the log space maps, metaslab_sync() can't write to 1717 * the metaslab's space map while we are loading as we only write to 1718 * it when we are flushing the metaslab, and that can't happen while 1719 * we are loading it. 1720 * 1721 * If we are not using log space maps though, metaslab_sync() can 1722 * append to the space map while we are loading. Therefore we load 1723 * only entries that existed when we started the load. Additionally, 1724 * metaslab_sync_done() has to wait for the load to complete because 1725 * there are potential races like metaslab_load() loading parts of the 1726 * space map that are currently being appended by metaslab_sync(). If 1727 * we didn't, the ms_allocatable would have entries that 1728 * metaslab_sync_done() would try to re-add later. 1729 * 1730 * That's why before dropping the lock we remember the synced length 1731 * of the metaslab and read up to that point of the space map, 1732 * ignoring entries appended by metaslab_sync() that happen after we 1733 * drop the lock. 1734 */ 1735 uint64_t length = msp->ms_synced_length; 1736 mutex_exit(&msp->ms_lock); 1737 1738 hrtime_t load_start = gethrtime(); 1739 if (msp->ms_sm != NULL) { 1740 error = space_map_load_length(msp->ms_sm, msp->ms_allocatable, 1741 SM_FREE, length); 1742 } else { 1743 /* 1744 * The space map has not been allocated yet, so treat 1745 * all the space in the metaslab as free and add it to the 1746 * ms_allocatable tree. 1747 */ 1748 range_tree_add(msp->ms_allocatable, 1749 msp->ms_start, msp->ms_size); 1750 1751 if (msp->ms_freed != NULL) { 1752 /* 1753 * If the ms_sm doesn't exist, this means that this 1754 * metaslab hasn't gone through metaslab_sync() and 1755 * thus has never been dirtied. So we shouldn't 1756 * expect any unflushed allocs or frees from previous 1757 * TXGs. 1758 * 1759 * Note: ms_freed and all the other trees except for 1760 * the ms_allocatable, can be NULL at this point only 1761 * if this is a new metaslab of a vdev that just got 1762 * expanded. 1763 */ 1764 ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); 1765 ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); 1766 } 1767 } 1768 1769 /* 1770 * We need to grab the ms_sync_lock to prevent metaslab_sync() from 1771 * changing the ms_sm (or log_sm) and the metaslab's range trees 1772 * while we are about to use them and populate the ms_allocatable. 1773 * The ms_lock is insufficient for this because metaslab_sync() doesn't 1774 * hold the ms_lock while writing the ms_checkpointing tree to disk. 1775 */ 1776 mutex_enter(&msp->ms_sync_lock); 1777 mutex_enter(&msp->ms_lock); 1778 1779 ASSERT(!msp->ms_condensing); 1780 ASSERT(!msp->ms_flushing); 1781 1782 if (error != 0) { 1783 mutex_exit(&msp->ms_sync_lock); 1784 return (error); 1785 } 1786 1787 ASSERT3P(msp->ms_group, !=, NULL); 1788 msp->ms_loaded = B_TRUE; 1789 1790 /* 1791 * Apply all the unflushed changes to ms_allocatable right 1792 * away so any manipulations we do below have a clear view 1793 * of what is allocated and what is free. 1794 */ 1795 range_tree_walk(msp->ms_unflushed_allocs, 1796 range_tree_remove, msp->ms_allocatable); 1797 range_tree_walk(msp->ms_unflushed_frees, 1798 range_tree_add, msp->ms_allocatable); 1799 1800 msp->ms_loaded = B_TRUE; 1801 1802 ASSERT3P(msp->ms_group, !=, NULL); 1803 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1804 if (spa_syncing_log_sm(spa) != NULL) { 1805 ASSERT(spa_feature_is_enabled(spa, 1806 SPA_FEATURE_LOG_SPACEMAP)); 1807 1808 /* 1809 * If we use a log space map we add all the segments 1810 * that are in ms_unflushed_frees so they are available 1811 * for allocation. 1812 * 1813 * ms_allocatable needs to contain all free segments 1814 * that are ready for allocations (thus not segments 1815 * from ms_freeing, ms_freed, and the ms_defer trees). 1816 * But if we grab the lock in this code path at a sync 1817 * pass later that 1, then it also contains the 1818 * segments of ms_freed (they were added to it earlier 1819 * in this path through ms_unflushed_frees). So we 1820 * need to remove all the segments that exist in 1821 * ms_freed from ms_allocatable as they will be added 1822 * later in metaslab_sync_done(). 1823 * 1824 * When there's no log space map, the ms_allocatable 1825 * correctly doesn't contain any segments that exist 1826 * in ms_freed [see ms_synced_length]. 1827 */ 1828 range_tree_walk(msp->ms_freed, 1829 range_tree_remove, msp->ms_allocatable); 1830 } 1831 1832 /* 1833 * If we are not using the log space map, ms_allocatable 1834 * contains the segments that exist in the ms_defer trees 1835 * [see ms_synced_length]. Thus we need to remove them 1836 * from ms_allocatable as they will be added again in 1837 * metaslab_sync_done(). 1838 * 1839 * If we are using the log space map, ms_allocatable still 1840 * contains the segments that exist in the ms_defer trees. 1841 * Not because it read them through the ms_sm though. But 1842 * because these segments are part of ms_unflushed_frees 1843 * whose segments we add to ms_allocatable earlier in this 1844 * code path. 1845 */ 1846 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1847 range_tree_walk(msp->ms_defer[t], 1848 range_tree_remove, msp->ms_allocatable); 1849 } 1850 1851 /* 1852 * Call metaslab_recalculate_weight_and_sort() now that the 1853 * metaslab is loaded so we get the metaslab's real weight. 1854 * 1855 * Unless this metaslab was created with older software and 1856 * has not yet been converted to use segment-based weight, we 1857 * expect the new weight to be better or equal to the weight 1858 * that the metaslab had while it was not loaded. This is 1859 * because the old weight does not take into account the 1860 * consolidation of adjacent segments between TXGs. [see 1861 * comment for ms_synchist and ms_deferhist[] for more info] 1862 */ 1863 uint64_t weight = msp->ms_weight; 1864 metaslab_recalculate_weight_and_sort(msp); 1865 if (!WEIGHT_IS_SPACEBASED(weight)) 1866 ASSERT3U(weight, <=, msp->ms_weight); 1867 msp->ms_max_size = metaslab_block_maxsize(msp); 1868 1869 hrtime_t load_end = gethrtime(); 1870 if (zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) { 1871 zfs_dbgmsg("loading: txg %llu, spa %s, vdev_id %llu, " 1872 "ms_id %llu, smp_length %llu, " 1873 "unflushed_allocs %llu, unflushed_frees %llu, " 1874 "freed %llu, defer %llu + %llu, " 1875 "loading_time %lld ms", 1876 spa_syncing_txg(spa), spa_name(spa), 1877 msp->ms_group->mg_vd->vdev_id, msp->ms_id, 1878 space_map_length(msp->ms_sm), 1879 range_tree_space(msp->ms_unflushed_allocs), 1880 range_tree_space(msp->ms_unflushed_frees), 1881 range_tree_space(msp->ms_freed), 1882 range_tree_space(msp->ms_defer[0]), 1883 range_tree_space(msp->ms_defer[1]), 1884 (longlong_t)((load_end - load_start) / 1000000)); 1885 } 1886 1887 metaslab_verify_space(msp, spa_syncing_txg(spa)); 1888 mutex_exit(&msp->ms_sync_lock); 1889 return (0); 1890 } 1891 1892 int 1893 metaslab_load(metaslab_t *msp) 1894 { 1895 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1896 1897 /* 1898 * There may be another thread loading the same metaslab, if that's 1899 * the case just wait until the other thread is done and return. 1900 */ 1901 metaslab_load_wait(msp); 1902 if (msp->ms_loaded) 1903 return (0); 1904 VERIFY(!msp->ms_loading); 1905 ASSERT(!msp->ms_condensing); 1906 1907 /* 1908 * We set the loading flag BEFORE potentially dropping the lock to 1909 * wait for an ongoing flush (see ms_flushing below). This way other 1910 * threads know that there is already a thread that is loading this 1911 * metaslab. 1912 */ 1913 msp->ms_loading = B_TRUE; 1914 1915 /* 1916 * Wait for any in-progress flushing to finish as we drop the ms_lock 1917 * both here (during space_map_load()) and in metaslab_flush() (when 1918 * we flush our changes to the ms_sm). 1919 */ 1920 if (msp->ms_flushing) 1921 metaslab_flush_wait(msp); 1922 1923 /* 1924 * In the possibility that we were waiting for the metaslab to be 1925 * flushed (where we temporarily dropped the ms_lock), ensure that 1926 * no one else loaded the metaslab somehow. 1927 */ 1928 ASSERT(!msp->ms_loaded); 1929 1930 int error = metaslab_load_impl(msp); 1931 1932 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1933 msp->ms_loading = B_FALSE; 1934 cv_broadcast(&msp->ms_load_cv); 1935 1936 return (error); 1937 } 1938 1939 void 1940 metaslab_unload(metaslab_t *msp) 1941 { 1942 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1943 1944 metaslab_verify_weight_and_frag(msp); 1945 1946 range_tree_vacate(msp->ms_allocatable, NULL, NULL); 1947 msp->ms_loaded = B_FALSE; 1948 1949 msp->ms_weight &= ~METASLAB_ACTIVE_MASK; 1950 msp->ms_max_size = 0; 1951 1952 /* 1953 * We explicitly recalculate the metaslab's weight based on its space 1954 * map (as it is now not loaded). We want unload metaslabs to always 1955 * have their weights calculated from the space map histograms, while 1956 * loaded ones have it calculated from their in-core range tree 1957 * [see metaslab_load()]. This way, the weight reflects the information 1958 * available in-core, whether it is loaded or not. 1959 * 1960 * If ms_group == NULL means that we came here from metaslab_fini(), 1961 * at which point it doesn't make sense for us to do the recalculation 1962 * and the sorting. 1963 */ 1964 if (msp->ms_group != NULL) 1965 metaslab_recalculate_weight_and_sort(msp); 1966 } 1967 1968 void 1969 metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta, 1970 int64_t defer_delta, int64_t space_delta) 1971 { 1972 vdev_space_update(vd, alloc_delta, defer_delta, space_delta); 1973 1974 ASSERT3P(vd->vdev_spa->spa_root_vdev, ==, vd->vdev_parent); 1975 ASSERT(vd->vdev_ms_count != 0); 1976 1977 metaslab_class_space_update(mc, alloc_delta, defer_delta, space_delta, 1978 vdev_deflated_space(vd, space_delta)); 1979 } 1980 1981 int 1982 metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, 1983 uint64_t txg, metaslab_t **msp) 1984 { 1985 vdev_t *vd = mg->mg_vd; 1986 spa_t *spa = vd->vdev_spa; 1987 objset_t *mos = spa->spa_meta_objset; 1988 metaslab_t *ms; 1989 int error; 1990 1991 ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); 1992 mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL); 1993 mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL); 1994 cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); 1995 cv_init(&ms->ms_flush_cv, NULL, CV_DEFAULT, NULL); 1996 1997 ms->ms_id = id; 1998 ms->ms_start = id << vd->vdev_ms_shift; 1999 ms->ms_size = 1ULL << vd->vdev_ms_shift; 2000 ms->ms_allocator = -1; 2001 ms->ms_new = B_TRUE; 2002 2003 /* 2004 * We only open space map objects that already exist. All others 2005 * will be opened when we finally allocate an object for it. 2006 * 2007 * Note: 2008 * When called from vdev_expand(), we can't call into the DMU as 2009 * we are holding the spa_config_lock as a writer and we would 2010 * deadlock [see relevant comment in vdev_metaslab_init()]. in 2011 * that case, the object parameter is zero though, so we won't 2012 * call into the DMU. 2013 */ 2014 if (object != 0) { 2015 error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start, 2016 ms->ms_size, vd->vdev_ashift); 2017 2018 if (error != 0) { 2019 kmem_free(ms, sizeof (metaslab_t)); 2020 return (error); 2021 } 2022 2023 ASSERT(ms->ms_sm != NULL); 2024 ASSERT3S(space_map_allocated(ms->ms_sm), >=, 0); 2025 ms->ms_allocated_space = space_map_allocated(ms->ms_sm); 2026 } 2027 2028 /* 2029 * We create the ms_allocatable here, but we don't create the 2030 * other range trees until metaslab_sync_done(). This serves 2031 * two purposes: it allows metaslab_sync_done() to detect the 2032 * addition of new space; and for debugging, it ensures that 2033 * we'd data fault on any attempt to use this metaslab before 2034 * it's ready. 2035 */ 2036 ms->ms_allocatable = range_tree_create_impl(&rt_avl_ops, 2037 &ms->ms_allocatable_by_size, metaslab_rangesize_compare, 0); 2038 2039 ms->ms_trim = range_tree_create(NULL, NULL); 2040 2041 metaslab_group_add(mg, ms); 2042 metaslab_set_fragmentation(ms); 2043 2044 /* 2045 * If we're opening an existing pool (txg == 0) or creating 2046 * a new one (txg == TXG_INITIAL), all space is available now. 2047 * If we're adding space to an existing pool, the new space 2048 * does not become available until after this txg has synced. 2049 * The metaslab's weight will also be initialized when we sync 2050 * out this txg. This ensures that we don't attempt to allocate 2051 * from it before we have initialized it completely. 2052 */ 2053 if (txg <= TXG_INITIAL) { 2054 metaslab_sync_done(ms, 0); 2055 metaslab_space_update(vd, mg->mg_class, 2056 metaslab_allocated_space(ms), 0, 0); 2057 } 2058 2059 if (txg != 0) { 2060 vdev_dirty(vd, 0, NULL, txg); 2061 vdev_dirty(vd, VDD_METASLAB, ms, txg); 2062 } 2063 2064 *msp = ms; 2065 2066 return (0); 2067 } 2068 2069 static void 2070 metaslab_fini_flush_data(metaslab_t *msp) 2071 { 2072 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 2073 2074 if (metaslab_unflushed_txg(msp) == 0) { 2075 ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), 2076 ==, NULL); 2077 return; 2078 } 2079 ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); 2080 2081 mutex_enter(&spa->spa_flushed_ms_lock); 2082 avl_remove(&spa->spa_metaslabs_by_flushed, msp); 2083 mutex_exit(&spa->spa_flushed_ms_lock); 2084 2085 spa_log_sm_decrement_mscount(spa, metaslab_unflushed_txg(msp)); 2086 spa_log_summary_decrement_mscount(spa, metaslab_unflushed_txg(msp)); 2087 } 2088 2089 uint64_t 2090 metaslab_unflushed_changes_memused(metaslab_t *ms) 2091 { 2092 return ((range_tree_numsegs(ms->ms_unflushed_allocs) + 2093 range_tree_numsegs(ms->ms_unflushed_frees)) * 2094 sizeof (range_seg_t)); 2095 } 2096 2097 void 2098 metaslab_fini(metaslab_t *msp) 2099 { 2100 metaslab_group_t *mg = msp->ms_group; 2101 vdev_t *vd = mg->mg_vd; 2102 spa_t *spa = vd->vdev_spa; 2103 2104 metaslab_fini_flush_data(msp); 2105 2106 metaslab_group_remove(mg, msp); 2107 2108 mutex_enter(&msp->ms_lock); 2109 VERIFY(msp->ms_group == NULL); 2110 metaslab_space_update(vd, mg->mg_class, 2111 -metaslab_allocated_space(msp), 0, -msp->ms_size); 2112 2113 space_map_close(msp->ms_sm); 2114 msp->ms_sm = NULL; 2115 2116 metaslab_unload(msp); 2117 range_tree_destroy(msp->ms_allocatable); 2118 range_tree_destroy(msp->ms_freeing); 2119 range_tree_destroy(msp->ms_freed); 2120 2121 ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, 2122 metaslab_unflushed_changes_memused(msp)); 2123 spa->spa_unflushed_stats.sus_memused -= 2124 metaslab_unflushed_changes_memused(msp); 2125 range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); 2126 range_tree_destroy(msp->ms_unflushed_allocs); 2127 range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); 2128 range_tree_destroy(msp->ms_unflushed_frees); 2129 2130 for (int t = 0; t < TXG_SIZE; t++) { 2131 range_tree_destroy(msp->ms_allocating[t]); 2132 } 2133 2134 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2135 range_tree_destroy(msp->ms_defer[t]); 2136 } 2137 ASSERT0(msp->ms_deferspace); 2138 2139 range_tree_destroy(msp->ms_checkpointing); 2140 2141 for (int t = 0; t < TXG_SIZE; t++) 2142 ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t)); 2143 2144 range_tree_vacate(msp->ms_trim, NULL, NULL); 2145 range_tree_destroy(msp->ms_trim); 2146 2147 mutex_exit(&msp->ms_lock); 2148 cv_destroy(&msp->ms_load_cv); 2149 cv_destroy(&msp->ms_flush_cv); 2150 mutex_destroy(&msp->ms_lock); 2151 mutex_destroy(&msp->ms_sync_lock); 2152 ASSERT3U(msp->ms_allocator, ==, -1); 2153 2154 kmem_free(msp, sizeof (metaslab_t)); 2155 } 2156 2157 #define FRAGMENTATION_TABLE_SIZE 17 2158 2159 /* 2160 * This table defines a segment size based fragmentation metric that will 2161 * allow each metaslab to derive its own fragmentation value. This is done 2162 * by calculating the space in each bucket of the spacemap histogram and 2163 * multiplying that by the fragmentation metric in this table. Doing 2164 * this for all buckets and dividing it by the total amount of free 2165 * space in this metaslab (i.e. the total free space in all buckets) gives 2166 * us the fragmentation metric. This means that a high fragmentation metric 2167 * equates to most of the free space being comprised of small segments. 2168 * Conversely, if the metric is low, then most of the free space is in 2169 * large segments. A 10% change in fragmentation equates to approximately 2170 * double the number of segments. 2171 * 2172 * This table defines 0% fragmented space using 16MB segments. Testing has 2173 * shown that segments that are greater than or equal to 16MB do not suffer 2174 * from drastic performance problems. Using this value, we derive the rest 2175 * of the table. Since the fragmentation value is never stored on disk, it 2176 * is possible to change these calculations in the future. 2177 */ 2178 int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = { 2179 100, /* 512B */ 2180 100, /* 1K */ 2181 98, /* 2K */ 2182 95, /* 4K */ 2183 90, /* 8K */ 2184 80, /* 16K */ 2185 70, /* 32K */ 2186 60, /* 64K */ 2187 50, /* 128K */ 2188 40, /* 256K */ 2189 30, /* 512K */ 2190 20, /* 1M */ 2191 15, /* 2M */ 2192 10, /* 4M */ 2193 5, /* 8M */ 2194 0 /* 16M */ 2195 }; 2196 2197 /* 2198 * Calculate the metaslab's fragmentation metric and set ms_fragmentation. 2199 * Setting this value to ZFS_FRAG_INVALID means that the metaslab has not 2200 * been upgraded and does not support this metric. Otherwise, the return 2201 * value should be in the range [0, 100]. 2202 */ 2203 static void 2204 metaslab_set_fragmentation(metaslab_t *msp) 2205 { 2206 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 2207 uint64_t fragmentation = 0; 2208 uint64_t total = 0; 2209 boolean_t feature_enabled = spa_feature_is_enabled(spa, 2210 SPA_FEATURE_SPACEMAP_HISTOGRAM); 2211 2212 if (!feature_enabled) { 2213 msp->ms_fragmentation = ZFS_FRAG_INVALID; 2214 return; 2215 } 2216 2217 /* 2218 * A null space map means that the entire metaslab is free 2219 * and thus is not fragmented. 2220 */ 2221 if (msp->ms_sm == NULL) { 2222 msp->ms_fragmentation = 0; 2223 return; 2224 } 2225 2226 /* 2227 * If this metaslab's space map has not been upgraded, flag it 2228 * so that we upgrade next time we encounter it. 2229 */ 2230 if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) { 2231 uint64_t txg = spa_syncing_txg(spa); 2232 vdev_t *vd = msp->ms_group->mg_vd; 2233 2234 /* 2235 * If we've reached the final dirty txg, then we must 2236 * be shutting down the pool. We don't want to dirty 2237 * any data past this point so skip setting the condense 2238 * flag. We can retry this action the next time the pool 2239 * is imported. 2240 */ 2241 if (spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) { 2242 msp->ms_condense_wanted = B_TRUE; 2243 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 2244 zfs_dbgmsg("txg %llu, requesting force condense: " 2245 "ms_id %llu, vdev_id %llu", txg, msp->ms_id, 2246 vd->vdev_id); 2247 } 2248 msp->ms_fragmentation = ZFS_FRAG_INVALID; 2249 return; 2250 } 2251 2252 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 2253 uint64_t space = 0; 2254 uint8_t shift = msp->ms_sm->sm_shift; 2255 2256 int idx = MIN(shift - SPA_MINBLOCKSHIFT + i, 2257 FRAGMENTATION_TABLE_SIZE - 1); 2258 2259 if (msp->ms_sm->sm_phys->smp_histogram[i] == 0) 2260 continue; 2261 2262 space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift); 2263 total += space; 2264 2265 ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE); 2266 fragmentation += space * zfs_frag_table[idx]; 2267 } 2268 2269 if (total > 0) 2270 fragmentation /= total; 2271 ASSERT3U(fragmentation, <=, 100); 2272 2273 msp->ms_fragmentation = fragmentation; 2274 } 2275 2276 /* 2277 * Compute a weight -- a selection preference value -- for the given metaslab. 2278 * This is based on the amount of free space, the level of fragmentation, 2279 * the LBA range, and whether the metaslab is loaded. 2280 */ 2281 static uint64_t 2282 metaslab_space_weight(metaslab_t *msp) 2283 { 2284 metaslab_group_t *mg = msp->ms_group; 2285 vdev_t *vd = mg->mg_vd; 2286 uint64_t weight, space; 2287 2288 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2289 ASSERT(!vd->vdev_removing); 2290 2291 /* 2292 * The baseline weight is the metaslab's free space. 2293 */ 2294 space = msp->ms_size - metaslab_allocated_space(msp); 2295 2296 if (metaslab_fragmentation_factor_enabled && 2297 msp->ms_fragmentation != ZFS_FRAG_INVALID) { 2298 /* 2299 * Use the fragmentation information to inversely scale 2300 * down the baseline weight. We need to ensure that we 2301 * don't exclude this metaslab completely when it's 100% 2302 * fragmented. To avoid this we reduce the fragmented value 2303 * by 1. 2304 */ 2305 space = (space * (100 - (msp->ms_fragmentation - 1))) / 100; 2306 2307 /* 2308 * If space < SPA_MINBLOCKSIZE, then we will not allocate from 2309 * this metaslab again. The fragmentation metric may have 2310 * decreased the space to something smaller than 2311 * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE 2312 * so that we can consume any remaining space. 2313 */ 2314 if (space > 0 && space < SPA_MINBLOCKSIZE) 2315 space = SPA_MINBLOCKSIZE; 2316 } 2317 weight = space; 2318 2319 /* 2320 * Modern disks have uniform bit density and constant angular velocity. 2321 * Therefore, the outer recording zones are faster (higher bandwidth) 2322 * than the inner zones by the ratio of outer to inner track diameter, 2323 * which is typically around 2:1. We account for this by assigning 2324 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). 2325 * In effect, this means that we'll select the metaslab with the most 2326 * free bandwidth rather than simply the one with the most free space. 2327 */ 2328 if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) { 2329 weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count; 2330 ASSERT(weight >= space && weight <= 2 * space); 2331 } 2332 2333 /* 2334 * If this metaslab is one we're actively using, adjust its 2335 * weight to make it preferable to any inactive metaslab so 2336 * we'll polish it off. If the fragmentation on this metaslab 2337 * has exceed our threshold, then don't mark it active. 2338 */ 2339 if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID && 2340 msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) { 2341 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); 2342 } 2343 2344 WEIGHT_SET_SPACEBASED(weight); 2345 return (weight); 2346 } 2347 2348 /* 2349 * Return the weight of the specified metaslab, according to the segment-based 2350 * weighting algorithm. The metaslab must be loaded. This function can 2351 * be called within a sync pass since it relies only on the metaslab's 2352 * range tree which is always accurate when the metaslab is loaded. 2353 */ 2354 static uint64_t 2355 metaslab_weight_from_range_tree(metaslab_t *msp) 2356 { 2357 uint64_t weight = 0; 2358 uint32_t segments = 0; 2359 2360 ASSERT(msp->ms_loaded); 2361 2362 for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT; 2363 i--) { 2364 uint8_t shift = msp->ms_group->mg_vd->vdev_ashift; 2365 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; 2366 2367 segments <<= 1; 2368 segments += msp->ms_allocatable->rt_histogram[i]; 2369 2370 /* 2371 * The range tree provides more precision than the space map 2372 * and must be downgraded so that all values fit within the 2373 * space map's histogram. This allows us to compare loaded 2374 * vs. unloaded metaslabs to determine which metaslab is 2375 * considered "best". 2376 */ 2377 if (i > max_idx) 2378 continue; 2379 2380 if (segments != 0) { 2381 WEIGHT_SET_COUNT(weight, segments); 2382 WEIGHT_SET_INDEX(weight, i); 2383 WEIGHT_SET_ACTIVE(weight, 0); 2384 break; 2385 } 2386 } 2387 return (weight); 2388 } 2389 2390 /* 2391 * Calculate the weight based on the on-disk histogram. Should be applied 2392 * only to unloaded metaslabs (i.e no incoming allocations) in-order to 2393 * give results consistent with the on-disk state 2394 */ 2395 static uint64_t 2396 metaslab_weight_from_spacemap(metaslab_t *msp) 2397 { 2398 space_map_t *sm = msp->ms_sm; 2399 ASSERT(!msp->ms_loaded); 2400 ASSERT(sm != NULL); 2401 ASSERT3U(space_map_object(sm), !=, 0); 2402 ASSERT3U(sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); 2403 2404 /* 2405 * Create a joint histogram from all the segments that have made 2406 * it to the metaslab's space map histogram, that are not yet 2407 * available for allocation because they are still in the freeing 2408 * pipeline (e.g. freeing, freed, and defer trees). Then subtract 2409 * these segments from the space map's histogram to get a more 2410 * accurate weight. 2411 */ 2412 uint64_t deferspace_histogram[SPACE_MAP_HISTOGRAM_SIZE] = {0}; 2413 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) 2414 deferspace_histogram[i] += msp->ms_synchist[i]; 2415 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2416 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 2417 deferspace_histogram[i] += msp->ms_deferhist[t][i]; 2418 } 2419 } 2420 2421 uint64_t weight = 0; 2422 for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) { 2423 ASSERT3U(sm->sm_phys->smp_histogram[i], >=, 2424 deferspace_histogram[i]); 2425 uint64_t count = 2426 sm->sm_phys->smp_histogram[i] - deferspace_histogram[i]; 2427 if (count != 0) { 2428 WEIGHT_SET_COUNT(weight, count); 2429 WEIGHT_SET_INDEX(weight, i + sm->sm_shift); 2430 WEIGHT_SET_ACTIVE(weight, 0); 2431 break; 2432 } 2433 } 2434 return (weight); 2435 } 2436 2437 /* 2438 * Compute a segment-based weight for the specified metaslab. The weight 2439 * is determined by highest bucket in the histogram. The information 2440 * for the highest bucket is encoded into the weight value. 2441 */ 2442 static uint64_t 2443 metaslab_segment_weight(metaslab_t *msp) 2444 { 2445 metaslab_group_t *mg = msp->ms_group; 2446 uint64_t weight = 0; 2447 uint8_t shift = mg->mg_vd->vdev_ashift; 2448 2449 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2450 2451 /* 2452 * The metaslab is completely free. 2453 */ 2454 if (metaslab_allocated_space(msp) == 0) { 2455 int idx = highbit64(msp->ms_size) - 1; 2456 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; 2457 2458 if (idx < max_idx) { 2459 WEIGHT_SET_COUNT(weight, 1ULL); 2460 WEIGHT_SET_INDEX(weight, idx); 2461 } else { 2462 WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx)); 2463 WEIGHT_SET_INDEX(weight, max_idx); 2464 } 2465 WEIGHT_SET_ACTIVE(weight, 0); 2466 ASSERT(!WEIGHT_IS_SPACEBASED(weight)); 2467 return (weight); 2468 } 2469 2470 ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); 2471 2472 /* 2473 * If the metaslab is fully allocated then just make the weight 0. 2474 */ 2475 if (metaslab_allocated_space(msp) == msp->ms_size) 2476 return (0); 2477 /* 2478 * If the metaslab is already loaded, then use the range tree to 2479 * determine the weight. Otherwise, we rely on the space map information 2480 * to generate the weight. 2481 */ 2482 if (msp->ms_loaded) { 2483 weight = metaslab_weight_from_range_tree(msp); 2484 } else { 2485 weight = metaslab_weight_from_spacemap(msp); 2486 } 2487 2488 /* 2489 * If the metaslab was active the last time we calculated its weight 2490 * then keep it active. We want to consume the entire region that 2491 * is associated with this weight. 2492 */ 2493 if (msp->ms_activation_weight != 0 && weight != 0) 2494 WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight)); 2495 return (weight); 2496 } 2497 2498 /* 2499 * Determine if we should attempt to allocate from this metaslab. If the 2500 * metaslab is loaded, then we can determine if the desired allocation 2501 * can be satisfied by looking at the size of the maximum free segment 2502 * on that metaslab. Otherwise, we make our decision based on the metaslab's 2503 * weight. For segment-based weighting we can determine the maximum 2504 * allocation based on the index encoded in its value. For space-based 2505 * weights we rely on the entire weight (excluding the weight-type bit). 2506 */ 2507 boolean_t 2508 metaslab_should_allocate(metaslab_t *msp, uint64_t asize) 2509 { 2510 if (msp->ms_loaded) { 2511 return (msp->ms_max_size >= asize); 2512 } else { 2513 ASSERT0(msp->ms_max_size); 2514 } 2515 2516 boolean_t should_allocate; 2517 if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) { 2518 /* 2519 * The metaslab segment weight indicates segments in the 2520 * range [2^i, 2^(i+1)), where i is the index in the weight. 2521 * Since the asize might be in the middle of the range, we 2522 * should attempt the allocation if asize < 2^(i+1). 2523 */ 2524 should_allocate = (asize < 2525 1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1)); 2526 } else { 2527 should_allocate = (asize <= 2528 (msp->ms_weight & ~METASLAB_WEIGHT_TYPE)); 2529 } 2530 return (should_allocate); 2531 } 2532 2533 static uint64_t 2534 metaslab_weight(metaslab_t *msp) 2535 { 2536 vdev_t *vd = msp->ms_group->mg_vd; 2537 spa_t *spa = vd->vdev_spa; 2538 uint64_t weight; 2539 2540 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2541 2542 /* 2543 * If this vdev is in the process of being removed, there is nothing 2544 * for us to do here. 2545 */ 2546 if (vd->vdev_removing) 2547 return (0); 2548 2549 metaslab_set_fragmentation(msp); 2550 2551 /* 2552 * Update the maximum size if the metaslab is loaded. This will 2553 * ensure that we get an accurate maximum size if newly freed space 2554 * has been added back into the free tree. 2555 */ 2556 if (msp->ms_loaded) 2557 msp->ms_max_size = metaslab_block_maxsize(msp); 2558 else 2559 ASSERT0(msp->ms_max_size); 2560 2561 /* 2562 * Segment-based weighting requires space map histogram support. 2563 */ 2564 if (zfs_metaslab_segment_weight_enabled && 2565 spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) && 2566 (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size == 2567 sizeof (space_map_phys_t))) { 2568 weight = metaslab_segment_weight(msp); 2569 } else { 2570 weight = metaslab_space_weight(msp); 2571 } 2572 return (weight); 2573 } 2574 2575 void 2576 metaslab_recalculate_weight_and_sort(metaslab_t *msp) 2577 { 2578 /* note: we preserve the mask (e.g. indication of primary, etc..) */ 2579 uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; 2580 metaslab_group_sort(msp->ms_group, msp, 2581 metaslab_weight(msp) | was_active); 2582 } 2583 2584 static int 2585 metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp, 2586 int allocator, uint64_t activation_weight) 2587 { 2588 /* 2589 * If we're activating for the claim code, we don't want to actually 2590 * set the metaslab up for a specific allocator. 2591 */ 2592 if (activation_weight == METASLAB_WEIGHT_CLAIM) 2593 return (0); 2594 metaslab_t **arr = (activation_weight == METASLAB_WEIGHT_PRIMARY ? 2595 mg->mg_primaries : mg->mg_secondaries); 2596 2597 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2598 mutex_enter(&mg->mg_lock); 2599 if (arr[allocator] != NULL) { 2600 mutex_exit(&mg->mg_lock); 2601 return (EEXIST); 2602 } 2603 2604 arr[allocator] = msp; 2605 ASSERT3S(msp->ms_allocator, ==, -1); 2606 msp->ms_allocator = allocator; 2607 msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY); 2608 mutex_exit(&mg->mg_lock); 2609 2610 return (0); 2611 } 2612 2613 static int 2614 metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight) 2615 { 2616 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2617 2618 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { 2619 int error = metaslab_load(msp); 2620 if (error != 0) { 2621 metaslab_group_sort(msp->ms_group, msp, 0); 2622 return (error); 2623 } 2624 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) { 2625 /* 2626 * The metaslab was activated for another allocator 2627 * while we were waiting, we should reselect. 2628 */ 2629 return (EBUSY); 2630 } 2631 if ((error = metaslab_activate_allocator(msp->ms_group, msp, 2632 allocator, activation_weight)) != 0) { 2633 return (error); 2634 } 2635 2636 msp->ms_activation_weight = msp->ms_weight; 2637 metaslab_group_sort(msp->ms_group, msp, 2638 msp->ms_weight | activation_weight); 2639 } 2640 ASSERT(msp->ms_loaded); 2641 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); 2642 2643 return (0); 2644 } 2645 2646 static void 2647 metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp, 2648 uint64_t weight) 2649 { 2650 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2651 if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) { 2652 metaslab_group_sort(mg, msp, weight); 2653 return; 2654 } 2655 2656 mutex_enter(&mg->mg_lock); 2657 ASSERT3P(msp->ms_group, ==, mg); 2658 if (msp->ms_primary) { 2659 ASSERT3U(0, <=, msp->ms_allocator); 2660 ASSERT3U(msp->ms_allocator, <, mg->mg_allocators); 2661 ASSERT3P(mg->mg_primaries[msp->ms_allocator], ==, msp); 2662 ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); 2663 mg->mg_primaries[msp->ms_allocator] = NULL; 2664 } else { 2665 ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); 2666 ASSERT3P(mg->mg_secondaries[msp->ms_allocator], ==, msp); 2667 mg->mg_secondaries[msp->ms_allocator] = NULL; 2668 } 2669 msp->ms_allocator = -1; 2670 metaslab_group_sort_impl(mg, msp, weight); 2671 mutex_exit(&mg->mg_lock); 2672 } 2673 2674 static void 2675 metaslab_passivate(metaslab_t *msp, uint64_t weight) 2676 { 2677 uint64_t size = weight & ~METASLAB_WEIGHT_TYPE; 2678 2679 /* 2680 * If size < SPA_MINBLOCKSIZE, then we will not allocate from 2681 * this metaslab again. In that case, it had better be empty, 2682 * or we would be leaving space on the table. 2683 */ 2684 ASSERT(size >= SPA_MINBLOCKSIZE || 2685 range_tree_is_empty(msp->ms_allocatable)); 2686 ASSERT0(weight & METASLAB_ACTIVE_MASK); 2687 2688 msp->ms_activation_weight = 0; 2689 metaslab_passivate_allocator(msp->ms_group, msp, weight); 2690 ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); 2691 } 2692 2693 /* 2694 * Segment-based metaslabs are activated once and remain active until 2695 * we either fail an allocation attempt (similar to space-based metaslabs) 2696 * or have exhausted the free space in zfs_metaslab_switch_threshold 2697 * buckets since the metaslab was activated. This function checks to see 2698 * if we've exhaused the zfs_metaslab_switch_threshold buckets in the 2699 * metaslab and passivates it proactively. This will allow us to select a 2700 * metaslabs with larger contiguous region if any remaining within this 2701 * metaslab group. If we're in sync pass > 1, then we continue using this 2702 * metaslab so that we don't dirty more block and cause more sync passes. 2703 */ 2704 void 2705 metaslab_segment_may_passivate(metaslab_t *msp) 2706 { 2707 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 2708 2709 if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1) 2710 return; 2711 2712 /* 2713 * Since we are in the middle of a sync pass, the most accurate 2714 * information that is accessible to us is the in-core range tree 2715 * histogram; calculate the new weight based on that information. 2716 */ 2717 uint64_t weight = metaslab_weight_from_range_tree(msp); 2718 int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight); 2719 int current_idx = WEIGHT_GET_INDEX(weight); 2720 2721 if (current_idx <= activation_idx - zfs_metaslab_switch_threshold) 2722 metaslab_passivate(msp, weight); 2723 } 2724 2725 static void 2726 metaslab_preload(void *arg) 2727 { 2728 metaslab_t *msp = arg; 2729 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 2730 2731 ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock)); 2732 2733 mutex_enter(&msp->ms_lock); 2734 (void) metaslab_load(msp); 2735 msp->ms_selected_txg = spa_syncing_txg(spa); 2736 mutex_exit(&msp->ms_lock); 2737 } 2738 2739 static void 2740 metaslab_group_preload(metaslab_group_t *mg) 2741 { 2742 spa_t *spa = mg->mg_vd->vdev_spa; 2743 metaslab_t *msp; 2744 avl_tree_t *t = &mg->mg_metaslab_tree; 2745 int m = 0; 2746 2747 if (spa_shutting_down(spa) || !metaslab_preload_enabled) { 2748 taskq_wait(mg->mg_taskq); 2749 return; 2750 } 2751 2752 mutex_enter(&mg->mg_lock); 2753 2754 /* 2755 * Load the next potential metaslabs 2756 */ 2757 for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) { 2758 ASSERT3P(msp->ms_group, ==, mg); 2759 2760 /* 2761 * We preload only the maximum number of metaslabs specified 2762 * by metaslab_preload_limit. If a metaslab is being forced 2763 * to condense then we preload it too. This will ensure 2764 * that force condensing happens in the next txg. 2765 */ 2766 if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) { 2767 continue; 2768 } 2769 2770 VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload, 2771 msp, TQ_SLEEP) != TASKQID_INVALID); 2772 } 2773 mutex_exit(&mg->mg_lock); 2774 } 2775 2776 /* 2777 * Determine if the space map's on-disk footprint is past our tolerance for 2778 * inefficiency. We would like to use the following criteria to make our 2779 * decision: 2780 * 2781 * 1. Do not condense if the size of the space map object would dramatically 2782 * increase as a result of writing out the free space range tree. 2783 * 2784 * 2. Condense if the on on-disk space map representation is at least 2785 * zfs_condense_pct/100 times the size of the optimal representation 2786 * (i.e. zfs_condense_pct = 110 and in-core = 1MB, optimal = 1.1MB). 2787 * 2788 * 3. Do not condense if the on-disk size of the space map does not actually 2789 * decrease. 2790 * 2791 * Unfortunately, we cannot compute the on-disk size of the space map in this 2792 * context because we cannot accurately compute the effects of compression, etc. 2793 * Instead, we apply the heuristic described in the block comment for 2794 * zfs_metaslab_condense_block_threshold - we only condense if the space used 2795 * is greater than a threshold number of blocks. 2796 */ 2797 static boolean_t 2798 metaslab_should_condense(metaslab_t *msp) 2799 { 2800 space_map_t *sm = msp->ms_sm; 2801 vdev_t *vd = msp->ms_group->mg_vd; 2802 uint64_t vdev_blocksize = 1 << vd->vdev_ashift; 2803 2804 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2805 ASSERT(msp->ms_loaded); 2806 ASSERT(sm != NULL); 2807 ASSERT3U(spa_sync_pass(vd->vdev_spa), ==, 1); 2808 2809 /* 2810 * We always condense metaslabs that are empty and metaslabs for 2811 * which a condense request has been made. 2812 */ 2813 if (avl_is_empty(&msp->ms_allocatable_by_size) || 2814 msp->ms_condense_wanted) 2815 return (B_TRUE); 2816 2817 uint64_t record_size = MAX(sm->sm_blksz, vdev_blocksize); 2818 uint64_t object_size = space_map_length(sm); 2819 uint64_t optimal_size = space_map_estimate_optimal_size(sm, 2820 msp->ms_allocatable, SM_NO_VDEVID); 2821 2822 return (object_size >= (optimal_size * zfs_condense_pct / 100) && 2823 object_size > zfs_metaslab_condense_block_threshold * record_size); 2824 } 2825 2826 /* 2827 * Condense the on-disk space map representation to its minimized form. 2828 * The minimized form consists of a small number of allocations followed 2829 * by the entries of the free range tree (ms_allocatable). The condensed 2830 * spacemap contains all the entries of previous TXGs (including those in 2831 * the pool-wide log spacemaps; thus this is effectively a superset of 2832 * metaslab_flush()), but this TXG's entries still need to be written. 2833 */ 2834 static void 2835 metaslab_condense(metaslab_t *msp, dmu_tx_t *tx) 2836 { 2837 range_tree_t *condense_tree; 2838 space_map_t *sm = msp->ms_sm; 2839 uint64_t txg = dmu_tx_get_txg(tx); 2840 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 2841 2842 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2843 ASSERT(msp->ms_loaded); 2844 ASSERT(msp->ms_sm != NULL); 2845 2846 /* 2847 * In order to condense the space map, we need to change it so it 2848 * only describes which segments are currently allocated and free. 2849 * 2850 * All the current free space resides in the ms_allocatable, all 2851 * the ms_defer trees, and all the ms_allocating trees. We ignore 2852 * ms_freed because it is empty because we're in sync pass 1. We 2853 * ignore ms_freeing because these changes are not yet reflected 2854 * in the spacemap (they will be written later this txg). 2855 * 2856 * So to truncate the space map to represent all the entries of 2857 * previous TXGs we do the following: 2858 * 2859 * 1] We create a range tree (condense tree) that is 100% allocated. 2860 * 2] We remove from it all segments found in the ms_defer trees 2861 * as those segments are marked as free in the original space 2862 * map. We do the same with the ms_allocating trees for the same 2863 * reason. Removing these segments should be a relatively 2864 * inexpensive operation since we expect these trees to have a 2865 * small number of nodes. 2866 * 3] We vacate any unflushed allocs as they should already exist 2867 * in the condense tree. Then we vacate any unflushed frees as 2868 * they should already be part of ms_allocatable. 2869 * 4] At this point, we would ideally like to remove all segments 2870 * in the ms_allocatable tree from the condense tree. This way 2871 * we would write all the entries of the condense tree as the 2872 * condensed space map, which would only contain allocated 2873 * segments with everything else assumed to be freed. 2874 * 2875 * Doing so can be prohibitively expensive as ms_allocatable can 2876 * be large, and therefore computationally expensive to subtract 2877 * from the condense_tree. Instead we first sync out the 2878 * condense_tree and then the ms_allocatable, in the condensed 2879 * space map. While this is not optimal, it is typically close to 2880 * optimal and more importantly much cheaper to compute. 2881 * 2882 * 5] Finally, as both of the unflushed trees were written to our 2883 * new and condensed metaslab space map, we basically flushed 2884 * all the unflushed changes to disk, thus we call 2885 * metaslab_flush_update(). 2886 */ 2887 ASSERT3U(spa_sync_pass(spa), ==, 1); 2888 ASSERT(range_tree_is_empty(msp->ms_freed)); /* since it is pass 1 */ 2889 2890 zfs_dbgmsg("condensing: txg %llu, msp[%llu] %p, vdev id %llu, " 2891 "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg, 2892 msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id, 2893 spa->spa_name, space_map_length(msp->ms_sm), 2894 avl_numnodes(&msp->ms_allocatable->rt_root), 2895 msp->ms_condense_wanted ? "TRUE" : "FALSE"); 2896 2897 msp->ms_condense_wanted = B_FALSE; 2898 2899 condense_tree = range_tree_create(NULL, NULL); 2900 range_tree_add(condense_tree, msp->ms_start, msp->ms_size); 2901 2902 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2903 range_tree_walk(msp->ms_defer[t], 2904 range_tree_remove, condense_tree); 2905 } 2906 2907 for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { 2908 range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK], 2909 range_tree_remove, condense_tree); 2910 } 2911 2912 ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, 2913 metaslab_unflushed_changes_memused(msp)); 2914 spa->spa_unflushed_stats.sus_memused -= 2915 metaslab_unflushed_changes_memused(msp); 2916 range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); 2917 range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); 2918 2919 /* 2920 * We're about to drop the metaslab's lock thus allowing other 2921 * consumers to change its content. Set the metaslab's ms_condensing 2922 * flag to ensure that allocations on this metaslab do not occur 2923 * while we're in the middle of committing it to disk. This is only 2924 * critical for ms_allocatable as all other range trees use per TXG 2925 * views of their content. 2926 */ 2927 msp->ms_condensing = B_TRUE; 2928 2929 mutex_exit(&msp->ms_lock); 2930 uint64_t object = space_map_object(msp->ms_sm); 2931 space_map_truncate(sm, 2932 spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ? 2933 zfs_metaslab_sm_blksz_with_log : zfs_metaslab_sm_blksz_no_log, tx); 2934 2935 /* 2936 * space_map_truncate() may have reallocated the spacemap object. 2937 * If so, update the vdev_ms_array. 2938 */ 2939 if (space_map_object(msp->ms_sm) != object) { 2940 object = space_map_object(msp->ms_sm); 2941 dmu_write(spa->spa_meta_objset, 2942 msp->ms_group->mg_vd->vdev_ms_array, sizeof (uint64_t) * 2943 msp->ms_id, sizeof (uint64_t), &object, tx); 2944 } 2945 2946 /* 2947 * Note: 2948 * When the log space map feature is enabled, each space map will 2949 * always have ALLOCS followed by FREES for each sync pass. This is 2950 * typically true even when the log space map feature is disabled, 2951 * except from the case where a metaslab goes through metaslab_sync() 2952 * and gets condensed. In that case the metaslab's space map will have 2953 * ALLOCS followed by FREES (due to condensing) followed by ALLOCS 2954 * followed by FREES (due to space_map_write() in metaslab_sync()) for 2955 * sync pass 1. 2956 */ 2957 space_map_write(sm, condense_tree, SM_ALLOC, SM_NO_VDEVID, tx); 2958 space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx); 2959 2960 range_tree_vacate(condense_tree, NULL, NULL); 2961 range_tree_destroy(condense_tree); 2962 mutex_enter(&msp->ms_lock); 2963 2964 msp->ms_condensing = B_FALSE; 2965 metaslab_flush_update(msp, tx); 2966 } 2967 2968 /* 2969 * Called when the metaslab has been flushed (its own spacemap now reflects 2970 * all the contents of the pool-wide spacemap log). Updates the metaslab's 2971 * metadata and any pool-wide related log space map data (e.g. summary, 2972 * obsolete logs, etc.) to reflect that. 2973 */ 2974 static void 2975 metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx) 2976 { 2977 metaslab_group_t *mg = msp->ms_group; 2978 spa_t *spa = mg->mg_vd->vdev_spa; 2979 2980 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2981 2982 ASSERT3U(spa_sync_pass(spa), ==, 1); 2983 ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); 2984 ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); 2985 2986 /* 2987 * Just because a metaslab got flushed, that doesn't mean that 2988 * it will pass through metaslab_sync_done(). Thus, make sure to 2989 * update ms_synced_length here in case it doesn't. 2990 */ 2991 msp->ms_synced_length = space_map_length(msp->ms_sm); 2992 2993 /* 2994 * We may end up here from metaslab_condense() without the 2995 * feature being active. In that case this is a no-op. 2996 */ 2997 if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) 2998 return; 2999 3000 ASSERT(spa_syncing_log_sm(spa) != NULL); 3001 ASSERT(msp->ms_sm != NULL); 3002 ASSERT(metaslab_unflushed_txg(msp) != 0); 3003 ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), ==, msp); 3004 3005 VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(spa)); 3006 3007 /* update metaslab's position in our flushing tree */ 3008 uint64_t ms_prev_flushed_txg = metaslab_unflushed_txg(msp); 3009 mutex_enter(&spa->spa_flushed_ms_lock); 3010 avl_remove(&spa->spa_metaslabs_by_flushed, msp); 3011 metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx); 3012 avl_add(&spa->spa_metaslabs_by_flushed, msp); 3013 mutex_exit(&spa->spa_flushed_ms_lock); 3014 3015 /* update metaslab counts of spa_log_sm_t nodes */ 3016 spa_log_sm_decrement_mscount(spa, ms_prev_flushed_txg); 3017 spa_log_sm_increment_current_mscount(spa); 3018 3019 /* cleanup obsolete logs if any */ 3020 uint64_t log_blocks_before = spa_log_sm_nblocks(spa); 3021 spa_cleanup_old_sm_logs(spa, tx); 3022 uint64_t log_blocks_after = spa_log_sm_nblocks(spa); 3023 VERIFY3U(log_blocks_after, <=, log_blocks_before); 3024 3025 /* update log space map summary */ 3026 uint64_t blocks_gone = log_blocks_before - log_blocks_after; 3027 spa_log_summary_add_flushed_metaslab(spa); 3028 spa_log_summary_decrement_mscount(spa, ms_prev_flushed_txg); 3029 spa_log_summary_decrement_blkcount(spa, blocks_gone); 3030 } 3031 3032 boolean_t 3033 metaslab_flush(metaslab_t *msp, dmu_tx_t *tx) 3034 { 3035 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 3036 3037 ASSERT(MUTEX_HELD(&msp->ms_lock)); 3038 ASSERT3U(spa_sync_pass(spa), ==, 1); 3039 ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); 3040 3041 ASSERT(msp->ms_sm != NULL); 3042 ASSERT(metaslab_unflushed_txg(msp) != 0); 3043 ASSERT(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL) != NULL); 3044 3045 /* 3046 * There is nothing wrong with flushing the same metaslab twice, as 3047 * this codepath should work on that case. However, the current 3048 * flushing scheme makes sure to avoid this situation as we would be 3049 * making all these calls without having anything meaningful to write 3050 * to disk. We assert this behavior here. 3051 */ 3052 ASSERT3U(metaslab_unflushed_txg(msp), <, dmu_tx_get_txg(tx)); 3053 3054 /* 3055 * We can not flush while loading, because then we would 3056 * not load the ms_unflushed_{allocs,frees}. 3057 */ 3058 if (msp->ms_loading) 3059 return (B_FALSE); 3060 3061 metaslab_verify_space(msp, dmu_tx_get_txg(tx)); 3062 metaslab_verify_weight_and_frag(msp); 3063 3064 /* 3065 * Metaslab condensing is effectively flushing. Therefore if the 3066 * metaslab can be condensed we can just condense it instead of 3067 * flushing it. 3068 * 3069 * Note that metaslab_condense() does call metaslab_flush_update() 3070 * so we can just return immediately after condensing. We also 3071 * don't need to care about setting ms_flushing or broadcasting 3072 * ms_flush_cv, even if we temporarily drop the ms_lock in 3073 * metaslab_condense(), as the metaslab is already loaded. 3074 */ 3075 if (msp->ms_loaded && metaslab_should_condense(msp)) { 3076 metaslab_group_t *mg = msp->ms_group; 3077 3078 /* 3079 * For all histogram operations below refer to the 3080 * comments of metaslab_sync() where we follow a 3081 * similar procedure. 3082 */ 3083 metaslab_group_histogram_verify(mg); 3084 metaslab_class_histogram_verify(mg->mg_class); 3085 metaslab_group_histogram_remove(mg, msp); 3086 3087 metaslab_condense(msp, tx); 3088 3089 space_map_histogram_clear(msp->ms_sm); 3090 space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx); 3091 ASSERT(range_tree_is_empty(msp->ms_freed)); 3092 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 3093 space_map_histogram_add(msp->ms_sm, 3094 msp->ms_defer[t], tx); 3095 } 3096 metaslab_aux_histograms_update(msp); 3097 3098 metaslab_group_histogram_add(mg, msp); 3099 metaslab_group_histogram_verify(mg); 3100 metaslab_class_histogram_verify(mg->mg_class); 3101 3102 metaslab_verify_space(msp, dmu_tx_get_txg(tx)); 3103 3104 /* 3105 * Since we recreated the histogram (and potentially 3106 * the ms_sm too while condensing) ensure that the 3107 * weight is updated too because we are not guaranteed 3108 * that this metaslab is dirty and will go through 3109 * metaslab_sync_done(). 3110 */ 3111 metaslab_recalculate_weight_and_sort(msp); 3112 return (B_TRUE); 3113 } 3114 3115 msp->ms_flushing = B_TRUE; 3116 uint64_t sm_len_before = space_map_length(msp->ms_sm); 3117 3118 mutex_exit(&msp->ms_lock); 3119 space_map_write(msp->ms_sm, msp->ms_unflushed_allocs, SM_ALLOC, 3120 SM_NO_VDEVID, tx); 3121 space_map_write(msp->ms_sm, msp->ms_unflushed_frees, SM_FREE, 3122 SM_NO_VDEVID, tx); 3123 mutex_enter(&msp->ms_lock); 3124 3125 uint64_t sm_len_after = space_map_length(msp->ms_sm); 3126 if (zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) { 3127 zfs_dbgmsg("flushing: txg %llu, spa %s, vdev_id %llu, " 3128 "ms_id %llu, unflushed_allocs %llu, unflushed_frees %llu, " 3129 "appended %llu bytes", dmu_tx_get_txg(tx), spa_name(spa), 3130 msp->ms_group->mg_vd->vdev_id, msp->ms_id, 3131 range_tree_space(msp->ms_unflushed_allocs), 3132 range_tree_space(msp->ms_unflushed_frees), 3133 (sm_len_after - sm_len_before)); 3134 } 3135 3136 ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, 3137 metaslab_unflushed_changes_memused(msp)); 3138 spa->spa_unflushed_stats.sus_memused -= 3139 metaslab_unflushed_changes_memused(msp); 3140 range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); 3141 range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); 3142 3143 metaslab_verify_space(msp, dmu_tx_get_txg(tx)); 3144 metaslab_verify_weight_and_frag(msp); 3145 3146 metaslab_flush_update(msp, tx); 3147 3148 metaslab_verify_space(msp, dmu_tx_get_txg(tx)); 3149 metaslab_verify_weight_and_frag(msp); 3150 3151 msp->ms_flushing = B_FALSE; 3152 cv_broadcast(&msp->ms_flush_cv); 3153 return (B_TRUE); 3154 } 3155 3156 /* 3157 * Write a metaslab to disk in the context of the specified transaction group. 3158 */ 3159 void 3160 metaslab_sync(metaslab_t *msp, uint64_t txg) 3161 { 3162 metaslab_group_t *mg = msp->ms_group; 3163 vdev_t *vd = mg->mg_vd; 3164 spa_t *spa = vd->vdev_spa; 3165 objset_t *mos = spa_meta_objset(spa); 3166 range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK]; 3167 dmu_tx_t *tx; 3168 3169 ASSERT(!vd->vdev_ishole); 3170 3171 /* 3172 * This metaslab has just been added so there's no work to do now. 3173 */ 3174 if (msp->ms_freeing == NULL) { 3175 ASSERT3P(alloctree, ==, NULL); 3176 return; 3177 } 3178 3179 ASSERT3P(alloctree, !=, NULL); 3180 ASSERT3P(msp->ms_freeing, !=, NULL); 3181 ASSERT3P(msp->ms_freed, !=, NULL); 3182 ASSERT3P(msp->ms_checkpointing, !=, NULL); 3183 ASSERT3P(msp->ms_trim, !=, NULL); 3184 3185 /* 3186 * Normally, we don't want to process a metaslab if there are no 3187 * allocations or frees to perform. However, if the metaslab is being 3188 * forced to condense and it's loaded, we need to let it through. 3189 */ 3190 if (range_tree_is_empty(alloctree) && 3191 range_tree_is_empty(msp->ms_freeing) && 3192 range_tree_is_empty(msp->ms_checkpointing) && 3193 !(msp->ms_loaded && msp->ms_condense_wanted)) 3194 return; 3195 3196 3197 VERIFY(txg <= spa_final_dirty_txg(spa)); 3198 3199 /* 3200 * The only state that can actually be changing concurrently 3201 * with metaslab_sync() is the metaslab's ms_allocatable. No 3202 * other thread can be modifying this txg's alloc, freeing, 3203 * freed, or space_map_phys_t. We drop ms_lock whenever we 3204 * could call into the DMU, because the DMU can call down to 3205 * us (e.g. via zio_free()) at any time. 3206 * 3207 * The spa_vdev_remove_thread() can be reading metaslab state 3208 * concurrently, and it is locked out by the ms_sync_lock. 3209 * Note that the ms_lock is insufficient for this, because it 3210 * is dropped by space_map_write(). 3211 */ 3212 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 3213 3214 /* 3215 * Generate a log space map if one doesn't exist already. 3216 */ 3217 spa_generate_syncing_log_sm(spa, tx); 3218 3219 if (msp->ms_sm == NULL) { 3220 uint64_t new_object = space_map_alloc(mos, 3221 spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ? 3222 zfs_metaslab_sm_blksz_with_log : 3223 zfs_metaslab_sm_blksz_no_log, tx); 3224 VERIFY3U(new_object, !=, 0); 3225 3226 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * 3227 msp->ms_id, sizeof (uint64_t), &new_object, tx); 3228 3229 VERIFY0(space_map_open(&msp->ms_sm, mos, new_object, 3230 msp->ms_start, msp->ms_size, vd->vdev_ashift)); 3231 ASSERT(msp->ms_sm != NULL); 3232 3233 ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); 3234 ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); 3235 ASSERT0(metaslab_allocated_space(msp)); 3236 } 3237 3238 if (metaslab_unflushed_txg(msp) == 0 && 3239 spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { 3240 ASSERT(spa_syncing_log_sm(spa) != NULL); 3241 3242 metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx); 3243 spa_log_sm_increment_current_mscount(spa); 3244 spa_log_summary_add_flushed_metaslab(spa); 3245 3246 ASSERT(msp->ms_sm != NULL); 3247 mutex_enter(&spa->spa_flushed_ms_lock); 3248 avl_add(&spa->spa_metaslabs_by_flushed, msp); 3249 mutex_exit(&spa->spa_flushed_ms_lock); 3250 3251 ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); 3252 ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); 3253 } 3254 3255 if (!range_tree_is_empty(msp->ms_checkpointing) && 3256 vd->vdev_checkpoint_sm == NULL) { 3257 ASSERT(spa_has_checkpoint(spa)); 3258 3259 uint64_t new_object = space_map_alloc(mos, 3260 zfs_vdev_standard_sm_blksz, tx); 3261 VERIFY3U(new_object, !=, 0); 3262 3263 VERIFY0(space_map_open(&vd->vdev_checkpoint_sm, 3264 mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift)); 3265 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); 3266 3267 /* 3268 * We save the space map object as an entry in vdev_top_zap 3269 * so it can be retrieved when the pool is reopened after an 3270 * export or through zdb. 3271 */ 3272 VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, 3273 vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, 3274 sizeof (new_object), 1, &new_object, tx)); 3275 } 3276 3277 mutex_enter(&msp->ms_sync_lock); 3278 mutex_enter(&msp->ms_lock); 3279 3280 /* 3281 * Note: metaslab_condense() clears the space map's histogram. 3282 * Therefore we must verify and remove this histogram before 3283 * condensing. 3284 */ 3285 metaslab_group_histogram_verify(mg); 3286 metaslab_class_histogram_verify(mg->mg_class); 3287 metaslab_group_histogram_remove(mg, msp); 3288 3289 if (spa->spa_sync_pass == 1 && msp->ms_loaded && 3290 metaslab_should_condense(msp)) 3291 metaslab_condense(msp, tx); 3292 3293 /* 3294 * We'll be going to disk to sync our space accounting, thus we 3295 * drop the ms_lock during that time so allocations coming from 3296 * open-context (ZIL) for future TXGs do not block. 3297 */ 3298 mutex_exit(&msp->ms_lock); 3299 space_map_t *log_sm = spa_syncing_log_sm(spa); 3300 if (log_sm != NULL) { 3301 ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP)); 3302 3303 space_map_write(log_sm, alloctree, SM_ALLOC, 3304 vd->vdev_id, tx); 3305 space_map_write(log_sm, msp->ms_freeing, SM_FREE, 3306 vd->vdev_id, tx); 3307 mutex_enter(&msp->ms_lock); 3308 3309 ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, 3310 metaslab_unflushed_changes_memused(msp)); 3311 spa->spa_unflushed_stats.sus_memused -= 3312 metaslab_unflushed_changes_memused(msp); 3313 range_tree_remove_xor_add(alloctree, 3314 msp->ms_unflushed_frees, msp->ms_unflushed_allocs); 3315 range_tree_remove_xor_add(msp->ms_freeing, 3316 msp->ms_unflushed_allocs, msp->ms_unflushed_frees); 3317 spa->spa_unflushed_stats.sus_memused += 3318 metaslab_unflushed_changes_memused(msp); 3319 } else { 3320 ASSERT(!spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP)); 3321 3322 space_map_write(msp->ms_sm, alloctree, SM_ALLOC, 3323 SM_NO_VDEVID, tx); 3324 space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE, 3325 SM_NO_VDEVID, tx); 3326 mutex_enter(&msp->ms_lock); 3327 } 3328 3329 msp->ms_allocated_space += range_tree_space(alloctree); 3330 ASSERT3U(msp->ms_allocated_space, >=, 3331 range_tree_space(msp->ms_freeing)); 3332 msp->ms_allocated_space -= range_tree_space(msp->ms_freeing); 3333 3334 if (!range_tree_is_empty(msp->ms_checkpointing)) { 3335 ASSERT(spa_has_checkpoint(spa)); 3336 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); 3337 3338 /* 3339 * Since we are doing writes to disk and the ms_checkpointing 3340 * tree won't be changing during that time, we drop the 3341 * ms_lock while writing to the checkpoint space map, for the 3342 * same reason mentioned above. 3343 */ 3344 mutex_exit(&msp->ms_lock); 3345 space_map_write(vd->vdev_checkpoint_sm, 3346 msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx); 3347 mutex_enter(&msp->ms_lock); 3348 3349 spa->spa_checkpoint_info.sci_dspace += 3350 range_tree_space(msp->ms_checkpointing); 3351 vd->vdev_stat.vs_checkpoint_space += 3352 range_tree_space(msp->ms_checkpointing); 3353 ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==, 3354 -space_map_allocated(vd->vdev_checkpoint_sm)); 3355 3356 range_tree_vacate(msp->ms_checkpointing, NULL, NULL); 3357 } 3358 3359 if (msp->ms_loaded) { 3360 /* 3361 * When the space map is loaded, we have an accurate 3362 * histogram in the range tree. This gives us an opportunity 3363 * to bring the space map's histogram up-to-date so we clear 3364 * it first before updating it. 3365 */ 3366 space_map_histogram_clear(msp->ms_sm); 3367 space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx); 3368 3369 /* 3370 * Since we've cleared the histogram we need to add back 3371 * any free space that has already been processed, plus 3372 * any deferred space. This allows the on-disk histogram 3373 * to accurately reflect all free space even if some space 3374 * is not yet available for allocation (i.e. deferred). 3375 */ 3376 space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx); 3377 3378 /* 3379 * Add back any deferred free space that has not been 3380 * added back into the in-core free tree yet. This will 3381 * ensure that we don't end up with a space map histogram 3382 * that is completely empty unless the metaslab is fully 3383 * allocated. 3384 */ 3385 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 3386 space_map_histogram_add(msp->ms_sm, 3387 msp->ms_defer[t], tx); 3388 } 3389 } 3390 3391 /* 3392 * Always add the free space from this sync pass to the space 3393 * map histogram. We want to make sure that the on-disk histogram 3394 * accounts for all free space. If the space map is not loaded, 3395 * then we will lose some accuracy but will correct it the next 3396 * time we load the space map. 3397 */ 3398 space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx); 3399 metaslab_aux_histograms_update(msp); 3400 3401 metaslab_group_histogram_add(mg, msp); 3402 metaslab_group_histogram_verify(mg); 3403 metaslab_class_histogram_verify(mg->mg_class); 3404 3405 /* 3406 * For sync pass 1, we avoid traversing this txg's free range tree 3407 * and instead will just swap the pointers for freeing and freed. 3408 * We can safely do this since the freed_tree is guaranteed to be 3409 * empty on the initial pass. 3410 * 3411 * Keep in mind that even if we are currently using a log spacemap 3412 * we want current frees to end up in the ms_allocatable (but not 3413 * get appended to the ms_sm) so their ranges can be reused as usual. 3414 */ 3415 if (spa_sync_pass(spa) == 1) { 3416 range_tree_swap(&msp->ms_freeing, &msp->ms_freed); 3417 ASSERT0(msp->ms_allocated_this_txg); 3418 } else { 3419 range_tree_vacate(msp->ms_freeing, 3420 range_tree_add, msp->ms_freed); 3421 } 3422 msp->ms_allocated_this_txg += range_tree_space(alloctree); 3423 range_tree_vacate(alloctree, NULL, NULL); 3424 3425 ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); 3426 ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg) 3427 & TXG_MASK])); 3428 ASSERT0(range_tree_space(msp->ms_freeing)); 3429 ASSERT0(range_tree_space(msp->ms_checkpointing)); 3430 3431 mutex_exit(&msp->ms_lock); 3432 3433 /* 3434 * Verify that the space map object ID has been recorded in the 3435 * vdev_ms_array. 3436 */ 3437 uint64_t object; 3438 VERIFY0(dmu_read(mos, vd->vdev_ms_array, 3439 msp->ms_id * sizeof (uint64_t), sizeof (uint64_t), &object, 0)); 3440 VERIFY3U(object, ==, space_map_object(msp->ms_sm)); 3441 3442 mutex_exit(&msp->ms_sync_lock); 3443 dmu_tx_commit(tx); 3444 } 3445 3446 /* 3447 * Called after a transaction group has completely synced to mark 3448 * all of the metaslab's free space as usable. 3449 */ 3450 void 3451 metaslab_sync_done(metaslab_t *msp, uint64_t txg) 3452 { 3453 metaslab_group_t *mg = msp->ms_group; 3454 vdev_t *vd = mg->mg_vd; 3455 spa_t *spa = vd->vdev_spa; 3456 range_tree_t **defer_tree; 3457 int64_t alloc_delta, defer_delta; 3458 boolean_t defer_allowed = B_TRUE; 3459 3460 ASSERT(!vd->vdev_ishole); 3461 3462 mutex_enter(&msp->ms_lock); 3463 3464 /* 3465 * If this metaslab is just becoming available, initialize its 3466 * range trees and add its capacity to the vdev. 3467 */ 3468 if (msp->ms_freed == NULL) { 3469 for (int t = 0; t < TXG_SIZE; t++) { 3470 ASSERT(msp->ms_allocating[t] == NULL); 3471 3472 msp->ms_allocating[t] = range_tree_create(NULL, NULL); 3473 } 3474 3475 ASSERT3P(msp->ms_freeing, ==, NULL); 3476 msp->ms_freeing = range_tree_create(NULL, NULL); 3477 3478 ASSERT3P(msp->ms_freed, ==, NULL); 3479 msp->ms_freed = range_tree_create(NULL, NULL); 3480 3481 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 3482 ASSERT3P(msp->ms_defer[t], ==, NULL); 3483 msp->ms_defer[t] = range_tree_create(NULL, NULL); 3484 } 3485 3486 ASSERT3P(msp->ms_checkpointing, ==, NULL); 3487 msp->ms_checkpointing = range_tree_create(NULL, NULL); 3488 3489 ASSERT3P(msp->ms_unflushed_allocs, ==, NULL); 3490 msp->ms_unflushed_allocs = range_tree_create(NULL, NULL); 3491 ASSERT3P(msp->ms_unflushed_frees, ==, NULL); 3492 msp->ms_unflushed_frees = range_tree_create(NULL, NULL); 3493 3494 metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size); 3495 } 3496 ASSERT0(range_tree_space(msp->ms_freeing)); 3497 ASSERT0(range_tree_space(msp->ms_checkpointing)); 3498 3499 defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE]; 3500 3501 uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) - 3502 metaslab_class_get_alloc(spa_normal_class(spa)); 3503 if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) { 3504 defer_allowed = B_FALSE; 3505 } 3506 3507 defer_delta = 0; 3508 alloc_delta = msp->ms_allocated_this_txg - 3509 range_tree_space(msp->ms_freed); 3510 3511 if (defer_allowed) { 3512 defer_delta = range_tree_space(msp->ms_freed) - 3513 range_tree_space(*defer_tree); 3514 } else { 3515 defer_delta -= range_tree_space(*defer_tree); 3516 } 3517 metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta, 3518 defer_delta, 0); 3519 3520 if (spa_syncing_log_sm(spa) == NULL) { 3521 /* 3522 * If there's a metaslab_load() in progress and we don't have 3523 * a log space map, it means that we probably wrote to the 3524 * metaslab's space map. If this is the case, we need to 3525 * make sure that we wait for the load to complete so that we 3526 * have a consistent view at the in-core side of the metaslab. 3527 */ 3528 metaslab_load_wait(msp); 3529 } else { 3530 ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); 3531 } 3532 3533 /* 3534 * When auto-trimming is enabled, free ranges which are added to 3535 * ms_allocatable are also be added to ms_trim. The ms_trim tree is 3536 * periodically consumed by the vdev_autotrim_thread() which issues 3537 * trims for all ranges and then vacates the tree. The ms_trim tree 3538 * can be discarded at any time with the sole consequence of recent 3539 * frees not being trimmed. 3540 */ 3541 if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON) { 3542 range_tree_walk(*defer_tree, range_tree_add, msp->ms_trim); 3543 if (!defer_allowed) { 3544 range_tree_walk(msp->ms_freed, range_tree_add, 3545 msp->ms_trim); 3546 } 3547 } else { 3548 range_tree_vacate(msp->ms_trim, NULL, NULL); 3549 } 3550 3551 /* 3552 * Move the frees from the defer_tree back to the free 3553 * range tree (if it's loaded). Swap the freed_tree and 3554 * the defer_tree -- this is safe to do because we've 3555 * just emptied out the defer_tree. 3556 */ 3557 range_tree_vacate(*defer_tree, 3558 msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable); 3559 if (defer_allowed) { 3560 range_tree_swap(&msp->ms_freed, defer_tree); 3561 } else { 3562 range_tree_vacate(msp->ms_freed, 3563 msp->ms_loaded ? range_tree_add : NULL, 3564 msp->ms_allocatable); 3565 } 3566 3567 msp->ms_synced_length = space_map_length(msp->ms_sm); 3568 3569 msp->ms_deferspace += defer_delta; 3570 ASSERT3S(msp->ms_deferspace, >=, 0); 3571 ASSERT3S(msp->ms_deferspace, <=, msp->ms_size); 3572 if (msp->ms_deferspace != 0) { 3573 /* 3574 * Keep syncing this metaslab until all deferred frees 3575 * are back in circulation. 3576 */ 3577 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 3578 } 3579 metaslab_aux_histograms_update_done(msp, defer_allowed); 3580 3581 if (msp->ms_new) { 3582 msp->ms_new = B_FALSE; 3583 mutex_enter(&mg->mg_lock); 3584 mg->mg_ms_ready++; 3585 mutex_exit(&mg->mg_lock); 3586 } 3587 3588 /* 3589 * Re-sort metaslab within its group now that we've adjusted 3590 * its allocatable space. 3591 */ 3592 metaslab_recalculate_weight_and_sort(msp); 3593 3594 /* 3595 * If the metaslab is loaded and we've not tried to load or allocate 3596 * from it in 'metaslab_unload_delay' txgs, then unload it. 3597 */ 3598 if (msp->ms_loaded && 3599 msp->ms_disabled == 0 && 3600 msp->ms_selected_txg + metaslab_unload_delay < txg) { 3601 3602 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 3603 VERIFY0(range_tree_space( 3604 msp->ms_allocating[(txg + t) & TXG_MASK])); 3605 } 3606 if (msp->ms_allocator != -1) { 3607 metaslab_passivate(msp, msp->ms_weight & 3608 ~METASLAB_ACTIVE_MASK); 3609 } 3610 3611 if (!metaslab_debug_unload) 3612 metaslab_unload(msp); 3613 } 3614 3615 ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); 3616 ASSERT0(range_tree_space(msp->ms_freeing)); 3617 ASSERT0(range_tree_space(msp->ms_freed)); 3618 ASSERT0(range_tree_space(msp->ms_checkpointing)); 3619 3620 msp->ms_allocated_this_txg = 0; 3621 mutex_exit(&msp->ms_lock); 3622 } 3623 3624 void 3625 metaslab_sync_reassess(metaslab_group_t *mg) 3626 { 3627 spa_t *spa = mg->mg_class->mc_spa; 3628 3629 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 3630 metaslab_group_alloc_update(mg); 3631 mg->mg_fragmentation = metaslab_group_fragmentation(mg); 3632 3633 /* 3634 * Preload the next potential metaslabs but only on active 3635 * metaslab groups. We can get into a state where the metaslab 3636 * is no longer active since we dirty metaslabs as we remove a 3637 * a device, thus potentially making the metaslab group eligible 3638 * for preloading. 3639 */ 3640 if (mg->mg_activation_count > 0) { 3641 metaslab_group_preload(mg); 3642 } 3643 spa_config_exit(spa, SCL_ALLOC, FTAG); 3644 } 3645 3646 /* 3647 * When writing a ditto block (i.e. more than one DVA for a given BP) on 3648 * the same vdev as an existing DVA of this BP, then try to allocate it 3649 * on a different metaslab than existing DVAs (i.e. a unique metaslab). 3650 */ 3651 static boolean_t 3652 metaslab_is_unique(metaslab_t *msp, dva_t *dva) 3653 { 3654 uint64_t dva_ms_id; 3655 3656 if (DVA_GET_ASIZE(dva) == 0) 3657 return (B_TRUE); 3658 3659 if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) 3660 return (B_TRUE); 3661 3662 dva_ms_id = DVA_GET_OFFSET(dva) >> msp->ms_group->mg_vd->vdev_ms_shift; 3663 3664 return (msp->ms_id != dva_ms_id); 3665 } 3666 3667 /* 3668 * ========================================================================== 3669 * Metaslab allocation tracing facility 3670 * ========================================================================== 3671 */ 3672 kstat_t *metaslab_trace_ksp; 3673 kstat_named_t metaslab_trace_over_limit; 3674 3675 void 3676 metaslab_alloc_trace_init(void) 3677 { 3678 ASSERT(metaslab_alloc_trace_cache == NULL); 3679 metaslab_alloc_trace_cache = kmem_cache_create( 3680 "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t), 3681 0, NULL, NULL, NULL, NULL, NULL, 0); 3682 metaslab_trace_ksp = kstat_create("zfs", 0, "metaslab_trace_stats", 3683 "misc", KSTAT_TYPE_NAMED, 1, KSTAT_FLAG_VIRTUAL); 3684 if (metaslab_trace_ksp != NULL) { 3685 metaslab_trace_ksp->ks_data = &metaslab_trace_over_limit; 3686 kstat_named_init(&metaslab_trace_over_limit, 3687 "metaslab_trace_over_limit", KSTAT_DATA_UINT64); 3688 kstat_install(metaslab_trace_ksp); 3689 } 3690 } 3691 3692 void 3693 metaslab_alloc_trace_fini(void) 3694 { 3695 if (metaslab_trace_ksp != NULL) { 3696 kstat_delete(metaslab_trace_ksp); 3697 metaslab_trace_ksp = NULL; 3698 } 3699 kmem_cache_destroy(metaslab_alloc_trace_cache); 3700 metaslab_alloc_trace_cache = NULL; 3701 } 3702 3703 /* 3704 * Add an allocation trace element to the allocation tracing list. 3705 */ 3706 static void 3707 metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg, 3708 metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset, 3709 int allocator) 3710 { 3711 if (!metaslab_trace_enabled) 3712 return; 3713 3714 /* 3715 * When the tracing list reaches its maximum we remove 3716 * the second element in the list before adding a new one. 3717 * By removing the second element we preserve the original 3718 * entry as a clue to what allocations steps have already been 3719 * performed. 3720 */ 3721 if (zal->zal_size == metaslab_trace_max_entries) { 3722 metaslab_alloc_trace_t *mat_next; 3723 #ifdef DEBUG 3724 panic("too many entries in allocation list"); 3725 #endif 3726 atomic_inc_64(&metaslab_trace_over_limit.value.ui64); 3727 zal->zal_size--; 3728 mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list)); 3729 list_remove(&zal->zal_list, mat_next); 3730 kmem_cache_free(metaslab_alloc_trace_cache, mat_next); 3731 } 3732 3733 metaslab_alloc_trace_t *mat = 3734 kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP); 3735 list_link_init(&mat->mat_list_node); 3736 mat->mat_mg = mg; 3737 mat->mat_msp = msp; 3738 mat->mat_size = psize; 3739 mat->mat_dva_id = dva_id; 3740 mat->mat_offset = offset; 3741 mat->mat_weight = 0; 3742 mat->mat_allocator = allocator; 3743 3744 if (msp != NULL) 3745 mat->mat_weight = msp->ms_weight; 3746 3747 /* 3748 * The list is part of the zio so locking is not required. Only 3749 * a single thread will perform allocations for a given zio. 3750 */ 3751 list_insert_tail(&zal->zal_list, mat); 3752 zal->zal_size++; 3753 3754 ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries); 3755 } 3756 3757 void 3758 metaslab_trace_init(zio_alloc_list_t *zal) 3759 { 3760 list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t), 3761 offsetof(metaslab_alloc_trace_t, mat_list_node)); 3762 zal->zal_size = 0; 3763 } 3764 3765 void 3766 metaslab_trace_fini(zio_alloc_list_t *zal) 3767 { 3768 metaslab_alloc_trace_t *mat; 3769 3770 while ((mat = list_remove_head(&zal->zal_list)) != NULL) 3771 kmem_cache_free(metaslab_alloc_trace_cache, mat); 3772 list_destroy(&zal->zal_list); 3773 zal->zal_size = 0; 3774 } 3775 3776 /* 3777 * ========================================================================== 3778 * Metaslab block operations 3779 * ========================================================================== 3780 */ 3781 3782 static void 3783 metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags, 3784 int allocator) 3785 { 3786 if (!(flags & METASLAB_ASYNC_ALLOC) || 3787 (flags & METASLAB_DONT_THROTTLE)) 3788 return; 3789 3790 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 3791 if (!mg->mg_class->mc_alloc_throttle_enabled) 3792 return; 3793 3794 (void) zfs_refcount_add(&mg->mg_alloc_queue_depth[allocator], tag); 3795 } 3796 3797 static void 3798 metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator) 3799 { 3800 uint64_t max = mg->mg_max_alloc_queue_depth; 3801 uint64_t cur = mg->mg_cur_max_alloc_queue_depth[allocator]; 3802 while (cur < max) { 3803 if (atomic_cas_64(&mg->mg_cur_max_alloc_queue_depth[allocator], 3804 cur, cur + 1) == cur) { 3805 atomic_inc_64( 3806 &mg->mg_class->mc_alloc_max_slots[allocator]); 3807 return; 3808 } 3809 cur = mg->mg_cur_max_alloc_queue_depth[allocator]; 3810 } 3811 } 3812 3813 void 3814 metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags, 3815 int allocator, boolean_t io_complete) 3816 { 3817 if (!(flags & METASLAB_ASYNC_ALLOC) || 3818 (flags & METASLAB_DONT_THROTTLE)) 3819 return; 3820 3821 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 3822 if (!mg->mg_class->mc_alloc_throttle_enabled) 3823 return; 3824 3825 (void) zfs_refcount_remove(&mg->mg_alloc_queue_depth[allocator], tag); 3826 if (io_complete) 3827 metaslab_group_increment_qdepth(mg, allocator); 3828 } 3829 3830 void 3831 metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag, 3832 int allocator) 3833 { 3834 #ifdef ZFS_DEBUG 3835 const dva_t *dva = bp->blk_dva; 3836 int ndvas = BP_GET_NDVAS(bp); 3837 3838 for (int d = 0; d < ndvas; d++) { 3839 uint64_t vdev = DVA_GET_VDEV(&dva[d]); 3840 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 3841 VERIFY(zfs_refcount_not_held( 3842 &mg->mg_alloc_queue_depth[allocator], tag)); 3843 } 3844 #endif 3845 } 3846 3847 static uint64_t 3848 metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) 3849 { 3850 uint64_t start; 3851 range_tree_t *rt = msp->ms_allocatable; 3852 metaslab_class_t *mc = msp->ms_group->mg_class; 3853 3854 ASSERT(MUTEX_HELD(&msp->ms_lock)); 3855 VERIFY(!msp->ms_condensing); 3856 VERIFY0(msp->ms_disabled); 3857 3858 start = mc->mc_ops->msop_alloc(msp, size); 3859 if (start != -1ULL) { 3860 metaslab_group_t *mg = msp->ms_group; 3861 vdev_t *vd = mg->mg_vd; 3862 3863 VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift)); 3864 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 3865 VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size); 3866 range_tree_remove(rt, start, size); 3867 range_tree_clear(msp->ms_trim, start, size); 3868 3869 if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) 3870 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); 3871 3872 range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size); 3873 3874 /* Track the last successful allocation */ 3875 msp->ms_alloc_txg = txg; 3876 metaslab_verify_space(msp, txg); 3877 } 3878 3879 /* 3880 * Now that we've attempted the allocation we need to update the 3881 * metaslab's maximum block size since it may have changed. 3882 */ 3883 msp->ms_max_size = metaslab_block_maxsize(msp); 3884 return (start); 3885 } 3886 3887 /* 3888 * Find the metaslab with the highest weight that is less than what we've 3889 * already tried. In the common case, this means that we will examine each 3890 * metaslab at most once. Note that concurrent callers could reorder metaslabs 3891 * by activation/passivation once we have dropped the mg_lock. If a metaslab is 3892 * activated by another thread, and we fail to allocate from the metaslab we 3893 * have selected, we may not try the newly-activated metaslab, and instead 3894 * activate another metaslab. This is not optimal, but generally does not cause 3895 * any problems (a possible exception being if every metaslab is completely full 3896 * except for the the newly-activated metaslab which we fail to examine). 3897 */ 3898 static metaslab_t * 3899 find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, 3900 dva_t *dva, int d, boolean_t want_unique, uint64_t asize, int allocator, 3901 zio_alloc_list_t *zal, metaslab_t *search, boolean_t *was_active) 3902 { 3903 avl_index_t idx; 3904 avl_tree_t *t = &mg->mg_metaslab_tree; 3905 metaslab_t *msp = avl_find(t, search, &idx); 3906 if (msp == NULL) 3907 msp = avl_nearest(t, idx, AVL_AFTER); 3908 3909 for (; msp != NULL; msp = AVL_NEXT(t, msp)) { 3910 int i; 3911 if (!metaslab_should_allocate(msp, asize)) { 3912 metaslab_trace_add(zal, mg, msp, asize, d, 3913 TRACE_TOO_SMALL, allocator); 3914 continue; 3915 } 3916 3917 /* 3918 * If the selected metaslab is condensing or disabled, 3919 * skip it. 3920 */ 3921 if (msp->ms_condensing || msp->ms_disabled > 0) 3922 continue; 3923 3924 *was_active = msp->ms_allocator != -1; 3925 /* 3926 * If we're activating as primary, this is our first allocation 3927 * from this disk, so we don't need to check how close we are. 3928 * If the metaslab under consideration was already active, 3929 * we're getting desperate enough to steal another allocator's 3930 * metaslab, so we still don't care about distances. 3931 */ 3932 if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active) 3933 break; 3934 3935 for (i = 0; i < d; i++) { 3936 if (want_unique && 3937 !metaslab_is_unique(msp, &dva[i])) 3938 break; /* try another metaslab */ 3939 } 3940 if (i == d) 3941 break; 3942 } 3943 3944 if (msp != NULL) { 3945 search->ms_weight = msp->ms_weight; 3946 search->ms_start = msp->ms_start + 1; 3947 search->ms_allocator = msp->ms_allocator; 3948 search->ms_primary = msp->ms_primary; 3949 } 3950 return (msp); 3951 } 3952 3953 /* ARGSUSED */ 3954 static uint64_t 3955 metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, 3956 uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, 3957 int d, int allocator) 3958 { 3959 metaslab_t *msp = NULL; 3960 uint64_t offset = -1ULL; 3961 uint64_t activation_weight; 3962 3963 activation_weight = METASLAB_WEIGHT_PRIMARY; 3964 for (int i = 0; i < d; i++) { 3965 if (activation_weight == METASLAB_WEIGHT_PRIMARY && 3966 DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { 3967 activation_weight = METASLAB_WEIGHT_SECONDARY; 3968 } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && 3969 DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { 3970 activation_weight = METASLAB_WEIGHT_CLAIM; 3971 break; 3972 } 3973 } 3974 3975 /* 3976 * If we don't have enough metaslabs active to fill the entire array, we 3977 * just use the 0th slot. 3978 */ 3979 if (mg->mg_ms_ready < mg->mg_allocators * 3) 3980 allocator = 0; 3981 3982 ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2); 3983 3984 metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP); 3985 search->ms_weight = UINT64_MAX; 3986 search->ms_start = 0; 3987 /* 3988 * At the end of the metaslab tree are the already-active metaslabs, 3989 * first the primaries, then the secondaries. When we resume searching 3990 * through the tree, we need to consider ms_allocator and ms_primary so 3991 * we start in the location right after where we left off, and don't 3992 * accidentally loop forever considering the same metaslabs. 3993 */ 3994 search->ms_allocator = -1; 3995 search->ms_primary = B_TRUE; 3996 for (;;) { 3997 boolean_t was_active = B_FALSE; 3998 3999 mutex_enter(&mg->mg_lock); 4000 4001 if (activation_weight == METASLAB_WEIGHT_PRIMARY && 4002 mg->mg_primaries[allocator] != NULL) { 4003 msp = mg->mg_primaries[allocator]; 4004 was_active = B_TRUE; 4005 } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && 4006 mg->mg_secondaries[allocator] != NULL) { 4007 msp = mg->mg_secondaries[allocator]; 4008 was_active = B_TRUE; 4009 } else { 4010 msp = find_valid_metaslab(mg, activation_weight, dva, d, 4011 want_unique, asize, allocator, zal, search, 4012 &was_active); 4013 } 4014 4015 mutex_exit(&mg->mg_lock); 4016 if (msp == NULL) { 4017 kmem_free(search, sizeof (*search)); 4018 return (-1ULL); 4019 } 4020 4021 mutex_enter(&msp->ms_lock); 4022 /* 4023 * Ensure that the metaslab we have selected is still 4024 * capable of handling our request. It's possible that 4025 * another thread may have changed the weight while we 4026 * were blocked on the metaslab lock. We check the 4027 * active status first to see if we need to reselect 4028 * a new metaslab. 4029 */ 4030 if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) { 4031 mutex_exit(&msp->ms_lock); 4032 continue; 4033 } 4034 4035 /* 4036 * If the metaslab is freshly activated for an allocator that 4037 * isn't the one we're allocating from, or if it's a primary and 4038 * we're seeking a secondary (or vice versa), we go back and 4039 * select a new metaslab. 4040 */ 4041 if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) && 4042 (msp->ms_allocator != -1) && 4043 (msp->ms_allocator != allocator || ((activation_weight == 4044 METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) { 4045 mutex_exit(&msp->ms_lock); 4046 continue; 4047 } 4048 4049 if (msp->ms_weight & METASLAB_WEIGHT_CLAIM && 4050 activation_weight != METASLAB_WEIGHT_CLAIM) { 4051 metaslab_passivate(msp, msp->ms_weight & 4052 ~METASLAB_WEIGHT_CLAIM); 4053 mutex_exit(&msp->ms_lock); 4054 continue; 4055 } 4056 4057 if (metaslab_activate(msp, allocator, activation_weight) != 0) { 4058 mutex_exit(&msp->ms_lock); 4059 continue; 4060 } 4061 4062 msp->ms_selected_txg = txg; 4063 4064 /* 4065 * Now that we have the lock, recheck to see if we should 4066 * continue to use this metaslab for this allocation. The 4067 * the metaslab is now loaded so metaslab_should_allocate() can 4068 * accurately determine if the allocation attempt should 4069 * proceed. 4070 */ 4071 if (!metaslab_should_allocate(msp, asize)) { 4072 /* Passivate this metaslab and select a new one. */ 4073 metaslab_trace_add(zal, mg, msp, asize, d, 4074 TRACE_TOO_SMALL, allocator); 4075 goto next; 4076 } 4077 4078 /* 4079 * If this metaslab is currently condensing then pick again as 4080 * we can't manipulate this metaslab until it's committed 4081 * to disk. If this metaslab is being initialized, we shouldn't 4082 * allocate from it since the allocated region might be 4083 * overwritten after allocation. 4084 */ 4085 if (msp->ms_condensing) { 4086 metaslab_trace_add(zal, mg, msp, asize, d, 4087 TRACE_CONDENSING, allocator); 4088 metaslab_passivate(msp, msp->ms_weight & 4089 ~METASLAB_ACTIVE_MASK); 4090 mutex_exit(&msp->ms_lock); 4091 continue; 4092 } else if (msp->ms_disabled > 0) { 4093 metaslab_trace_add(zal, mg, msp, asize, d, 4094 TRACE_DISABLED, allocator); 4095 metaslab_passivate(msp, msp->ms_weight & 4096 ~METASLAB_ACTIVE_MASK); 4097 mutex_exit(&msp->ms_lock); 4098 continue; 4099 } 4100 4101 offset = metaslab_block_alloc(msp, asize, txg); 4102 metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator); 4103 4104 if (offset != -1ULL) { 4105 /* Proactively passivate the metaslab, if needed */ 4106 metaslab_segment_may_passivate(msp); 4107 break; 4108 } 4109 next: 4110 ASSERT(msp->ms_loaded); 4111 4112 /* 4113 * We were unable to allocate from this metaslab so determine 4114 * a new weight for this metaslab. Now that we have loaded 4115 * the metaslab we can provide a better hint to the metaslab 4116 * selector. 4117 * 4118 * For space-based metaslabs, we use the maximum block size. 4119 * This information is only available when the metaslab 4120 * is loaded and is more accurate than the generic free 4121 * space weight that was calculated by metaslab_weight(). 4122 * This information allows us to quickly compare the maximum 4123 * available allocation in the metaslab to the allocation 4124 * size being requested. 4125 * 4126 * For segment-based metaslabs, determine the new weight 4127 * based on the highest bucket in the range tree. We 4128 * explicitly use the loaded segment weight (i.e. the range 4129 * tree histogram) since it contains the space that is 4130 * currently available for allocation and is accurate 4131 * even within a sync pass. 4132 */ 4133 if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) { 4134 uint64_t weight = metaslab_block_maxsize(msp); 4135 WEIGHT_SET_SPACEBASED(weight); 4136 metaslab_passivate(msp, weight); 4137 } else { 4138 metaslab_passivate(msp, 4139 metaslab_weight_from_range_tree(msp)); 4140 } 4141 4142 /* 4143 * We have just failed an allocation attempt, check 4144 * that metaslab_should_allocate() agrees. Otherwise, 4145 * we may end up in an infinite loop retrying the same 4146 * metaslab. 4147 */ 4148 ASSERT(!metaslab_should_allocate(msp, asize)); 4149 4150 mutex_exit(&msp->ms_lock); 4151 } 4152 mutex_exit(&msp->ms_lock); 4153 kmem_free(search, sizeof (*search)); 4154 return (offset); 4155 } 4156 4157 static uint64_t 4158 metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, 4159 uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, 4160 int d, int allocator) 4161 { 4162 uint64_t offset; 4163 ASSERT(mg->mg_initialized); 4164 4165 offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique, 4166 dva, d, allocator); 4167 4168 mutex_enter(&mg->mg_lock); 4169 if (offset == -1ULL) { 4170 mg->mg_failed_allocations++; 4171 metaslab_trace_add(zal, mg, NULL, asize, d, 4172 TRACE_GROUP_FAILURE, allocator); 4173 if (asize == SPA_GANGBLOCKSIZE) { 4174 /* 4175 * This metaslab group was unable to allocate 4176 * the minimum gang block size so it must be out of 4177 * space. We must notify the allocation throttle 4178 * to start skipping allocation attempts to this 4179 * metaslab group until more space becomes available. 4180 * Note: this failure cannot be caused by the 4181 * allocation throttle since the allocation throttle 4182 * is only responsible for skipping devices and 4183 * not failing block allocations. 4184 */ 4185 mg->mg_no_free_space = B_TRUE; 4186 } 4187 } 4188 mg->mg_allocations++; 4189 mutex_exit(&mg->mg_lock); 4190 return (offset); 4191 } 4192 4193 /* 4194 * Allocate a block for the specified i/o. 4195 */ 4196 int 4197 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, 4198 dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags, 4199 zio_alloc_list_t *zal, int allocator) 4200 { 4201 metaslab_group_t *mg, *rotor; 4202 vdev_t *vd; 4203 boolean_t try_hard = B_FALSE; 4204 4205 ASSERT(!DVA_IS_VALID(&dva[d])); 4206 4207 /* 4208 * For testing, make some blocks above a certain size be gang blocks. 4209 * This will also test spilling from special to normal. 4210 */ 4211 if (psize >= metaslab_force_ganging && (ddi_get_lbolt() & 3) == 0) { 4212 metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG, 4213 allocator); 4214 return (SET_ERROR(ENOSPC)); 4215 } 4216 4217 /* 4218 * Start at the rotor and loop through all mgs until we find something. 4219 * Note that there's no locking on mc_rotor or mc_aliquot because 4220 * nothing actually breaks if we miss a few updates -- we just won't 4221 * allocate quite as evenly. It all balances out over time. 4222 * 4223 * If we are doing ditto or log blocks, try to spread them across 4224 * consecutive vdevs. If we're forced to reuse a vdev before we've 4225 * allocated all of our ditto blocks, then try and spread them out on 4226 * that vdev as much as possible. If it turns out to not be possible, 4227 * gradually lower our standards until anything becomes acceptable. 4228 * Also, allocating on consecutive vdevs (as opposed to random vdevs) 4229 * gives us hope of containing our fault domains to something we're 4230 * able to reason about. Otherwise, any two top-level vdev failures 4231 * will guarantee the loss of data. With consecutive allocation, 4232 * only two adjacent top-level vdev failures will result in data loss. 4233 * 4234 * If we are doing gang blocks (hintdva is non-NULL), try to keep 4235 * ourselves on the same vdev as our gang block header. That 4236 * way, we can hope for locality in vdev_cache, plus it makes our 4237 * fault domains something tractable. 4238 */ 4239 if (hintdva) { 4240 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); 4241 4242 /* 4243 * It's possible the vdev we're using as the hint no 4244 * longer exists or its mg has been closed (e.g. by 4245 * device removal). Consult the rotor when 4246 * all else fails. 4247 */ 4248 if (vd != NULL && vd->vdev_mg != NULL) { 4249 mg = vd->vdev_mg; 4250 4251 if (flags & METASLAB_HINTBP_AVOID && 4252 mg->mg_next != NULL) 4253 mg = mg->mg_next; 4254 } else { 4255 mg = mc->mc_rotor; 4256 } 4257 } else if (d != 0) { 4258 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); 4259 mg = vd->vdev_mg->mg_next; 4260 } else { 4261 ASSERT(mc->mc_rotor != NULL); 4262 mg = mc->mc_rotor; 4263 } 4264 4265 /* 4266 * If the hint put us into the wrong metaslab class, or into a 4267 * metaslab group that has been passivated, just follow the rotor. 4268 */ 4269 if (mg->mg_class != mc || mg->mg_activation_count <= 0) 4270 mg = mc->mc_rotor; 4271 4272 rotor = mg; 4273 top: 4274 do { 4275 boolean_t allocatable; 4276 4277 ASSERT(mg->mg_activation_count == 1); 4278 vd = mg->mg_vd; 4279 4280 /* 4281 * Don't allocate from faulted devices. 4282 */ 4283 if (try_hard) { 4284 spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); 4285 allocatable = vdev_allocatable(vd); 4286 spa_config_exit(spa, SCL_ZIO, FTAG); 4287 } else { 4288 allocatable = vdev_allocatable(vd); 4289 } 4290 4291 /* 4292 * Determine if the selected metaslab group is eligible 4293 * for allocations. If we're ganging then don't allow 4294 * this metaslab group to skip allocations since that would 4295 * inadvertently return ENOSPC and suspend the pool 4296 * even though space is still available. 4297 */ 4298 if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) { 4299 allocatable = metaslab_group_allocatable(mg, rotor, 4300 psize, allocator, d); 4301 } 4302 4303 if (!allocatable) { 4304 metaslab_trace_add(zal, mg, NULL, psize, d, 4305 TRACE_NOT_ALLOCATABLE, allocator); 4306 goto next; 4307 } 4308 4309 ASSERT(mg->mg_initialized); 4310 4311 /* 4312 * Avoid writing single-copy data to a failing, 4313 * non-redundant vdev, unless we've already tried all 4314 * other vdevs. 4315 */ 4316 if ((vd->vdev_stat.vs_write_errors > 0 || 4317 vd->vdev_state < VDEV_STATE_HEALTHY) && 4318 d == 0 && !try_hard && vd->vdev_children == 0) { 4319 metaslab_trace_add(zal, mg, NULL, psize, d, 4320 TRACE_VDEV_ERROR, allocator); 4321 goto next; 4322 } 4323 4324 ASSERT(mg->mg_class == mc); 4325 4326 uint64_t asize = vdev_psize_to_asize(vd, psize); 4327 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); 4328 4329 /* 4330 * If we don't need to try hard, then require that the 4331 * block be on an different metaslab from any other DVAs 4332 * in this BP (unique=true). If we are trying hard, then 4333 * allow any metaslab to be used (unique=false). 4334 */ 4335 uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg, 4336 !try_hard, dva, d, allocator); 4337 4338 if (offset != -1ULL) { 4339 /* 4340 * If we've just selected this metaslab group, 4341 * figure out whether the corresponding vdev is 4342 * over- or under-used relative to the pool, 4343 * and set an allocation bias to even it out. 4344 */ 4345 if (mc->mc_aliquot == 0 && metaslab_bias_enabled) { 4346 vdev_stat_t *vs = &vd->vdev_stat; 4347 int64_t vu, cu; 4348 4349 vu = (vs->vs_alloc * 100) / (vs->vs_space + 1); 4350 cu = (mc->mc_alloc * 100) / (mc->mc_space + 1); 4351 4352 /* 4353 * Calculate how much more or less we should 4354 * try to allocate from this device during 4355 * this iteration around the rotor. 4356 * For example, if a device is 80% full 4357 * and the pool is 20% full then we should 4358 * reduce allocations by 60% on this device. 4359 * 4360 * mg_bias = (20 - 80) * 512K / 100 = -307K 4361 * 4362 * This reduces allocations by 307K for this 4363 * iteration. 4364 */ 4365 mg->mg_bias = ((cu - vu) * 4366 (int64_t)mg->mg_aliquot) / 100; 4367 } else if (!metaslab_bias_enabled) { 4368 mg->mg_bias = 0; 4369 } 4370 4371 if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= 4372 mg->mg_aliquot + mg->mg_bias) { 4373 mc->mc_rotor = mg->mg_next; 4374 mc->mc_aliquot = 0; 4375 } 4376 4377 DVA_SET_VDEV(&dva[d], vd->vdev_id); 4378 DVA_SET_OFFSET(&dva[d], offset); 4379 DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER)); 4380 DVA_SET_ASIZE(&dva[d], asize); 4381 4382 return (0); 4383 } 4384 next: 4385 mc->mc_rotor = mg->mg_next; 4386 mc->mc_aliquot = 0; 4387 } while ((mg = mg->mg_next) != rotor); 4388 4389 /* 4390 * If we haven't tried hard, do so now. 4391 */ 4392 if (!try_hard) { 4393 try_hard = B_TRUE; 4394 goto top; 4395 } 4396 4397 bzero(&dva[d], sizeof (dva_t)); 4398 4399 metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator); 4400 return (SET_ERROR(ENOSPC)); 4401 } 4402 4403 void 4404 metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize, 4405 boolean_t checkpoint) 4406 { 4407 metaslab_t *msp; 4408 spa_t *spa = vd->vdev_spa; 4409 4410 ASSERT(vdev_is_concrete(vd)); 4411 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 4412 ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); 4413 4414 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 4415 4416 VERIFY(!msp->ms_condensing); 4417 VERIFY3U(offset, >=, msp->ms_start); 4418 VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size); 4419 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 4420 VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift)); 4421 4422 metaslab_check_free_impl(vd, offset, asize); 4423 4424 mutex_enter(&msp->ms_lock); 4425 if (range_tree_is_empty(msp->ms_freeing) && 4426 range_tree_is_empty(msp->ms_checkpointing)) { 4427 vdev_dirty(vd, VDD_METASLAB, msp, spa_syncing_txg(spa)); 4428 } 4429 4430 if (checkpoint) { 4431 ASSERT(spa_has_checkpoint(spa)); 4432 range_tree_add(msp->ms_checkpointing, offset, asize); 4433 } else { 4434 range_tree_add(msp->ms_freeing, offset, asize); 4435 } 4436 mutex_exit(&msp->ms_lock); 4437 } 4438 4439 /* ARGSUSED */ 4440 void 4441 metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, 4442 uint64_t size, void *arg) 4443 { 4444 boolean_t *checkpoint = arg; 4445 4446 ASSERT3P(checkpoint, !=, NULL); 4447 4448 if (vd->vdev_ops->vdev_op_remap != NULL) 4449 vdev_indirect_mark_obsolete(vd, offset, size); 4450 else 4451 metaslab_free_impl(vd, offset, size, *checkpoint); 4452 } 4453 4454 static void 4455 metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size, 4456 boolean_t checkpoint) 4457 { 4458 spa_t *spa = vd->vdev_spa; 4459 4460 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 4461 4462 if (spa_syncing_txg(spa) > spa_freeze_txg(spa)) 4463 return; 4464 4465 if (spa->spa_vdev_removal != NULL && 4466 spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id && 4467 vdev_is_concrete(vd)) { 4468 /* 4469 * Note: we check if the vdev is concrete because when 4470 * we complete the removal, we first change the vdev to be 4471 * an indirect vdev (in open context), and then (in syncing 4472 * context) clear spa_vdev_removal. 4473 */ 4474 free_from_removing_vdev(vd, offset, size); 4475 } else if (vd->vdev_ops->vdev_op_remap != NULL) { 4476 vdev_indirect_mark_obsolete(vd, offset, size); 4477 vd->vdev_ops->vdev_op_remap(vd, offset, size, 4478 metaslab_free_impl_cb, &checkpoint); 4479 } else { 4480 metaslab_free_concrete(vd, offset, size, checkpoint); 4481 } 4482 } 4483 4484 typedef struct remap_blkptr_cb_arg { 4485 blkptr_t *rbca_bp; 4486 spa_remap_cb_t rbca_cb; 4487 vdev_t *rbca_remap_vd; 4488 uint64_t rbca_remap_offset; 4489 void *rbca_cb_arg; 4490 } remap_blkptr_cb_arg_t; 4491 4492 void 4493 remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, 4494 uint64_t size, void *arg) 4495 { 4496 remap_blkptr_cb_arg_t *rbca = arg; 4497 blkptr_t *bp = rbca->rbca_bp; 4498 4499 /* We can not remap split blocks. */ 4500 if (size != DVA_GET_ASIZE(&bp->blk_dva[0])) 4501 return; 4502 ASSERT0(inner_offset); 4503 4504 if (rbca->rbca_cb != NULL) { 4505 /* 4506 * At this point we know that we are not handling split 4507 * blocks and we invoke the callback on the previous 4508 * vdev which must be indirect. 4509 */ 4510 ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops); 4511 4512 rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id, 4513 rbca->rbca_remap_offset, size, rbca->rbca_cb_arg); 4514 4515 /* set up remap_blkptr_cb_arg for the next call */ 4516 rbca->rbca_remap_vd = vd; 4517 rbca->rbca_remap_offset = offset; 4518 } 4519 4520 /* 4521 * The phys birth time is that of dva[0]. This ensures that we know 4522 * when each dva was written, so that resilver can determine which 4523 * blocks need to be scrubbed (i.e. those written during the time 4524 * the vdev was offline). It also ensures that the key used in 4525 * the ARC hash table is unique (i.e. dva[0] + phys_birth). If 4526 * we didn't change the phys_birth, a lookup in the ARC for a 4527 * remapped BP could find the data that was previously stored at 4528 * this vdev + offset. 4529 */ 4530 vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa, 4531 DVA_GET_VDEV(&bp->blk_dva[0])); 4532 vdev_indirect_births_t *vib = oldvd->vdev_indirect_births; 4533 bp->blk_phys_birth = vdev_indirect_births_physbirth(vib, 4534 DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0])); 4535 4536 DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id); 4537 DVA_SET_OFFSET(&bp->blk_dva[0], offset); 4538 } 4539 4540 /* 4541 * If the block pointer contains any indirect DVAs, modify them to refer to 4542 * concrete DVAs. Note that this will sometimes not be possible, leaving 4543 * the indirect DVA in place. This happens if the indirect DVA spans multiple 4544 * segments in the mapping (i.e. it is a "split block"). 4545 * 4546 * If the BP was remapped, calls the callback on the original dva (note the 4547 * callback can be called multiple times if the original indirect DVA refers 4548 * to another indirect DVA, etc). 4549 * 4550 * Returns TRUE if the BP was remapped. 4551 */ 4552 boolean_t 4553 spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg) 4554 { 4555 remap_blkptr_cb_arg_t rbca; 4556 4557 if (!zfs_remap_blkptr_enable) 4558 return (B_FALSE); 4559 4560 if (!spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) 4561 return (B_FALSE); 4562 4563 /* 4564 * Dedup BP's can not be remapped, because ddt_phys_select() depends 4565 * on DVA[0] being the same in the BP as in the DDT (dedup table). 4566 */ 4567 if (BP_GET_DEDUP(bp)) 4568 return (B_FALSE); 4569 4570 /* 4571 * Gang blocks can not be remapped, because 4572 * zio_checksum_gang_verifier() depends on the DVA[0] that's in 4573 * the BP used to read the gang block header (GBH) being the same 4574 * as the DVA[0] that we allocated for the GBH. 4575 */ 4576 if (BP_IS_GANG(bp)) 4577 return (B_FALSE); 4578 4579 /* 4580 * Embedded BP's have no DVA to remap. 4581 */ 4582 if (BP_GET_NDVAS(bp) < 1) 4583 return (B_FALSE); 4584 4585 /* 4586 * Note: we only remap dva[0]. If we remapped other dvas, we 4587 * would no longer know what their phys birth txg is. 4588 */ 4589 dva_t *dva = &bp->blk_dva[0]; 4590 4591 uint64_t offset = DVA_GET_OFFSET(dva); 4592 uint64_t size = DVA_GET_ASIZE(dva); 4593 vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); 4594 4595 if (vd->vdev_ops->vdev_op_remap == NULL) 4596 return (B_FALSE); 4597 4598 rbca.rbca_bp = bp; 4599 rbca.rbca_cb = callback; 4600 rbca.rbca_remap_vd = vd; 4601 rbca.rbca_remap_offset = offset; 4602 rbca.rbca_cb_arg = arg; 4603 4604 /* 4605 * remap_blkptr_cb() will be called in order for each level of 4606 * indirection, until a concrete vdev is reached or a split block is 4607 * encountered. old_vd and old_offset are updated within the callback 4608 * as we go from the one indirect vdev to the next one (either concrete 4609 * or indirect again) in that order. 4610 */ 4611 vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca); 4612 4613 /* Check if the DVA wasn't remapped because it is a split block */ 4614 if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id) 4615 return (B_FALSE); 4616 4617 return (B_TRUE); 4618 } 4619 4620 /* 4621 * Undo the allocation of a DVA which happened in the given transaction group. 4622 */ 4623 void 4624 metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg) 4625 { 4626 metaslab_t *msp; 4627 vdev_t *vd; 4628 uint64_t vdev = DVA_GET_VDEV(dva); 4629 uint64_t offset = DVA_GET_OFFSET(dva); 4630 uint64_t size = DVA_GET_ASIZE(dva); 4631 4632 ASSERT(DVA_IS_VALID(dva)); 4633 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 4634 4635 if (txg > spa_freeze_txg(spa)) 4636 return; 4637 4638 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 4639 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { 4640 cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu", 4641 (u_longlong_t)vdev, (u_longlong_t)offset); 4642 ASSERT(0); 4643 return; 4644 } 4645 4646 ASSERT(!vd->vdev_removing); 4647 ASSERT(vdev_is_concrete(vd)); 4648 ASSERT0(vd->vdev_indirect_config.vic_mapping_object); 4649 ASSERT3P(vd->vdev_indirect_mapping, ==, NULL); 4650 4651 if (DVA_GET_GANG(dva)) 4652 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 4653 4654 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 4655 4656 mutex_enter(&msp->ms_lock); 4657 range_tree_remove(msp->ms_allocating[txg & TXG_MASK], 4658 offset, size); 4659 4660 VERIFY(!msp->ms_condensing); 4661 VERIFY3U(offset, >=, msp->ms_start); 4662 VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size); 4663 VERIFY3U(range_tree_space(msp->ms_allocatable) + size, <=, 4664 msp->ms_size); 4665 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 4666 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 4667 range_tree_add(msp->ms_allocatable, offset, size); 4668 mutex_exit(&msp->ms_lock); 4669 } 4670 4671 /* 4672 * Free the block represented by the given DVA. 4673 */ 4674 void 4675 metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint) 4676 { 4677 uint64_t vdev = DVA_GET_VDEV(dva); 4678 uint64_t offset = DVA_GET_OFFSET(dva); 4679 uint64_t size = DVA_GET_ASIZE(dva); 4680 vdev_t *vd = vdev_lookup_top(spa, vdev); 4681 4682 ASSERT(DVA_IS_VALID(dva)); 4683 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 4684 4685 if (DVA_GET_GANG(dva)) { 4686 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 4687 } 4688 4689 metaslab_free_impl(vd, offset, size, checkpoint); 4690 } 4691 4692 /* 4693 * Reserve some allocation slots. The reservation system must be called 4694 * before we call into the allocator. If there aren't any available slots 4695 * then the I/O will be throttled until an I/O completes and its slots are 4696 * freed up. The function returns true if it was successful in placing 4697 * the reservation. 4698 */ 4699 boolean_t 4700 metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator, 4701 zio_t *zio, int flags) 4702 { 4703 uint64_t available_slots = 0; 4704 boolean_t slot_reserved = B_FALSE; 4705 uint64_t max = mc->mc_alloc_max_slots[allocator]; 4706 4707 ASSERT(mc->mc_alloc_throttle_enabled); 4708 mutex_enter(&mc->mc_lock); 4709 4710 uint64_t reserved_slots = 4711 zfs_refcount_count(&mc->mc_alloc_slots[allocator]); 4712 if (reserved_slots < max) 4713 available_slots = max - reserved_slots; 4714 4715 if (slots <= available_slots || GANG_ALLOCATION(flags) || 4716 flags & METASLAB_MUST_RESERVE) { 4717 /* 4718 * We reserve the slots individually so that we can unreserve 4719 * them individually when an I/O completes. 4720 */ 4721 for (int d = 0; d < slots; d++) { 4722 reserved_slots = 4723 zfs_refcount_add(&mc->mc_alloc_slots[allocator], 4724 zio); 4725 } 4726 zio->io_flags |= ZIO_FLAG_IO_ALLOCATING; 4727 slot_reserved = B_TRUE; 4728 } 4729 4730 mutex_exit(&mc->mc_lock); 4731 return (slot_reserved); 4732 } 4733 4734 void 4735 metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, 4736 int allocator, zio_t *zio) 4737 { 4738 ASSERT(mc->mc_alloc_throttle_enabled); 4739 mutex_enter(&mc->mc_lock); 4740 for (int d = 0; d < slots; d++) { 4741 (void) zfs_refcount_remove(&mc->mc_alloc_slots[allocator], 4742 zio); 4743 } 4744 mutex_exit(&mc->mc_lock); 4745 } 4746 4747 static int 4748 metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size, 4749 uint64_t txg) 4750 { 4751 metaslab_t *msp; 4752 spa_t *spa = vd->vdev_spa; 4753 int error = 0; 4754 4755 if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count) 4756 return (ENXIO); 4757 4758 ASSERT3P(vd->vdev_ms, !=, NULL); 4759 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 4760 4761 mutex_enter(&msp->ms_lock); 4762 4763 if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) 4764 error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM); 4765 /* 4766 * No need to fail in that case; someone else has activated the 4767 * metaslab, but that doesn't preclude us from using it. 4768 */ 4769 if (error == EBUSY) 4770 error = 0; 4771 4772 if (error == 0 && 4773 !range_tree_contains(msp->ms_allocatable, offset, size)) 4774 error = SET_ERROR(ENOENT); 4775 4776 if (error || txg == 0) { /* txg == 0 indicates dry run */ 4777 mutex_exit(&msp->ms_lock); 4778 return (error); 4779 } 4780 4781 VERIFY(!msp->ms_condensing); 4782 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 4783 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 4784 VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=, 4785 msp->ms_size); 4786 range_tree_remove(msp->ms_allocatable, offset, size); 4787 range_tree_clear(msp->ms_trim, offset, size); 4788 4789 if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ 4790 if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) 4791 vdev_dirty(vd, VDD_METASLAB, msp, txg); 4792 range_tree_add(msp->ms_allocating[txg & TXG_MASK], 4793 offset, size); 4794 } 4795 4796 mutex_exit(&msp->ms_lock); 4797 4798 return (0); 4799 } 4800 4801 typedef struct metaslab_claim_cb_arg_t { 4802 uint64_t mcca_txg; 4803 int mcca_error; 4804 } metaslab_claim_cb_arg_t; 4805 4806 /* ARGSUSED */ 4807 static void 4808 metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, 4809 uint64_t size, void *arg) 4810 { 4811 metaslab_claim_cb_arg_t *mcca_arg = arg; 4812 4813 if (mcca_arg->mcca_error == 0) { 4814 mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset, 4815 size, mcca_arg->mcca_txg); 4816 } 4817 } 4818 4819 int 4820 metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg) 4821 { 4822 if (vd->vdev_ops->vdev_op_remap != NULL) { 4823 metaslab_claim_cb_arg_t arg; 4824 4825 /* 4826 * Only zdb(1M) can claim on indirect vdevs. This is used 4827 * to detect leaks of mapped space (that are not accounted 4828 * for in the obsolete counts, spacemap, or bpobj). 4829 */ 4830 ASSERT(!spa_writeable(vd->vdev_spa)); 4831 arg.mcca_error = 0; 4832 arg.mcca_txg = txg; 4833 4834 vd->vdev_ops->vdev_op_remap(vd, offset, size, 4835 metaslab_claim_impl_cb, &arg); 4836 4837 if (arg.mcca_error == 0) { 4838 arg.mcca_error = metaslab_claim_concrete(vd, 4839 offset, size, txg); 4840 } 4841 return (arg.mcca_error); 4842 } else { 4843 return (metaslab_claim_concrete(vd, offset, size, txg)); 4844 } 4845 } 4846 4847 /* 4848 * Intent log support: upon opening the pool after a crash, notify the SPA 4849 * of blocks that the intent log has allocated for immediate write, but 4850 * which are still considered free by the SPA because the last transaction 4851 * group didn't commit yet. 4852 */ 4853 static int 4854 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) 4855 { 4856 uint64_t vdev = DVA_GET_VDEV(dva); 4857 uint64_t offset = DVA_GET_OFFSET(dva); 4858 uint64_t size = DVA_GET_ASIZE(dva); 4859 vdev_t *vd; 4860 4861 if ((vd = vdev_lookup_top(spa, vdev)) == NULL) { 4862 return (SET_ERROR(ENXIO)); 4863 } 4864 4865 ASSERT(DVA_IS_VALID(dva)); 4866 4867 if (DVA_GET_GANG(dva)) 4868 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 4869 4870 return (metaslab_claim_impl(vd, offset, size, txg)); 4871 } 4872 4873 int 4874 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, 4875 int ndvas, uint64_t txg, blkptr_t *hintbp, int flags, 4876 zio_alloc_list_t *zal, zio_t *zio, int allocator) 4877 { 4878 dva_t *dva = bp->blk_dva; 4879 dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL; 4880 int error = 0; 4881 4882 ASSERT(bp->blk_birth == 0); 4883 ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); 4884 4885 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 4886 4887 if (mc->mc_rotor == NULL) { /* no vdevs in this class */ 4888 spa_config_exit(spa, SCL_ALLOC, FTAG); 4889 return (SET_ERROR(ENOSPC)); 4890 } 4891 4892 ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); 4893 ASSERT(BP_GET_NDVAS(bp) == 0); 4894 ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); 4895 ASSERT3P(zal, !=, NULL); 4896 4897 for (int d = 0; d < ndvas; d++) { 4898 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, 4899 txg, flags, zal, allocator); 4900 if (error != 0) { 4901 for (d--; d >= 0; d--) { 4902 metaslab_unalloc_dva(spa, &dva[d], txg); 4903 metaslab_group_alloc_decrement(spa, 4904 DVA_GET_VDEV(&dva[d]), zio, flags, 4905 allocator, B_FALSE); 4906 bzero(&dva[d], sizeof (dva_t)); 4907 } 4908 spa_config_exit(spa, SCL_ALLOC, FTAG); 4909 return (error); 4910 } else { 4911 /* 4912 * Update the metaslab group's queue depth 4913 * based on the newly allocated dva. 4914 */ 4915 metaslab_group_alloc_increment(spa, 4916 DVA_GET_VDEV(&dva[d]), zio, flags, allocator); 4917 } 4918 4919 } 4920 ASSERT(error == 0); 4921 ASSERT(BP_GET_NDVAS(bp) == ndvas); 4922 4923 spa_config_exit(spa, SCL_ALLOC, FTAG); 4924 4925 BP_SET_BIRTH(bp, txg, txg); 4926 4927 return (0); 4928 } 4929 4930 void 4931 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) 4932 { 4933 const dva_t *dva = bp->blk_dva; 4934 int ndvas = BP_GET_NDVAS(bp); 4935 4936 ASSERT(!BP_IS_HOLE(bp)); 4937 ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); 4938 4939 /* 4940 * If we have a checkpoint for the pool we need to make sure that 4941 * the blocks that we free that are part of the checkpoint won't be 4942 * reused until the checkpoint is discarded or we revert to it. 4943 * 4944 * The checkpoint flag is passed down the metaslab_free code path 4945 * and is set whenever we want to add a block to the checkpoint's 4946 * accounting. That is, we "checkpoint" blocks that existed at the 4947 * time the checkpoint was created and are therefore referenced by 4948 * the checkpointed uberblock. 4949 * 4950 * Note that, we don't checkpoint any blocks if the current 4951 * syncing txg <= spa_checkpoint_txg. We want these frees to sync 4952 * normally as they will be referenced by the checkpointed uberblock. 4953 */ 4954 boolean_t checkpoint = B_FALSE; 4955 if (bp->blk_birth <= spa->spa_checkpoint_txg && 4956 spa_syncing_txg(spa) > spa->spa_checkpoint_txg) { 4957 /* 4958 * At this point, if the block is part of the checkpoint 4959 * there is no way it was created in the current txg. 4960 */ 4961 ASSERT(!now); 4962 ASSERT3U(spa_syncing_txg(spa), ==, txg); 4963 checkpoint = B_TRUE; 4964 } 4965 4966 spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); 4967 4968 for (int d = 0; d < ndvas; d++) { 4969 if (now) { 4970 metaslab_unalloc_dva(spa, &dva[d], txg); 4971 } else { 4972 ASSERT3U(txg, ==, spa_syncing_txg(spa)); 4973 metaslab_free_dva(spa, &dva[d], checkpoint); 4974 } 4975 } 4976 4977 spa_config_exit(spa, SCL_FREE, FTAG); 4978 } 4979 4980 int 4981 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) 4982 { 4983 const dva_t *dva = bp->blk_dva; 4984 int ndvas = BP_GET_NDVAS(bp); 4985 int error = 0; 4986 4987 ASSERT(!BP_IS_HOLE(bp)); 4988 4989 if (txg != 0) { 4990 /* 4991 * First do a dry run to make sure all DVAs are claimable, 4992 * so we don't have to unwind from partial failures below. 4993 */ 4994 if ((error = metaslab_claim(spa, bp, 0)) != 0) 4995 return (error); 4996 } 4997 4998 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 4999 5000 for (int d = 0; d < ndvas; d++) { 5001 error = metaslab_claim_dva(spa, &dva[d], txg); 5002 if (error != 0) 5003 break; 5004 } 5005 5006 spa_config_exit(spa, SCL_ALLOC, FTAG); 5007 5008 ASSERT(error == 0 || txg == 0); 5009 5010 return (error); 5011 } 5012 5013 /* ARGSUSED */ 5014 static void 5015 metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset, 5016 uint64_t size, void *arg) 5017 { 5018 if (vd->vdev_ops == &vdev_indirect_ops) 5019 return; 5020 5021 metaslab_check_free_impl(vd, offset, size); 5022 } 5023 5024 static void 5025 metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size) 5026 { 5027 metaslab_t *msp; 5028 spa_t *spa = vd->vdev_spa; 5029 5030 if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) 5031 return; 5032 5033 if (vd->vdev_ops->vdev_op_remap != NULL) { 5034 vd->vdev_ops->vdev_op_remap(vd, offset, size, 5035 metaslab_check_free_impl_cb, NULL); 5036 return; 5037 } 5038 5039 ASSERT(vdev_is_concrete(vd)); 5040 ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); 5041 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 5042 5043 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 5044 5045 mutex_enter(&msp->ms_lock); 5046 if (msp->ms_loaded) { 5047 range_tree_verify_not_present(msp->ms_allocatable, 5048 offset, size); 5049 } 5050 5051 /* 5052 * Check all segments that currently exist in the freeing pipeline. 5053 * 5054 * It would intuitively make sense to also check the current allocating 5055 * tree since metaslab_unalloc_dva() exists for extents that are 5056 * allocated and freed in the same sync pass withing the same txg. 5057 * Unfortunately there are places (e.g. the ZIL) where we allocate a 5058 * segment but then we free part of it within the same txg 5059 * [see zil_sync()]. Thus, we don't call range_tree_verify() in the 5060 * current allocating tree. 5061 */ 5062 range_tree_verify_not_present(msp->ms_freeing, offset, size); 5063 range_tree_verify_not_present(msp->ms_checkpointing, offset, size); 5064 range_tree_verify_not_present(msp->ms_freed, offset, size); 5065 for (int j = 0; j < TXG_DEFER_SIZE; j++) 5066 range_tree_verify_not_present(msp->ms_defer[j], offset, size); 5067 range_tree_verify_not_present(msp->ms_trim, offset, size); 5068 mutex_exit(&msp->ms_lock); 5069 } 5070 5071 void 5072 metaslab_check_free(spa_t *spa, const blkptr_t *bp) 5073 { 5074 if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) 5075 return; 5076 5077 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 5078 for (int i = 0; i < BP_GET_NDVAS(bp); i++) { 5079 uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]); 5080 vdev_t *vd = vdev_lookup_top(spa, vdev); 5081 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); 5082 uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]); 5083 5084 if (DVA_GET_GANG(&bp->blk_dva[i])) 5085 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 5086 5087 ASSERT3P(vd, !=, NULL); 5088 5089 metaslab_check_free_impl(vd, offset, size); 5090 } 5091 spa_config_exit(spa, SCL_VDEV, FTAG); 5092 } 5093 5094 static void 5095 metaslab_group_disable_wait(metaslab_group_t *mg) 5096 { 5097 ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock)); 5098 while (mg->mg_disabled_updating) { 5099 cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock); 5100 } 5101 } 5102 5103 static void 5104 metaslab_group_disabled_increment(metaslab_group_t *mg) 5105 { 5106 ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock)); 5107 ASSERT(mg->mg_disabled_updating); 5108 5109 while (mg->mg_ms_disabled >= max_disabled_ms) { 5110 cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock); 5111 } 5112 mg->mg_ms_disabled++; 5113 ASSERT3U(mg->mg_ms_disabled, <=, max_disabled_ms); 5114 } 5115 5116 /* 5117 * Mark the metaslab as disabled to prevent any allocations on this metaslab. 5118 * We must also track how many metaslabs are currently disabled within a 5119 * metaslab group and limit them to prevent allocation failures from 5120 * occurring because all metaslabs are disabled. 5121 */ 5122 void 5123 metaslab_disable(metaslab_t *msp) 5124 { 5125 ASSERT(!MUTEX_HELD(&msp->ms_lock)); 5126 metaslab_group_t *mg = msp->ms_group; 5127 5128 mutex_enter(&mg->mg_ms_disabled_lock); 5129 5130 /* 5131 * To keep an accurate count of how many threads have disabled 5132 * a specific metaslab group, we only allow one thread to mark 5133 * the metaslab group at a time. This ensures that the value of 5134 * ms_disabled will be accurate when we decide to mark a metaslab 5135 * group as disabled. To do this we force all other threads 5136 * to wait till the metaslab's mg_disabled_updating flag is no 5137 * longer set. 5138 */ 5139 metaslab_group_disable_wait(mg); 5140 mg->mg_disabled_updating = B_TRUE; 5141 if (msp->ms_disabled == 0) { 5142 metaslab_group_disabled_increment(mg); 5143 } 5144 mutex_enter(&msp->ms_lock); 5145 msp->ms_disabled++; 5146 mutex_exit(&msp->ms_lock); 5147 5148 mg->mg_disabled_updating = B_FALSE; 5149 cv_broadcast(&mg->mg_ms_disabled_cv); 5150 mutex_exit(&mg->mg_ms_disabled_lock); 5151 } 5152 5153 void 5154 metaslab_enable(metaslab_t *msp, boolean_t sync) 5155 { 5156 metaslab_group_t *mg = msp->ms_group; 5157 spa_t *spa = mg->mg_vd->vdev_spa; 5158 5159 /* 5160 * Wait for the outstanding IO to be synced to prevent newly 5161 * allocated blocks from being overwritten. This used by 5162 * initialize and TRIM which are modifying unallocated space. 5163 */ 5164 if (sync) 5165 txg_wait_synced(spa_get_dsl(spa), 0); 5166 5167 mutex_enter(&mg->mg_ms_disabled_lock); 5168 mutex_enter(&msp->ms_lock); 5169 if (--msp->ms_disabled == 0) { 5170 mg->mg_ms_disabled--; 5171 cv_broadcast(&mg->mg_ms_disabled_cv); 5172 } 5173 mutex_exit(&msp->ms_lock); 5174 mutex_exit(&mg->mg_ms_disabled_lock); 5175 } 5176 5177 static void 5178 metaslab_update_ondisk_flush_data(metaslab_t *ms, dmu_tx_t *tx) 5179 { 5180 vdev_t *vd = ms->ms_group->mg_vd; 5181 spa_t *spa = vd->vdev_spa; 5182 objset_t *mos = spa_meta_objset(spa); 5183 5184 ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); 5185 5186 metaslab_unflushed_phys_t entry = { 5187 .msp_unflushed_txg = metaslab_unflushed_txg(ms), 5188 }; 5189 uint64_t entry_size = sizeof (entry); 5190 uint64_t entry_offset = ms->ms_id * entry_size; 5191 5192 uint64_t object = 0; 5193 int err = zap_lookup(mos, vd->vdev_top_zap, 5194 VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, 5195 &object); 5196 if (err == ENOENT) { 5197 object = dmu_object_alloc(mos, DMU_OTN_UINT64_METADATA, 5198 SPA_OLD_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx); 5199 VERIFY0(zap_add(mos, vd->vdev_top_zap, 5200 VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, 5201 &object, tx)); 5202 } else { 5203 VERIFY0(err); 5204 } 5205 5206 dmu_write(spa_meta_objset(spa), object, entry_offset, entry_size, 5207 &entry, tx); 5208 } 5209 5210 void 5211 metaslab_set_unflushed_txg(metaslab_t *ms, uint64_t txg, dmu_tx_t *tx) 5212 { 5213 spa_t *spa = ms->ms_group->mg_vd->vdev_spa; 5214 5215 if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) 5216 return; 5217 5218 ms->ms_unflushed_txg = txg; 5219 metaslab_update_ondisk_flush_data(ms, tx); 5220 } 5221 5222 uint64_t 5223 metaslab_unflushed_txg(metaslab_t *ms) 5224 { 5225 return (ms->ms_unflushed_txg); 5226 } 5227