1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2015 by Delphix. All rights reserved. 24 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 25 * Copyright (c) 2014 Integros [integros.com] 26 */ 27 28 #include <sys/zfs_context.h> 29 #include <sys/dmu.h> 30 #include <sys/dmu_tx.h> 31 #include <sys/space_map.h> 32 #include <sys/metaslab_impl.h> 33 #include <sys/vdev_impl.h> 34 #include <sys/zio.h> 35 #include <sys/spa_impl.h> 36 #include <sys/zfeature.h> 37 38 #define GANG_ALLOCATION(flags) \ 39 ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER)) 40 41 #define METASLAB_WEIGHT_PRIMARY (1ULL << 63) 42 #define METASLAB_WEIGHT_SECONDARY (1ULL << 62) 43 #define METASLAB_ACTIVE_MASK \ 44 (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY) 45 46 uint64_t metaslab_aliquot = 512ULL << 10; 47 uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ 48 49 /* 50 * The in-core space map representation is more compact than its on-disk form. 51 * The zfs_condense_pct determines how much more compact the in-core 52 * space_map representation must be before we compact it on-disk. 53 * Values should be greater than or equal to 100. 54 */ 55 int zfs_condense_pct = 200; 56 57 /* 58 * Condensing a metaslab is not guaranteed to actually reduce the amount of 59 * space used on disk. In particular, a space map uses data in increments of 60 * MAX(1 << ashift, space_map_blksize), so a metaslab might use the 61 * same number of blocks after condensing. Since the goal of condensing is to 62 * reduce the number of IOPs required to read the space map, we only want to 63 * condense when we can be sure we will reduce the number of blocks used by the 64 * space map. Unfortunately, we cannot precisely compute whether or not this is 65 * the case in metaslab_should_condense since we are holding ms_lock. Instead, 66 * we apply the following heuristic: do not condense a spacemap unless the 67 * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold 68 * blocks. 69 */ 70 int zfs_metaslab_condense_block_threshold = 4; 71 72 /* 73 * The zfs_mg_noalloc_threshold defines which metaslab groups should 74 * be eligible for allocation. The value is defined as a percentage of 75 * free space. Metaslab groups that have more free space than 76 * zfs_mg_noalloc_threshold are always eligible for allocations. Once 77 * a metaslab group's free space is less than or equal to the 78 * zfs_mg_noalloc_threshold the allocator will avoid allocating to that 79 * group unless all groups in the pool have reached zfs_mg_noalloc_threshold. 80 * Once all groups in the pool reach zfs_mg_noalloc_threshold then all 81 * groups are allowed to accept allocations. Gang blocks are always 82 * eligible to allocate on any metaslab group. The default value of 0 means 83 * no metaslab group will be excluded based on this criterion. 84 */ 85 int zfs_mg_noalloc_threshold = 0; 86 87 /* 88 * Metaslab groups are considered eligible for allocations if their 89 * fragmenation metric (measured as a percentage) is less than or equal to 90 * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold 91 * then it will be skipped unless all metaslab groups within the metaslab 92 * class have also crossed this threshold. 93 */ 94 int zfs_mg_fragmentation_threshold = 85; 95 96 /* 97 * Allow metaslabs to keep their active state as long as their fragmentation 98 * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An 99 * active metaslab that exceeds this threshold will no longer keep its active 100 * status allowing better metaslabs to be selected. 101 */ 102 int zfs_metaslab_fragmentation_threshold = 70; 103 104 /* 105 * When set will load all metaslabs when pool is first opened. 106 */ 107 int metaslab_debug_load = 0; 108 109 /* 110 * When set will prevent metaslabs from being unloaded. 111 */ 112 int metaslab_debug_unload = 0; 113 114 /* 115 * Minimum size which forces the dynamic allocator to change 116 * it's allocation strategy. Once the space map cannot satisfy 117 * an allocation of this size then it switches to using more 118 * aggressive strategy (i.e search by size rather than offset). 119 */ 120 uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE; 121 122 /* 123 * The minimum free space, in percent, which must be available 124 * in a space map to continue allocations in a first-fit fashion. 125 * Once the space_map's free space drops below this level we dynamically 126 * switch to using best-fit allocations. 127 */ 128 int metaslab_df_free_pct = 4; 129 130 /* 131 * A metaslab is considered "free" if it contains a contiguous 132 * segment which is greater than metaslab_min_alloc_size. 133 */ 134 uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS; 135 136 /* 137 * Percentage of all cpus that can be used by the metaslab taskq. 138 */ 139 int metaslab_load_pct = 50; 140 141 /* 142 * Determines how many txgs a metaslab may remain loaded without having any 143 * allocations from it. As long as a metaslab continues to be used we will 144 * keep it loaded. 145 */ 146 int metaslab_unload_delay = TXG_SIZE * 2; 147 148 /* 149 * Max number of metaslabs per group to preload. 150 */ 151 int metaslab_preload_limit = SPA_DVAS_PER_BP; 152 153 /* 154 * Enable/disable preloading of metaslab. 155 */ 156 boolean_t metaslab_preload_enabled = B_TRUE; 157 158 /* 159 * Enable/disable fragmentation weighting on metaslabs. 160 */ 161 boolean_t metaslab_fragmentation_factor_enabled = B_TRUE; 162 163 /* 164 * Enable/disable lba weighting (i.e. outer tracks are given preference). 165 */ 166 boolean_t metaslab_lba_weighting_enabled = B_TRUE; 167 168 /* 169 * Enable/disable metaslab group biasing. 170 */ 171 boolean_t metaslab_bias_enabled = B_TRUE; 172 173 static uint64_t metaslab_fragmentation(metaslab_t *); 174 175 /* 176 * ========================================================================== 177 * Metaslab classes 178 * ========================================================================== 179 */ 180 metaslab_class_t * 181 metaslab_class_create(spa_t *spa, metaslab_ops_t *ops) 182 { 183 metaslab_class_t *mc; 184 185 mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); 186 187 mc->mc_spa = spa; 188 mc->mc_rotor = NULL; 189 mc->mc_ops = ops; 190 mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL); 191 refcount_create_tracked(&mc->mc_alloc_slots); 192 193 return (mc); 194 } 195 196 void 197 metaslab_class_destroy(metaslab_class_t *mc) 198 { 199 ASSERT(mc->mc_rotor == NULL); 200 ASSERT(mc->mc_alloc == 0); 201 ASSERT(mc->mc_deferred == 0); 202 ASSERT(mc->mc_space == 0); 203 ASSERT(mc->mc_dspace == 0); 204 205 refcount_destroy(&mc->mc_alloc_slots); 206 mutex_destroy(&mc->mc_lock); 207 kmem_free(mc, sizeof (metaslab_class_t)); 208 } 209 210 int 211 metaslab_class_validate(metaslab_class_t *mc) 212 { 213 metaslab_group_t *mg; 214 vdev_t *vd; 215 216 /* 217 * Must hold one of the spa_config locks. 218 */ 219 ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || 220 spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); 221 222 if ((mg = mc->mc_rotor) == NULL) 223 return (0); 224 225 do { 226 vd = mg->mg_vd; 227 ASSERT(vd->vdev_mg != NULL); 228 ASSERT3P(vd->vdev_top, ==, vd); 229 ASSERT3P(mg->mg_class, ==, mc); 230 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); 231 } while ((mg = mg->mg_next) != mc->mc_rotor); 232 233 return (0); 234 } 235 236 void 237 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, 238 int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) 239 { 240 atomic_add_64(&mc->mc_alloc, alloc_delta); 241 atomic_add_64(&mc->mc_deferred, defer_delta); 242 atomic_add_64(&mc->mc_space, space_delta); 243 atomic_add_64(&mc->mc_dspace, dspace_delta); 244 } 245 246 uint64_t 247 metaslab_class_get_alloc(metaslab_class_t *mc) 248 { 249 return (mc->mc_alloc); 250 } 251 252 uint64_t 253 metaslab_class_get_deferred(metaslab_class_t *mc) 254 { 255 return (mc->mc_deferred); 256 } 257 258 uint64_t 259 metaslab_class_get_space(metaslab_class_t *mc) 260 { 261 return (mc->mc_space); 262 } 263 264 uint64_t 265 metaslab_class_get_dspace(metaslab_class_t *mc) 266 { 267 return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); 268 } 269 270 void 271 metaslab_class_histogram_verify(metaslab_class_t *mc) 272 { 273 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 274 uint64_t *mc_hist; 275 int i; 276 277 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 278 return; 279 280 mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 281 KM_SLEEP); 282 283 for (int c = 0; c < rvd->vdev_children; c++) { 284 vdev_t *tvd = rvd->vdev_child[c]; 285 metaslab_group_t *mg = tvd->vdev_mg; 286 287 /* 288 * Skip any holes, uninitialized top-levels, or 289 * vdevs that are not in this metalab class. 290 */ 291 if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 || 292 mg->mg_class != mc) { 293 continue; 294 } 295 296 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 297 mc_hist[i] += mg->mg_histogram[i]; 298 } 299 300 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 301 VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]); 302 303 kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 304 } 305 306 /* 307 * Calculate the metaslab class's fragmentation metric. The metric 308 * is weighted based on the space contribution of each metaslab group. 309 * The return value will be a number between 0 and 100 (inclusive), or 310 * ZFS_FRAG_INVALID if the metric has not been set. See comment above the 311 * zfs_frag_table for more information about the metric. 312 */ 313 uint64_t 314 metaslab_class_fragmentation(metaslab_class_t *mc) 315 { 316 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 317 uint64_t fragmentation = 0; 318 319 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 320 321 for (int c = 0; c < rvd->vdev_children; c++) { 322 vdev_t *tvd = rvd->vdev_child[c]; 323 metaslab_group_t *mg = tvd->vdev_mg; 324 325 /* 326 * Skip any holes, uninitialized top-levels, or 327 * vdevs that are not in this metalab class. 328 */ 329 if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 || 330 mg->mg_class != mc) { 331 continue; 332 } 333 334 /* 335 * If a metaslab group does not contain a fragmentation 336 * metric then just bail out. 337 */ 338 if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { 339 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 340 return (ZFS_FRAG_INVALID); 341 } 342 343 /* 344 * Determine how much this metaslab_group is contributing 345 * to the overall pool fragmentation metric. 346 */ 347 fragmentation += mg->mg_fragmentation * 348 metaslab_group_get_space(mg); 349 } 350 fragmentation /= metaslab_class_get_space(mc); 351 352 ASSERT3U(fragmentation, <=, 100); 353 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 354 return (fragmentation); 355 } 356 357 /* 358 * Calculate the amount of expandable space that is available in 359 * this metaslab class. If a device is expanded then its expandable 360 * space will be the amount of allocatable space that is currently not 361 * part of this metaslab class. 362 */ 363 uint64_t 364 metaslab_class_expandable_space(metaslab_class_t *mc) 365 { 366 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 367 uint64_t space = 0; 368 369 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 370 for (int c = 0; c < rvd->vdev_children; c++) { 371 vdev_t *tvd = rvd->vdev_child[c]; 372 metaslab_group_t *mg = tvd->vdev_mg; 373 374 if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 || 375 mg->mg_class != mc) { 376 continue; 377 } 378 379 /* 380 * Calculate if we have enough space to add additional 381 * metaslabs. We report the expandable space in terms 382 * of the metaslab size since that's the unit of expansion. 383 */ 384 space += P2ALIGN(tvd->vdev_max_asize - tvd->vdev_asize, 385 1ULL << tvd->vdev_ms_shift); 386 } 387 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 388 return (space); 389 } 390 391 /* 392 * ========================================================================== 393 * Metaslab groups 394 * ========================================================================== 395 */ 396 static int 397 metaslab_compare(const void *x1, const void *x2) 398 { 399 const metaslab_t *m1 = x1; 400 const metaslab_t *m2 = x2; 401 402 if (m1->ms_weight < m2->ms_weight) 403 return (1); 404 if (m1->ms_weight > m2->ms_weight) 405 return (-1); 406 407 /* 408 * If the weights are identical, use the offset to force uniqueness. 409 */ 410 if (m1->ms_start < m2->ms_start) 411 return (-1); 412 if (m1->ms_start > m2->ms_start) 413 return (1); 414 415 ASSERT3P(m1, ==, m2); 416 417 return (0); 418 } 419 420 /* 421 * Update the allocatable flag and the metaslab group's capacity. 422 * The allocatable flag is set to true if the capacity is below 423 * the zfs_mg_noalloc_threshold or has a fragmentation value that is 424 * greater than zfs_mg_fragmentation_threshold. If a metaslab group 425 * transitions from allocatable to non-allocatable or vice versa then the 426 * metaslab group's class is updated to reflect the transition. 427 */ 428 static void 429 metaslab_group_alloc_update(metaslab_group_t *mg) 430 { 431 vdev_t *vd = mg->mg_vd; 432 metaslab_class_t *mc = mg->mg_class; 433 vdev_stat_t *vs = &vd->vdev_stat; 434 boolean_t was_allocatable; 435 boolean_t was_initialized; 436 437 ASSERT(vd == vd->vdev_top); 438 439 mutex_enter(&mg->mg_lock); 440 was_allocatable = mg->mg_allocatable; 441 was_initialized = mg->mg_initialized; 442 443 mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / 444 (vs->vs_space + 1); 445 446 mutex_enter(&mc->mc_lock); 447 448 /* 449 * If the metaslab group was just added then it won't 450 * have any space until we finish syncing out this txg. 451 * At that point we will consider it initialized and available 452 * for allocations. We also don't consider non-activated 453 * metaslab groups (e.g. vdevs that are in the middle of being removed) 454 * to be initialized, because they can't be used for allocation. 455 */ 456 mg->mg_initialized = metaslab_group_initialized(mg); 457 if (!was_initialized && mg->mg_initialized) { 458 mc->mc_groups++; 459 } else if (was_initialized && !mg->mg_initialized) { 460 ASSERT3U(mc->mc_groups, >, 0); 461 mc->mc_groups--; 462 } 463 if (mg->mg_initialized) 464 mg->mg_no_free_space = B_FALSE; 465 466 /* 467 * A metaslab group is considered allocatable if it has plenty 468 * of free space or is not heavily fragmented. We only take 469 * fragmentation into account if the metaslab group has a valid 470 * fragmentation metric (i.e. a value between 0 and 100). 471 */ 472 mg->mg_allocatable = (mg->mg_activation_count > 0 && 473 mg->mg_free_capacity > zfs_mg_noalloc_threshold && 474 (mg->mg_fragmentation == ZFS_FRAG_INVALID || 475 mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)); 476 477 /* 478 * The mc_alloc_groups maintains a count of the number of 479 * groups in this metaslab class that are still above the 480 * zfs_mg_noalloc_threshold. This is used by the allocating 481 * threads to determine if they should avoid allocations to 482 * a given group. The allocator will avoid allocations to a group 483 * if that group has reached or is below the zfs_mg_noalloc_threshold 484 * and there are still other groups that are above the threshold. 485 * When a group transitions from allocatable to non-allocatable or 486 * vice versa we update the metaslab class to reflect that change. 487 * When the mc_alloc_groups value drops to 0 that means that all 488 * groups have reached the zfs_mg_noalloc_threshold making all groups 489 * eligible for allocations. This effectively means that all devices 490 * are balanced again. 491 */ 492 if (was_allocatable && !mg->mg_allocatable) 493 mc->mc_alloc_groups--; 494 else if (!was_allocatable && mg->mg_allocatable) 495 mc->mc_alloc_groups++; 496 mutex_exit(&mc->mc_lock); 497 498 mutex_exit(&mg->mg_lock); 499 } 500 501 metaslab_group_t * 502 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) 503 { 504 metaslab_group_t *mg; 505 506 mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); 507 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); 508 avl_create(&mg->mg_metaslab_tree, metaslab_compare, 509 sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); 510 mg->mg_vd = vd; 511 mg->mg_class = mc; 512 mg->mg_activation_count = 0; 513 mg->mg_initialized = B_FALSE; 514 mg->mg_no_free_space = B_TRUE; 515 refcount_create_tracked(&mg->mg_alloc_queue_depth); 516 517 mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct, 518 minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT); 519 520 return (mg); 521 } 522 523 void 524 metaslab_group_destroy(metaslab_group_t *mg) 525 { 526 ASSERT(mg->mg_prev == NULL); 527 ASSERT(mg->mg_next == NULL); 528 /* 529 * We may have gone below zero with the activation count 530 * either because we never activated in the first place or 531 * because we're done, and possibly removing the vdev. 532 */ 533 ASSERT(mg->mg_activation_count <= 0); 534 535 taskq_destroy(mg->mg_taskq); 536 avl_destroy(&mg->mg_metaslab_tree); 537 mutex_destroy(&mg->mg_lock); 538 refcount_destroy(&mg->mg_alloc_queue_depth); 539 kmem_free(mg, sizeof (metaslab_group_t)); 540 } 541 542 void 543 metaslab_group_activate(metaslab_group_t *mg) 544 { 545 metaslab_class_t *mc = mg->mg_class; 546 metaslab_group_t *mgprev, *mgnext; 547 548 ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); 549 550 ASSERT(mc->mc_rotor != mg); 551 ASSERT(mg->mg_prev == NULL); 552 ASSERT(mg->mg_next == NULL); 553 ASSERT(mg->mg_activation_count <= 0); 554 555 if (++mg->mg_activation_count <= 0) 556 return; 557 558 mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); 559 metaslab_group_alloc_update(mg); 560 561 if ((mgprev = mc->mc_rotor) == NULL) { 562 mg->mg_prev = mg; 563 mg->mg_next = mg; 564 } else { 565 mgnext = mgprev->mg_next; 566 mg->mg_prev = mgprev; 567 mg->mg_next = mgnext; 568 mgprev->mg_next = mg; 569 mgnext->mg_prev = mg; 570 } 571 mc->mc_rotor = mg; 572 } 573 574 void 575 metaslab_group_passivate(metaslab_group_t *mg) 576 { 577 metaslab_class_t *mc = mg->mg_class; 578 metaslab_group_t *mgprev, *mgnext; 579 580 ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); 581 582 if (--mg->mg_activation_count != 0) { 583 ASSERT(mc->mc_rotor != mg); 584 ASSERT(mg->mg_prev == NULL); 585 ASSERT(mg->mg_next == NULL); 586 ASSERT(mg->mg_activation_count < 0); 587 return; 588 } 589 590 taskq_wait(mg->mg_taskq); 591 metaslab_group_alloc_update(mg); 592 593 mgprev = mg->mg_prev; 594 mgnext = mg->mg_next; 595 596 if (mg == mgnext) { 597 mc->mc_rotor = NULL; 598 } else { 599 mc->mc_rotor = mgnext; 600 mgprev->mg_next = mgnext; 601 mgnext->mg_prev = mgprev; 602 } 603 604 mg->mg_prev = NULL; 605 mg->mg_next = NULL; 606 } 607 608 boolean_t 609 metaslab_group_initialized(metaslab_group_t *mg) 610 { 611 vdev_t *vd = mg->mg_vd; 612 vdev_stat_t *vs = &vd->vdev_stat; 613 614 return (vs->vs_space != 0 && mg->mg_activation_count > 0); 615 } 616 617 uint64_t 618 metaslab_group_get_space(metaslab_group_t *mg) 619 { 620 return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count); 621 } 622 623 void 624 metaslab_group_histogram_verify(metaslab_group_t *mg) 625 { 626 uint64_t *mg_hist; 627 vdev_t *vd = mg->mg_vd; 628 uint64_t ashift = vd->vdev_ashift; 629 int i; 630 631 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 632 return; 633 634 mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 635 KM_SLEEP); 636 637 ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=, 638 SPACE_MAP_HISTOGRAM_SIZE + ashift); 639 640 for (int m = 0; m < vd->vdev_ms_count; m++) { 641 metaslab_t *msp = vd->vdev_ms[m]; 642 643 if (msp->ms_sm == NULL) 644 continue; 645 646 for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) 647 mg_hist[i + ashift] += 648 msp->ms_sm->sm_phys->smp_histogram[i]; 649 } 650 651 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++) 652 VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]); 653 654 kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 655 } 656 657 static void 658 metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp) 659 { 660 metaslab_class_t *mc = mg->mg_class; 661 uint64_t ashift = mg->mg_vd->vdev_ashift; 662 663 ASSERT(MUTEX_HELD(&msp->ms_lock)); 664 if (msp->ms_sm == NULL) 665 return; 666 667 mutex_enter(&mg->mg_lock); 668 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 669 mg->mg_histogram[i + ashift] += 670 msp->ms_sm->sm_phys->smp_histogram[i]; 671 mc->mc_histogram[i + ashift] += 672 msp->ms_sm->sm_phys->smp_histogram[i]; 673 } 674 mutex_exit(&mg->mg_lock); 675 } 676 677 void 678 metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp) 679 { 680 metaslab_class_t *mc = mg->mg_class; 681 uint64_t ashift = mg->mg_vd->vdev_ashift; 682 683 ASSERT(MUTEX_HELD(&msp->ms_lock)); 684 if (msp->ms_sm == NULL) 685 return; 686 687 mutex_enter(&mg->mg_lock); 688 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 689 ASSERT3U(mg->mg_histogram[i + ashift], >=, 690 msp->ms_sm->sm_phys->smp_histogram[i]); 691 ASSERT3U(mc->mc_histogram[i + ashift], >=, 692 msp->ms_sm->sm_phys->smp_histogram[i]); 693 694 mg->mg_histogram[i + ashift] -= 695 msp->ms_sm->sm_phys->smp_histogram[i]; 696 mc->mc_histogram[i + ashift] -= 697 msp->ms_sm->sm_phys->smp_histogram[i]; 698 } 699 mutex_exit(&mg->mg_lock); 700 } 701 702 static void 703 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) 704 { 705 ASSERT(msp->ms_group == NULL); 706 mutex_enter(&mg->mg_lock); 707 msp->ms_group = mg; 708 msp->ms_weight = 0; 709 avl_add(&mg->mg_metaslab_tree, msp); 710 mutex_exit(&mg->mg_lock); 711 712 mutex_enter(&msp->ms_lock); 713 metaslab_group_histogram_add(mg, msp); 714 mutex_exit(&msp->ms_lock); 715 } 716 717 static void 718 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) 719 { 720 mutex_enter(&msp->ms_lock); 721 metaslab_group_histogram_remove(mg, msp); 722 mutex_exit(&msp->ms_lock); 723 724 mutex_enter(&mg->mg_lock); 725 ASSERT(msp->ms_group == mg); 726 avl_remove(&mg->mg_metaslab_tree, msp); 727 msp->ms_group = NULL; 728 mutex_exit(&mg->mg_lock); 729 } 730 731 static void 732 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 733 { 734 /* 735 * Although in principle the weight can be any value, in 736 * practice we do not use values in the range [1, 511]. 737 */ 738 ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0); 739 ASSERT(MUTEX_HELD(&msp->ms_lock)); 740 741 mutex_enter(&mg->mg_lock); 742 ASSERT(msp->ms_group == mg); 743 avl_remove(&mg->mg_metaslab_tree, msp); 744 msp->ms_weight = weight; 745 avl_add(&mg->mg_metaslab_tree, msp); 746 mutex_exit(&mg->mg_lock); 747 } 748 749 /* 750 * Calculate the fragmentation for a given metaslab group. We can use 751 * a simple average here since all metaslabs within the group must have 752 * the same size. The return value will be a value between 0 and 100 753 * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this 754 * group have a fragmentation metric. 755 */ 756 uint64_t 757 metaslab_group_fragmentation(metaslab_group_t *mg) 758 { 759 vdev_t *vd = mg->mg_vd; 760 uint64_t fragmentation = 0; 761 uint64_t valid_ms = 0; 762 763 for (int m = 0; m < vd->vdev_ms_count; m++) { 764 metaslab_t *msp = vd->vdev_ms[m]; 765 766 if (msp->ms_fragmentation == ZFS_FRAG_INVALID) 767 continue; 768 769 valid_ms++; 770 fragmentation += msp->ms_fragmentation; 771 } 772 773 if (valid_ms <= vd->vdev_ms_count / 2) 774 return (ZFS_FRAG_INVALID); 775 776 fragmentation /= valid_ms; 777 ASSERT3U(fragmentation, <=, 100); 778 return (fragmentation); 779 } 780 781 /* 782 * Determine if a given metaslab group should skip allocations. A metaslab 783 * group should avoid allocations if its free capacity is less than the 784 * zfs_mg_noalloc_threshold or its fragmentation metric is greater than 785 * zfs_mg_fragmentation_threshold and there is at least one metaslab group 786 * that can still handle allocations. If the allocation throttle is enabled 787 * then we skip allocations to devices that have reached their maximum 788 * allocation queue depth unless the selected metaslab group is the only 789 * eligible group remaining. 790 */ 791 static boolean_t 792 metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, 793 uint64_t psize) 794 { 795 spa_t *spa = mg->mg_vd->vdev_spa; 796 metaslab_class_t *mc = mg->mg_class; 797 798 /* 799 * We can only consider skipping this metaslab group if it's 800 * in the normal metaslab class and there are other metaslab 801 * groups to select from. Otherwise, we always consider it eligible 802 * for allocations. 803 */ 804 if (mc != spa_normal_class(spa) || mc->mc_groups <= 1) 805 return (B_TRUE); 806 807 /* 808 * If the metaslab group's mg_allocatable flag is set (see comments 809 * in metaslab_group_alloc_update() for more information) and 810 * the allocation throttle is disabled then allow allocations to this 811 * device. However, if the allocation throttle is enabled then 812 * check if we have reached our allocation limit (mg_alloc_queue_depth) 813 * to determine if we should allow allocations to this metaslab group. 814 * If all metaslab groups are no longer considered allocatable 815 * (mc_alloc_groups == 0) or we're trying to allocate the smallest 816 * gang block size then we allow allocations on this metaslab group 817 * regardless of the mg_allocatable or throttle settings. 818 */ 819 if (mg->mg_allocatable) { 820 metaslab_group_t *mgp; 821 int64_t qdepth; 822 uint64_t qmax = mg->mg_max_alloc_queue_depth; 823 824 if (!mc->mc_alloc_throttle_enabled) 825 return (B_TRUE); 826 827 /* 828 * If this metaslab group does not have any free space, then 829 * there is no point in looking further. 830 */ 831 if (mg->mg_no_free_space) 832 return (B_FALSE); 833 834 qdepth = refcount_count(&mg->mg_alloc_queue_depth); 835 836 /* 837 * If this metaslab group is below its qmax or it's 838 * the only allocatable metasable group, then attempt 839 * to allocate from it. 840 */ 841 if (qdepth < qmax || mc->mc_alloc_groups == 1) 842 return (B_TRUE); 843 ASSERT3U(mc->mc_alloc_groups, >, 1); 844 845 /* 846 * Since this metaslab group is at or over its qmax, we 847 * need to determine if there are metaslab groups after this 848 * one that might be able to handle this allocation. This is 849 * racy since we can't hold the locks for all metaslab 850 * groups at the same time when we make this check. 851 */ 852 for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) { 853 qmax = mgp->mg_max_alloc_queue_depth; 854 855 qdepth = refcount_count(&mgp->mg_alloc_queue_depth); 856 857 /* 858 * If there is another metaslab group that 859 * might be able to handle the allocation, then 860 * we return false so that we skip this group. 861 */ 862 if (qdepth < qmax && !mgp->mg_no_free_space) 863 return (B_FALSE); 864 } 865 866 /* 867 * We didn't find another group to handle the allocation 868 * so we can't skip this metaslab group even though 869 * we are at or over our qmax. 870 */ 871 return (B_TRUE); 872 873 } else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) { 874 return (B_TRUE); 875 } 876 return (B_FALSE); 877 } 878 879 /* 880 * ========================================================================== 881 * Range tree callbacks 882 * ========================================================================== 883 */ 884 885 /* 886 * Comparison function for the private size-ordered tree. Tree is sorted 887 * by size, larger sizes at the end of the tree. 888 */ 889 static int 890 metaslab_rangesize_compare(const void *x1, const void *x2) 891 { 892 const range_seg_t *r1 = x1; 893 const range_seg_t *r2 = x2; 894 uint64_t rs_size1 = r1->rs_end - r1->rs_start; 895 uint64_t rs_size2 = r2->rs_end - r2->rs_start; 896 897 if (rs_size1 < rs_size2) 898 return (-1); 899 if (rs_size1 > rs_size2) 900 return (1); 901 902 if (r1->rs_start < r2->rs_start) 903 return (-1); 904 905 if (r1->rs_start > r2->rs_start) 906 return (1); 907 908 return (0); 909 } 910 911 /* 912 * Create any block allocator specific components. The current allocators 913 * rely on using both a size-ordered range_tree_t and an array of uint64_t's. 914 */ 915 static void 916 metaslab_rt_create(range_tree_t *rt, void *arg) 917 { 918 metaslab_t *msp = arg; 919 920 ASSERT3P(rt->rt_arg, ==, msp); 921 ASSERT(msp->ms_tree == NULL); 922 923 avl_create(&msp->ms_size_tree, metaslab_rangesize_compare, 924 sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node)); 925 } 926 927 /* 928 * Destroy the block allocator specific components. 929 */ 930 static void 931 metaslab_rt_destroy(range_tree_t *rt, void *arg) 932 { 933 metaslab_t *msp = arg; 934 935 ASSERT3P(rt->rt_arg, ==, msp); 936 ASSERT3P(msp->ms_tree, ==, rt); 937 ASSERT0(avl_numnodes(&msp->ms_size_tree)); 938 939 avl_destroy(&msp->ms_size_tree); 940 } 941 942 static void 943 metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg) 944 { 945 metaslab_t *msp = arg; 946 947 ASSERT3P(rt->rt_arg, ==, msp); 948 ASSERT3P(msp->ms_tree, ==, rt); 949 VERIFY(!msp->ms_condensing); 950 avl_add(&msp->ms_size_tree, rs); 951 } 952 953 static void 954 metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg) 955 { 956 metaslab_t *msp = arg; 957 958 ASSERT3P(rt->rt_arg, ==, msp); 959 ASSERT3P(msp->ms_tree, ==, rt); 960 VERIFY(!msp->ms_condensing); 961 avl_remove(&msp->ms_size_tree, rs); 962 } 963 964 static void 965 metaslab_rt_vacate(range_tree_t *rt, void *arg) 966 { 967 metaslab_t *msp = arg; 968 969 ASSERT3P(rt->rt_arg, ==, msp); 970 ASSERT3P(msp->ms_tree, ==, rt); 971 972 /* 973 * Normally one would walk the tree freeing nodes along the way. 974 * Since the nodes are shared with the range trees we can avoid 975 * walking all nodes and just reinitialize the avl tree. The nodes 976 * will be freed by the range tree, so we don't want to free them here. 977 */ 978 avl_create(&msp->ms_size_tree, metaslab_rangesize_compare, 979 sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node)); 980 } 981 982 static range_tree_ops_t metaslab_rt_ops = { 983 metaslab_rt_create, 984 metaslab_rt_destroy, 985 metaslab_rt_add, 986 metaslab_rt_remove, 987 metaslab_rt_vacate 988 }; 989 990 /* 991 * ========================================================================== 992 * Metaslab block operations 993 * ========================================================================== 994 */ 995 996 /* 997 * Return the maximum contiguous segment within the metaslab. 998 */ 999 uint64_t 1000 metaslab_block_maxsize(metaslab_t *msp) 1001 { 1002 avl_tree_t *t = &msp->ms_size_tree; 1003 range_seg_t *rs; 1004 1005 if (t == NULL || (rs = avl_last(t)) == NULL) 1006 return (0ULL); 1007 1008 return (rs->rs_end - rs->rs_start); 1009 } 1010 1011 uint64_t 1012 metaslab_block_alloc(metaslab_t *msp, uint64_t size) 1013 { 1014 uint64_t start; 1015 range_tree_t *rt = msp->ms_tree; 1016 1017 VERIFY(!msp->ms_condensing); 1018 1019 start = msp->ms_ops->msop_alloc(msp, size); 1020 if (start != -1ULL) { 1021 vdev_t *vd = msp->ms_group->mg_vd; 1022 1023 VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift)); 1024 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 1025 VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size); 1026 range_tree_remove(rt, start, size); 1027 } 1028 return (start); 1029 } 1030 1031 /* 1032 * ========================================================================== 1033 * Common allocator routines 1034 * ========================================================================== 1035 */ 1036 1037 /* 1038 * This is a helper function that can be used by the allocator to find 1039 * a suitable block to allocate. This will search the specified AVL 1040 * tree looking for a block that matches the specified criteria. 1041 */ 1042 static uint64_t 1043 metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, 1044 uint64_t align) 1045 { 1046 range_seg_t *rs, rsearch; 1047 avl_index_t where; 1048 1049 rsearch.rs_start = *cursor; 1050 rsearch.rs_end = *cursor + size; 1051 1052 rs = avl_find(t, &rsearch, &where); 1053 if (rs == NULL) 1054 rs = avl_nearest(t, where, AVL_AFTER); 1055 1056 while (rs != NULL) { 1057 uint64_t offset = P2ROUNDUP(rs->rs_start, align); 1058 1059 if (offset + size <= rs->rs_end) { 1060 *cursor = offset + size; 1061 return (offset); 1062 } 1063 rs = AVL_NEXT(t, rs); 1064 } 1065 1066 /* 1067 * If we know we've searched the whole map (*cursor == 0), give up. 1068 * Otherwise, reset the cursor to the beginning and try again. 1069 */ 1070 if (*cursor == 0) 1071 return (-1ULL); 1072 1073 *cursor = 0; 1074 return (metaslab_block_picker(t, cursor, size, align)); 1075 } 1076 1077 /* 1078 * ========================================================================== 1079 * The first-fit block allocator 1080 * ========================================================================== 1081 */ 1082 static uint64_t 1083 metaslab_ff_alloc(metaslab_t *msp, uint64_t size) 1084 { 1085 /* 1086 * Find the largest power of 2 block size that evenly divides the 1087 * requested size. This is used to try to allocate blocks with similar 1088 * alignment from the same area of the metaslab (i.e. same cursor 1089 * bucket) but it does not guarantee that other allocations sizes 1090 * may exist in the same region. 1091 */ 1092 uint64_t align = size & -size; 1093 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; 1094 avl_tree_t *t = &msp->ms_tree->rt_root; 1095 1096 return (metaslab_block_picker(t, cursor, size, align)); 1097 } 1098 1099 static metaslab_ops_t metaslab_ff_ops = { 1100 metaslab_ff_alloc 1101 }; 1102 1103 /* 1104 * ========================================================================== 1105 * Dynamic block allocator - 1106 * Uses the first fit allocation scheme until space get low and then 1107 * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold 1108 * and metaslab_df_free_pct to determine when to switch the allocation scheme. 1109 * ========================================================================== 1110 */ 1111 static uint64_t 1112 metaslab_df_alloc(metaslab_t *msp, uint64_t size) 1113 { 1114 /* 1115 * Find the largest power of 2 block size that evenly divides the 1116 * requested size. This is used to try to allocate blocks with similar 1117 * alignment from the same area of the metaslab (i.e. same cursor 1118 * bucket) but it does not guarantee that other allocations sizes 1119 * may exist in the same region. 1120 */ 1121 uint64_t align = size & -size; 1122 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; 1123 range_tree_t *rt = msp->ms_tree; 1124 avl_tree_t *t = &rt->rt_root; 1125 uint64_t max_size = metaslab_block_maxsize(msp); 1126 int free_pct = range_tree_space(rt) * 100 / msp->ms_size; 1127 1128 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1129 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree)); 1130 1131 if (max_size < size) 1132 return (-1ULL); 1133 1134 /* 1135 * If we're running low on space switch to using the size 1136 * sorted AVL tree (best-fit). 1137 */ 1138 if (max_size < metaslab_df_alloc_threshold || 1139 free_pct < metaslab_df_free_pct) { 1140 t = &msp->ms_size_tree; 1141 *cursor = 0; 1142 } 1143 1144 return (metaslab_block_picker(t, cursor, size, 1ULL)); 1145 } 1146 1147 static metaslab_ops_t metaslab_df_ops = { 1148 metaslab_df_alloc 1149 }; 1150 1151 /* 1152 * ========================================================================== 1153 * Cursor fit block allocator - 1154 * Select the largest region in the metaslab, set the cursor to the beginning 1155 * of the range and the cursor_end to the end of the range. As allocations 1156 * are made advance the cursor. Continue allocating from the cursor until 1157 * the range is exhausted and then find a new range. 1158 * ========================================================================== 1159 */ 1160 static uint64_t 1161 metaslab_cf_alloc(metaslab_t *msp, uint64_t size) 1162 { 1163 range_tree_t *rt = msp->ms_tree; 1164 avl_tree_t *t = &msp->ms_size_tree; 1165 uint64_t *cursor = &msp->ms_lbas[0]; 1166 uint64_t *cursor_end = &msp->ms_lbas[1]; 1167 uint64_t offset = 0; 1168 1169 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1170 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root)); 1171 1172 ASSERT3U(*cursor_end, >=, *cursor); 1173 1174 if ((*cursor + size) > *cursor_end) { 1175 range_seg_t *rs; 1176 1177 rs = avl_last(&msp->ms_size_tree); 1178 if (rs == NULL || (rs->rs_end - rs->rs_start) < size) 1179 return (-1ULL); 1180 1181 *cursor = rs->rs_start; 1182 *cursor_end = rs->rs_end; 1183 } 1184 1185 offset = *cursor; 1186 *cursor += size; 1187 1188 return (offset); 1189 } 1190 1191 static metaslab_ops_t metaslab_cf_ops = { 1192 metaslab_cf_alloc 1193 }; 1194 1195 /* 1196 * ========================================================================== 1197 * New dynamic fit allocator - 1198 * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift 1199 * contiguous blocks. If no region is found then just use the largest segment 1200 * that remains. 1201 * ========================================================================== 1202 */ 1203 1204 /* 1205 * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift) 1206 * to request from the allocator. 1207 */ 1208 uint64_t metaslab_ndf_clump_shift = 4; 1209 1210 static uint64_t 1211 metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) 1212 { 1213 avl_tree_t *t = &msp->ms_tree->rt_root; 1214 avl_index_t where; 1215 range_seg_t *rs, rsearch; 1216 uint64_t hbit = highbit64(size); 1217 uint64_t *cursor = &msp->ms_lbas[hbit - 1]; 1218 uint64_t max_size = metaslab_block_maxsize(msp); 1219 1220 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1221 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree)); 1222 1223 if (max_size < size) 1224 return (-1ULL); 1225 1226 rsearch.rs_start = *cursor; 1227 rsearch.rs_end = *cursor + size; 1228 1229 rs = avl_find(t, &rsearch, &where); 1230 if (rs == NULL || (rs->rs_end - rs->rs_start) < size) { 1231 t = &msp->ms_size_tree; 1232 1233 rsearch.rs_start = 0; 1234 rsearch.rs_end = MIN(max_size, 1235 1ULL << (hbit + metaslab_ndf_clump_shift)); 1236 rs = avl_find(t, &rsearch, &where); 1237 if (rs == NULL) 1238 rs = avl_nearest(t, where, AVL_AFTER); 1239 ASSERT(rs != NULL); 1240 } 1241 1242 if ((rs->rs_end - rs->rs_start) >= size) { 1243 *cursor = rs->rs_start + size; 1244 return (rs->rs_start); 1245 } 1246 return (-1ULL); 1247 } 1248 1249 static metaslab_ops_t metaslab_ndf_ops = { 1250 metaslab_ndf_alloc 1251 }; 1252 1253 metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops; 1254 1255 /* 1256 * ========================================================================== 1257 * Metaslabs 1258 * ========================================================================== 1259 */ 1260 1261 /* 1262 * Wait for any in-progress metaslab loads to complete. 1263 */ 1264 void 1265 metaslab_load_wait(metaslab_t *msp) 1266 { 1267 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1268 1269 while (msp->ms_loading) { 1270 ASSERT(!msp->ms_loaded); 1271 cv_wait(&msp->ms_load_cv, &msp->ms_lock); 1272 } 1273 } 1274 1275 int 1276 metaslab_load(metaslab_t *msp) 1277 { 1278 int error = 0; 1279 1280 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1281 ASSERT(!msp->ms_loaded); 1282 ASSERT(!msp->ms_loading); 1283 1284 msp->ms_loading = B_TRUE; 1285 1286 /* 1287 * If the space map has not been allocated yet, then treat 1288 * all the space in the metaslab as free and add it to the 1289 * ms_tree. 1290 */ 1291 if (msp->ms_sm != NULL) 1292 error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE); 1293 else 1294 range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size); 1295 1296 msp->ms_loaded = (error == 0); 1297 msp->ms_loading = B_FALSE; 1298 1299 if (msp->ms_loaded) { 1300 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1301 range_tree_walk(msp->ms_defertree[t], 1302 range_tree_remove, msp->ms_tree); 1303 } 1304 } 1305 cv_broadcast(&msp->ms_load_cv); 1306 return (error); 1307 } 1308 1309 void 1310 metaslab_unload(metaslab_t *msp) 1311 { 1312 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1313 range_tree_vacate(msp->ms_tree, NULL, NULL); 1314 msp->ms_loaded = B_FALSE; 1315 msp->ms_weight &= ~METASLAB_ACTIVE_MASK; 1316 } 1317 1318 int 1319 metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, 1320 metaslab_t **msp) 1321 { 1322 vdev_t *vd = mg->mg_vd; 1323 objset_t *mos = vd->vdev_spa->spa_meta_objset; 1324 metaslab_t *ms; 1325 int error; 1326 1327 ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); 1328 mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL); 1329 cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); 1330 ms->ms_id = id; 1331 ms->ms_start = id << vd->vdev_ms_shift; 1332 ms->ms_size = 1ULL << vd->vdev_ms_shift; 1333 1334 /* 1335 * We only open space map objects that already exist. All others 1336 * will be opened when we finally allocate an object for it. 1337 */ 1338 if (object != 0) { 1339 error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start, 1340 ms->ms_size, vd->vdev_ashift, &ms->ms_lock); 1341 1342 if (error != 0) { 1343 kmem_free(ms, sizeof (metaslab_t)); 1344 return (error); 1345 } 1346 1347 ASSERT(ms->ms_sm != NULL); 1348 } 1349 1350 /* 1351 * We create the main range tree here, but we don't create the 1352 * alloctree and freetree until metaslab_sync_done(). This serves 1353 * two purposes: it allows metaslab_sync_done() to detect the 1354 * addition of new space; and for debugging, it ensures that we'd 1355 * data fault on any attempt to use this metaslab before it's ready. 1356 */ 1357 ms->ms_tree = range_tree_create(&metaslab_rt_ops, ms, &ms->ms_lock); 1358 metaslab_group_add(mg, ms); 1359 1360 ms->ms_fragmentation = metaslab_fragmentation(ms); 1361 ms->ms_ops = mg->mg_class->mc_ops; 1362 1363 /* 1364 * If we're opening an existing pool (txg == 0) or creating 1365 * a new one (txg == TXG_INITIAL), all space is available now. 1366 * If we're adding space to an existing pool, the new space 1367 * does not become available until after this txg has synced. 1368 */ 1369 if (txg <= TXG_INITIAL) 1370 metaslab_sync_done(ms, 0); 1371 1372 /* 1373 * If metaslab_debug_load is set and we're initializing a metaslab 1374 * that has an allocated space_map object then load the its space 1375 * map so that can verify frees. 1376 */ 1377 if (metaslab_debug_load && ms->ms_sm != NULL) { 1378 mutex_enter(&ms->ms_lock); 1379 VERIFY0(metaslab_load(ms)); 1380 mutex_exit(&ms->ms_lock); 1381 } 1382 1383 if (txg != 0) { 1384 vdev_dirty(vd, 0, NULL, txg); 1385 vdev_dirty(vd, VDD_METASLAB, ms, txg); 1386 } 1387 1388 *msp = ms; 1389 1390 return (0); 1391 } 1392 1393 void 1394 metaslab_fini(metaslab_t *msp) 1395 { 1396 metaslab_group_t *mg = msp->ms_group; 1397 1398 metaslab_group_remove(mg, msp); 1399 1400 mutex_enter(&msp->ms_lock); 1401 1402 VERIFY(msp->ms_group == NULL); 1403 vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm), 1404 0, -msp->ms_size); 1405 space_map_close(msp->ms_sm); 1406 1407 metaslab_unload(msp); 1408 range_tree_destroy(msp->ms_tree); 1409 1410 for (int t = 0; t < TXG_SIZE; t++) { 1411 range_tree_destroy(msp->ms_alloctree[t]); 1412 range_tree_destroy(msp->ms_freetree[t]); 1413 } 1414 1415 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1416 range_tree_destroy(msp->ms_defertree[t]); 1417 } 1418 1419 ASSERT0(msp->ms_deferspace); 1420 1421 mutex_exit(&msp->ms_lock); 1422 cv_destroy(&msp->ms_load_cv); 1423 mutex_destroy(&msp->ms_lock); 1424 1425 kmem_free(msp, sizeof (metaslab_t)); 1426 } 1427 1428 #define FRAGMENTATION_TABLE_SIZE 17 1429 1430 /* 1431 * This table defines a segment size based fragmentation metric that will 1432 * allow each metaslab to derive its own fragmentation value. This is done 1433 * by calculating the space in each bucket of the spacemap histogram and 1434 * multiplying that by the fragmetation metric in this table. Doing 1435 * this for all buckets and dividing it by the total amount of free 1436 * space in this metaslab (i.e. the total free space in all buckets) gives 1437 * us the fragmentation metric. This means that a high fragmentation metric 1438 * equates to most of the free space being comprised of small segments. 1439 * Conversely, if the metric is low, then most of the free space is in 1440 * large segments. A 10% change in fragmentation equates to approximately 1441 * double the number of segments. 1442 * 1443 * This table defines 0% fragmented space using 16MB segments. Testing has 1444 * shown that segments that are greater than or equal to 16MB do not suffer 1445 * from drastic performance problems. Using this value, we derive the rest 1446 * of the table. Since the fragmentation value is never stored on disk, it 1447 * is possible to change these calculations in the future. 1448 */ 1449 int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = { 1450 100, /* 512B */ 1451 100, /* 1K */ 1452 98, /* 2K */ 1453 95, /* 4K */ 1454 90, /* 8K */ 1455 80, /* 16K */ 1456 70, /* 32K */ 1457 60, /* 64K */ 1458 50, /* 128K */ 1459 40, /* 256K */ 1460 30, /* 512K */ 1461 20, /* 1M */ 1462 15, /* 2M */ 1463 10, /* 4M */ 1464 5, /* 8M */ 1465 0 /* 16M */ 1466 }; 1467 1468 /* 1469 * Calclate the metaslab's fragmentation metric. A return value 1470 * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does 1471 * not support this metric. Otherwise, the return value should be in the 1472 * range [0, 100]. 1473 */ 1474 static uint64_t 1475 metaslab_fragmentation(metaslab_t *msp) 1476 { 1477 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1478 uint64_t fragmentation = 0; 1479 uint64_t total = 0; 1480 boolean_t feature_enabled = spa_feature_is_enabled(spa, 1481 SPA_FEATURE_SPACEMAP_HISTOGRAM); 1482 1483 if (!feature_enabled) 1484 return (ZFS_FRAG_INVALID); 1485 1486 /* 1487 * A null space map means that the entire metaslab is free 1488 * and thus is not fragmented. 1489 */ 1490 if (msp->ms_sm == NULL) 1491 return (0); 1492 1493 /* 1494 * If this metaslab's space_map has not been upgraded, flag it 1495 * so that we upgrade next time we encounter it. 1496 */ 1497 if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) { 1498 uint64_t txg = spa_syncing_txg(spa); 1499 vdev_t *vd = msp->ms_group->mg_vd; 1500 1501 if (spa_writeable(spa)) { 1502 msp->ms_condense_wanted = B_TRUE; 1503 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 1504 spa_dbgmsg(spa, "txg %llu, requesting force condense: " 1505 "msp %p, vd %p", txg, msp, vd); 1506 } 1507 return (ZFS_FRAG_INVALID); 1508 } 1509 1510 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 1511 uint64_t space = 0; 1512 uint8_t shift = msp->ms_sm->sm_shift; 1513 int idx = MIN(shift - SPA_MINBLOCKSHIFT + i, 1514 FRAGMENTATION_TABLE_SIZE - 1); 1515 1516 if (msp->ms_sm->sm_phys->smp_histogram[i] == 0) 1517 continue; 1518 1519 space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift); 1520 total += space; 1521 1522 ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE); 1523 fragmentation += space * zfs_frag_table[idx]; 1524 } 1525 1526 if (total > 0) 1527 fragmentation /= total; 1528 ASSERT3U(fragmentation, <=, 100); 1529 return (fragmentation); 1530 } 1531 1532 /* 1533 * Compute a weight -- a selection preference value -- for the given metaslab. 1534 * This is based on the amount of free space, the level of fragmentation, 1535 * the LBA range, and whether the metaslab is loaded. 1536 */ 1537 static uint64_t 1538 metaslab_weight(metaslab_t *msp) 1539 { 1540 metaslab_group_t *mg = msp->ms_group; 1541 vdev_t *vd = mg->mg_vd; 1542 uint64_t weight, space; 1543 1544 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1545 1546 /* 1547 * This vdev is in the process of being removed so there is nothing 1548 * for us to do here. 1549 */ 1550 if (vd->vdev_removing) { 1551 ASSERT0(space_map_allocated(msp->ms_sm)); 1552 ASSERT0(vd->vdev_ms_shift); 1553 return (0); 1554 } 1555 1556 /* 1557 * The baseline weight is the metaslab's free space. 1558 */ 1559 space = msp->ms_size - space_map_allocated(msp->ms_sm); 1560 1561 msp->ms_fragmentation = metaslab_fragmentation(msp); 1562 if (metaslab_fragmentation_factor_enabled && 1563 msp->ms_fragmentation != ZFS_FRAG_INVALID) { 1564 /* 1565 * Use the fragmentation information to inversely scale 1566 * down the baseline weight. We need to ensure that we 1567 * don't exclude this metaslab completely when it's 100% 1568 * fragmented. To avoid this we reduce the fragmented value 1569 * by 1. 1570 */ 1571 space = (space * (100 - (msp->ms_fragmentation - 1))) / 100; 1572 1573 /* 1574 * If space < SPA_MINBLOCKSIZE, then we will not allocate from 1575 * this metaslab again. The fragmentation metric may have 1576 * decreased the space to something smaller than 1577 * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE 1578 * so that we can consume any remaining space. 1579 */ 1580 if (space > 0 && space < SPA_MINBLOCKSIZE) 1581 space = SPA_MINBLOCKSIZE; 1582 } 1583 weight = space; 1584 1585 /* 1586 * Modern disks have uniform bit density and constant angular velocity. 1587 * Therefore, the outer recording zones are faster (higher bandwidth) 1588 * than the inner zones by the ratio of outer to inner track diameter, 1589 * which is typically around 2:1. We account for this by assigning 1590 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). 1591 * In effect, this means that we'll select the metaslab with the most 1592 * free bandwidth rather than simply the one with the most free space. 1593 */ 1594 if (metaslab_lba_weighting_enabled) { 1595 weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count; 1596 ASSERT(weight >= space && weight <= 2 * space); 1597 } 1598 1599 /* 1600 * If this metaslab is one we're actively using, adjust its 1601 * weight to make it preferable to any inactive metaslab so 1602 * we'll polish it off. If the fragmentation on this metaslab 1603 * has exceed our threshold, then don't mark it active. 1604 */ 1605 if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID && 1606 msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) { 1607 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); 1608 } 1609 1610 return (weight); 1611 } 1612 1613 static int 1614 metaslab_activate(metaslab_t *msp, uint64_t activation_weight) 1615 { 1616 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1617 1618 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { 1619 metaslab_load_wait(msp); 1620 if (!msp->ms_loaded) { 1621 int error = metaslab_load(msp); 1622 if (error) { 1623 metaslab_group_sort(msp->ms_group, msp, 0); 1624 return (error); 1625 } 1626 } 1627 1628 metaslab_group_sort(msp->ms_group, msp, 1629 msp->ms_weight | activation_weight); 1630 } 1631 ASSERT(msp->ms_loaded); 1632 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); 1633 1634 return (0); 1635 } 1636 1637 static void 1638 metaslab_passivate(metaslab_t *msp, uint64_t size) 1639 { 1640 /* 1641 * If size < SPA_MINBLOCKSIZE, then we will not allocate from 1642 * this metaslab again. In that case, it had better be empty, 1643 * or we would be leaving space on the table. 1644 */ 1645 ASSERT(size >= SPA_MINBLOCKSIZE || range_tree_space(msp->ms_tree) == 0); 1646 metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size)); 1647 ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); 1648 } 1649 1650 static void 1651 metaslab_preload(void *arg) 1652 { 1653 metaslab_t *msp = arg; 1654 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1655 1656 ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock)); 1657 1658 mutex_enter(&msp->ms_lock); 1659 metaslab_load_wait(msp); 1660 if (!msp->ms_loaded) 1661 (void) metaslab_load(msp); 1662 1663 /* 1664 * Set the ms_access_txg value so that we don't unload it right away. 1665 */ 1666 msp->ms_access_txg = spa_syncing_txg(spa) + metaslab_unload_delay + 1; 1667 mutex_exit(&msp->ms_lock); 1668 } 1669 1670 static void 1671 metaslab_group_preload(metaslab_group_t *mg) 1672 { 1673 spa_t *spa = mg->mg_vd->vdev_spa; 1674 metaslab_t *msp; 1675 avl_tree_t *t = &mg->mg_metaslab_tree; 1676 int m = 0; 1677 1678 if (spa_shutting_down(spa) || !metaslab_preload_enabled) { 1679 taskq_wait(mg->mg_taskq); 1680 return; 1681 } 1682 1683 mutex_enter(&mg->mg_lock); 1684 /* 1685 * Load the next potential metaslabs 1686 */ 1687 msp = avl_first(t); 1688 while (msp != NULL) { 1689 metaslab_t *msp_next = AVL_NEXT(t, msp); 1690 1691 /* 1692 * We preload only the maximum number of metaslabs specified 1693 * by metaslab_preload_limit. If a metaslab is being forced 1694 * to condense then we preload it too. This will ensure 1695 * that force condensing happens in the next txg. 1696 */ 1697 if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) { 1698 msp = msp_next; 1699 continue; 1700 } 1701 1702 /* 1703 * We must drop the metaslab group lock here to preserve 1704 * lock ordering with the ms_lock (when grabbing both 1705 * the mg_lock and the ms_lock, the ms_lock must be taken 1706 * first). As a result, it is possible that the ordering 1707 * of the metaslabs within the avl tree may change before 1708 * we reacquire the lock. The metaslab cannot be removed from 1709 * the tree while we're in syncing context so it is safe to 1710 * drop the mg_lock here. If the metaslabs are reordered 1711 * nothing will break -- we just may end up loading a 1712 * less than optimal one. 1713 */ 1714 mutex_exit(&mg->mg_lock); 1715 VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload, 1716 msp, TQ_SLEEP) != NULL); 1717 mutex_enter(&mg->mg_lock); 1718 msp = msp_next; 1719 } 1720 mutex_exit(&mg->mg_lock); 1721 } 1722 1723 /* 1724 * Determine if the space map's on-disk footprint is past our tolerance 1725 * for inefficiency. We would like to use the following criteria to make 1726 * our decision: 1727 * 1728 * 1. The size of the space map object should not dramatically increase as a 1729 * result of writing out the free space range tree. 1730 * 1731 * 2. The minimal on-disk space map representation is zfs_condense_pct/100 1732 * times the size than the free space range tree representation 1733 * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB). 1734 * 1735 * 3. The on-disk size of the space map should actually decrease. 1736 * 1737 * Checking the first condition is tricky since we don't want to walk 1738 * the entire AVL tree calculating the estimated on-disk size. Instead we 1739 * use the size-ordered range tree in the metaslab and calculate the 1740 * size required to write out the largest segment in our free tree. If the 1741 * size required to represent that segment on disk is larger than the space 1742 * map object then we avoid condensing this map. 1743 * 1744 * To determine the second criterion we use a best-case estimate and assume 1745 * each segment can be represented on-disk as a single 64-bit entry. We refer 1746 * to this best-case estimate as the space map's minimal form. 1747 * 1748 * Unfortunately, we cannot compute the on-disk size of the space map in this 1749 * context because we cannot accurately compute the effects of compression, etc. 1750 * Instead, we apply the heuristic described in the block comment for 1751 * zfs_metaslab_condense_block_threshold - we only condense if the space used 1752 * is greater than a threshold number of blocks. 1753 */ 1754 static boolean_t 1755 metaslab_should_condense(metaslab_t *msp) 1756 { 1757 space_map_t *sm = msp->ms_sm; 1758 range_seg_t *rs; 1759 uint64_t size, entries, segsz, object_size, optimal_size, record_size; 1760 dmu_object_info_t doi; 1761 uint64_t vdev_blocksize = 1 << msp->ms_group->mg_vd->vdev_ashift; 1762 1763 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1764 ASSERT(msp->ms_loaded); 1765 1766 /* 1767 * Use the ms_size_tree range tree, which is ordered by size, to 1768 * obtain the largest segment in the free tree. We always condense 1769 * metaslabs that are empty and metaslabs for which a condense 1770 * request has been made. 1771 */ 1772 rs = avl_last(&msp->ms_size_tree); 1773 if (rs == NULL || msp->ms_condense_wanted) 1774 return (B_TRUE); 1775 1776 /* 1777 * Calculate the number of 64-bit entries this segment would 1778 * require when written to disk. If this single segment would be 1779 * larger on-disk than the entire current on-disk structure, then 1780 * clearly condensing will increase the on-disk structure size. 1781 */ 1782 size = (rs->rs_end - rs->rs_start) >> sm->sm_shift; 1783 entries = size / (MIN(size, SM_RUN_MAX)); 1784 segsz = entries * sizeof (uint64_t); 1785 1786 optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root); 1787 object_size = space_map_length(msp->ms_sm); 1788 1789 dmu_object_info_from_db(sm->sm_dbuf, &doi); 1790 record_size = MAX(doi.doi_data_block_size, vdev_blocksize); 1791 1792 return (segsz <= object_size && 1793 object_size >= (optimal_size * zfs_condense_pct / 100) && 1794 object_size > zfs_metaslab_condense_block_threshold * record_size); 1795 } 1796 1797 /* 1798 * Condense the on-disk space map representation to its minimized form. 1799 * The minimized form consists of a small number of allocations followed by 1800 * the entries of the free range tree. 1801 */ 1802 static void 1803 metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) 1804 { 1805 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1806 range_tree_t *freetree = msp->ms_freetree[txg & TXG_MASK]; 1807 range_tree_t *condense_tree; 1808 space_map_t *sm = msp->ms_sm; 1809 1810 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1811 ASSERT3U(spa_sync_pass(spa), ==, 1); 1812 ASSERT(msp->ms_loaded); 1813 1814 1815 spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, vdev id %llu, " 1816 "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg, 1817 msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id, 1818 msp->ms_group->mg_vd->vdev_spa->spa_name, 1819 space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root), 1820 msp->ms_condense_wanted ? "TRUE" : "FALSE"); 1821 1822 msp->ms_condense_wanted = B_FALSE; 1823 1824 /* 1825 * Create an range tree that is 100% allocated. We remove segments 1826 * that have been freed in this txg, any deferred frees that exist, 1827 * and any allocation in the future. Removing segments should be 1828 * a relatively inexpensive operation since we expect these trees to 1829 * have a small number of nodes. 1830 */ 1831 condense_tree = range_tree_create(NULL, NULL, &msp->ms_lock); 1832 range_tree_add(condense_tree, msp->ms_start, msp->ms_size); 1833 1834 /* 1835 * Remove what's been freed in this txg from the condense_tree. 1836 * Since we're in sync_pass 1, we know that all the frees from 1837 * this txg are in the freetree. 1838 */ 1839 range_tree_walk(freetree, range_tree_remove, condense_tree); 1840 1841 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1842 range_tree_walk(msp->ms_defertree[t], 1843 range_tree_remove, condense_tree); 1844 } 1845 1846 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 1847 range_tree_walk(msp->ms_alloctree[(txg + t) & TXG_MASK], 1848 range_tree_remove, condense_tree); 1849 } 1850 1851 /* 1852 * We're about to drop the metaslab's lock thus allowing 1853 * other consumers to change it's content. Set the 1854 * metaslab's ms_condensing flag to ensure that 1855 * allocations on this metaslab do not occur while we're 1856 * in the middle of committing it to disk. This is only critical 1857 * for the ms_tree as all other range trees use per txg 1858 * views of their content. 1859 */ 1860 msp->ms_condensing = B_TRUE; 1861 1862 mutex_exit(&msp->ms_lock); 1863 space_map_truncate(sm, tx); 1864 mutex_enter(&msp->ms_lock); 1865 1866 /* 1867 * While we would ideally like to create a space_map representation 1868 * that consists only of allocation records, doing so can be 1869 * prohibitively expensive because the in-core free tree can be 1870 * large, and therefore computationally expensive to subtract 1871 * from the condense_tree. Instead we sync out two trees, a cheap 1872 * allocation only tree followed by the in-core free tree. While not 1873 * optimal, this is typically close to optimal, and much cheaper to 1874 * compute. 1875 */ 1876 space_map_write(sm, condense_tree, SM_ALLOC, tx); 1877 range_tree_vacate(condense_tree, NULL, NULL); 1878 range_tree_destroy(condense_tree); 1879 1880 space_map_write(sm, msp->ms_tree, SM_FREE, tx); 1881 msp->ms_condensing = B_FALSE; 1882 } 1883 1884 /* 1885 * Write a metaslab to disk in the context of the specified transaction group. 1886 */ 1887 void 1888 metaslab_sync(metaslab_t *msp, uint64_t txg) 1889 { 1890 metaslab_group_t *mg = msp->ms_group; 1891 vdev_t *vd = mg->mg_vd; 1892 spa_t *spa = vd->vdev_spa; 1893 objset_t *mos = spa_meta_objset(spa); 1894 range_tree_t *alloctree = msp->ms_alloctree[txg & TXG_MASK]; 1895 range_tree_t **freetree = &msp->ms_freetree[txg & TXG_MASK]; 1896 range_tree_t **freed_tree = 1897 &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK]; 1898 dmu_tx_t *tx; 1899 uint64_t object = space_map_object(msp->ms_sm); 1900 1901 ASSERT(!vd->vdev_ishole); 1902 1903 /* 1904 * This metaslab has just been added so there's no work to do now. 1905 */ 1906 if (*freetree == NULL) { 1907 ASSERT3P(alloctree, ==, NULL); 1908 return; 1909 } 1910 1911 ASSERT3P(alloctree, !=, NULL); 1912 ASSERT3P(*freetree, !=, NULL); 1913 ASSERT3P(*freed_tree, !=, NULL); 1914 1915 /* 1916 * Normally, we don't want to process a metaslab if there 1917 * are no allocations or frees to perform. However, if the metaslab 1918 * is being forced to condense we need to let it through. 1919 */ 1920 if (range_tree_space(alloctree) == 0 && 1921 range_tree_space(*freetree) == 0 && 1922 !msp->ms_condense_wanted) 1923 return; 1924 1925 /* 1926 * The only state that can actually be changing concurrently with 1927 * metaslab_sync() is the metaslab's ms_tree. No other thread can 1928 * be modifying this txg's alloctree, freetree, freed_tree, or 1929 * space_map_phys_t. Therefore, we only hold ms_lock to satify 1930 * space_map ASSERTs. We drop it whenever we call into the DMU, 1931 * because the DMU can call down to us (e.g. via zio_free()) at 1932 * any time. 1933 */ 1934 1935 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 1936 1937 if (msp->ms_sm == NULL) { 1938 uint64_t new_object; 1939 1940 new_object = space_map_alloc(mos, tx); 1941 VERIFY3U(new_object, !=, 0); 1942 1943 VERIFY0(space_map_open(&msp->ms_sm, mos, new_object, 1944 msp->ms_start, msp->ms_size, vd->vdev_ashift, 1945 &msp->ms_lock)); 1946 ASSERT(msp->ms_sm != NULL); 1947 } 1948 1949 mutex_enter(&msp->ms_lock); 1950 1951 /* 1952 * Note: metaslab_condense() clears the space_map's histogram. 1953 * Therefore we must verify and remove this histogram before 1954 * condensing. 1955 */ 1956 metaslab_group_histogram_verify(mg); 1957 metaslab_class_histogram_verify(mg->mg_class); 1958 metaslab_group_histogram_remove(mg, msp); 1959 1960 if (msp->ms_loaded && spa_sync_pass(spa) == 1 && 1961 metaslab_should_condense(msp)) { 1962 metaslab_condense(msp, txg, tx); 1963 } else { 1964 space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx); 1965 space_map_write(msp->ms_sm, *freetree, SM_FREE, tx); 1966 } 1967 1968 if (msp->ms_loaded) { 1969 /* 1970 * When the space map is loaded, we have an accruate 1971 * histogram in the range tree. This gives us an opportunity 1972 * to bring the space map's histogram up-to-date so we clear 1973 * it first before updating it. 1974 */ 1975 space_map_histogram_clear(msp->ms_sm); 1976 space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx); 1977 } else { 1978 /* 1979 * Since the space map is not loaded we simply update the 1980 * exisiting histogram with what was freed in this txg. This 1981 * means that the on-disk histogram may not have an accurate 1982 * view of the free space but it's close enough to allow 1983 * us to make allocation decisions. 1984 */ 1985 space_map_histogram_add(msp->ms_sm, *freetree, tx); 1986 } 1987 metaslab_group_histogram_add(mg, msp); 1988 metaslab_group_histogram_verify(mg); 1989 metaslab_class_histogram_verify(mg->mg_class); 1990 1991 /* 1992 * For sync pass 1, we avoid traversing this txg's free range tree 1993 * and instead will just swap the pointers for freetree and 1994 * freed_tree. We can safely do this since the freed_tree is 1995 * guaranteed to be empty on the initial pass. 1996 */ 1997 if (spa_sync_pass(spa) == 1) { 1998 range_tree_swap(freetree, freed_tree); 1999 } else { 2000 range_tree_vacate(*freetree, range_tree_add, *freed_tree); 2001 } 2002 range_tree_vacate(alloctree, NULL, NULL); 2003 2004 ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK])); 2005 ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK])); 2006 2007 mutex_exit(&msp->ms_lock); 2008 2009 if (object != space_map_object(msp->ms_sm)) { 2010 object = space_map_object(msp->ms_sm); 2011 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * 2012 msp->ms_id, sizeof (uint64_t), &object, tx); 2013 } 2014 dmu_tx_commit(tx); 2015 } 2016 2017 /* 2018 * Called after a transaction group has completely synced to mark 2019 * all of the metaslab's free space as usable. 2020 */ 2021 void 2022 metaslab_sync_done(metaslab_t *msp, uint64_t txg) 2023 { 2024 metaslab_group_t *mg = msp->ms_group; 2025 vdev_t *vd = mg->mg_vd; 2026 range_tree_t **freed_tree; 2027 range_tree_t **defer_tree; 2028 int64_t alloc_delta, defer_delta; 2029 2030 ASSERT(!vd->vdev_ishole); 2031 2032 mutex_enter(&msp->ms_lock); 2033 2034 /* 2035 * If this metaslab is just becoming available, initialize its 2036 * alloctrees, freetrees, and defertree and add its capacity to 2037 * the vdev. 2038 */ 2039 if (msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK] == NULL) { 2040 for (int t = 0; t < TXG_SIZE; t++) { 2041 ASSERT(msp->ms_alloctree[t] == NULL); 2042 ASSERT(msp->ms_freetree[t] == NULL); 2043 2044 msp->ms_alloctree[t] = range_tree_create(NULL, msp, 2045 &msp->ms_lock); 2046 msp->ms_freetree[t] = range_tree_create(NULL, msp, 2047 &msp->ms_lock); 2048 } 2049 2050 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2051 ASSERT(msp->ms_defertree[t] == NULL); 2052 2053 msp->ms_defertree[t] = range_tree_create(NULL, msp, 2054 &msp->ms_lock); 2055 } 2056 2057 vdev_space_update(vd, 0, 0, msp->ms_size); 2058 } 2059 2060 freed_tree = &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK]; 2061 defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE]; 2062 2063 alloc_delta = space_map_alloc_delta(msp->ms_sm); 2064 defer_delta = range_tree_space(*freed_tree) - 2065 range_tree_space(*defer_tree); 2066 2067 vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0); 2068 2069 ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK])); 2070 ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK])); 2071 2072 /* 2073 * If there's a metaslab_load() in progress, wait for it to complete 2074 * so that we have a consistent view of the in-core space map. 2075 */ 2076 metaslab_load_wait(msp); 2077 2078 /* 2079 * Move the frees from the defer_tree back to the free 2080 * range tree (if it's loaded). Swap the freed_tree and the 2081 * defer_tree -- this is safe to do because we've just emptied out 2082 * the defer_tree. 2083 */ 2084 range_tree_vacate(*defer_tree, 2085 msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree); 2086 range_tree_swap(freed_tree, defer_tree); 2087 2088 space_map_update(msp->ms_sm); 2089 2090 msp->ms_deferspace += defer_delta; 2091 ASSERT3S(msp->ms_deferspace, >=, 0); 2092 ASSERT3S(msp->ms_deferspace, <=, msp->ms_size); 2093 if (msp->ms_deferspace != 0) { 2094 /* 2095 * Keep syncing this metaslab until all deferred frees 2096 * are back in circulation. 2097 */ 2098 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 2099 } 2100 2101 if (msp->ms_loaded && msp->ms_access_txg < txg) { 2102 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 2103 VERIFY0(range_tree_space( 2104 msp->ms_alloctree[(txg + t) & TXG_MASK])); 2105 } 2106 2107 if (!metaslab_debug_unload) 2108 metaslab_unload(msp); 2109 } 2110 2111 metaslab_group_sort(mg, msp, metaslab_weight(msp)); 2112 mutex_exit(&msp->ms_lock); 2113 } 2114 2115 void 2116 metaslab_sync_reassess(metaslab_group_t *mg) 2117 { 2118 metaslab_group_alloc_update(mg); 2119 mg->mg_fragmentation = metaslab_group_fragmentation(mg); 2120 2121 /* 2122 * Preload the next potential metaslabs 2123 */ 2124 metaslab_group_preload(mg); 2125 } 2126 2127 static uint64_t 2128 metaslab_distance(metaslab_t *msp, dva_t *dva) 2129 { 2130 uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift; 2131 uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift; 2132 uint64_t start = msp->ms_id; 2133 2134 if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) 2135 return (1ULL << 63); 2136 2137 if (offset < start) 2138 return ((start - offset) << ms_shift); 2139 if (offset > start) 2140 return ((offset - start) << ms_shift); 2141 return (0); 2142 } 2143 2144 /* 2145 * ========================================================================== 2146 * Metaslab block operations 2147 * ========================================================================== 2148 */ 2149 2150 static void 2151 metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags) 2152 { 2153 if (!(flags & METASLAB_ASYNC_ALLOC) || 2154 flags & METASLAB_DONT_THROTTLE) 2155 return; 2156 2157 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 2158 if (!mg->mg_class->mc_alloc_throttle_enabled) 2159 return; 2160 2161 (void) refcount_add(&mg->mg_alloc_queue_depth, tag); 2162 } 2163 2164 void 2165 metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags) 2166 { 2167 if (!(flags & METASLAB_ASYNC_ALLOC) || 2168 flags & METASLAB_DONT_THROTTLE) 2169 return; 2170 2171 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 2172 if (!mg->mg_class->mc_alloc_throttle_enabled) 2173 return; 2174 2175 (void) refcount_remove(&mg->mg_alloc_queue_depth, tag); 2176 } 2177 2178 void 2179 metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag) 2180 { 2181 #ifdef ZFS_DEBUG 2182 const dva_t *dva = bp->blk_dva; 2183 int ndvas = BP_GET_NDVAS(bp); 2184 2185 for (int d = 0; d < ndvas; d++) { 2186 uint64_t vdev = DVA_GET_VDEV(&dva[d]); 2187 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 2188 VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth, tag)); 2189 } 2190 #endif 2191 } 2192 2193 static uint64_t 2194 metaslab_group_alloc(metaslab_group_t *mg, uint64_t asize, 2195 uint64_t txg, uint64_t min_distance, dva_t *dva, int d) 2196 { 2197 spa_t *spa = mg->mg_vd->vdev_spa; 2198 metaslab_t *msp = NULL; 2199 uint64_t offset = -1ULL; 2200 avl_tree_t *t = &mg->mg_metaslab_tree; 2201 uint64_t activation_weight; 2202 uint64_t target_distance; 2203 int i; 2204 2205 activation_weight = METASLAB_WEIGHT_PRIMARY; 2206 for (i = 0; i < d; i++) { 2207 if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { 2208 activation_weight = METASLAB_WEIGHT_SECONDARY; 2209 break; 2210 } 2211 } 2212 2213 for (;;) { 2214 boolean_t was_active; 2215 2216 mutex_enter(&mg->mg_lock); 2217 for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) { 2218 if (msp->ms_weight < asize) { 2219 spa_dbgmsg(spa, "%s: failed to meet weight " 2220 "requirement: vdev %llu, txg %llu, mg %p, " 2221 "msp %p, asize %llu, " 2222 "weight %llu", spa_name(spa), 2223 mg->mg_vd->vdev_id, txg, 2224 mg, msp, asize, msp->ms_weight); 2225 mutex_exit(&mg->mg_lock); 2226 return (-1ULL); 2227 } 2228 2229 /* 2230 * If the selected metaslab is condensing, skip it. 2231 */ 2232 if (msp->ms_condensing) 2233 continue; 2234 2235 was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; 2236 if (activation_weight == METASLAB_WEIGHT_PRIMARY) 2237 break; 2238 2239 target_distance = min_distance + 2240 (space_map_allocated(msp->ms_sm) != 0 ? 0 : 2241 min_distance >> 1); 2242 2243 for (i = 0; i < d; i++) 2244 if (metaslab_distance(msp, &dva[i]) < 2245 target_distance) 2246 break; 2247 if (i == d) 2248 break; 2249 } 2250 mutex_exit(&mg->mg_lock); 2251 if (msp == NULL) 2252 return (-1ULL); 2253 2254 mutex_enter(&msp->ms_lock); 2255 2256 /* 2257 * Ensure that the metaslab we have selected is still 2258 * capable of handling our request. It's possible that 2259 * another thread may have changed the weight while we 2260 * were blocked on the metaslab lock. 2261 */ 2262 if (msp->ms_weight < asize || (was_active && 2263 !(msp->ms_weight & METASLAB_ACTIVE_MASK) && 2264 activation_weight == METASLAB_WEIGHT_PRIMARY)) { 2265 mutex_exit(&msp->ms_lock); 2266 continue; 2267 } 2268 2269 if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) && 2270 activation_weight == METASLAB_WEIGHT_PRIMARY) { 2271 metaslab_passivate(msp, 2272 msp->ms_weight & ~METASLAB_ACTIVE_MASK); 2273 mutex_exit(&msp->ms_lock); 2274 continue; 2275 } 2276 2277 if (metaslab_activate(msp, activation_weight) != 0) { 2278 mutex_exit(&msp->ms_lock); 2279 continue; 2280 } 2281 2282 /* 2283 * If this metaslab is currently condensing then pick again as 2284 * we can't manipulate this metaslab until it's committed 2285 * to disk. 2286 */ 2287 if (msp->ms_condensing) { 2288 mutex_exit(&msp->ms_lock); 2289 continue; 2290 } 2291 2292 if ((offset = metaslab_block_alloc(msp, asize)) != -1ULL) 2293 break; 2294 2295 metaslab_passivate(msp, metaslab_block_maxsize(msp)); 2296 mutex_exit(&msp->ms_lock); 2297 } 2298 2299 if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0) 2300 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); 2301 2302 range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, asize); 2303 msp->ms_access_txg = txg + metaslab_unload_delay; 2304 2305 mutex_exit(&msp->ms_lock); 2306 return (offset); 2307 } 2308 2309 /* 2310 * Allocate a block for the specified i/o. 2311 */ 2312 static int 2313 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, 2314 dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags) 2315 { 2316 metaslab_group_t *mg, *rotor; 2317 vdev_t *vd; 2318 int dshift = 3; 2319 int all_zero; 2320 int zio_lock = B_FALSE; 2321 boolean_t allocatable; 2322 uint64_t asize; 2323 uint64_t distance; 2324 2325 ASSERT(!DVA_IS_VALID(&dva[d])); 2326 2327 /* 2328 * For testing, make some blocks above a certain size be gang blocks. 2329 */ 2330 if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) 2331 return (SET_ERROR(ENOSPC)); 2332 2333 /* 2334 * Start at the rotor and loop through all mgs until we find something. 2335 * Note that there's no locking on mc_rotor or mc_aliquot because 2336 * nothing actually breaks if we miss a few updates -- we just won't 2337 * allocate quite as evenly. It all balances out over time. 2338 * 2339 * If we are doing ditto or log blocks, try to spread them across 2340 * consecutive vdevs. If we're forced to reuse a vdev before we've 2341 * allocated all of our ditto blocks, then try and spread them out on 2342 * that vdev as much as possible. If it turns out to not be possible, 2343 * gradually lower our standards until anything becomes acceptable. 2344 * Also, allocating on consecutive vdevs (as opposed to random vdevs) 2345 * gives us hope of containing our fault domains to something we're 2346 * able to reason about. Otherwise, any two top-level vdev failures 2347 * will guarantee the loss of data. With consecutive allocation, 2348 * only two adjacent top-level vdev failures will result in data loss. 2349 * 2350 * If we are doing gang blocks (hintdva is non-NULL), try to keep 2351 * ourselves on the same vdev as our gang block header. That 2352 * way, we can hope for locality in vdev_cache, plus it makes our 2353 * fault domains something tractable. 2354 */ 2355 if (hintdva) { 2356 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); 2357 2358 /* 2359 * It's possible the vdev we're using as the hint no 2360 * longer exists (i.e. removed). Consult the rotor when 2361 * all else fails. 2362 */ 2363 if (vd != NULL) { 2364 mg = vd->vdev_mg; 2365 2366 if (flags & METASLAB_HINTBP_AVOID && 2367 mg->mg_next != NULL) 2368 mg = mg->mg_next; 2369 } else { 2370 mg = mc->mc_rotor; 2371 } 2372 } else if (d != 0) { 2373 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); 2374 mg = vd->vdev_mg->mg_next; 2375 } else { 2376 mg = mc->mc_rotor; 2377 } 2378 2379 /* 2380 * If the hint put us into the wrong metaslab class, or into a 2381 * metaslab group that has been passivated, just follow the rotor. 2382 */ 2383 if (mg->mg_class != mc || mg->mg_activation_count <= 0) 2384 mg = mc->mc_rotor; 2385 2386 rotor = mg; 2387 top: 2388 all_zero = B_TRUE; 2389 do { 2390 ASSERT(mg->mg_activation_count == 1); 2391 vd = mg->mg_vd; 2392 2393 /* 2394 * Don't allocate from faulted devices. 2395 */ 2396 if (zio_lock) { 2397 spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); 2398 allocatable = vdev_allocatable(vd); 2399 spa_config_exit(spa, SCL_ZIO, FTAG); 2400 } else { 2401 allocatable = vdev_allocatable(vd); 2402 } 2403 2404 /* 2405 * Determine if the selected metaslab group is eligible 2406 * for allocations. If we're ganging then don't allow 2407 * this metaslab group to skip allocations since that would 2408 * inadvertently return ENOSPC and suspend the pool 2409 * even though space is still available. 2410 */ 2411 if (allocatable && !GANG_ALLOCATION(flags) && !zio_lock) { 2412 allocatable = metaslab_group_allocatable(mg, rotor, 2413 psize); 2414 } 2415 2416 if (!allocatable) 2417 goto next; 2418 2419 ASSERT(mg->mg_initialized); 2420 2421 /* 2422 * Avoid writing single-copy data to a failing vdev. 2423 */ 2424 if ((vd->vdev_stat.vs_write_errors > 0 || 2425 vd->vdev_state < VDEV_STATE_HEALTHY) && 2426 d == 0 && dshift == 3 && vd->vdev_children == 0) { 2427 all_zero = B_FALSE; 2428 goto next; 2429 } 2430 2431 ASSERT(mg->mg_class == mc); 2432 2433 distance = vd->vdev_asize >> dshift; 2434 if (distance <= (1ULL << vd->vdev_ms_shift)) 2435 distance = 0; 2436 else 2437 all_zero = B_FALSE; 2438 2439 asize = vdev_psize_to_asize(vd, psize); 2440 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); 2441 2442 uint64_t offset = metaslab_group_alloc(mg, asize, txg, 2443 distance, dva, d); 2444 2445 mutex_enter(&mg->mg_lock); 2446 if (offset == -1ULL) { 2447 mg->mg_failed_allocations++; 2448 if (asize == SPA_GANGBLOCKSIZE) { 2449 /* 2450 * This metaslab group was unable to allocate 2451 * the minimum gang block size so it must be 2452 * out of space. We must notify the allocation 2453 * throttle to start skipping allocation 2454 * attempts to this metaslab group until more 2455 * space becomes available. 2456 * 2457 * Note: this failure cannot be caused by the 2458 * allocation throttle since the allocation 2459 * throttle is only responsible for skipping 2460 * devices and not failing block allocations. 2461 */ 2462 mg->mg_no_free_space = B_TRUE; 2463 } 2464 } 2465 mg->mg_allocations++; 2466 mutex_exit(&mg->mg_lock); 2467 2468 if (offset != -1ULL) { 2469 /* 2470 * If we've just selected this metaslab group, 2471 * figure out whether the corresponding vdev is 2472 * over- or under-used relative to the pool, 2473 * and set an allocation bias to even it out. 2474 */ 2475 if (mc->mc_aliquot == 0 && metaslab_bias_enabled) { 2476 vdev_stat_t *vs = &vd->vdev_stat; 2477 int64_t vu, cu; 2478 2479 vu = (vs->vs_alloc * 100) / (vs->vs_space + 1); 2480 cu = (mc->mc_alloc * 100) / (mc->mc_space + 1); 2481 2482 /* 2483 * Calculate how much more or less we should 2484 * try to allocate from this device during 2485 * this iteration around the rotor. 2486 * For example, if a device is 80% full 2487 * and the pool is 20% full then we should 2488 * reduce allocations by 60% on this device. 2489 * 2490 * mg_bias = (20 - 80) * 512K / 100 = -307K 2491 * 2492 * This reduces allocations by 307K for this 2493 * iteration. 2494 */ 2495 mg->mg_bias = ((cu - vu) * 2496 (int64_t)mg->mg_aliquot) / 100; 2497 } else if (!metaslab_bias_enabled) { 2498 mg->mg_bias = 0; 2499 } 2500 2501 if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= 2502 mg->mg_aliquot + mg->mg_bias) { 2503 mc->mc_rotor = mg->mg_next; 2504 mc->mc_aliquot = 0; 2505 } 2506 2507 DVA_SET_VDEV(&dva[d], vd->vdev_id); 2508 DVA_SET_OFFSET(&dva[d], offset); 2509 DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER)); 2510 DVA_SET_ASIZE(&dva[d], asize); 2511 2512 return (0); 2513 } 2514 next: 2515 mc->mc_rotor = mg->mg_next; 2516 mc->mc_aliquot = 0; 2517 } while ((mg = mg->mg_next) != rotor); 2518 2519 if (!all_zero) { 2520 dshift++; 2521 ASSERT(dshift < 64); 2522 goto top; 2523 } 2524 2525 if (!allocatable && !zio_lock) { 2526 dshift = 3; 2527 zio_lock = B_TRUE; 2528 goto top; 2529 } 2530 2531 bzero(&dva[d], sizeof (dva_t)); 2532 2533 return (SET_ERROR(ENOSPC)); 2534 } 2535 2536 /* 2537 * Free the block represented by DVA in the context of the specified 2538 * transaction group. 2539 */ 2540 static void 2541 metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now) 2542 { 2543 uint64_t vdev = DVA_GET_VDEV(dva); 2544 uint64_t offset = DVA_GET_OFFSET(dva); 2545 uint64_t size = DVA_GET_ASIZE(dva); 2546 vdev_t *vd; 2547 metaslab_t *msp; 2548 2549 ASSERT(DVA_IS_VALID(dva)); 2550 2551 if (txg > spa_freeze_txg(spa)) 2552 return; 2553 2554 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 2555 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { 2556 cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu", 2557 (u_longlong_t)vdev, (u_longlong_t)offset); 2558 ASSERT(0); 2559 return; 2560 } 2561 2562 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 2563 2564 if (DVA_GET_GANG(dva)) 2565 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 2566 2567 mutex_enter(&msp->ms_lock); 2568 2569 if (now) { 2570 range_tree_remove(msp->ms_alloctree[txg & TXG_MASK], 2571 offset, size); 2572 2573 VERIFY(!msp->ms_condensing); 2574 VERIFY3U(offset, >=, msp->ms_start); 2575 VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size); 2576 VERIFY3U(range_tree_space(msp->ms_tree) + size, <=, 2577 msp->ms_size); 2578 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 2579 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 2580 range_tree_add(msp->ms_tree, offset, size); 2581 } else { 2582 if (range_tree_space(msp->ms_freetree[txg & TXG_MASK]) == 0) 2583 vdev_dirty(vd, VDD_METASLAB, msp, txg); 2584 range_tree_add(msp->ms_freetree[txg & TXG_MASK], 2585 offset, size); 2586 } 2587 2588 mutex_exit(&msp->ms_lock); 2589 } 2590 2591 /* 2592 * Intent log support: upon opening the pool after a crash, notify the SPA 2593 * of blocks that the intent log has allocated for immediate write, but 2594 * which are still considered free by the SPA because the last transaction 2595 * group didn't commit yet. 2596 */ 2597 static int 2598 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) 2599 { 2600 uint64_t vdev = DVA_GET_VDEV(dva); 2601 uint64_t offset = DVA_GET_OFFSET(dva); 2602 uint64_t size = DVA_GET_ASIZE(dva); 2603 vdev_t *vd; 2604 metaslab_t *msp; 2605 int error = 0; 2606 2607 ASSERT(DVA_IS_VALID(dva)); 2608 2609 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 2610 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) 2611 return (SET_ERROR(ENXIO)); 2612 2613 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 2614 2615 if (DVA_GET_GANG(dva)) 2616 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 2617 2618 mutex_enter(&msp->ms_lock); 2619 2620 if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) 2621 error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); 2622 2623 if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size)) 2624 error = SET_ERROR(ENOENT); 2625 2626 if (error || txg == 0) { /* txg == 0 indicates dry run */ 2627 mutex_exit(&msp->ms_lock); 2628 return (error); 2629 } 2630 2631 VERIFY(!msp->ms_condensing); 2632 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 2633 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 2634 VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size); 2635 range_tree_remove(msp->ms_tree, offset, size); 2636 2637 if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ 2638 if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0) 2639 vdev_dirty(vd, VDD_METASLAB, msp, txg); 2640 range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size); 2641 } 2642 2643 mutex_exit(&msp->ms_lock); 2644 2645 return (0); 2646 } 2647 2648 /* 2649 * Reserve some allocation slots. The reservation system must be called 2650 * before we call into the allocator. If there aren't any available slots 2651 * then the I/O will be throttled until an I/O completes and its slots are 2652 * freed up. The function returns true if it was successful in placing 2653 * the reservation. 2654 */ 2655 boolean_t 2656 metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio, 2657 int flags) 2658 { 2659 uint64_t available_slots = 0; 2660 boolean_t slot_reserved = B_FALSE; 2661 2662 ASSERT(mc->mc_alloc_throttle_enabled); 2663 mutex_enter(&mc->mc_lock); 2664 2665 uint64_t reserved_slots = refcount_count(&mc->mc_alloc_slots); 2666 if (reserved_slots < mc->mc_alloc_max_slots) 2667 available_slots = mc->mc_alloc_max_slots - reserved_slots; 2668 2669 if (slots <= available_slots || GANG_ALLOCATION(flags)) { 2670 /* 2671 * We reserve the slots individually so that we can unreserve 2672 * them individually when an I/O completes. 2673 */ 2674 for (int d = 0; d < slots; d++) { 2675 reserved_slots = refcount_add(&mc->mc_alloc_slots, zio); 2676 } 2677 zio->io_flags |= ZIO_FLAG_IO_ALLOCATING; 2678 slot_reserved = B_TRUE; 2679 } 2680 2681 mutex_exit(&mc->mc_lock); 2682 return (slot_reserved); 2683 } 2684 2685 void 2686 metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, zio_t *zio) 2687 { 2688 ASSERT(mc->mc_alloc_throttle_enabled); 2689 mutex_enter(&mc->mc_lock); 2690 for (int d = 0; d < slots; d++) { 2691 (void) refcount_remove(&mc->mc_alloc_slots, zio); 2692 } 2693 mutex_exit(&mc->mc_lock); 2694 } 2695 2696 int 2697 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, 2698 int ndvas, uint64_t txg, blkptr_t *hintbp, int flags, zio_t *zio) 2699 { 2700 dva_t *dva = bp->blk_dva; 2701 dva_t *hintdva = hintbp->blk_dva; 2702 int error = 0; 2703 2704 ASSERT(bp->blk_birth == 0); 2705 ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); 2706 2707 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 2708 2709 if (mc->mc_rotor == NULL) { /* no vdevs in this class */ 2710 spa_config_exit(spa, SCL_ALLOC, FTAG); 2711 return (SET_ERROR(ENOSPC)); 2712 } 2713 2714 ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); 2715 ASSERT(BP_GET_NDVAS(bp) == 0); 2716 ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); 2717 2718 for (int d = 0; d < ndvas; d++) { 2719 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, 2720 txg, flags); 2721 if (error != 0) { 2722 for (d--; d >= 0; d--) { 2723 metaslab_free_dva(spa, &dva[d], txg, B_TRUE); 2724 metaslab_group_alloc_decrement(spa, 2725 DVA_GET_VDEV(&dva[d]), zio, flags); 2726 bzero(&dva[d], sizeof (dva_t)); 2727 } 2728 spa_config_exit(spa, SCL_ALLOC, FTAG); 2729 return (error); 2730 } else { 2731 /* 2732 * Update the metaslab group's queue depth 2733 * based on the newly allocated dva. 2734 */ 2735 metaslab_group_alloc_increment(spa, 2736 DVA_GET_VDEV(&dva[d]), zio, flags); 2737 } 2738 2739 } 2740 ASSERT(error == 0); 2741 ASSERT(BP_GET_NDVAS(bp) == ndvas); 2742 2743 spa_config_exit(spa, SCL_ALLOC, FTAG); 2744 2745 BP_SET_BIRTH(bp, txg, txg); 2746 2747 return (0); 2748 } 2749 2750 void 2751 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) 2752 { 2753 const dva_t *dva = bp->blk_dva; 2754 int ndvas = BP_GET_NDVAS(bp); 2755 2756 ASSERT(!BP_IS_HOLE(bp)); 2757 ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); 2758 2759 spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); 2760 2761 for (int d = 0; d < ndvas; d++) 2762 metaslab_free_dva(spa, &dva[d], txg, now); 2763 2764 spa_config_exit(spa, SCL_FREE, FTAG); 2765 } 2766 2767 int 2768 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) 2769 { 2770 const dva_t *dva = bp->blk_dva; 2771 int ndvas = BP_GET_NDVAS(bp); 2772 int error = 0; 2773 2774 ASSERT(!BP_IS_HOLE(bp)); 2775 2776 if (txg != 0) { 2777 /* 2778 * First do a dry run to make sure all DVAs are claimable, 2779 * so we don't have to unwind from partial failures below. 2780 */ 2781 if ((error = metaslab_claim(spa, bp, 0)) != 0) 2782 return (error); 2783 } 2784 2785 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 2786 2787 for (int d = 0; d < ndvas; d++) 2788 if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0) 2789 break; 2790 2791 spa_config_exit(spa, SCL_ALLOC, FTAG); 2792 2793 ASSERT(error == 0 || txg == 0); 2794 2795 return (error); 2796 } 2797 2798 void 2799 metaslab_check_free(spa_t *spa, const blkptr_t *bp) 2800 { 2801 if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) 2802 return; 2803 2804 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 2805 for (int i = 0; i < BP_GET_NDVAS(bp); i++) { 2806 uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]); 2807 vdev_t *vd = vdev_lookup_top(spa, vdev); 2808 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); 2809 uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]); 2810 metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 2811 2812 if (msp->ms_loaded) 2813 range_tree_verify(msp->ms_tree, offset, size); 2814 2815 for (int j = 0; j < TXG_SIZE; j++) 2816 range_tree_verify(msp->ms_freetree[j], offset, size); 2817 for (int j = 0; j < TXG_DEFER_SIZE; j++) 2818 range_tree_verify(msp->ms_defertree[j], offset, size); 2819 } 2820 spa_config_exit(spa, SCL_VDEV, FTAG); 2821 } 2822