1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2015 by Delphix. All rights reserved. 24 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 25 * Copyright (c) 2014 Integros [integros.com] 26 */ 27 28 #include <sys/zfs_context.h> 29 #include <sys/dmu.h> 30 #include <sys/dmu_tx.h> 31 #include <sys/space_map.h> 32 #include <sys/metaslab_impl.h> 33 #include <sys/vdev_impl.h> 34 #include <sys/zio.h> 35 #include <sys/spa_impl.h> 36 #include <sys/zfeature.h> 37 38 /* 39 * Allow allocations to switch to gang blocks quickly. We do this to 40 * avoid having to load lots of space_maps in a given txg. There are, 41 * however, some cases where we want to avoid "fast" ganging and instead 42 * we want to do an exhaustive search of all metaslabs on this device. 43 * Currently we don't allow any gang, slog, or dump device related allocations 44 * to "fast" gang. 45 */ 46 #define CAN_FASTGANG(flags) \ 47 (!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \ 48 METASLAB_GANG_AVOID))) 49 50 #define METASLAB_WEIGHT_PRIMARY (1ULL << 63) 51 #define METASLAB_WEIGHT_SECONDARY (1ULL << 62) 52 #define METASLAB_ACTIVE_MASK \ 53 (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY) 54 55 uint64_t metaslab_aliquot = 512ULL << 10; 56 uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ 57 58 /* 59 * The in-core space map representation is more compact than its on-disk form. 60 * The zfs_condense_pct determines how much more compact the in-core 61 * space_map representation must be before we compact it on-disk. 62 * Values should be greater than or equal to 100. 63 */ 64 int zfs_condense_pct = 200; 65 66 /* 67 * Condensing a metaslab is not guaranteed to actually reduce the amount of 68 * space used on disk. In particular, a space map uses data in increments of 69 * MAX(1 << ashift, space_map_blksize), so a metaslab might use the 70 * same number of blocks after condensing. Since the goal of condensing is to 71 * reduce the number of IOPs required to read the space map, we only want to 72 * condense when we can be sure we will reduce the number of blocks used by the 73 * space map. Unfortunately, we cannot precisely compute whether or not this is 74 * the case in metaslab_should_condense since we are holding ms_lock. Instead, 75 * we apply the following heuristic: do not condense a spacemap unless the 76 * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold 77 * blocks. 78 */ 79 int zfs_metaslab_condense_block_threshold = 4; 80 81 /* 82 * The zfs_mg_noalloc_threshold defines which metaslab groups should 83 * be eligible for allocation. The value is defined as a percentage of 84 * free space. Metaslab groups that have more free space than 85 * zfs_mg_noalloc_threshold are always eligible for allocations. Once 86 * a metaslab group's free space is less than or equal to the 87 * zfs_mg_noalloc_threshold the allocator will avoid allocating to that 88 * group unless all groups in the pool have reached zfs_mg_noalloc_threshold. 89 * Once all groups in the pool reach zfs_mg_noalloc_threshold then all 90 * groups are allowed to accept allocations. Gang blocks are always 91 * eligible to allocate on any metaslab group. The default value of 0 means 92 * no metaslab group will be excluded based on this criterion. 93 */ 94 int zfs_mg_noalloc_threshold = 0; 95 96 /* 97 * Metaslab groups are considered eligible for allocations if their 98 * fragmenation metric (measured as a percentage) is less than or equal to 99 * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold 100 * then it will be skipped unless all metaslab groups within the metaslab 101 * class have also crossed this threshold. 102 */ 103 int zfs_mg_fragmentation_threshold = 85; 104 105 /* 106 * Allow metaslabs to keep their active state as long as their fragmentation 107 * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An 108 * active metaslab that exceeds this threshold will no longer keep its active 109 * status allowing better metaslabs to be selected. 110 */ 111 int zfs_metaslab_fragmentation_threshold = 70; 112 113 /* 114 * When set will load all metaslabs when pool is first opened. 115 */ 116 int metaslab_debug_load = 0; 117 118 /* 119 * When set will prevent metaslabs from being unloaded. 120 */ 121 int metaslab_debug_unload = 0; 122 123 /* 124 * Minimum size which forces the dynamic allocator to change 125 * it's allocation strategy. Once the space map cannot satisfy 126 * an allocation of this size then it switches to using more 127 * aggressive strategy (i.e search by size rather than offset). 128 */ 129 uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE; 130 131 /* 132 * The minimum free space, in percent, which must be available 133 * in a space map to continue allocations in a first-fit fashion. 134 * Once the space_map's free space drops below this level we dynamically 135 * switch to using best-fit allocations. 136 */ 137 int metaslab_df_free_pct = 4; 138 139 /* 140 * A metaslab is considered "free" if it contains a contiguous 141 * segment which is greater than metaslab_min_alloc_size. 142 */ 143 uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS; 144 145 /* 146 * Percentage of all cpus that can be used by the metaslab taskq. 147 */ 148 int metaslab_load_pct = 50; 149 150 /* 151 * Determines how many txgs a metaslab may remain loaded without having any 152 * allocations from it. As long as a metaslab continues to be used we will 153 * keep it loaded. 154 */ 155 int metaslab_unload_delay = TXG_SIZE * 2; 156 157 /* 158 * Max number of metaslabs per group to preload. 159 */ 160 int metaslab_preload_limit = SPA_DVAS_PER_BP; 161 162 /* 163 * Enable/disable preloading of metaslab. 164 */ 165 boolean_t metaslab_preload_enabled = B_TRUE; 166 167 /* 168 * Enable/disable fragmentation weighting on metaslabs. 169 */ 170 boolean_t metaslab_fragmentation_factor_enabled = B_TRUE; 171 172 /* 173 * Enable/disable lba weighting (i.e. outer tracks are given preference). 174 */ 175 boolean_t metaslab_lba_weighting_enabled = B_TRUE; 176 177 /* 178 * Enable/disable metaslab group biasing. 179 */ 180 boolean_t metaslab_bias_enabled = B_TRUE; 181 182 static uint64_t metaslab_fragmentation(metaslab_t *); 183 184 /* 185 * ========================================================================== 186 * Metaslab classes 187 * ========================================================================== 188 */ 189 metaslab_class_t * 190 metaslab_class_create(spa_t *spa, metaslab_ops_t *ops) 191 { 192 metaslab_class_t *mc; 193 194 mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); 195 196 mc->mc_spa = spa; 197 mc->mc_rotor = NULL; 198 mc->mc_ops = ops; 199 200 return (mc); 201 } 202 203 void 204 metaslab_class_destroy(metaslab_class_t *mc) 205 { 206 ASSERT(mc->mc_rotor == NULL); 207 ASSERT(mc->mc_alloc == 0); 208 ASSERT(mc->mc_deferred == 0); 209 ASSERT(mc->mc_space == 0); 210 ASSERT(mc->mc_dspace == 0); 211 212 kmem_free(mc, sizeof (metaslab_class_t)); 213 } 214 215 int 216 metaslab_class_validate(metaslab_class_t *mc) 217 { 218 metaslab_group_t *mg; 219 vdev_t *vd; 220 221 /* 222 * Must hold one of the spa_config locks. 223 */ 224 ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || 225 spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); 226 227 if ((mg = mc->mc_rotor) == NULL) 228 return (0); 229 230 do { 231 vd = mg->mg_vd; 232 ASSERT(vd->vdev_mg != NULL); 233 ASSERT3P(vd->vdev_top, ==, vd); 234 ASSERT3P(mg->mg_class, ==, mc); 235 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); 236 } while ((mg = mg->mg_next) != mc->mc_rotor); 237 238 return (0); 239 } 240 241 void 242 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, 243 int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) 244 { 245 atomic_add_64(&mc->mc_alloc, alloc_delta); 246 atomic_add_64(&mc->mc_deferred, defer_delta); 247 atomic_add_64(&mc->mc_space, space_delta); 248 atomic_add_64(&mc->mc_dspace, dspace_delta); 249 } 250 251 uint64_t 252 metaslab_class_get_alloc(metaslab_class_t *mc) 253 { 254 return (mc->mc_alloc); 255 } 256 257 uint64_t 258 metaslab_class_get_deferred(metaslab_class_t *mc) 259 { 260 return (mc->mc_deferred); 261 } 262 263 uint64_t 264 metaslab_class_get_space(metaslab_class_t *mc) 265 { 266 return (mc->mc_space); 267 } 268 269 uint64_t 270 metaslab_class_get_dspace(metaslab_class_t *mc) 271 { 272 return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); 273 } 274 275 void 276 metaslab_class_histogram_verify(metaslab_class_t *mc) 277 { 278 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 279 uint64_t *mc_hist; 280 int i; 281 282 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 283 return; 284 285 mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 286 KM_SLEEP); 287 288 for (int c = 0; c < rvd->vdev_children; c++) { 289 vdev_t *tvd = rvd->vdev_child[c]; 290 metaslab_group_t *mg = tvd->vdev_mg; 291 292 /* 293 * Skip any holes, uninitialized top-levels, or 294 * vdevs that are not in this metalab class. 295 */ 296 if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 || 297 mg->mg_class != mc) { 298 continue; 299 } 300 301 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 302 mc_hist[i] += mg->mg_histogram[i]; 303 } 304 305 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 306 VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]); 307 308 kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 309 } 310 311 /* 312 * Calculate the metaslab class's fragmentation metric. The metric 313 * is weighted based on the space contribution of each metaslab group. 314 * The return value will be a number between 0 and 100 (inclusive), or 315 * ZFS_FRAG_INVALID if the metric has not been set. See comment above the 316 * zfs_frag_table for more information about the metric. 317 */ 318 uint64_t 319 metaslab_class_fragmentation(metaslab_class_t *mc) 320 { 321 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 322 uint64_t fragmentation = 0; 323 324 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 325 326 for (int c = 0; c < rvd->vdev_children; c++) { 327 vdev_t *tvd = rvd->vdev_child[c]; 328 metaslab_group_t *mg = tvd->vdev_mg; 329 330 /* 331 * Skip any holes, uninitialized top-levels, or 332 * vdevs that are not in this metalab class. 333 */ 334 if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 || 335 mg->mg_class != mc) { 336 continue; 337 } 338 339 /* 340 * If a metaslab group does not contain a fragmentation 341 * metric then just bail out. 342 */ 343 if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { 344 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 345 return (ZFS_FRAG_INVALID); 346 } 347 348 /* 349 * Determine how much this metaslab_group is contributing 350 * to the overall pool fragmentation metric. 351 */ 352 fragmentation += mg->mg_fragmentation * 353 metaslab_group_get_space(mg); 354 } 355 fragmentation /= metaslab_class_get_space(mc); 356 357 ASSERT3U(fragmentation, <=, 100); 358 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 359 return (fragmentation); 360 } 361 362 /* 363 * Calculate the amount of expandable space that is available in 364 * this metaslab class. If a device is expanded then its expandable 365 * space will be the amount of allocatable space that is currently not 366 * part of this metaslab class. 367 */ 368 uint64_t 369 metaslab_class_expandable_space(metaslab_class_t *mc) 370 { 371 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 372 uint64_t space = 0; 373 374 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 375 for (int c = 0; c < rvd->vdev_children; c++) { 376 vdev_t *tvd = rvd->vdev_child[c]; 377 metaslab_group_t *mg = tvd->vdev_mg; 378 379 if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 || 380 mg->mg_class != mc) { 381 continue; 382 } 383 384 space += tvd->vdev_max_asize - tvd->vdev_asize; 385 } 386 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 387 return (space); 388 } 389 390 /* 391 * ========================================================================== 392 * Metaslab groups 393 * ========================================================================== 394 */ 395 static int 396 metaslab_compare(const void *x1, const void *x2) 397 { 398 const metaslab_t *m1 = x1; 399 const metaslab_t *m2 = x2; 400 401 if (m1->ms_weight < m2->ms_weight) 402 return (1); 403 if (m1->ms_weight > m2->ms_weight) 404 return (-1); 405 406 /* 407 * If the weights are identical, use the offset to force uniqueness. 408 */ 409 if (m1->ms_start < m2->ms_start) 410 return (-1); 411 if (m1->ms_start > m2->ms_start) 412 return (1); 413 414 ASSERT3P(m1, ==, m2); 415 416 return (0); 417 } 418 419 /* 420 * Update the allocatable flag and the metaslab group's capacity. 421 * The allocatable flag is set to true if the capacity is below 422 * the zfs_mg_noalloc_threshold. If a metaslab group transitions 423 * from allocatable to non-allocatable or vice versa then the metaslab 424 * group's class is updated to reflect the transition. 425 */ 426 static void 427 metaslab_group_alloc_update(metaslab_group_t *mg) 428 { 429 vdev_t *vd = mg->mg_vd; 430 metaslab_class_t *mc = mg->mg_class; 431 vdev_stat_t *vs = &vd->vdev_stat; 432 boolean_t was_allocatable; 433 434 ASSERT(vd == vd->vdev_top); 435 436 mutex_enter(&mg->mg_lock); 437 was_allocatable = mg->mg_allocatable; 438 439 mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / 440 (vs->vs_space + 1); 441 442 /* 443 * A metaslab group is considered allocatable if it has plenty 444 * of free space or is not heavily fragmented. We only take 445 * fragmentation into account if the metaslab group has a valid 446 * fragmentation metric (i.e. a value between 0 and 100). 447 */ 448 mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold && 449 (mg->mg_fragmentation == ZFS_FRAG_INVALID || 450 mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)); 451 452 /* 453 * The mc_alloc_groups maintains a count of the number of 454 * groups in this metaslab class that are still above the 455 * zfs_mg_noalloc_threshold. This is used by the allocating 456 * threads to determine if they should avoid allocations to 457 * a given group. The allocator will avoid allocations to a group 458 * if that group has reached or is below the zfs_mg_noalloc_threshold 459 * and there are still other groups that are above the threshold. 460 * When a group transitions from allocatable to non-allocatable or 461 * vice versa we update the metaslab class to reflect that change. 462 * When the mc_alloc_groups value drops to 0 that means that all 463 * groups have reached the zfs_mg_noalloc_threshold making all groups 464 * eligible for allocations. This effectively means that all devices 465 * are balanced again. 466 */ 467 if (was_allocatable && !mg->mg_allocatable) 468 mc->mc_alloc_groups--; 469 else if (!was_allocatable && mg->mg_allocatable) 470 mc->mc_alloc_groups++; 471 472 mutex_exit(&mg->mg_lock); 473 } 474 475 metaslab_group_t * 476 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) 477 { 478 metaslab_group_t *mg; 479 480 mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); 481 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); 482 avl_create(&mg->mg_metaslab_tree, metaslab_compare, 483 sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); 484 mg->mg_vd = vd; 485 mg->mg_class = mc; 486 mg->mg_activation_count = 0; 487 488 mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct, 489 minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT); 490 491 return (mg); 492 } 493 494 void 495 metaslab_group_destroy(metaslab_group_t *mg) 496 { 497 ASSERT(mg->mg_prev == NULL); 498 ASSERT(mg->mg_next == NULL); 499 /* 500 * We may have gone below zero with the activation count 501 * either because we never activated in the first place or 502 * because we're done, and possibly removing the vdev. 503 */ 504 ASSERT(mg->mg_activation_count <= 0); 505 506 taskq_destroy(mg->mg_taskq); 507 avl_destroy(&mg->mg_metaslab_tree); 508 mutex_destroy(&mg->mg_lock); 509 kmem_free(mg, sizeof (metaslab_group_t)); 510 } 511 512 void 513 metaslab_group_activate(metaslab_group_t *mg) 514 { 515 metaslab_class_t *mc = mg->mg_class; 516 metaslab_group_t *mgprev, *mgnext; 517 518 ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); 519 520 ASSERT(mc->mc_rotor != mg); 521 ASSERT(mg->mg_prev == NULL); 522 ASSERT(mg->mg_next == NULL); 523 ASSERT(mg->mg_activation_count <= 0); 524 525 if (++mg->mg_activation_count <= 0) 526 return; 527 528 mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); 529 metaslab_group_alloc_update(mg); 530 531 if ((mgprev = mc->mc_rotor) == NULL) { 532 mg->mg_prev = mg; 533 mg->mg_next = mg; 534 } else { 535 mgnext = mgprev->mg_next; 536 mg->mg_prev = mgprev; 537 mg->mg_next = mgnext; 538 mgprev->mg_next = mg; 539 mgnext->mg_prev = mg; 540 } 541 mc->mc_rotor = mg; 542 } 543 544 void 545 metaslab_group_passivate(metaslab_group_t *mg) 546 { 547 metaslab_class_t *mc = mg->mg_class; 548 metaslab_group_t *mgprev, *mgnext; 549 550 ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); 551 552 if (--mg->mg_activation_count != 0) { 553 ASSERT(mc->mc_rotor != mg); 554 ASSERT(mg->mg_prev == NULL); 555 ASSERT(mg->mg_next == NULL); 556 ASSERT(mg->mg_activation_count < 0); 557 return; 558 } 559 560 taskq_wait(mg->mg_taskq); 561 metaslab_group_alloc_update(mg); 562 563 mgprev = mg->mg_prev; 564 mgnext = mg->mg_next; 565 566 if (mg == mgnext) { 567 mc->mc_rotor = NULL; 568 } else { 569 mc->mc_rotor = mgnext; 570 mgprev->mg_next = mgnext; 571 mgnext->mg_prev = mgprev; 572 } 573 574 mg->mg_prev = NULL; 575 mg->mg_next = NULL; 576 } 577 578 uint64_t 579 metaslab_group_get_space(metaslab_group_t *mg) 580 { 581 return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count); 582 } 583 584 void 585 metaslab_group_histogram_verify(metaslab_group_t *mg) 586 { 587 uint64_t *mg_hist; 588 vdev_t *vd = mg->mg_vd; 589 uint64_t ashift = vd->vdev_ashift; 590 int i; 591 592 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 593 return; 594 595 mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 596 KM_SLEEP); 597 598 ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=, 599 SPACE_MAP_HISTOGRAM_SIZE + ashift); 600 601 for (int m = 0; m < vd->vdev_ms_count; m++) { 602 metaslab_t *msp = vd->vdev_ms[m]; 603 604 if (msp->ms_sm == NULL) 605 continue; 606 607 for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) 608 mg_hist[i + ashift] += 609 msp->ms_sm->sm_phys->smp_histogram[i]; 610 } 611 612 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++) 613 VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]); 614 615 kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 616 } 617 618 static void 619 metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp) 620 { 621 metaslab_class_t *mc = mg->mg_class; 622 uint64_t ashift = mg->mg_vd->vdev_ashift; 623 624 ASSERT(MUTEX_HELD(&msp->ms_lock)); 625 if (msp->ms_sm == NULL) 626 return; 627 628 mutex_enter(&mg->mg_lock); 629 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 630 mg->mg_histogram[i + ashift] += 631 msp->ms_sm->sm_phys->smp_histogram[i]; 632 mc->mc_histogram[i + ashift] += 633 msp->ms_sm->sm_phys->smp_histogram[i]; 634 } 635 mutex_exit(&mg->mg_lock); 636 } 637 638 void 639 metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp) 640 { 641 metaslab_class_t *mc = mg->mg_class; 642 uint64_t ashift = mg->mg_vd->vdev_ashift; 643 644 ASSERT(MUTEX_HELD(&msp->ms_lock)); 645 if (msp->ms_sm == NULL) 646 return; 647 648 mutex_enter(&mg->mg_lock); 649 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 650 ASSERT3U(mg->mg_histogram[i + ashift], >=, 651 msp->ms_sm->sm_phys->smp_histogram[i]); 652 ASSERT3U(mc->mc_histogram[i + ashift], >=, 653 msp->ms_sm->sm_phys->smp_histogram[i]); 654 655 mg->mg_histogram[i + ashift] -= 656 msp->ms_sm->sm_phys->smp_histogram[i]; 657 mc->mc_histogram[i + ashift] -= 658 msp->ms_sm->sm_phys->smp_histogram[i]; 659 } 660 mutex_exit(&mg->mg_lock); 661 } 662 663 static void 664 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) 665 { 666 ASSERT(msp->ms_group == NULL); 667 mutex_enter(&mg->mg_lock); 668 msp->ms_group = mg; 669 msp->ms_weight = 0; 670 avl_add(&mg->mg_metaslab_tree, msp); 671 mutex_exit(&mg->mg_lock); 672 673 mutex_enter(&msp->ms_lock); 674 metaslab_group_histogram_add(mg, msp); 675 mutex_exit(&msp->ms_lock); 676 } 677 678 static void 679 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) 680 { 681 mutex_enter(&msp->ms_lock); 682 metaslab_group_histogram_remove(mg, msp); 683 mutex_exit(&msp->ms_lock); 684 685 mutex_enter(&mg->mg_lock); 686 ASSERT(msp->ms_group == mg); 687 avl_remove(&mg->mg_metaslab_tree, msp); 688 msp->ms_group = NULL; 689 mutex_exit(&mg->mg_lock); 690 } 691 692 static void 693 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 694 { 695 /* 696 * Although in principle the weight can be any value, in 697 * practice we do not use values in the range [1, 511]. 698 */ 699 ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0); 700 ASSERT(MUTEX_HELD(&msp->ms_lock)); 701 702 mutex_enter(&mg->mg_lock); 703 ASSERT(msp->ms_group == mg); 704 avl_remove(&mg->mg_metaslab_tree, msp); 705 msp->ms_weight = weight; 706 avl_add(&mg->mg_metaslab_tree, msp); 707 mutex_exit(&mg->mg_lock); 708 } 709 710 /* 711 * Calculate the fragmentation for a given metaslab group. We can use 712 * a simple average here since all metaslabs within the group must have 713 * the same size. The return value will be a value between 0 and 100 714 * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this 715 * group have a fragmentation metric. 716 */ 717 uint64_t 718 metaslab_group_fragmentation(metaslab_group_t *mg) 719 { 720 vdev_t *vd = mg->mg_vd; 721 uint64_t fragmentation = 0; 722 uint64_t valid_ms = 0; 723 724 for (int m = 0; m < vd->vdev_ms_count; m++) { 725 metaslab_t *msp = vd->vdev_ms[m]; 726 727 if (msp->ms_fragmentation == ZFS_FRAG_INVALID) 728 continue; 729 730 valid_ms++; 731 fragmentation += msp->ms_fragmentation; 732 } 733 734 if (valid_ms <= vd->vdev_ms_count / 2) 735 return (ZFS_FRAG_INVALID); 736 737 fragmentation /= valid_ms; 738 ASSERT3U(fragmentation, <=, 100); 739 return (fragmentation); 740 } 741 742 /* 743 * Determine if a given metaslab group should skip allocations. A metaslab 744 * group should avoid allocations if its free capacity is less than the 745 * zfs_mg_noalloc_threshold or its fragmentation metric is greater than 746 * zfs_mg_fragmentation_threshold and there is at least one metaslab group 747 * that can still handle allocations. 748 */ 749 static boolean_t 750 metaslab_group_allocatable(metaslab_group_t *mg) 751 { 752 vdev_t *vd = mg->mg_vd; 753 spa_t *spa = vd->vdev_spa; 754 metaslab_class_t *mc = mg->mg_class; 755 756 /* 757 * We use two key metrics to determine if a metaslab group is 758 * considered allocatable -- free space and fragmentation. If 759 * the free space is greater than the free space threshold and 760 * the fragmentation is less than the fragmentation threshold then 761 * consider the group allocatable. There are two case when we will 762 * not consider these key metrics. The first is if the group is 763 * associated with a slog device and the second is if all groups 764 * in this metaslab class have already been consider ineligible 765 * for allocations. 766 */ 767 return ((mg->mg_free_capacity > zfs_mg_noalloc_threshold && 768 (mg->mg_fragmentation == ZFS_FRAG_INVALID || 769 mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)) || 770 mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0); 771 } 772 773 /* 774 * ========================================================================== 775 * Range tree callbacks 776 * ========================================================================== 777 */ 778 779 /* 780 * Comparison function for the private size-ordered tree. Tree is sorted 781 * by size, larger sizes at the end of the tree. 782 */ 783 static int 784 metaslab_rangesize_compare(const void *x1, const void *x2) 785 { 786 const range_seg_t *r1 = x1; 787 const range_seg_t *r2 = x2; 788 uint64_t rs_size1 = r1->rs_end - r1->rs_start; 789 uint64_t rs_size2 = r2->rs_end - r2->rs_start; 790 791 if (rs_size1 < rs_size2) 792 return (-1); 793 if (rs_size1 > rs_size2) 794 return (1); 795 796 if (r1->rs_start < r2->rs_start) 797 return (-1); 798 799 if (r1->rs_start > r2->rs_start) 800 return (1); 801 802 return (0); 803 } 804 805 /* 806 * Create any block allocator specific components. The current allocators 807 * rely on using both a size-ordered range_tree_t and an array of uint64_t's. 808 */ 809 static void 810 metaslab_rt_create(range_tree_t *rt, void *arg) 811 { 812 metaslab_t *msp = arg; 813 814 ASSERT3P(rt->rt_arg, ==, msp); 815 ASSERT(msp->ms_tree == NULL); 816 817 avl_create(&msp->ms_size_tree, metaslab_rangesize_compare, 818 sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node)); 819 } 820 821 /* 822 * Destroy the block allocator specific components. 823 */ 824 static void 825 metaslab_rt_destroy(range_tree_t *rt, void *arg) 826 { 827 metaslab_t *msp = arg; 828 829 ASSERT3P(rt->rt_arg, ==, msp); 830 ASSERT3P(msp->ms_tree, ==, rt); 831 ASSERT0(avl_numnodes(&msp->ms_size_tree)); 832 833 avl_destroy(&msp->ms_size_tree); 834 } 835 836 static void 837 metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg) 838 { 839 metaslab_t *msp = arg; 840 841 ASSERT3P(rt->rt_arg, ==, msp); 842 ASSERT3P(msp->ms_tree, ==, rt); 843 VERIFY(!msp->ms_condensing); 844 avl_add(&msp->ms_size_tree, rs); 845 } 846 847 static void 848 metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg) 849 { 850 metaslab_t *msp = arg; 851 852 ASSERT3P(rt->rt_arg, ==, msp); 853 ASSERT3P(msp->ms_tree, ==, rt); 854 VERIFY(!msp->ms_condensing); 855 avl_remove(&msp->ms_size_tree, rs); 856 } 857 858 static void 859 metaslab_rt_vacate(range_tree_t *rt, void *arg) 860 { 861 metaslab_t *msp = arg; 862 863 ASSERT3P(rt->rt_arg, ==, msp); 864 ASSERT3P(msp->ms_tree, ==, rt); 865 866 /* 867 * Normally one would walk the tree freeing nodes along the way. 868 * Since the nodes are shared with the range trees we can avoid 869 * walking all nodes and just reinitialize the avl tree. The nodes 870 * will be freed by the range tree, so we don't want to free them here. 871 */ 872 avl_create(&msp->ms_size_tree, metaslab_rangesize_compare, 873 sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node)); 874 } 875 876 static range_tree_ops_t metaslab_rt_ops = { 877 metaslab_rt_create, 878 metaslab_rt_destroy, 879 metaslab_rt_add, 880 metaslab_rt_remove, 881 metaslab_rt_vacate 882 }; 883 884 /* 885 * ========================================================================== 886 * Metaslab block operations 887 * ========================================================================== 888 */ 889 890 /* 891 * Return the maximum contiguous segment within the metaslab. 892 */ 893 uint64_t 894 metaslab_block_maxsize(metaslab_t *msp) 895 { 896 avl_tree_t *t = &msp->ms_size_tree; 897 range_seg_t *rs; 898 899 if (t == NULL || (rs = avl_last(t)) == NULL) 900 return (0ULL); 901 902 return (rs->rs_end - rs->rs_start); 903 } 904 905 uint64_t 906 metaslab_block_alloc(metaslab_t *msp, uint64_t size) 907 { 908 uint64_t start; 909 range_tree_t *rt = msp->ms_tree; 910 911 VERIFY(!msp->ms_condensing); 912 913 start = msp->ms_ops->msop_alloc(msp, size); 914 if (start != -1ULL) { 915 vdev_t *vd = msp->ms_group->mg_vd; 916 917 VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift)); 918 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 919 VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size); 920 range_tree_remove(rt, start, size); 921 } 922 return (start); 923 } 924 925 /* 926 * ========================================================================== 927 * Common allocator routines 928 * ========================================================================== 929 */ 930 931 /* 932 * This is a helper function that can be used by the allocator to find 933 * a suitable block to allocate. This will search the specified AVL 934 * tree looking for a block that matches the specified criteria. 935 */ 936 static uint64_t 937 metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, 938 uint64_t align) 939 { 940 range_seg_t *rs, rsearch; 941 avl_index_t where; 942 943 rsearch.rs_start = *cursor; 944 rsearch.rs_end = *cursor + size; 945 946 rs = avl_find(t, &rsearch, &where); 947 if (rs == NULL) 948 rs = avl_nearest(t, where, AVL_AFTER); 949 950 while (rs != NULL) { 951 uint64_t offset = P2ROUNDUP(rs->rs_start, align); 952 953 if (offset + size <= rs->rs_end) { 954 *cursor = offset + size; 955 return (offset); 956 } 957 rs = AVL_NEXT(t, rs); 958 } 959 960 /* 961 * If we know we've searched the whole map (*cursor == 0), give up. 962 * Otherwise, reset the cursor to the beginning and try again. 963 */ 964 if (*cursor == 0) 965 return (-1ULL); 966 967 *cursor = 0; 968 return (metaslab_block_picker(t, cursor, size, align)); 969 } 970 971 /* 972 * ========================================================================== 973 * The first-fit block allocator 974 * ========================================================================== 975 */ 976 static uint64_t 977 metaslab_ff_alloc(metaslab_t *msp, uint64_t size) 978 { 979 /* 980 * Find the largest power of 2 block size that evenly divides the 981 * requested size. This is used to try to allocate blocks with similar 982 * alignment from the same area of the metaslab (i.e. same cursor 983 * bucket) but it does not guarantee that other allocations sizes 984 * may exist in the same region. 985 */ 986 uint64_t align = size & -size; 987 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; 988 avl_tree_t *t = &msp->ms_tree->rt_root; 989 990 return (metaslab_block_picker(t, cursor, size, align)); 991 } 992 993 static metaslab_ops_t metaslab_ff_ops = { 994 metaslab_ff_alloc 995 }; 996 997 /* 998 * ========================================================================== 999 * Dynamic block allocator - 1000 * Uses the first fit allocation scheme until space get low and then 1001 * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold 1002 * and metaslab_df_free_pct to determine when to switch the allocation scheme. 1003 * ========================================================================== 1004 */ 1005 static uint64_t 1006 metaslab_df_alloc(metaslab_t *msp, uint64_t size) 1007 { 1008 /* 1009 * Find the largest power of 2 block size that evenly divides the 1010 * requested size. This is used to try to allocate blocks with similar 1011 * alignment from the same area of the metaslab (i.e. same cursor 1012 * bucket) but it does not guarantee that other allocations sizes 1013 * may exist in the same region. 1014 */ 1015 uint64_t align = size & -size; 1016 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; 1017 range_tree_t *rt = msp->ms_tree; 1018 avl_tree_t *t = &rt->rt_root; 1019 uint64_t max_size = metaslab_block_maxsize(msp); 1020 int free_pct = range_tree_space(rt) * 100 / msp->ms_size; 1021 1022 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1023 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree)); 1024 1025 if (max_size < size) 1026 return (-1ULL); 1027 1028 /* 1029 * If we're running low on space switch to using the size 1030 * sorted AVL tree (best-fit). 1031 */ 1032 if (max_size < metaslab_df_alloc_threshold || 1033 free_pct < metaslab_df_free_pct) { 1034 t = &msp->ms_size_tree; 1035 *cursor = 0; 1036 } 1037 1038 return (metaslab_block_picker(t, cursor, size, 1ULL)); 1039 } 1040 1041 static metaslab_ops_t metaslab_df_ops = { 1042 metaslab_df_alloc 1043 }; 1044 1045 /* 1046 * ========================================================================== 1047 * Cursor fit block allocator - 1048 * Select the largest region in the metaslab, set the cursor to the beginning 1049 * of the range and the cursor_end to the end of the range. As allocations 1050 * are made advance the cursor. Continue allocating from the cursor until 1051 * the range is exhausted and then find a new range. 1052 * ========================================================================== 1053 */ 1054 static uint64_t 1055 metaslab_cf_alloc(metaslab_t *msp, uint64_t size) 1056 { 1057 range_tree_t *rt = msp->ms_tree; 1058 avl_tree_t *t = &msp->ms_size_tree; 1059 uint64_t *cursor = &msp->ms_lbas[0]; 1060 uint64_t *cursor_end = &msp->ms_lbas[1]; 1061 uint64_t offset = 0; 1062 1063 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1064 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root)); 1065 1066 ASSERT3U(*cursor_end, >=, *cursor); 1067 1068 if ((*cursor + size) > *cursor_end) { 1069 range_seg_t *rs; 1070 1071 rs = avl_last(&msp->ms_size_tree); 1072 if (rs == NULL || (rs->rs_end - rs->rs_start) < size) 1073 return (-1ULL); 1074 1075 *cursor = rs->rs_start; 1076 *cursor_end = rs->rs_end; 1077 } 1078 1079 offset = *cursor; 1080 *cursor += size; 1081 1082 return (offset); 1083 } 1084 1085 static metaslab_ops_t metaslab_cf_ops = { 1086 metaslab_cf_alloc 1087 }; 1088 1089 /* 1090 * ========================================================================== 1091 * New dynamic fit allocator - 1092 * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift 1093 * contiguous blocks. If no region is found then just use the largest segment 1094 * that remains. 1095 * ========================================================================== 1096 */ 1097 1098 /* 1099 * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift) 1100 * to request from the allocator. 1101 */ 1102 uint64_t metaslab_ndf_clump_shift = 4; 1103 1104 static uint64_t 1105 metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) 1106 { 1107 avl_tree_t *t = &msp->ms_tree->rt_root; 1108 avl_index_t where; 1109 range_seg_t *rs, rsearch; 1110 uint64_t hbit = highbit64(size); 1111 uint64_t *cursor = &msp->ms_lbas[hbit - 1]; 1112 uint64_t max_size = metaslab_block_maxsize(msp); 1113 1114 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1115 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree)); 1116 1117 if (max_size < size) 1118 return (-1ULL); 1119 1120 rsearch.rs_start = *cursor; 1121 rsearch.rs_end = *cursor + size; 1122 1123 rs = avl_find(t, &rsearch, &where); 1124 if (rs == NULL || (rs->rs_end - rs->rs_start) < size) { 1125 t = &msp->ms_size_tree; 1126 1127 rsearch.rs_start = 0; 1128 rsearch.rs_end = MIN(max_size, 1129 1ULL << (hbit + metaslab_ndf_clump_shift)); 1130 rs = avl_find(t, &rsearch, &where); 1131 if (rs == NULL) 1132 rs = avl_nearest(t, where, AVL_AFTER); 1133 ASSERT(rs != NULL); 1134 } 1135 1136 if ((rs->rs_end - rs->rs_start) >= size) { 1137 *cursor = rs->rs_start + size; 1138 return (rs->rs_start); 1139 } 1140 return (-1ULL); 1141 } 1142 1143 static metaslab_ops_t metaslab_ndf_ops = { 1144 metaslab_ndf_alloc 1145 }; 1146 1147 metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops; 1148 1149 /* 1150 * ========================================================================== 1151 * Metaslabs 1152 * ========================================================================== 1153 */ 1154 1155 /* 1156 * Wait for any in-progress metaslab loads to complete. 1157 */ 1158 void 1159 metaslab_load_wait(metaslab_t *msp) 1160 { 1161 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1162 1163 while (msp->ms_loading) { 1164 ASSERT(!msp->ms_loaded); 1165 cv_wait(&msp->ms_load_cv, &msp->ms_lock); 1166 } 1167 } 1168 1169 int 1170 metaslab_load(metaslab_t *msp) 1171 { 1172 int error = 0; 1173 1174 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1175 ASSERT(!msp->ms_loaded); 1176 ASSERT(!msp->ms_loading); 1177 1178 msp->ms_loading = B_TRUE; 1179 1180 /* 1181 * If the space map has not been allocated yet, then treat 1182 * all the space in the metaslab as free and add it to the 1183 * ms_tree. 1184 */ 1185 if (msp->ms_sm != NULL) 1186 error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE); 1187 else 1188 range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size); 1189 1190 msp->ms_loaded = (error == 0); 1191 msp->ms_loading = B_FALSE; 1192 1193 if (msp->ms_loaded) { 1194 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1195 range_tree_walk(msp->ms_defertree[t], 1196 range_tree_remove, msp->ms_tree); 1197 } 1198 } 1199 cv_broadcast(&msp->ms_load_cv); 1200 return (error); 1201 } 1202 1203 void 1204 metaslab_unload(metaslab_t *msp) 1205 { 1206 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1207 range_tree_vacate(msp->ms_tree, NULL, NULL); 1208 msp->ms_loaded = B_FALSE; 1209 msp->ms_weight &= ~METASLAB_ACTIVE_MASK; 1210 } 1211 1212 int 1213 metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, 1214 metaslab_t **msp) 1215 { 1216 vdev_t *vd = mg->mg_vd; 1217 objset_t *mos = vd->vdev_spa->spa_meta_objset; 1218 metaslab_t *ms; 1219 int error; 1220 1221 ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); 1222 mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL); 1223 cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); 1224 ms->ms_id = id; 1225 ms->ms_start = id << vd->vdev_ms_shift; 1226 ms->ms_size = 1ULL << vd->vdev_ms_shift; 1227 1228 /* 1229 * We only open space map objects that already exist. All others 1230 * will be opened when we finally allocate an object for it. 1231 */ 1232 if (object != 0) { 1233 error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start, 1234 ms->ms_size, vd->vdev_ashift, &ms->ms_lock); 1235 1236 if (error != 0) { 1237 kmem_free(ms, sizeof (metaslab_t)); 1238 return (error); 1239 } 1240 1241 ASSERT(ms->ms_sm != NULL); 1242 } 1243 1244 /* 1245 * We create the main range tree here, but we don't create the 1246 * alloctree and freetree until metaslab_sync_done(). This serves 1247 * two purposes: it allows metaslab_sync_done() to detect the 1248 * addition of new space; and for debugging, it ensures that we'd 1249 * data fault on any attempt to use this metaslab before it's ready. 1250 */ 1251 ms->ms_tree = range_tree_create(&metaslab_rt_ops, ms, &ms->ms_lock); 1252 metaslab_group_add(mg, ms); 1253 1254 ms->ms_fragmentation = metaslab_fragmentation(ms); 1255 ms->ms_ops = mg->mg_class->mc_ops; 1256 1257 /* 1258 * If we're opening an existing pool (txg == 0) or creating 1259 * a new one (txg == TXG_INITIAL), all space is available now. 1260 * If we're adding space to an existing pool, the new space 1261 * does not become available until after this txg has synced. 1262 */ 1263 if (txg <= TXG_INITIAL) 1264 metaslab_sync_done(ms, 0); 1265 1266 /* 1267 * If metaslab_debug_load is set and we're initializing a metaslab 1268 * that has an allocated space_map object then load the its space 1269 * map so that can verify frees. 1270 */ 1271 if (metaslab_debug_load && ms->ms_sm != NULL) { 1272 mutex_enter(&ms->ms_lock); 1273 VERIFY0(metaslab_load(ms)); 1274 mutex_exit(&ms->ms_lock); 1275 } 1276 1277 if (txg != 0) { 1278 vdev_dirty(vd, 0, NULL, txg); 1279 vdev_dirty(vd, VDD_METASLAB, ms, txg); 1280 } 1281 1282 *msp = ms; 1283 1284 return (0); 1285 } 1286 1287 void 1288 metaslab_fini(metaslab_t *msp) 1289 { 1290 metaslab_group_t *mg = msp->ms_group; 1291 1292 metaslab_group_remove(mg, msp); 1293 1294 mutex_enter(&msp->ms_lock); 1295 1296 VERIFY(msp->ms_group == NULL); 1297 vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm), 1298 0, -msp->ms_size); 1299 space_map_close(msp->ms_sm); 1300 1301 metaslab_unload(msp); 1302 range_tree_destroy(msp->ms_tree); 1303 1304 for (int t = 0; t < TXG_SIZE; t++) { 1305 range_tree_destroy(msp->ms_alloctree[t]); 1306 range_tree_destroy(msp->ms_freetree[t]); 1307 } 1308 1309 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1310 range_tree_destroy(msp->ms_defertree[t]); 1311 } 1312 1313 ASSERT0(msp->ms_deferspace); 1314 1315 mutex_exit(&msp->ms_lock); 1316 cv_destroy(&msp->ms_load_cv); 1317 mutex_destroy(&msp->ms_lock); 1318 1319 kmem_free(msp, sizeof (metaslab_t)); 1320 } 1321 1322 #define FRAGMENTATION_TABLE_SIZE 17 1323 1324 /* 1325 * This table defines a segment size based fragmentation metric that will 1326 * allow each metaslab to derive its own fragmentation value. This is done 1327 * by calculating the space in each bucket of the spacemap histogram and 1328 * multiplying that by the fragmetation metric in this table. Doing 1329 * this for all buckets and dividing it by the total amount of free 1330 * space in this metaslab (i.e. the total free space in all buckets) gives 1331 * us the fragmentation metric. This means that a high fragmentation metric 1332 * equates to most of the free space being comprised of small segments. 1333 * Conversely, if the metric is low, then most of the free space is in 1334 * large segments. A 10% change in fragmentation equates to approximately 1335 * double the number of segments. 1336 * 1337 * This table defines 0% fragmented space using 16MB segments. Testing has 1338 * shown that segments that are greater than or equal to 16MB do not suffer 1339 * from drastic performance problems. Using this value, we derive the rest 1340 * of the table. Since the fragmentation value is never stored on disk, it 1341 * is possible to change these calculations in the future. 1342 */ 1343 int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = { 1344 100, /* 512B */ 1345 100, /* 1K */ 1346 98, /* 2K */ 1347 95, /* 4K */ 1348 90, /* 8K */ 1349 80, /* 16K */ 1350 70, /* 32K */ 1351 60, /* 64K */ 1352 50, /* 128K */ 1353 40, /* 256K */ 1354 30, /* 512K */ 1355 20, /* 1M */ 1356 15, /* 2M */ 1357 10, /* 4M */ 1358 5, /* 8M */ 1359 0 /* 16M */ 1360 }; 1361 1362 /* 1363 * Calclate the metaslab's fragmentation metric. A return value 1364 * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does 1365 * not support this metric. Otherwise, the return value should be in the 1366 * range [0, 100]. 1367 */ 1368 static uint64_t 1369 metaslab_fragmentation(metaslab_t *msp) 1370 { 1371 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1372 uint64_t fragmentation = 0; 1373 uint64_t total = 0; 1374 boolean_t feature_enabled = spa_feature_is_enabled(spa, 1375 SPA_FEATURE_SPACEMAP_HISTOGRAM); 1376 1377 if (!feature_enabled) 1378 return (ZFS_FRAG_INVALID); 1379 1380 /* 1381 * A null space map means that the entire metaslab is free 1382 * and thus is not fragmented. 1383 */ 1384 if (msp->ms_sm == NULL) 1385 return (0); 1386 1387 /* 1388 * If this metaslab's space_map has not been upgraded, flag it 1389 * so that we upgrade next time we encounter it. 1390 */ 1391 if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) { 1392 uint64_t txg = spa_syncing_txg(spa); 1393 vdev_t *vd = msp->ms_group->mg_vd; 1394 1395 if (spa_writeable(spa)) { 1396 msp->ms_condense_wanted = B_TRUE; 1397 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 1398 spa_dbgmsg(spa, "txg %llu, requesting force condense: " 1399 "msp %p, vd %p", txg, msp, vd); 1400 } 1401 return (ZFS_FRAG_INVALID); 1402 } 1403 1404 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 1405 uint64_t space = 0; 1406 uint8_t shift = msp->ms_sm->sm_shift; 1407 int idx = MIN(shift - SPA_MINBLOCKSHIFT + i, 1408 FRAGMENTATION_TABLE_SIZE - 1); 1409 1410 if (msp->ms_sm->sm_phys->smp_histogram[i] == 0) 1411 continue; 1412 1413 space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift); 1414 total += space; 1415 1416 ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE); 1417 fragmentation += space * zfs_frag_table[idx]; 1418 } 1419 1420 if (total > 0) 1421 fragmentation /= total; 1422 ASSERT3U(fragmentation, <=, 100); 1423 return (fragmentation); 1424 } 1425 1426 /* 1427 * Compute a weight -- a selection preference value -- for the given metaslab. 1428 * This is based on the amount of free space, the level of fragmentation, 1429 * the LBA range, and whether the metaslab is loaded. 1430 */ 1431 static uint64_t 1432 metaslab_weight(metaslab_t *msp) 1433 { 1434 metaslab_group_t *mg = msp->ms_group; 1435 vdev_t *vd = mg->mg_vd; 1436 uint64_t weight, space; 1437 1438 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1439 1440 /* 1441 * This vdev is in the process of being removed so there is nothing 1442 * for us to do here. 1443 */ 1444 if (vd->vdev_removing) { 1445 ASSERT0(space_map_allocated(msp->ms_sm)); 1446 ASSERT0(vd->vdev_ms_shift); 1447 return (0); 1448 } 1449 1450 /* 1451 * The baseline weight is the metaslab's free space. 1452 */ 1453 space = msp->ms_size - space_map_allocated(msp->ms_sm); 1454 1455 msp->ms_fragmentation = metaslab_fragmentation(msp); 1456 if (metaslab_fragmentation_factor_enabled && 1457 msp->ms_fragmentation != ZFS_FRAG_INVALID) { 1458 /* 1459 * Use the fragmentation information to inversely scale 1460 * down the baseline weight. We need to ensure that we 1461 * don't exclude this metaslab completely when it's 100% 1462 * fragmented. To avoid this we reduce the fragmented value 1463 * by 1. 1464 */ 1465 space = (space * (100 - (msp->ms_fragmentation - 1))) / 100; 1466 1467 /* 1468 * If space < SPA_MINBLOCKSIZE, then we will not allocate from 1469 * this metaslab again. The fragmentation metric may have 1470 * decreased the space to something smaller than 1471 * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE 1472 * so that we can consume any remaining space. 1473 */ 1474 if (space > 0 && space < SPA_MINBLOCKSIZE) 1475 space = SPA_MINBLOCKSIZE; 1476 } 1477 weight = space; 1478 1479 /* 1480 * Modern disks have uniform bit density and constant angular velocity. 1481 * Therefore, the outer recording zones are faster (higher bandwidth) 1482 * than the inner zones by the ratio of outer to inner track diameter, 1483 * which is typically around 2:1. We account for this by assigning 1484 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). 1485 * In effect, this means that we'll select the metaslab with the most 1486 * free bandwidth rather than simply the one with the most free space. 1487 */ 1488 if (metaslab_lba_weighting_enabled) { 1489 weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count; 1490 ASSERT(weight >= space && weight <= 2 * space); 1491 } 1492 1493 /* 1494 * If this metaslab is one we're actively using, adjust its 1495 * weight to make it preferable to any inactive metaslab so 1496 * we'll polish it off. If the fragmentation on this metaslab 1497 * has exceed our threshold, then don't mark it active. 1498 */ 1499 if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID && 1500 msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) { 1501 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); 1502 } 1503 1504 return (weight); 1505 } 1506 1507 static int 1508 metaslab_activate(metaslab_t *msp, uint64_t activation_weight) 1509 { 1510 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1511 1512 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { 1513 metaslab_load_wait(msp); 1514 if (!msp->ms_loaded) { 1515 int error = metaslab_load(msp); 1516 if (error) { 1517 metaslab_group_sort(msp->ms_group, msp, 0); 1518 return (error); 1519 } 1520 } 1521 1522 metaslab_group_sort(msp->ms_group, msp, 1523 msp->ms_weight | activation_weight); 1524 } 1525 ASSERT(msp->ms_loaded); 1526 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); 1527 1528 return (0); 1529 } 1530 1531 static void 1532 metaslab_passivate(metaslab_t *msp, uint64_t size) 1533 { 1534 /* 1535 * If size < SPA_MINBLOCKSIZE, then we will not allocate from 1536 * this metaslab again. In that case, it had better be empty, 1537 * or we would be leaving space on the table. 1538 */ 1539 ASSERT(size >= SPA_MINBLOCKSIZE || range_tree_space(msp->ms_tree) == 0); 1540 metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size)); 1541 ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); 1542 } 1543 1544 static void 1545 metaslab_preload(void *arg) 1546 { 1547 metaslab_t *msp = arg; 1548 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1549 1550 ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock)); 1551 1552 mutex_enter(&msp->ms_lock); 1553 metaslab_load_wait(msp); 1554 if (!msp->ms_loaded) 1555 (void) metaslab_load(msp); 1556 1557 /* 1558 * Set the ms_access_txg value so that we don't unload it right away. 1559 */ 1560 msp->ms_access_txg = spa_syncing_txg(spa) + metaslab_unload_delay + 1; 1561 mutex_exit(&msp->ms_lock); 1562 } 1563 1564 static void 1565 metaslab_group_preload(metaslab_group_t *mg) 1566 { 1567 spa_t *spa = mg->mg_vd->vdev_spa; 1568 metaslab_t *msp; 1569 avl_tree_t *t = &mg->mg_metaslab_tree; 1570 int m = 0; 1571 1572 if (spa_shutting_down(spa) || !metaslab_preload_enabled) { 1573 taskq_wait(mg->mg_taskq); 1574 return; 1575 } 1576 1577 mutex_enter(&mg->mg_lock); 1578 /* 1579 * Load the next potential metaslabs 1580 */ 1581 msp = avl_first(t); 1582 while (msp != NULL) { 1583 metaslab_t *msp_next = AVL_NEXT(t, msp); 1584 1585 /* 1586 * We preload only the maximum number of metaslabs specified 1587 * by metaslab_preload_limit. If a metaslab is being forced 1588 * to condense then we preload it too. This will ensure 1589 * that force condensing happens in the next txg. 1590 */ 1591 if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) { 1592 msp = msp_next; 1593 continue; 1594 } 1595 1596 /* 1597 * We must drop the metaslab group lock here to preserve 1598 * lock ordering with the ms_lock (when grabbing both 1599 * the mg_lock and the ms_lock, the ms_lock must be taken 1600 * first). As a result, it is possible that the ordering 1601 * of the metaslabs within the avl tree may change before 1602 * we reacquire the lock. The metaslab cannot be removed from 1603 * the tree while we're in syncing context so it is safe to 1604 * drop the mg_lock here. If the metaslabs are reordered 1605 * nothing will break -- we just may end up loading a 1606 * less than optimal one. 1607 */ 1608 mutex_exit(&mg->mg_lock); 1609 VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload, 1610 msp, TQ_SLEEP) != NULL); 1611 mutex_enter(&mg->mg_lock); 1612 msp = msp_next; 1613 } 1614 mutex_exit(&mg->mg_lock); 1615 } 1616 1617 /* 1618 * Determine if the space map's on-disk footprint is past our tolerance 1619 * for inefficiency. We would like to use the following criteria to make 1620 * our decision: 1621 * 1622 * 1. The size of the space map object should not dramatically increase as a 1623 * result of writing out the free space range tree. 1624 * 1625 * 2. The minimal on-disk space map representation is zfs_condense_pct/100 1626 * times the size than the free space range tree representation 1627 * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB). 1628 * 1629 * 3. The on-disk size of the space map should actually decrease. 1630 * 1631 * Checking the first condition is tricky since we don't want to walk 1632 * the entire AVL tree calculating the estimated on-disk size. Instead we 1633 * use the size-ordered range tree in the metaslab and calculate the 1634 * size required to write out the largest segment in our free tree. If the 1635 * size required to represent that segment on disk is larger than the space 1636 * map object then we avoid condensing this map. 1637 * 1638 * To determine the second criterion we use a best-case estimate and assume 1639 * each segment can be represented on-disk as a single 64-bit entry. We refer 1640 * to this best-case estimate as the space map's minimal form. 1641 * 1642 * Unfortunately, we cannot compute the on-disk size of the space map in this 1643 * context because we cannot accurately compute the effects of compression, etc. 1644 * Instead, we apply the heuristic described in the block comment for 1645 * zfs_metaslab_condense_block_threshold - we only condense if the space used 1646 * is greater than a threshold number of blocks. 1647 */ 1648 static boolean_t 1649 metaslab_should_condense(metaslab_t *msp) 1650 { 1651 space_map_t *sm = msp->ms_sm; 1652 range_seg_t *rs; 1653 uint64_t size, entries, segsz, object_size, optimal_size, record_size; 1654 dmu_object_info_t doi; 1655 uint64_t vdev_blocksize = 1 << msp->ms_group->mg_vd->vdev_ashift; 1656 1657 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1658 ASSERT(msp->ms_loaded); 1659 1660 /* 1661 * Use the ms_size_tree range tree, which is ordered by size, to 1662 * obtain the largest segment in the free tree. We always condense 1663 * metaslabs that are empty and metaslabs for which a condense 1664 * request has been made. 1665 */ 1666 rs = avl_last(&msp->ms_size_tree); 1667 if (rs == NULL || msp->ms_condense_wanted) 1668 return (B_TRUE); 1669 1670 /* 1671 * Calculate the number of 64-bit entries this segment would 1672 * require when written to disk. If this single segment would be 1673 * larger on-disk than the entire current on-disk structure, then 1674 * clearly condensing will increase the on-disk structure size. 1675 */ 1676 size = (rs->rs_end - rs->rs_start) >> sm->sm_shift; 1677 entries = size / (MIN(size, SM_RUN_MAX)); 1678 segsz = entries * sizeof (uint64_t); 1679 1680 optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root); 1681 object_size = space_map_length(msp->ms_sm); 1682 1683 dmu_object_info_from_db(sm->sm_dbuf, &doi); 1684 record_size = MAX(doi.doi_data_block_size, vdev_blocksize); 1685 1686 return (segsz <= object_size && 1687 object_size >= (optimal_size * zfs_condense_pct / 100) && 1688 object_size > zfs_metaslab_condense_block_threshold * record_size); 1689 } 1690 1691 /* 1692 * Condense the on-disk space map representation to its minimized form. 1693 * The minimized form consists of a small number of allocations followed by 1694 * the entries of the free range tree. 1695 */ 1696 static void 1697 metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) 1698 { 1699 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1700 range_tree_t *freetree = msp->ms_freetree[txg & TXG_MASK]; 1701 range_tree_t *condense_tree; 1702 space_map_t *sm = msp->ms_sm; 1703 1704 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1705 ASSERT3U(spa_sync_pass(spa), ==, 1); 1706 ASSERT(msp->ms_loaded); 1707 1708 1709 spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, vdev id %llu, " 1710 "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg, 1711 msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id, 1712 msp->ms_group->mg_vd->vdev_spa->spa_name, 1713 space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root), 1714 msp->ms_condense_wanted ? "TRUE" : "FALSE"); 1715 1716 msp->ms_condense_wanted = B_FALSE; 1717 1718 /* 1719 * Create an range tree that is 100% allocated. We remove segments 1720 * that have been freed in this txg, any deferred frees that exist, 1721 * and any allocation in the future. Removing segments should be 1722 * a relatively inexpensive operation since we expect these trees to 1723 * have a small number of nodes. 1724 */ 1725 condense_tree = range_tree_create(NULL, NULL, &msp->ms_lock); 1726 range_tree_add(condense_tree, msp->ms_start, msp->ms_size); 1727 1728 /* 1729 * Remove what's been freed in this txg from the condense_tree. 1730 * Since we're in sync_pass 1, we know that all the frees from 1731 * this txg are in the freetree. 1732 */ 1733 range_tree_walk(freetree, range_tree_remove, condense_tree); 1734 1735 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1736 range_tree_walk(msp->ms_defertree[t], 1737 range_tree_remove, condense_tree); 1738 } 1739 1740 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 1741 range_tree_walk(msp->ms_alloctree[(txg + t) & TXG_MASK], 1742 range_tree_remove, condense_tree); 1743 } 1744 1745 /* 1746 * We're about to drop the metaslab's lock thus allowing 1747 * other consumers to change it's content. Set the 1748 * metaslab's ms_condensing flag to ensure that 1749 * allocations on this metaslab do not occur while we're 1750 * in the middle of committing it to disk. This is only critical 1751 * for the ms_tree as all other range trees use per txg 1752 * views of their content. 1753 */ 1754 msp->ms_condensing = B_TRUE; 1755 1756 mutex_exit(&msp->ms_lock); 1757 space_map_truncate(sm, tx); 1758 mutex_enter(&msp->ms_lock); 1759 1760 /* 1761 * While we would ideally like to create a space_map representation 1762 * that consists only of allocation records, doing so can be 1763 * prohibitively expensive because the in-core free tree can be 1764 * large, and therefore computationally expensive to subtract 1765 * from the condense_tree. Instead we sync out two trees, a cheap 1766 * allocation only tree followed by the in-core free tree. While not 1767 * optimal, this is typically close to optimal, and much cheaper to 1768 * compute. 1769 */ 1770 space_map_write(sm, condense_tree, SM_ALLOC, tx); 1771 range_tree_vacate(condense_tree, NULL, NULL); 1772 range_tree_destroy(condense_tree); 1773 1774 space_map_write(sm, msp->ms_tree, SM_FREE, tx); 1775 msp->ms_condensing = B_FALSE; 1776 } 1777 1778 /* 1779 * Write a metaslab to disk in the context of the specified transaction group. 1780 */ 1781 void 1782 metaslab_sync(metaslab_t *msp, uint64_t txg) 1783 { 1784 metaslab_group_t *mg = msp->ms_group; 1785 vdev_t *vd = mg->mg_vd; 1786 spa_t *spa = vd->vdev_spa; 1787 objset_t *mos = spa_meta_objset(spa); 1788 range_tree_t *alloctree = msp->ms_alloctree[txg & TXG_MASK]; 1789 range_tree_t **freetree = &msp->ms_freetree[txg & TXG_MASK]; 1790 range_tree_t **freed_tree = 1791 &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK]; 1792 dmu_tx_t *tx; 1793 uint64_t object = space_map_object(msp->ms_sm); 1794 1795 ASSERT(!vd->vdev_ishole); 1796 1797 /* 1798 * This metaslab has just been added so there's no work to do now. 1799 */ 1800 if (*freetree == NULL) { 1801 ASSERT3P(alloctree, ==, NULL); 1802 return; 1803 } 1804 1805 ASSERT3P(alloctree, !=, NULL); 1806 ASSERT3P(*freetree, !=, NULL); 1807 ASSERT3P(*freed_tree, !=, NULL); 1808 1809 /* 1810 * Normally, we don't want to process a metaslab if there 1811 * are no allocations or frees to perform. However, if the metaslab 1812 * is being forced to condense we need to let it through. 1813 */ 1814 if (range_tree_space(alloctree) == 0 && 1815 range_tree_space(*freetree) == 0 && 1816 !msp->ms_condense_wanted) 1817 return; 1818 1819 /* 1820 * The only state that can actually be changing concurrently with 1821 * metaslab_sync() is the metaslab's ms_tree. No other thread can 1822 * be modifying this txg's alloctree, freetree, freed_tree, or 1823 * space_map_phys_t. Therefore, we only hold ms_lock to satify 1824 * space_map ASSERTs. We drop it whenever we call into the DMU, 1825 * because the DMU can call down to us (e.g. via zio_free()) at 1826 * any time. 1827 */ 1828 1829 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 1830 1831 if (msp->ms_sm == NULL) { 1832 uint64_t new_object; 1833 1834 new_object = space_map_alloc(mos, tx); 1835 VERIFY3U(new_object, !=, 0); 1836 1837 VERIFY0(space_map_open(&msp->ms_sm, mos, new_object, 1838 msp->ms_start, msp->ms_size, vd->vdev_ashift, 1839 &msp->ms_lock)); 1840 ASSERT(msp->ms_sm != NULL); 1841 } 1842 1843 mutex_enter(&msp->ms_lock); 1844 1845 /* 1846 * Note: metaslab_condense() clears the space_map's histogram. 1847 * Therefore we must verify and remove this histogram before 1848 * condensing. 1849 */ 1850 metaslab_group_histogram_verify(mg); 1851 metaslab_class_histogram_verify(mg->mg_class); 1852 metaslab_group_histogram_remove(mg, msp); 1853 1854 if (msp->ms_loaded && spa_sync_pass(spa) == 1 && 1855 metaslab_should_condense(msp)) { 1856 metaslab_condense(msp, txg, tx); 1857 } else { 1858 space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx); 1859 space_map_write(msp->ms_sm, *freetree, SM_FREE, tx); 1860 } 1861 1862 if (msp->ms_loaded) { 1863 /* 1864 * When the space map is loaded, we have an accruate 1865 * histogram in the range tree. This gives us an opportunity 1866 * to bring the space map's histogram up-to-date so we clear 1867 * it first before updating it. 1868 */ 1869 space_map_histogram_clear(msp->ms_sm); 1870 space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx); 1871 } else { 1872 /* 1873 * Since the space map is not loaded we simply update the 1874 * exisiting histogram with what was freed in this txg. This 1875 * means that the on-disk histogram may not have an accurate 1876 * view of the free space but it's close enough to allow 1877 * us to make allocation decisions. 1878 */ 1879 space_map_histogram_add(msp->ms_sm, *freetree, tx); 1880 } 1881 metaslab_group_histogram_add(mg, msp); 1882 metaslab_group_histogram_verify(mg); 1883 metaslab_class_histogram_verify(mg->mg_class); 1884 1885 /* 1886 * For sync pass 1, we avoid traversing this txg's free range tree 1887 * and instead will just swap the pointers for freetree and 1888 * freed_tree. We can safely do this since the freed_tree is 1889 * guaranteed to be empty on the initial pass. 1890 */ 1891 if (spa_sync_pass(spa) == 1) { 1892 range_tree_swap(freetree, freed_tree); 1893 } else { 1894 range_tree_vacate(*freetree, range_tree_add, *freed_tree); 1895 } 1896 range_tree_vacate(alloctree, NULL, NULL); 1897 1898 ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK])); 1899 ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK])); 1900 1901 mutex_exit(&msp->ms_lock); 1902 1903 if (object != space_map_object(msp->ms_sm)) { 1904 object = space_map_object(msp->ms_sm); 1905 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * 1906 msp->ms_id, sizeof (uint64_t), &object, tx); 1907 } 1908 dmu_tx_commit(tx); 1909 } 1910 1911 /* 1912 * Called after a transaction group has completely synced to mark 1913 * all of the metaslab's free space as usable. 1914 */ 1915 void 1916 metaslab_sync_done(metaslab_t *msp, uint64_t txg) 1917 { 1918 metaslab_group_t *mg = msp->ms_group; 1919 vdev_t *vd = mg->mg_vd; 1920 range_tree_t **freed_tree; 1921 range_tree_t **defer_tree; 1922 int64_t alloc_delta, defer_delta; 1923 1924 ASSERT(!vd->vdev_ishole); 1925 1926 mutex_enter(&msp->ms_lock); 1927 1928 /* 1929 * If this metaslab is just becoming available, initialize its 1930 * alloctrees, freetrees, and defertree and add its capacity to 1931 * the vdev. 1932 */ 1933 if (msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK] == NULL) { 1934 for (int t = 0; t < TXG_SIZE; t++) { 1935 ASSERT(msp->ms_alloctree[t] == NULL); 1936 ASSERT(msp->ms_freetree[t] == NULL); 1937 1938 msp->ms_alloctree[t] = range_tree_create(NULL, msp, 1939 &msp->ms_lock); 1940 msp->ms_freetree[t] = range_tree_create(NULL, msp, 1941 &msp->ms_lock); 1942 } 1943 1944 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1945 ASSERT(msp->ms_defertree[t] == NULL); 1946 1947 msp->ms_defertree[t] = range_tree_create(NULL, msp, 1948 &msp->ms_lock); 1949 } 1950 1951 vdev_space_update(vd, 0, 0, msp->ms_size); 1952 } 1953 1954 freed_tree = &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK]; 1955 defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE]; 1956 1957 alloc_delta = space_map_alloc_delta(msp->ms_sm); 1958 defer_delta = range_tree_space(*freed_tree) - 1959 range_tree_space(*defer_tree); 1960 1961 vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0); 1962 1963 ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK])); 1964 ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK])); 1965 1966 /* 1967 * If there's a metaslab_load() in progress, wait for it to complete 1968 * so that we have a consistent view of the in-core space map. 1969 */ 1970 metaslab_load_wait(msp); 1971 1972 /* 1973 * Move the frees from the defer_tree back to the free 1974 * range tree (if it's loaded). Swap the freed_tree and the 1975 * defer_tree -- this is safe to do because we've just emptied out 1976 * the defer_tree. 1977 */ 1978 range_tree_vacate(*defer_tree, 1979 msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree); 1980 range_tree_swap(freed_tree, defer_tree); 1981 1982 space_map_update(msp->ms_sm); 1983 1984 msp->ms_deferspace += defer_delta; 1985 ASSERT3S(msp->ms_deferspace, >=, 0); 1986 ASSERT3S(msp->ms_deferspace, <=, msp->ms_size); 1987 if (msp->ms_deferspace != 0) { 1988 /* 1989 * Keep syncing this metaslab until all deferred frees 1990 * are back in circulation. 1991 */ 1992 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 1993 } 1994 1995 if (msp->ms_loaded && msp->ms_access_txg < txg) { 1996 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 1997 VERIFY0(range_tree_space( 1998 msp->ms_alloctree[(txg + t) & TXG_MASK])); 1999 } 2000 2001 if (!metaslab_debug_unload) 2002 metaslab_unload(msp); 2003 } 2004 2005 metaslab_group_sort(mg, msp, metaslab_weight(msp)); 2006 mutex_exit(&msp->ms_lock); 2007 } 2008 2009 void 2010 metaslab_sync_reassess(metaslab_group_t *mg) 2011 { 2012 metaslab_group_alloc_update(mg); 2013 mg->mg_fragmentation = metaslab_group_fragmentation(mg); 2014 2015 /* 2016 * Preload the next potential metaslabs 2017 */ 2018 metaslab_group_preload(mg); 2019 } 2020 2021 static uint64_t 2022 metaslab_distance(metaslab_t *msp, dva_t *dva) 2023 { 2024 uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift; 2025 uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift; 2026 uint64_t start = msp->ms_id; 2027 2028 if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) 2029 return (1ULL << 63); 2030 2031 if (offset < start) 2032 return ((start - offset) << ms_shift); 2033 if (offset > start) 2034 return ((offset - start) << ms_shift); 2035 return (0); 2036 } 2037 2038 static uint64_t 2039 metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, 2040 uint64_t txg, uint64_t min_distance, dva_t *dva, int d) 2041 { 2042 spa_t *spa = mg->mg_vd->vdev_spa; 2043 metaslab_t *msp = NULL; 2044 uint64_t offset = -1ULL; 2045 avl_tree_t *t = &mg->mg_metaslab_tree; 2046 uint64_t activation_weight; 2047 uint64_t target_distance; 2048 int i; 2049 2050 activation_weight = METASLAB_WEIGHT_PRIMARY; 2051 for (i = 0; i < d; i++) { 2052 if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { 2053 activation_weight = METASLAB_WEIGHT_SECONDARY; 2054 break; 2055 } 2056 } 2057 2058 for (;;) { 2059 boolean_t was_active; 2060 2061 mutex_enter(&mg->mg_lock); 2062 for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) { 2063 if (msp->ms_weight < asize) { 2064 spa_dbgmsg(spa, "%s: failed to meet weight " 2065 "requirement: vdev %llu, txg %llu, mg %p, " 2066 "msp %p, psize %llu, asize %llu, " 2067 "weight %llu", spa_name(spa), 2068 mg->mg_vd->vdev_id, txg, 2069 mg, msp, psize, asize, msp->ms_weight); 2070 mutex_exit(&mg->mg_lock); 2071 return (-1ULL); 2072 } 2073 2074 /* 2075 * If the selected metaslab is condensing, skip it. 2076 */ 2077 if (msp->ms_condensing) 2078 continue; 2079 2080 was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; 2081 if (activation_weight == METASLAB_WEIGHT_PRIMARY) 2082 break; 2083 2084 target_distance = min_distance + 2085 (space_map_allocated(msp->ms_sm) != 0 ? 0 : 2086 min_distance >> 1); 2087 2088 for (i = 0; i < d; i++) 2089 if (metaslab_distance(msp, &dva[i]) < 2090 target_distance) 2091 break; 2092 if (i == d) 2093 break; 2094 } 2095 mutex_exit(&mg->mg_lock); 2096 if (msp == NULL) 2097 return (-1ULL); 2098 2099 mutex_enter(&msp->ms_lock); 2100 2101 /* 2102 * Ensure that the metaslab we have selected is still 2103 * capable of handling our request. It's possible that 2104 * another thread may have changed the weight while we 2105 * were blocked on the metaslab lock. 2106 */ 2107 if (msp->ms_weight < asize || (was_active && 2108 !(msp->ms_weight & METASLAB_ACTIVE_MASK) && 2109 activation_weight == METASLAB_WEIGHT_PRIMARY)) { 2110 mutex_exit(&msp->ms_lock); 2111 continue; 2112 } 2113 2114 if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) && 2115 activation_weight == METASLAB_WEIGHT_PRIMARY) { 2116 metaslab_passivate(msp, 2117 msp->ms_weight & ~METASLAB_ACTIVE_MASK); 2118 mutex_exit(&msp->ms_lock); 2119 continue; 2120 } 2121 2122 if (metaslab_activate(msp, activation_weight) != 0) { 2123 mutex_exit(&msp->ms_lock); 2124 continue; 2125 } 2126 2127 /* 2128 * If this metaslab is currently condensing then pick again as 2129 * we can't manipulate this metaslab until it's committed 2130 * to disk. 2131 */ 2132 if (msp->ms_condensing) { 2133 mutex_exit(&msp->ms_lock); 2134 continue; 2135 } 2136 2137 if ((offset = metaslab_block_alloc(msp, asize)) != -1ULL) 2138 break; 2139 2140 metaslab_passivate(msp, metaslab_block_maxsize(msp)); 2141 mutex_exit(&msp->ms_lock); 2142 } 2143 2144 if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0) 2145 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); 2146 2147 range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, asize); 2148 msp->ms_access_txg = txg + metaslab_unload_delay; 2149 2150 mutex_exit(&msp->ms_lock); 2151 2152 return (offset); 2153 } 2154 2155 /* 2156 * Allocate a block for the specified i/o. 2157 */ 2158 static int 2159 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, 2160 dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags) 2161 { 2162 metaslab_group_t *mg, *rotor; 2163 vdev_t *vd; 2164 int dshift = 3; 2165 int all_zero; 2166 int zio_lock = B_FALSE; 2167 boolean_t allocatable; 2168 uint64_t offset = -1ULL; 2169 uint64_t asize; 2170 uint64_t distance; 2171 2172 ASSERT(!DVA_IS_VALID(&dva[d])); 2173 2174 /* 2175 * For testing, make some blocks above a certain size be gang blocks. 2176 */ 2177 if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) 2178 return (SET_ERROR(ENOSPC)); 2179 2180 /* 2181 * Start at the rotor and loop through all mgs until we find something. 2182 * Note that there's no locking on mc_rotor or mc_aliquot because 2183 * nothing actually breaks if we miss a few updates -- we just won't 2184 * allocate quite as evenly. It all balances out over time. 2185 * 2186 * If we are doing ditto or log blocks, try to spread them across 2187 * consecutive vdevs. If we're forced to reuse a vdev before we've 2188 * allocated all of our ditto blocks, then try and spread them out on 2189 * that vdev as much as possible. If it turns out to not be possible, 2190 * gradually lower our standards until anything becomes acceptable. 2191 * Also, allocating on consecutive vdevs (as opposed to random vdevs) 2192 * gives us hope of containing our fault domains to something we're 2193 * able to reason about. Otherwise, any two top-level vdev failures 2194 * will guarantee the loss of data. With consecutive allocation, 2195 * only two adjacent top-level vdev failures will result in data loss. 2196 * 2197 * If we are doing gang blocks (hintdva is non-NULL), try to keep 2198 * ourselves on the same vdev as our gang block header. That 2199 * way, we can hope for locality in vdev_cache, plus it makes our 2200 * fault domains something tractable. 2201 */ 2202 if (hintdva) { 2203 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); 2204 2205 /* 2206 * It's possible the vdev we're using as the hint no 2207 * longer exists (i.e. removed). Consult the rotor when 2208 * all else fails. 2209 */ 2210 if (vd != NULL) { 2211 mg = vd->vdev_mg; 2212 2213 if (flags & METASLAB_HINTBP_AVOID && 2214 mg->mg_next != NULL) 2215 mg = mg->mg_next; 2216 } else { 2217 mg = mc->mc_rotor; 2218 } 2219 } else if (d != 0) { 2220 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); 2221 mg = vd->vdev_mg->mg_next; 2222 } else { 2223 mg = mc->mc_rotor; 2224 } 2225 2226 /* 2227 * If the hint put us into the wrong metaslab class, or into a 2228 * metaslab group that has been passivated, just follow the rotor. 2229 */ 2230 if (mg->mg_class != mc || mg->mg_activation_count <= 0) 2231 mg = mc->mc_rotor; 2232 2233 rotor = mg; 2234 top: 2235 all_zero = B_TRUE; 2236 do { 2237 ASSERT(mg->mg_activation_count == 1); 2238 2239 vd = mg->mg_vd; 2240 2241 /* 2242 * Don't allocate from faulted devices. 2243 */ 2244 if (zio_lock) { 2245 spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); 2246 allocatable = vdev_allocatable(vd); 2247 spa_config_exit(spa, SCL_ZIO, FTAG); 2248 } else { 2249 allocatable = vdev_allocatable(vd); 2250 } 2251 2252 /* 2253 * Determine if the selected metaslab group is eligible 2254 * for allocations. If we're ganging or have requested 2255 * an allocation for the smallest gang block size 2256 * then we don't want to avoid allocating to the this 2257 * metaslab group. If we're in this condition we should 2258 * try to allocate from any device possible so that we 2259 * don't inadvertently return ENOSPC and suspend the pool 2260 * even though space is still available. 2261 */ 2262 if (allocatable && CAN_FASTGANG(flags) && 2263 psize > SPA_GANGBLOCKSIZE) 2264 allocatable = metaslab_group_allocatable(mg); 2265 2266 if (!allocatable) 2267 goto next; 2268 2269 /* 2270 * Avoid writing single-copy data to a failing vdev 2271 * unless the user instructs us that it is okay. 2272 */ 2273 if ((vd->vdev_stat.vs_write_errors > 0 || 2274 vd->vdev_state < VDEV_STATE_HEALTHY) && 2275 d == 0 && dshift == 3 && vd->vdev_children == 0) { 2276 all_zero = B_FALSE; 2277 goto next; 2278 } 2279 2280 ASSERT(mg->mg_class == mc); 2281 2282 distance = vd->vdev_asize >> dshift; 2283 if (distance <= (1ULL << vd->vdev_ms_shift)) 2284 distance = 0; 2285 else 2286 all_zero = B_FALSE; 2287 2288 asize = vdev_psize_to_asize(vd, psize); 2289 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); 2290 2291 offset = metaslab_group_alloc(mg, psize, asize, txg, distance, 2292 dva, d); 2293 if (offset != -1ULL) { 2294 /* 2295 * If we've just selected this metaslab group, 2296 * figure out whether the corresponding vdev is 2297 * over- or under-used relative to the pool, 2298 * and set an allocation bias to even it out. 2299 */ 2300 if (mc->mc_aliquot == 0 && metaslab_bias_enabled) { 2301 vdev_stat_t *vs = &vd->vdev_stat; 2302 int64_t vu, cu; 2303 2304 vu = (vs->vs_alloc * 100) / (vs->vs_space + 1); 2305 cu = (mc->mc_alloc * 100) / (mc->mc_space + 1); 2306 2307 /* 2308 * Calculate how much more or less we should 2309 * try to allocate from this device during 2310 * this iteration around the rotor. 2311 * For example, if a device is 80% full 2312 * and the pool is 20% full then we should 2313 * reduce allocations by 60% on this device. 2314 * 2315 * mg_bias = (20 - 80) * 512K / 100 = -307K 2316 * 2317 * This reduces allocations by 307K for this 2318 * iteration. 2319 */ 2320 mg->mg_bias = ((cu - vu) * 2321 (int64_t)mg->mg_aliquot) / 100; 2322 } else if (!metaslab_bias_enabled) { 2323 mg->mg_bias = 0; 2324 } 2325 2326 if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= 2327 mg->mg_aliquot + mg->mg_bias) { 2328 mc->mc_rotor = mg->mg_next; 2329 mc->mc_aliquot = 0; 2330 } 2331 2332 DVA_SET_VDEV(&dva[d], vd->vdev_id); 2333 DVA_SET_OFFSET(&dva[d], offset); 2334 DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER)); 2335 DVA_SET_ASIZE(&dva[d], asize); 2336 2337 return (0); 2338 } 2339 next: 2340 mc->mc_rotor = mg->mg_next; 2341 mc->mc_aliquot = 0; 2342 } while ((mg = mg->mg_next) != rotor); 2343 2344 if (!all_zero) { 2345 dshift++; 2346 ASSERT(dshift < 64); 2347 goto top; 2348 } 2349 2350 if (!allocatable && !zio_lock) { 2351 dshift = 3; 2352 zio_lock = B_TRUE; 2353 goto top; 2354 } 2355 2356 bzero(&dva[d], sizeof (dva_t)); 2357 2358 return (SET_ERROR(ENOSPC)); 2359 } 2360 2361 /* 2362 * Free the block represented by DVA in the context of the specified 2363 * transaction group. 2364 */ 2365 static void 2366 metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now) 2367 { 2368 uint64_t vdev = DVA_GET_VDEV(dva); 2369 uint64_t offset = DVA_GET_OFFSET(dva); 2370 uint64_t size = DVA_GET_ASIZE(dva); 2371 vdev_t *vd; 2372 metaslab_t *msp; 2373 2374 ASSERT(DVA_IS_VALID(dva)); 2375 2376 if (txg > spa_freeze_txg(spa)) 2377 return; 2378 2379 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 2380 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { 2381 cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu", 2382 (u_longlong_t)vdev, (u_longlong_t)offset); 2383 ASSERT(0); 2384 return; 2385 } 2386 2387 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 2388 2389 if (DVA_GET_GANG(dva)) 2390 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 2391 2392 mutex_enter(&msp->ms_lock); 2393 2394 if (now) { 2395 range_tree_remove(msp->ms_alloctree[txg & TXG_MASK], 2396 offset, size); 2397 2398 VERIFY(!msp->ms_condensing); 2399 VERIFY3U(offset, >=, msp->ms_start); 2400 VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size); 2401 VERIFY3U(range_tree_space(msp->ms_tree) + size, <=, 2402 msp->ms_size); 2403 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 2404 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 2405 range_tree_add(msp->ms_tree, offset, size); 2406 } else { 2407 if (range_tree_space(msp->ms_freetree[txg & TXG_MASK]) == 0) 2408 vdev_dirty(vd, VDD_METASLAB, msp, txg); 2409 range_tree_add(msp->ms_freetree[txg & TXG_MASK], 2410 offset, size); 2411 } 2412 2413 mutex_exit(&msp->ms_lock); 2414 } 2415 2416 /* 2417 * Intent log support: upon opening the pool after a crash, notify the SPA 2418 * of blocks that the intent log has allocated for immediate write, but 2419 * which are still considered free by the SPA because the last transaction 2420 * group didn't commit yet. 2421 */ 2422 static int 2423 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) 2424 { 2425 uint64_t vdev = DVA_GET_VDEV(dva); 2426 uint64_t offset = DVA_GET_OFFSET(dva); 2427 uint64_t size = DVA_GET_ASIZE(dva); 2428 vdev_t *vd; 2429 metaslab_t *msp; 2430 int error = 0; 2431 2432 ASSERT(DVA_IS_VALID(dva)); 2433 2434 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 2435 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) 2436 return (SET_ERROR(ENXIO)); 2437 2438 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 2439 2440 if (DVA_GET_GANG(dva)) 2441 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 2442 2443 mutex_enter(&msp->ms_lock); 2444 2445 if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) 2446 error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); 2447 2448 if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size)) 2449 error = SET_ERROR(ENOENT); 2450 2451 if (error || txg == 0) { /* txg == 0 indicates dry run */ 2452 mutex_exit(&msp->ms_lock); 2453 return (error); 2454 } 2455 2456 VERIFY(!msp->ms_condensing); 2457 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 2458 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 2459 VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size); 2460 range_tree_remove(msp->ms_tree, offset, size); 2461 2462 if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ 2463 if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0) 2464 vdev_dirty(vd, VDD_METASLAB, msp, txg); 2465 range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size); 2466 } 2467 2468 mutex_exit(&msp->ms_lock); 2469 2470 return (0); 2471 } 2472 2473 int 2474 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, 2475 int ndvas, uint64_t txg, blkptr_t *hintbp, int flags) 2476 { 2477 dva_t *dva = bp->blk_dva; 2478 dva_t *hintdva = hintbp->blk_dva; 2479 int error = 0; 2480 2481 ASSERT(bp->blk_birth == 0); 2482 ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); 2483 2484 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 2485 2486 if (mc->mc_rotor == NULL) { /* no vdevs in this class */ 2487 spa_config_exit(spa, SCL_ALLOC, FTAG); 2488 return (SET_ERROR(ENOSPC)); 2489 } 2490 2491 ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); 2492 ASSERT(BP_GET_NDVAS(bp) == 0); 2493 ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); 2494 2495 for (int d = 0; d < ndvas; d++) { 2496 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, 2497 txg, flags); 2498 if (error != 0) { 2499 for (d--; d >= 0; d--) { 2500 metaslab_free_dva(spa, &dva[d], txg, B_TRUE); 2501 bzero(&dva[d], sizeof (dva_t)); 2502 } 2503 spa_config_exit(spa, SCL_ALLOC, FTAG); 2504 return (error); 2505 } 2506 } 2507 ASSERT(error == 0); 2508 ASSERT(BP_GET_NDVAS(bp) == ndvas); 2509 2510 spa_config_exit(spa, SCL_ALLOC, FTAG); 2511 2512 BP_SET_BIRTH(bp, txg, txg); 2513 2514 return (0); 2515 } 2516 2517 void 2518 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) 2519 { 2520 const dva_t *dva = bp->blk_dva; 2521 int ndvas = BP_GET_NDVAS(bp); 2522 2523 ASSERT(!BP_IS_HOLE(bp)); 2524 ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); 2525 2526 spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); 2527 2528 for (int d = 0; d < ndvas; d++) 2529 metaslab_free_dva(spa, &dva[d], txg, now); 2530 2531 spa_config_exit(spa, SCL_FREE, FTAG); 2532 } 2533 2534 int 2535 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) 2536 { 2537 const dva_t *dva = bp->blk_dva; 2538 int ndvas = BP_GET_NDVAS(bp); 2539 int error = 0; 2540 2541 ASSERT(!BP_IS_HOLE(bp)); 2542 2543 if (txg != 0) { 2544 /* 2545 * First do a dry run to make sure all DVAs are claimable, 2546 * so we don't have to unwind from partial failures below. 2547 */ 2548 if ((error = metaslab_claim(spa, bp, 0)) != 0) 2549 return (error); 2550 } 2551 2552 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 2553 2554 for (int d = 0; d < ndvas; d++) 2555 if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0) 2556 break; 2557 2558 spa_config_exit(spa, SCL_ALLOC, FTAG); 2559 2560 ASSERT(error == 0 || txg == 0); 2561 2562 return (error); 2563 } 2564 2565 void 2566 metaslab_check_free(spa_t *spa, const blkptr_t *bp) 2567 { 2568 if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) 2569 return; 2570 2571 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 2572 for (int i = 0; i < BP_GET_NDVAS(bp); i++) { 2573 uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]); 2574 vdev_t *vd = vdev_lookup_top(spa, vdev); 2575 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); 2576 uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]); 2577 metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 2578 2579 if (msp->ms_loaded) 2580 range_tree_verify(msp->ms_tree, offset, size); 2581 2582 for (int j = 0; j < TXG_SIZE; j++) 2583 range_tree_verify(msp->ms_freetree[j], offset, size); 2584 for (int j = 0; j < TXG_DEFER_SIZE; j++) 2585 range_tree_verify(msp->ms_defertree[j], offset, size); 2586 } 2587 spa_config_exit(spa, SCL_VDEV, FTAG); 2588 } 2589