1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2015 by Delphix. All rights reserved. 24 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 25 * Copyright (c) 2014 Integros [integros.com] 26 */ 27 28 #include <sys/zfs_context.h> 29 #include <sys/dmu.h> 30 #include <sys/dmu_tx.h> 31 #include <sys/space_map.h> 32 #include <sys/metaslab_impl.h> 33 #include <sys/vdev_impl.h> 34 #include <sys/zio.h> 35 #include <sys/spa_impl.h> 36 #include <sys/zfeature.h> 37 38 /* 39 * Allow allocations to switch to gang blocks quickly. We do this to 40 * avoid having to load lots of space_maps in a given txg. There are, 41 * however, some cases where we want to avoid "fast" ganging and instead 42 * we want to do an exhaustive search of all metaslabs on this device. 43 * Currently we don't allow any gang, slog, or dump device related allocations 44 * to "fast" gang. 45 */ 46 #define CAN_FASTGANG(flags) \ 47 (!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \ 48 METASLAB_GANG_AVOID))) 49 50 #define METASLAB_WEIGHT_PRIMARY (1ULL << 63) 51 #define METASLAB_WEIGHT_SECONDARY (1ULL << 62) 52 #define METASLAB_ACTIVE_MASK \ 53 (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY) 54 55 uint64_t metaslab_aliquot = 512ULL << 10; 56 uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ 57 58 /* 59 * The in-core space map representation is more compact than its on-disk form. 60 * The zfs_condense_pct determines how much more compact the in-core 61 * space_map representation must be before we compact it on-disk. 62 * Values should be greater than or equal to 100. 63 */ 64 int zfs_condense_pct = 200; 65 66 /* 67 * Condensing a metaslab is not guaranteed to actually reduce the amount of 68 * space used on disk. In particular, a space map uses data in increments of 69 * MAX(1 << ashift, space_map_blksize), so a metaslab might use the 70 * same number of blocks after condensing. Since the goal of condensing is to 71 * reduce the number of IOPs required to read the space map, we only want to 72 * condense when we can be sure we will reduce the number of blocks used by the 73 * space map. Unfortunately, we cannot precisely compute whether or not this is 74 * the case in metaslab_should_condense since we are holding ms_lock. Instead, 75 * we apply the following heuristic: do not condense a spacemap unless the 76 * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold 77 * blocks. 78 */ 79 int zfs_metaslab_condense_block_threshold = 4; 80 81 /* 82 * The zfs_mg_noalloc_threshold defines which metaslab groups should 83 * be eligible for allocation. The value is defined as a percentage of 84 * free space. Metaslab groups that have more free space than 85 * zfs_mg_noalloc_threshold are always eligible for allocations. Once 86 * a metaslab group's free space is less than or equal to the 87 * zfs_mg_noalloc_threshold the allocator will avoid allocating to that 88 * group unless all groups in the pool have reached zfs_mg_noalloc_threshold. 89 * Once all groups in the pool reach zfs_mg_noalloc_threshold then all 90 * groups are allowed to accept allocations. Gang blocks are always 91 * eligible to allocate on any metaslab group. The default value of 0 means 92 * no metaslab group will be excluded based on this criterion. 93 */ 94 int zfs_mg_noalloc_threshold = 0; 95 96 /* 97 * Metaslab groups are considered eligible for allocations if their 98 * fragmenation metric (measured as a percentage) is less than or equal to 99 * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold 100 * then it will be skipped unless all metaslab groups within the metaslab 101 * class have also crossed this threshold. 102 */ 103 int zfs_mg_fragmentation_threshold = 85; 104 105 /* 106 * Allow metaslabs to keep their active state as long as their fragmentation 107 * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An 108 * active metaslab that exceeds this threshold will no longer keep its active 109 * status allowing better metaslabs to be selected. 110 */ 111 int zfs_metaslab_fragmentation_threshold = 70; 112 113 /* 114 * When set will load all metaslabs when pool is first opened. 115 */ 116 int metaslab_debug_load = 0; 117 118 /* 119 * When set will prevent metaslabs from being unloaded. 120 */ 121 int metaslab_debug_unload = 0; 122 123 /* 124 * Minimum size which forces the dynamic allocator to change 125 * it's allocation strategy. Once the space map cannot satisfy 126 * an allocation of this size then it switches to using more 127 * aggressive strategy (i.e search by size rather than offset). 128 */ 129 uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE; 130 131 /* 132 * The minimum free space, in percent, which must be available 133 * in a space map to continue allocations in a first-fit fashion. 134 * Once the space_map's free space drops below this level we dynamically 135 * switch to using best-fit allocations. 136 */ 137 int metaslab_df_free_pct = 4; 138 139 /* 140 * A metaslab is considered "free" if it contains a contiguous 141 * segment which is greater than metaslab_min_alloc_size. 142 */ 143 uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS; 144 145 /* 146 * Percentage of all cpus that can be used by the metaslab taskq. 147 */ 148 int metaslab_load_pct = 50; 149 150 /* 151 * Determines how many txgs a metaslab may remain loaded without having any 152 * allocations from it. As long as a metaslab continues to be used we will 153 * keep it loaded. 154 */ 155 int metaslab_unload_delay = TXG_SIZE * 2; 156 157 /* 158 * Max number of metaslabs per group to preload. 159 */ 160 int metaslab_preload_limit = SPA_DVAS_PER_BP; 161 162 /* 163 * Enable/disable preloading of metaslab. 164 */ 165 boolean_t metaslab_preload_enabled = B_TRUE; 166 167 /* 168 * Enable/disable fragmentation weighting on metaslabs. 169 */ 170 boolean_t metaslab_fragmentation_factor_enabled = B_TRUE; 171 172 /* 173 * Enable/disable lba weighting (i.e. outer tracks are given preference). 174 */ 175 boolean_t metaslab_lba_weighting_enabled = B_TRUE; 176 177 /* 178 * Enable/disable metaslab group biasing. 179 */ 180 boolean_t metaslab_bias_enabled = B_TRUE; 181 182 static uint64_t metaslab_fragmentation(metaslab_t *); 183 184 /* 185 * ========================================================================== 186 * Metaslab classes 187 * ========================================================================== 188 */ 189 metaslab_class_t * 190 metaslab_class_create(spa_t *spa, metaslab_ops_t *ops) 191 { 192 metaslab_class_t *mc; 193 194 mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); 195 196 mc->mc_spa = spa; 197 mc->mc_rotor = NULL; 198 mc->mc_ops = ops; 199 200 return (mc); 201 } 202 203 void 204 metaslab_class_destroy(metaslab_class_t *mc) 205 { 206 ASSERT(mc->mc_rotor == NULL); 207 ASSERT(mc->mc_alloc == 0); 208 ASSERT(mc->mc_deferred == 0); 209 ASSERT(mc->mc_space == 0); 210 ASSERT(mc->mc_dspace == 0); 211 212 kmem_free(mc, sizeof (metaslab_class_t)); 213 } 214 215 int 216 metaslab_class_validate(metaslab_class_t *mc) 217 { 218 metaslab_group_t *mg; 219 vdev_t *vd; 220 221 /* 222 * Must hold one of the spa_config locks. 223 */ 224 ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || 225 spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); 226 227 if ((mg = mc->mc_rotor) == NULL) 228 return (0); 229 230 do { 231 vd = mg->mg_vd; 232 ASSERT(vd->vdev_mg != NULL); 233 ASSERT3P(vd->vdev_top, ==, vd); 234 ASSERT3P(mg->mg_class, ==, mc); 235 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); 236 } while ((mg = mg->mg_next) != mc->mc_rotor); 237 238 return (0); 239 } 240 241 void 242 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, 243 int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) 244 { 245 atomic_add_64(&mc->mc_alloc, alloc_delta); 246 atomic_add_64(&mc->mc_deferred, defer_delta); 247 atomic_add_64(&mc->mc_space, space_delta); 248 atomic_add_64(&mc->mc_dspace, dspace_delta); 249 } 250 251 uint64_t 252 metaslab_class_get_alloc(metaslab_class_t *mc) 253 { 254 return (mc->mc_alloc); 255 } 256 257 uint64_t 258 metaslab_class_get_deferred(metaslab_class_t *mc) 259 { 260 return (mc->mc_deferred); 261 } 262 263 uint64_t 264 metaslab_class_get_space(metaslab_class_t *mc) 265 { 266 return (mc->mc_space); 267 } 268 269 uint64_t 270 metaslab_class_get_dspace(metaslab_class_t *mc) 271 { 272 return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); 273 } 274 275 void 276 metaslab_class_histogram_verify(metaslab_class_t *mc) 277 { 278 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 279 uint64_t *mc_hist; 280 int i; 281 282 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 283 return; 284 285 mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 286 KM_SLEEP); 287 288 for (int c = 0; c < rvd->vdev_children; c++) { 289 vdev_t *tvd = rvd->vdev_child[c]; 290 metaslab_group_t *mg = tvd->vdev_mg; 291 292 /* 293 * Skip any holes, uninitialized top-levels, or 294 * vdevs that are not in this metalab class. 295 */ 296 if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 || 297 mg->mg_class != mc) { 298 continue; 299 } 300 301 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 302 mc_hist[i] += mg->mg_histogram[i]; 303 } 304 305 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 306 VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]); 307 308 kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 309 } 310 311 /* 312 * Calculate the metaslab class's fragmentation metric. The metric 313 * is weighted based on the space contribution of each metaslab group. 314 * The return value will be a number between 0 and 100 (inclusive), or 315 * ZFS_FRAG_INVALID if the metric has not been set. See comment above the 316 * zfs_frag_table for more information about the metric. 317 */ 318 uint64_t 319 metaslab_class_fragmentation(metaslab_class_t *mc) 320 { 321 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 322 uint64_t fragmentation = 0; 323 324 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 325 326 for (int c = 0; c < rvd->vdev_children; c++) { 327 vdev_t *tvd = rvd->vdev_child[c]; 328 metaslab_group_t *mg = tvd->vdev_mg; 329 330 /* 331 * Skip any holes, uninitialized top-levels, or 332 * vdevs that are not in this metalab class. 333 */ 334 if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 || 335 mg->mg_class != mc) { 336 continue; 337 } 338 339 /* 340 * If a metaslab group does not contain a fragmentation 341 * metric then just bail out. 342 */ 343 if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { 344 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 345 return (ZFS_FRAG_INVALID); 346 } 347 348 /* 349 * Determine how much this metaslab_group is contributing 350 * to the overall pool fragmentation metric. 351 */ 352 fragmentation += mg->mg_fragmentation * 353 metaslab_group_get_space(mg); 354 } 355 fragmentation /= metaslab_class_get_space(mc); 356 357 ASSERT3U(fragmentation, <=, 100); 358 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 359 return (fragmentation); 360 } 361 362 /* 363 * Calculate the amount of expandable space that is available in 364 * this metaslab class. If a device is expanded then its expandable 365 * space will be the amount of allocatable space that is currently not 366 * part of this metaslab class. 367 */ 368 uint64_t 369 metaslab_class_expandable_space(metaslab_class_t *mc) 370 { 371 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 372 uint64_t space = 0; 373 374 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 375 for (int c = 0; c < rvd->vdev_children; c++) { 376 vdev_t *tvd = rvd->vdev_child[c]; 377 metaslab_group_t *mg = tvd->vdev_mg; 378 379 if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 || 380 mg->mg_class != mc) { 381 continue; 382 } 383 384 /* 385 * Calculate if we have enough space to add additional 386 * metaslabs. We report the expandable space in terms 387 * of the metaslab size since that's the unit of expansion. 388 */ 389 space += P2ALIGN(tvd->vdev_max_asize - tvd->vdev_asize, 390 1ULL << tvd->vdev_ms_shift); 391 } 392 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 393 return (space); 394 } 395 396 /* 397 * ========================================================================== 398 * Metaslab groups 399 * ========================================================================== 400 */ 401 static int 402 metaslab_compare(const void *x1, const void *x2) 403 { 404 const metaslab_t *m1 = x1; 405 const metaslab_t *m2 = x2; 406 407 if (m1->ms_weight < m2->ms_weight) 408 return (1); 409 if (m1->ms_weight > m2->ms_weight) 410 return (-1); 411 412 /* 413 * If the weights are identical, use the offset to force uniqueness. 414 */ 415 if (m1->ms_start < m2->ms_start) 416 return (-1); 417 if (m1->ms_start > m2->ms_start) 418 return (1); 419 420 ASSERT3P(m1, ==, m2); 421 422 return (0); 423 } 424 425 /* 426 * Update the allocatable flag and the metaslab group's capacity. 427 * The allocatable flag is set to true if the capacity is below 428 * the zfs_mg_noalloc_threshold. If a metaslab group transitions 429 * from allocatable to non-allocatable or vice versa then the metaslab 430 * group's class is updated to reflect the transition. 431 */ 432 static void 433 metaslab_group_alloc_update(metaslab_group_t *mg) 434 { 435 vdev_t *vd = mg->mg_vd; 436 metaslab_class_t *mc = mg->mg_class; 437 vdev_stat_t *vs = &vd->vdev_stat; 438 boolean_t was_allocatable; 439 440 ASSERT(vd == vd->vdev_top); 441 442 mutex_enter(&mg->mg_lock); 443 was_allocatable = mg->mg_allocatable; 444 445 mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / 446 (vs->vs_space + 1); 447 448 /* 449 * A metaslab group is considered allocatable if it has plenty 450 * of free space or is not heavily fragmented. We only take 451 * fragmentation into account if the metaslab group has a valid 452 * fragmentation metric (i.e. a value between 0 and 100). 453 */ 454 mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold && 455 (mg->mg_fragmentation == ZFS_FRAG_INVALID || 456 mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)); 457 458 /* 459 * The mc_alloc_groups maintains a count of the number of 460 * groups in this metaslab class that are still above the 461 * zfs_mg_noalloc_threshold. This is used by the allocating 462 * threads to determine if they should avoid allocations to 463 * a given group. The allocator will avoid allocations to a group 464 * if that group has reached or is below the zfs_mg_noalloc_threshold 465 * and there are still other groups that are above the threshold. 466 * When a group transitions from allocatable to non-allocatable or 467 * vice versa we update the metaslab class to reflect that change. 468 * When the mc_alloc_groups value drops to 0 that means that all 469 * groups have reached the zfs_mg_noalloc_threshold making all groups 470 * eligible for allocations. This effectively means that all devices 471 * are balanced again. 472 */ 473 if (was_allocatable && !mg->mg_allocatable) 474 mc->mc_alloc_groups--; 475 else if (!was_allocatable && mg->mg_allocatable) 476 mc->mc_alloc_groups++; 477 478 mutex_exit(&mg->mg_lock); 479 } 480 481 metaslab_group_t * 482 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) 483 { 484 metaslab_group_t *mg; 485 486 mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); 487 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); 488 avl_create(&mg->mg_metaslab_tree, metaslab_compare, 489 sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); 490 mg->mg_vd = vd; 491 mg->mg_class = mc; 492 mg->mg_activation_count = 0; 493 494 mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct, 495 minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT); 496 497 return (mg); 498 } 499 500 void 501 metaslab_group_destroy(metaslab_group_t *mg) 502 { 503 ASSERT(mg->mg_prev == NULL); 504 ASSERT(mg->mg_next == NULL); 505 /* 506 * We may have gone below zero with the activation count 507 * either because we never activated in the first place or 508 * because we're done, and possibly removing the vdev. 509 */ 510 ASSERT(mg->mg_activation_count <= 0); 511 512 taskq_destroy(mg->mg_taskq); 513 avl_destroy(&mg->mg_metaslab_tree); 514 mutex_destroy(&mg->mg_lock); 515 kmem_free(mg, sizeof (metaslab_group_t)); 516 } 517 518 void 519 metaslab_group_activate(metaslab_group_t *mg) 520 { 521 metaslab_class_t *mc = mg->mg_class; 522 metaslab_group_t *mgprev, *mgnext; 523 524 ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); 525 526 ASSERT(mc->mc_rotor != mg); 527 ASSERT(mg->mg_prev == NULL); 528 ASSERT(mg->mg_next == NULL); 529 ASSERT(mg->mg_activation_count <= 0); 530 531 if (++mg->mg_activation_count <= 0) 532 return; 533 534 mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); 535 metaslab_group_alloc_update(mg); 536 537 if ((mgprev = mc->mc_rotor) == NULL) { 538 mg->mg_prev = mg; 539 mg->mg_next = mg; 540 } else { 541 mgnext = mgprev->mg_next; 542 mg->mg_prev = mgprev; 543 mg->mg_next = mgnext; 544 mgprev->mg_next = mg; 545 mgnext->mg_prev = mg; 546 } 547 mc->mc_rotor = mg; 548 } 549 550 void 551 metaslab_group_passivate(metaslab_group_t *mg) 552 { 553 metaslab_class_t *mc = mg->mg_class; 554 metaslab_group_t *mgprev, *mgnext; 555 556 ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); 557 558 if (--mg->mg_activation_count != 0) { 559 ASSERT(mc->mc_rotor != mg); 560 ASSERT(mg->mg_prev == NULL); 561 ASSERT(mg->mg_next == NULL); 562 ASSERT(mg->mg_activation_count < 0); 563 return; 564 } 565 566 taskq_wait(mg->mg_taskq); 567 metaslab_group_alloc_update(mg); 568 569 mgprev = mg->mg_prev; 570 mgnext = mg->mg_next; 571 572 if (mg == mgnext) { 573 mc->mc_rotor = NULL; 574 } else { 575 mc->mc_rotor = mgnext; 576 mgprev->mg_next = mgnext; 577 mgnext->mg_prev = mgprev; 578 } 579 580 mg->mg_prev = NULL; 581 mg->mg_next = NULL; 582 } 583 584 uint64_t 585 metaslab_group_get_space(metaslab_group_t *mg) 586 { 587 return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count); 588 } 589 590 void 591 metaslab_group_histogram_verify(metaslab_group_t *mg) 592 { 593 uint64_t *mg_hist; 594 vdev_t *vd = mg->mg_vd; 595 uint64_t ashift = vd->vdev_ashift; 596 int i; 597 598 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 599 return; 600 601 mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 602 KM_SLEEP); 603 604 ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=, 605 SPACE_MAP_HISTOGRAM_SIZE + ashift); 606 607 for (int m = 0; m < vd->vdev_ms_count; m++) { 608 metaslab_t *msp = vd->vdev_ms[m]; 609 610 if (msp->ms_sm == NULL) 611 continue; 612 613 for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) 614 mg_hist[i + ashift] += 615 msp->ms_sm->sm_phys->smp_histogram[i]; 616 } 617 618 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++) 619 VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]); 620 621 kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 622 } 623 624 static void 625 metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp) 626 { 627 metaslab_class_t *mc = mg->mg_class; 628 uint64_t ashift = mg->mg_vd->vdev_ashift; 629 630 ASSERT(MUTEX_HELD(&msp->ms_lock)); 631 if (msp->ms_sm == NULL) 632 return; 633 634 mutex_enter(&mg->mg_lock); 635 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 636 mg->mg_histogram[i + ashift] += 637 msp->ms_sm->sm_phys->smp_histogram[i]; 638 mc->mc_histogram[i + ashift] += 639 msp->ms_sm->sm_phys->smp_histogram[i]; 640 } 641 mutex_exit(&mg->mg_lock); 642 } 643 644 void 645 metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp) 646 { 647 metaslab_class_t *mc = mg->mg_class; 648 uint64_t ashift = mg->mg_vd->vdev_ashift; 649 650 ASSERT(MUTEX_HELD(&msp->ms_lock)); 651 if (msp->ms_sm == NULL) 652 return; 653 654 mutex_enter(&mg->mg_lock); 655 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 656 ASSERT3U(mg->mg_histogram[i + ashift], >=, 657 msp->ms_sm->sm_phys->smp_histogram[i]); 658 ASSERT3U(mc->mc_histogram[i + ashift], >=, 659 msp->ms_sm->sm_phys->smp_histogram[i]); 660 661 mg->mg_histogram[i + ashift] -= 662 msp->ms_sm->sm_phys->smp_histogram[i]; 663 mc->mc_histogram[i + ashift] -= 664 msp->ms_sm->sm_phys->smp_histogram[i]; 665 } 666 mutex_exit(&mg->mg_lock); 667 } 668 669 static void 670 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) 671 { 672 ASSERT(msp->ms_group == NULL); 673 mutex_enter(&mg->mg_lock); 674 msp->ms_group = mg; 675 msp->ms_weight = 0; 676 avl_add(&mg->mg_metaslab_tree, msp); 677 mutex_exit(&mg->mg_lock); 678 679 mutex_enter(&msp->ms_lock); 680 metaslab_group_histogram_add(mg, msp); 681 mutex_exit(&msp->ms_lock); 682 } 683 684 static void 685 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) 686 { 687 mutex_enter(&msp->ms_lock); 688 metaslab_group_histogram_remove(mg, msp); 689 mutex_exit(&msp->ms_lock); 690 691 mutex_enter(&mg->mg_lock); 692 ASSERT(msp->ms_group == mg); 693 avl_remove(&mg->mg_metaslab_tree, msp); 694 msp->ms_group = NULL; 695 mutex_exit(&mg->mg_lock); 696 } 697 698 static void 699 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 700 { 701 /* 702 * Although in principle the weight can be any value, in 703 * practice we do not use values in the range [1, 511]. 704 */ 705 ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0); 706 ASSERT(MUTEX_HELD(&msp->ms_lock)); 707 708 mutex_enter(&mg->mg_lock); 709 ASSERT(msp->ms_group == mg); 710 avl_remove(&mg->mg_metaslab_tree, msp); 711 msp->ms_weight = weight; 712 avl_add(&mg->mg_metaslab_tree, msp); 713 mutex_exit(&mg->mg_lock); 714 } 715 716 /* 717 * Calculate the fragmentation for a given metaslab group. We can use 718 * a simple average here since all metaslabs within the group must have 719 * the same size. The return value will be a value between 0 and 100 720 * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this 721 * group have a fragmentation metric. 722 */ 723 uint64_t 724 metaslab_group_fragmentation(metaslab_group_t *mg) 725 { 726 vdev_t *vd = mg->mg_vd; 727 uint64_t fragmentation = 0; 728 uint64_t valid_ms = 0; 729 730 for (int m = 0; m < vd->vdev_ms_count; m++) { 731 metaslab_t *msp = vd->vdev_ms[m]; 732 733 if (msp->ms_fragmentation == ZFS_FRAG_INVALID) 734 continue; 735 736 valid_ms++; 737 fragmentation += msp->ms_fragmentation; 738 } 739 740 if (valid_ms <= vd->vdev_ms_count / 2) 741 return (ZFS_FRAG_INVALID); 742 743 fragmentation /= valid_ms; 744 ASSERT3U(fragmentation, <=, 100); 745 return (fragmentation); 746 } 747 748 /* 749 * Determine if a given metaslab group should skip allocations. A metaslab 750 * group should avoid allocations if its free capacity is less than the 751 * zfs_mg_noalloc_threshold or its fragmentation metric is greater than 752 * zfs_mg_fragmentation_threshold and there is at least one metaslab group 753 * that can still handle allocations. 754 */ 755 static boolean_t 756 metaslab_group_allocatable(metaslab_group_t *mg) 757 { 758 vdev_t *vd = mg->mg_vd; 759 spa_t *spa = vd->vdev_spa; 760 metaslab_class_t *mc = mg->mg_class; 761 762 /* 763 * We use two key metrics to determine if a metaslab group is 764 * considered allocatable -- free space and fragmentation. If 765 * the free space is greater than the free space threshold and 766 * the fragmentation is less than the fragmentation threshold then 767 * consider the group allocatable. There are two case when we will 768 * not consider these key metrics. The first is if the group is 769 * associated with a slog device and the second is if all groups 770 * in this metaslab class have already been consider ineligible 771 * for allocations. 772 */ 773 return ((mg->mg_free_capacity > zfs_mg_noalloc_threshold && 774 (mg->mg_fragmentation == ZFS_FRAG_INVALID || 775 mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)) || 776 mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0); 777 } 778 779 /* 780 * ========================================================================== 781 * Range tree callbacks 782 * ========================================================================== 783 */ 784 785 /* 786 * Comparison function for the private size-ordered tree. Tree is sorted 787 * by size, larger sizes at the end of the tree. 788 */ 789 static int 790 metaslab_rangesize_compare(const void *x1, const void *x2) 791 { 792 const range_seg_t *r1 = x1; 793 const range_seg_t *r2 = x2; 794 uint64_t rs_size1 = r1->rs_end - r1->rs_start; 795 uint64_t rs_size2 = r2->rs_end - r2->rs_start; 796 797 if (rs_size1 < rs_size2) 798 return (-1); 799 if (rs_size1 > rs_size2) 800 return (1); 801 802 if (r1->rs_start < r2->rs_start) 803 return (-1); 804 805 if (r1->rs_start > r2->rs_start) 806 return (1); 807 808 return (0); 809 } 810 811 /* 812 * Create any block allocator specific components. The current allocators 813 * rely on using both a size-ordered range_tree_t and an array of uint64_t's. 814 */ 815 static void 816 metaslab_rt_create(range_tree_t *rt, void *arg) 817 { 818 metaslab_t *msp = arg; 819 820 ASSERT3P(rt->rt_arg, ==, msp); 821 ASSERT(msp->ms_tree == NULL); 822 823 avl_create(&msp->ms_size_tree, metaslab_rangesize_compare, 824 sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node)); 825 } 826 827 /* 828 * Destroy the block allocator specific components. 829 */ 830 static void 831 metaslab_rt_destroy(range_tree_t *rt, void *arg) 832 { 833 metaslab_t *msp = arg; 834 835 ASSERT3P(rt->rt_arg, ==, msp); 836 ASSERT3P(msp->ms_tree, ==, rt); 837 ASSERT0(avl_numnodes(&msp->ms_size_tree)); 838 839 avl_destroy(&msp->ms_size_tree); 840 } 841 842 static void 843 metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg) 844 { 845 metaslab_t *msp = arg; 846 847 ASSERT3P(rt->rt_arg, ==, msp); 848 ASSERT3P(msp->ms_tree, ==, rt); 849 VERIFY(!msp->ms_condensing); 850 avl_add(&msp->ms_size_tree, rs); 851 } 852 853 static void 854 metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg) 855 { 856 metaslab_t *msp = arg; 857 858 ASSERT3P(rt->rt_arg, ==, msp); 859 ASSERT3P(msp->ms_tree, ==, rt); 860 VERIFY(!msp->ms_condensing); 861 avl_remove(&msp->ms_size_tree, rs); 862 } 863 864 static void 865 metaslab_rt_vacate(range_tree_t *rt, void *arg) 866 { 867 metaslab_t *msp = arg; 868 869 ASSERT3P(rt->rt_arg, ==, msp); 870 ASSERT3P(msp->ms_tree, ==, rt); 871 872 /* 873 * Normally one would walk the tree freeing nodes along the way. 874 * Since the nodes are shared with the range trees we can avoid 875 * walking all nodes and just reinitialize the avl tree. The nodes 876 * will be freed by the range tree, so we don't want to free them here. 877 */ 878 avl_create(&msp->ms_size_tree, metaslab_rangesize_compare, 879 sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node)); 880 } 881 882 static range_tree_ops_t metaslab_rt_ops = { 883 metaslab_rt_create, 884 metaslab_rt_destroy, 885 metaslab_rt_add, 886 metaslab_rt_remove, 887 metaslab_rt_vacate 888 }; 889 890 /* 891 * ========================================================================== 892 * Metaslab block operations 893 * ========================================================================== 894 */ 895 896 /* 897 * Return the maximum contiguous segment within the metaslab. 898 */ 899 uint64_t 900 metaslab_block_maxsize(metaslab_t *msp) 901 { 902 avl_tree_t *t = &msp->ms_size_tree; 903 range_seg_t *rs; 904 905 if (t == NULL || (rs = avl_last(t)) == NULL) 906 return (0ULL); 907 908 return (rs->rs_end - rs->rs_start); 909 } 910 911 uint64_t 912 metaslab_block_alloc(metaslab_t *msp, uint64_t size) 913 { 914 uint64_t start; 915 range_tree_t *rt = msp->ms_tree; 916 917 VERIFY(!msp->ms_condensing); 918 919 start = msp->ms_ops->msop_alloc(msp, size); 920 if (start != -1ULL) { 921 vdev_t *vd = msp->ms_group->mg_vd; 922 923 VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift)); 924 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 925 VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size); 926 range_tree_remove(rt, start, size); 927 } 928 return (start); 929 } 930 931 /* 932 * ========================================================================== 933 * Common allocator routines 934 * ========================================================================== 935 */ 936 937 /* 938 * This is a helper function that can be used by the allocator to find 939 * a suitable block to allocate. This will search the specified AVL 940 * tree looking for a block that matches the specified criteria. 941 */ 942 static uint64_t 943 metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, 944 uint64_t align) 945 { 946 range_seg_t *rs, rsearch; 947 avl_index_t where; 948 949 rsearch.rs_start = *cursor; 950 rsearch.rs_end = *cursor + size; 951 952 rs = avl_find(t, &rsearch, &where); 953 if (rs == NULL) 954 rs = avl_nearest(t, where, AVL_AFTER); 955 956 while (rs != NULL) { 957 uint64_t offset = P2ROUNDUP(rs->rs_start, align); 958 959 if (offset + size <= rs->rs_end) { 960 *cursor = offset + size; 961 return (offset); 962 } 963 rs = AVL_NEXT(t, rs); 964 } 965 966 /* 967 * If we know we've searched the whole map (*cursor == 0), give up. 968 * Otherwise, reset the cursor to the beginning and try again. 969 */ 970 if (*cursor == 0) 971 return (-1ULL); 972 973 *cursor = 0; 974 return (metaslab_block_picker(t, cursor, size, align)); 975 } 976 977 /* 978 * ========================================================================== 979 * The first-fit block allocator 980 * ========================================================================== 981 */ 982 static uint64_t 983 metaslab_ff_alloc(metaslab_t *msp, uint64_t size) 984 { 985 /* 986 * Find the largest power of 2 block size that evenly divides the 987 * requested size. This is used to try to allocate blocks with similar 988 * alignment from the same area of the metaslab (i.e. same cursor 989 * bucket) but it does not guarantee that other allocations sizes 990 * may exist in the same region. 991 */ 992 uint64_t align = size & -size; 993 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; 994 avl_tree_t *t = &msp->ms_tree->rt_root; 995 996 return (metaslab_block_picker(t, cursor, size, align)); 997 } 998 999 static metaslab_ops_t metaslab_ff_ops = { 1000 metaslab_ff_alloc 1001 }; 1002 1003 /* 1004 * ========================================================================== 1005 * Dynamic block allocator - 1006 * Uses the first fit allocation scheme until space get low and then 1007 * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold 1008 * and metaslab_df_free_pct to determine when to switch the allocation scheme. 1009 * ========================================================================== 1010 */ 1011 static uint64_t 1012 metaslab_df_alloc(metaslab_t *msp, uint64_t size) 1013 { 1014 /* 1015 * Find the largest power of 2 block size that evenly divides the 1016 * requested size. This is used to try to allocate blocks with similar 1017 * alignment from the same area of the metaslab (i.e. same cursor 1018 * bucket) but it does not guarantee that other allocations sizes 1019 * may exist in the same region. 1020 */ 1021 uint64_t align = size & -size; 1022 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; 1023 range_tree_t *rt = msp->ms_tree; 1024 avl_tree_t *t = &rt->rt_root; 1025 uint64_t max_size = metaslab_block_maxsize(msp); 1026 int free_pct = range_tree_space(rt) * 100 / msp->ms_size; 1027 1028 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1029 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree)); 1030 1031 if (max_size < size) 1032 return (-1ULL); 1033 1034 /* 1035 * If we're running low on space switch to using the size 1036 * sorted AVL tree (best-fit). 1037 */ 1038 if (max_size < metaslab_df_alloc_threshold || 1039 free_pct < metaslab_df_free_pct) { 1040 t = &msp->ms_size_tree; 1041 *cursor = 0; 1042 } 1043 1044 return (metaslab_block_picker(t, cursor, size, 1ULL)); 1045 } 1046 1047 static metaslab_ops_t metaslab_df_ops = { 1048 metaslab_df_alloc 1049 }; 1050 1051 /* 1052 * ========================================================================== 1053 * Cursor fit block allocator - 1054 * Select the largest region in the metaslab, set the cursor to the beginning 1055 * of the range and the cursor_end to the end of the range. As allocations 1056 * are made advance the cursor. Continue allocating from the cursor until 1057 * the range is exhausted and then find a new range. 1058 * ========================================================================== 1059 */ 1060 static uint64_t 1061 metaslab_cf_alloc(metaslab_t *msp, uint64_t size) 1062 { 1063 range_tree_t *rt = msp->ms_tree; 1064 avl_tree_t *t = &msp->ms_size_tree; 1065 uint64_t *cursor = &msp->ms_lbas[0]; 1066 uint64_t *cursor_end = &msp->ms_lbas[1]; 1067 uint64_t offset = 0; 1068 1069 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1070 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root)); 1071 1072 ASSERT3U(*cursor_end, >=, *cursor); 1073 1074 if ((*cursor + size) > *cursor_end) { 1075 range_seg_t *rs; 1076 1077 rs = avl_last(&msp->ms_size_tree); 1078 if (rs == NULL || (rs->rs_end - rs->rs_start) < size) 1079 return (-1ULL); 1080 1081 *cursor = rs->rs_start; 1082 *cursor_end = rs->rs_end; 1083 } 1084 1085 offset = *cursor; 1086 *cursor += size; 1087 1088 return (offset); 1089 } 1090 1091 static metaslab_ops_t metaslab_cf_ops = { 1092 metaslab_cf_alloc 1093 }; 1094 1095 /* 1096 * ========================================================================== 1097 * New dynamic fit allocator - 1098 * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift 1099 * contiguous blocks. If no region is found then just use the largest segment 1100 * that remains. 1101 * ========================================================================== 1102 */ 1103 1104 /* 1105 * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift) 1106 * to request from the allocator. 1107 */ 1108 uint64_t metaslab_ndf_clump_shift = 4; 1109 1110 static uint64_t 1111 metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) 1112 { 1113 avl_tree_t *t = &msp->ms_tree->rt_root; 1114 avl_index_t where; 1115 range_seg_t *rs, rsearch; 1116 uint64_t hbit = highbit64(size); 1117 uint64_t *cursor = &msp->ms_lbas[hbit - 1]; 1118 uint64_t max_size = metaslab_block_maxsize(msp); 1119 1120 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1121 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree)); 1122 1123 if (max_size < size) 1124 return (-1ULL); 1125 1126 rsearch.rs_start = *cursor; 1127 rsearch.rs_end = *cursor + size; 1128 1129 rs = avl_find(t, &rsearch, &where); 1130 if (rs == NULL || (rs->rs_end - rs->rs_start) < size) { 1131 t = &msp->ms_size_tree; 1132 1133 rsearch.rs_start = 0; 1134 rsearch.rs_end = MIN(max_size, 1135 1ULL << (hbit + metaslab_ndf_clump_shift)); 1136 rs = avl_find(t, &rsearch, &where); 1137 if (rs == NULL) 1138 rs = avl_nearest(t, where, AVL_AFTER); 1139 ASSERT(rs != NULL); 1140 } 1141 1142 if ((rs->rs_end - rs->rs_start) >= size) { 1143 *cursor = rs->rs_start + size; 1144 return (rs->rs_start); 1145 } 1146 return (-1ULL); 1147 } 1148 1149 static metaslab_ops_t metaslab_ndf_ops = { 1150 metaslab_ndf_alloc 1151 }; 1152 1153 metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops; 1154 1155 /* 1156 * ========================================================================== 1157 * Metaslabs 1158 * ========================================================================== 1159 */ 1160 1161 /* 1162 * Wait for any in-progress metaslab loads to complete. 1163 */ 1164 void 1165 metaslab_load_wait(metaslab_t *msp) 1166 { 1167 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1168 1169 while (msp->ms_loading) { 1170 ASSERT(!msp->ms_loaded); 1171 cv_wait(&msp->ms_load_cv, &msp->ms_lock); 1172 } 1173 } 1174 1175 int 1176 metaslab_load(metaslab_t *msp) 1177 { 1178 int error = 0; 1179 1180 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1181 ASSERT(!msp->ms_loaded); 1182 ASSERT(!msp->ms_loading); 1183 1184 msp->ms_loading = B_TRUE; 1185 1186 /* 1187 * If the space map has not been allocated yet, then treat 1188 * all the space in the metaslab as free and add it to the 1189 * ms_tree. 1190 */ 1191 if (msp->ms_sm != NULL) 1192 error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE); 1193 else 1194 range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size); 1195 1196 msp->ms_loaded = (error == 0); 1197 msp->ms_loading = B_FALSE; 1198 1199 if (msp->ms_loaded) { 1200 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1201 range_tree_walk(msp->ms_defertree[t], 1202 range_tree_remove, msp->ms_tree); 1203 } 1204 } 1205 cv_broadcast(&msp->ms_load_cv); 1206 return (error); 1207 } 1208 1209 void 1210 metaslab_unload(metaslab_t *msp) 1211 { 1212 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1213 range_tree_vacate(msp->ms_tree, NULL, NULL); 1214 msp->ms_loaded = B_FALSE; 1215 msp->ms_weight &= ~METASLAB_ACTIVE_MASK; 1216 } 1217 1218 int 1219 metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, 1220 metaslab_t **msp) 1221 { 1222 vdev_t *vd = mg->mg_vd; 1223 objset_t *mos = vd->vdev_spa->spa_meta_objset; 1224 metaslab_t *ms; 1225 int error; 1226 1227 ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); 1228 mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL); 1229 cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); 1230 ms->ms_id = id; 1231 ms->ms_start = id << vd->vdev_ms_shift; 1232 ms->ms_size = 1ULL << vd->vdev_ms_shift; 1233 1234 /* 1235 * We only open space map objects that already exist. All others 1236 * will be opened when we finally allocate an object for it. 1237 */ 1238 if (object != 0) { 1239 error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start, 1240 ms->ms_size, vd->vdev_ashift, &ms->ms_lock); 1241 1242 if (error != 0) { 1243 kmem_free(ms, sizeof (metaslab_t)); 1244 return (error); 1245 } 1246 1247 ASSERT(ms->ms_sm != NULL); 1248 } 1249 1250 /* 1251 * We create the main range tree here, but we don't create the 1252 * alloctree and freetree until metaslab_sync_done(). This serves 1253 * two purposes: it allows metaslab_sync_done() to detect the 1254 * addition of new space; and for debugging, it ensures that we'd 1255 * data fault on any attempt to use this metaslab before it's ready. 1256 */ 1257 ms->ms_tree = range_tree_create(&metaslab_rt_ops, ms, &ms->ms_lock); 1258 metaslab_group_add(mg, ms); 1259 1260 ms->ms_fragmentation = metaslab_fragmentation(ms); 1261 ms->ms_ops = mg->mg_class->mc_ops; 1262 1263 /* 1264 * If we're opening an existing pool (txg == 0) or creating 1265 * a new one (txg == TXG_INITIAL), all space is available now. 1266 * If we're adding space to an existing pool, the new space 1267 * does not become available until after this txg has synced. 1268 */ 1269 if (txg <= TXG_INITIAL) 1270 metaslab_sync_done(ms, 0); 1271 1272 /* 1273 * If metaslab_debug_load is set and we're initializing a metaslab 1274 * that has an allocated space_map object then load the its space 1275 * map so that can verify frees. 1276 */ 1277 if (metaslab_debug_load && ms->ms_sm != NULL) { 1278 mutex_enter(&ms->ms_lock); 1279 VERIFY0(metaslab_load(ms)); 1280 mutex_exit(&ms->ms_lock); 1281 } 1282 1283 if (txg != 0) { 1284 vdev_dirty(vd, 0, NULL, txg); 1285 vdev_dirty(vd, VDD_METASLAB, ms, txg); 1286 } 1287 1288 *msp = ms; 1289 1290 return (0); 1291 } 1292 1293 void 1294 metaslab_fini(metaslab_t *msp) 1295 { 1296 metaslab_group_t *mg = msp->ms_group; 1297 1298 metaslab_group_remove(mg, msp); 1299 1300 mutex_enter(&msp->ms_lock); 1301 1302 VERIFY(msp->ms_group == NULL); 1303 vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm), 1304 0, -msp->ms_size); 1305 space_map_close(msp->ms_sm); 1306 1307 metaslab_unload(msp); 1308 range_tree_destroy(msp->ms_tree); 1309 1310 for (int t = 0; t < TXG_SIZE; t++) { 1311 range_tree_destroy(msp->ms_alloctree[t]); 1312 range_tree_destroy(msp->ms_freetree[t]); 1313 } 1314 1315 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1316 range_tree_destroy(msp->ms_defertree[t]); 1317 } 1318 1319 ASSERT0(msp->ms_deferspace); 1320 1321 mutex_exit(&msp->ms_lock); 1322 cv_destroy(&msp->ms_load_cv); 1323 mutex_destroy(&msp->ms_lock); 1324 1325 kmem_free(msp, sizeof (metaslab_t)); 1326 } 1327 1328 #define FRAGMENTATION_TABLE_SIZE 17 1329 1330 /* 1331 * This table defines a segment size based fragmentation metric that will 1332 * allow each metaslab to derive its own fragmentation value. This is done 1333 * by calculating the space in each bucket of the spacemap histogram and 1334 * multiplying that by the fragmetation metric in this table. Doing 1335 * this for all buckets and dividing it by the total amount of free 1336 * space in this metaslab (i.e. the total free space in all buckets) gives 1337 * us the fragmentation metric. This means that a high fragmentation metric 1338 * equates to most of the free space being comprised of small segments. 1339 * Conversely, if the metric is low, then most of the free space is in 1340 * large segments. A 10% change in fragmentation equates to approximately 1341 * double the number of segments. 1342 * 1343 * This table defines 0% fragmented space using 16MB segments. Testing has 1344 * shown that segments that are greater than or equal to 16MB do not suffer 1345 * from drastic performance problems. Using this value, we derive the rest 1346 * of the table. Since the fragmentation value is never stored on disk, it 1347 * is possible to change these calculations in the future. 1348 */ 1349 int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = { 1350 100, /* 512B */ 1351 100, /* 1K */ 1352 98, /* 2K */ 1353 95, /* 4K */ 1354 90, /* 8K */ 1355 80, /* 16K */ 1356 70, /* 32K */ 1357 60, /* 64K */ 1358 50, /* 128K */ 1359 40, /* 256K */ 1360 30, /* 512K */ 1361 20, /* 1M */ 1362 15, /* 2M */ 1363 10, /* 4M */ 1364 5, /* 8M */ 1365 0 /* 16M */ 1366 }; 1367 1368 /* 1369 * Calclate the metaslab's fragmentation metric. A return value 1370 * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does 1371 * not support this metric. Otherwise, the return value should be in the 1372 * range [0, 100]. 1373 */ 1374 static uint64_t 1375 metaslab_fragmentation(metaslab_t *msp) 1376 { 1377 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1378 uint64_t fragmentation = 0; 1379 uint64_t total = 0; 1380 boolean_t feature_enabled = spa_feature_is_enabled(spa, 1381 SPA_FEATURE_SPACEMAP_HISTOGRAM); 1382 1383 if (!feature_enabled) 1384 return (ZFS_FRAG_INVALID); 1385 1386 /* 1387 * A null space map means that the entire metaslab is free 1388 * and thus is not fragmented. 1389 */ 1390 if (msp->ms_sm == NULL) 1391 return (0); 1392 1393 /* 1394 * If this metaslab's space_map has not been upgraded, flag it 1395 * so that we upgrade next time we encounter it. 1396 */ 1397 if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) { 1398 uint64_t txg = spa_syncing_txg(spa); 1399 vdev_t *vd = msp->ms_group->mg_vd; 1400 1401 if (spa_writeable(spa)) { 1402 msp->ms_condense_wanted = B_TRUE; 1403 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 1404 spa_dbgmsg(spa, "txg %llu, requesting force condense: " 1405 "msp %p, vd %p", txg, msp, vd); 1406 } 1407 return (ZFS_FRAG_INVALID); 1408 } 1409 1410 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 1411 uint64_t space = 0; 1412 uint8_t shift = msp->ms_sm->sm_shift; 1413 int idx = MIN(shift - SPA_MINBLOCKSHIFT + i, 1414 FRAGMENTATION_TABLE_SIZE - 1); 1415 1416 if (msp->ms_sm->sm_phys->smp_histogram[i] == 0) 1417 continue; 1418 1419 space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift); 1420 total += space; 1421 1422 ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE); 1423 fragmentation += space * zfs_frag_table[idx]; 1424 } 1425 1426 if (total > 0) 1427 fragmentation /= total; 1428 ASSERT3U(fragmentation, <=, 100); 1429 return (fragmentation); 1430 } 1431 1432 /* 1433 * Compute a weight -- a selection preference value -- for the given metaslab. 1434 * This is based on the amount of free space, the level of fragmentation, 1435 * the LBA range, and whether the metaslab is loaded. 1436 */ 1437 static uint64_t 1438 metaslab_weight(metaslab_t *msp) 1439 { 1440 metaslab_group_t *mg = msp->ms_group; 1441 vdev_t *vd = mg->mg_vd; 1442 uint64_t weight, space; 1443 1444 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1445 1446 /* 1447 * This vdev is in the process of being removed so there is nothing 1448 * for us to do here. 1449 */ 1450 if (vd->vdev_removing) { 1451 ASSERT0(space_map_allocated(msp->ms_sm)); 1452 ASSERT0(vd->vdev_ms_shift); 1453 return (0); 1454 } 1455 1456 /* 1457 * The baseline weight is the metaslab's free space. 1458 */ 1459 space = msp->ms_size - space_map_allocated(msp->ms_sm); 1460 1461 msp->ms_fragmentation = metaslab_fragmentation(msp); 1462 if (metaslab_fragmentation_factor_enabled && 1463 msp->ms_fragmentation != ZFS_FRAG_INVALID) { 1464 /* 1465 * Use the fragmentation information to inversely scale 1466 * down the baseline weight. We need to ensure that we 1467 * don't exclude this metaslab completely when it's 100% 1468 * fragmented. To avoid this we reduce the fragmented value 1469 * by 1. 1470 */ 1471 space = (space * (100 - (msp->ms_fragmentation - 1))) / 100; 1472 1473 /* 1474 * If space < SPA_MINBLOCKSIZE, then we will not allocate from 1475 * this metaslab again. The fragmentation metric may have 1476 * decreased the space to something smaller than 1477 * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE 1478 * so that we can consume any remaining space. 1479 */ 1480 if (space > 0 && space < SPA_MINBLOCKSIZE) 1481 space = SPA_MINBLOCKSIZE; 1482 } 1483 weight = space; 1484 1485 /* 1486 * Modern disks have uniform bit density and constant angular velocity. 1487 * Therefore, the outer recording zones are faster (higher bandwidth) 1488 * than the inner zones by the ratio of outer to inner track diameter, 1489 * which is typically around 2:1. We account for this by assigning 1490 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). 1491 * In effect, this means that we'll select the metaslab with the most 1492 * free bandwidth rather than simply the one with the most free space. 1493 */ 1494 if (metaslab_lba_weighting_enabled) { 1495 weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count; 1496 ASSERT(weight >= space && weight <= 2 * space); 1497 } 1498 1499 /* 1500 * If this metaslab is one we're actively using, adjust its 1501 * weight to make it preferable to any inactive metaslab so 1502 * we'll polish it off. If the fragmentation on this metaslab 1503 * has exceed our threshold, then don't mark it active. 1504 */ 1505 if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID && 1506 msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) { 1507 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); 1508 } 1509 1510 return (weight); 1511 } 1512 1513 static int 1514 metaslab_activate(metaslab_t *msp, uint64_t activation_weight) 1515 { 1516 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1517 1518 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { 1519 metaslab_load_wait(msp); 1520 if (!msp->ms_loaded) { 1521 int error = metaslab_load(msp); 1522 if (error) { 1523 metaslab_group_sort(msp->ms_group, msp, 0); 1524 return (error); 1525 } 1526 } 1527 1528 metaslab_group_sort(msp->ms_group, msp, 1529 msp->ms_weight | activation_weight); 1530 } 1531 ASSERT(msp->ms_loaded); 1532 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); 1533 1534 return (0); 1535 } 1536 1537 static void 1538 metaslab_passivate(metaslab_t *msp, uint64_t size) 1539 { 1540 /* 1541 * If size < SPA_MINBLOCKSIZE, then we will not allocate from 1542 * this metaslab again. In that case, it had better be empty, 1543 * or we would be leaving space on the table. 1544 */ 1545 ASSERT(size >= SPA_MINBLOCKSIZE || range_tree_space(msp->ms_tree) == 0); 1546 metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size)); 1547 ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); 1548 } 1549 1550 static void 1551 metaslab_preload(void *arg) 1552 { 1553 metaslab_t *msp = arg; 1554 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1555 1556 ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock)); 1557 1558 mutex_enter(&msp->ms_lock); 1559 metaslab_load_wait(msp); 1560 if (!msp->ms_loaded) 1561 (void) metaslab_load(msp); 1562 1563 /* 1564 * Set the ms_access_txg value so that we don't unload it right away. 1565 */ 1566 msp->ms_access_txg = spa_syncing_txg(spa) + metaslab_unload_delay + 1; 1567 mutex_exit(&msp->ms_lock); 1568 } 1569 1570 static void 1571 metaslab_group_preload(metaslab_group_t *mg) 1572 { 1573 spa_t *spa = mg->mg_vd->vdev_spa; 1574 metaslab_t *msp; 1575 avl_tree_t *t = &mg->mg_metaslab_tree; 1576 int m = 0; 1577 1578 if (spa_shutting_down(spa) || !metaslab_preload_enabled) { 1579 taskq_wait(mg->mg_taskq); 1580 return; 1581 } 1582 1583 mutex_enter(&mg->mg_lock); 1584 /* 1585 * Load the next potential metaslabs 1586 */ 1587 msp = avl_first(t); 1588 while (msp != NULL) { 1589 metaslab_t *msp_next = AVL_NEXT(t, msp); 1590 1591 /* 1592 * We preload only the maximum number of metaslabs specified 1593 * by metaslab_preload_limit. If a metaslab is being forced 1594 * to condense then we preload it too. This will ensure 1595 * that force condensing happens in the next txg. 1596 */ 1597 if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) { 1598 msp = msp_next; 1599 continue; 1600 } 1601 1602 /* 1603 * We must drop the metaslab group lock here to preserve 1604 * lock ordering with the ms_lock (when grabbing both 1605 * the mg_lock and the ms_lock, the ms_lock must be taken 1606 * first). As a result, it is possible that the ordering 1607 * of the metaslabs within the avl tree may change before 1608 * we reacquire the lock. The metaslab cannot be removed from 1609 * the tree while we're in syncing context so it is safe to 1610 * drop the mg_lock here. If the metaslabs are reordered 1611 * nothing will break -- we just may end up loading a 1612 * less than optimal one. 1613 */ 1614 mutex_exit(&mg->mg_lock); 1615 VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload, 1616 msp, TQ_SLEEP) != NULL); 1617 mutex_enter(&mg->mg_lock); 1618 msp = msp_next; 1619 } 1620 mutex_exit(&mg->mg_lock); 1621 } 1622 1623 /* 1624 * Determine if the space map's on-disk footprint is past our tolerance 1625 * for inefficiency. We would like to use the following criteria to make 1626 * our decision: 1627 * 1628 * 1. The size of the space map object should not dramatically increase as a 1629 * result of writing out the free space range tree. 1630 * 1631 * 2. The minimal on-disk space map representation is zfs_condense_pct/100 1632 * times the size than the free space range tree representation 1633 * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB). 1634 * 1635 * 3. The on-disk size of the space map should actually decrease. 1636 * 1637 * Checking the first condition is tricky since we don't want to walk 1638 * the entire AVL tree calculating the estimated on-disk size. Instead we 1639 * use the size-ordered range tree in the metaslab and calculate the 1640 * size required to write out the largest segment in our free tree. If the 1641 * size required to represent that segment on disk is larger than the space 1642 * map object then we avoid condensing this map. 1643 * 1644 * To determine the second criterion we use a best-case estimate and assume 1645 * each segment can be represented on-disk as a single 64-bit entry. We refer 1646 * to this best-case estimate as the space map's minimal form. 1647 * 1648 * Unfortunately, we cannot compute the on-disk size of the space map in this 1649 * context because we cannot accurately compute the effects of compression, etc. 1650 * Instead, we apply the heuristic described in the block comment for 1651 * zfs_metaslab_condense_block_threshold - we only condense if the space used 1652 * is greater than a threshold number of blocks. 1653 */ 1654 static boolean_t 1655 metaslab_should_condense(metaslab_t *msp) 1656 { 1657 space_map_t *sm = msp->ms_sm; 1658 range_seg_t *rs; 1659 uint64_t size, entries, segsz, object_size, optimal_size, record_size; 1660 dmu_object_info_t doi; 1661 uint64_t vdev_blocksize = 1 << msp->ms_group->mg_vd->vdev_ashift; 1662 1663 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1664 ASSERT(msp->ms_loaded); 1665 1666 /* 1667 * Use the ms_size_tree range tree, which is ordered by size, to 1668 * obtain the largest segment in the free tree. We always condense 1669 * metaslabs that are empty and metaslabs for which a condense 1670 * request has been made. 1671 */ 1672 rs = avl_last(&msp->ms_size_tree); 1673 if (rs == NULL || msp->ms_condense_wanted) 1674 return (B_TRUE); 1675 1676 /* 1677 * Calculate the number of 64-bit entries this segment would 1678 * require when written to disk. If this single segment would be 1679 * larger on-disk than the entire current on-disk structure, then 1680 * clearly condensing will increase the on-disk structure size. 1681 */ 1682 size = (rs->rs_end - rs->rs_start) >> sm->sm_shift; 1683 entries = size / (MIN(size, SM_RUN_MAX)); 1684 segsz = entries * sizeof (uint64_t); 1685 1686 optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root); 1687 object_size = space_map_length(msp->ms_sm); 1688 1689 dmu_object_info_from_db(sm->sm_dbuf, &doi); 1690 record_size = MAX(doi.doi_data_block_size, vdev_blocksize); 1691 1692 return (segsz <= object_size && 1693 object_size >= (optimal_size * zfs_condense_pct / 100) && 1694 object_size > zfs_metaslab_condense_block_threshold * record_size); 1695 } 1696 1697 /* 1698 * Condense the on-disk space map representation to its minimized form. 1699 * The minimized form consists of a small number of allocations followed by 1700 * the entries of the free range tree. 1701 */ 1702 static void 1703 metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) 1704 { 1705 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1706 range_tree_t *freetree = msp->ms_freetree[txg & TXG_MASK]; 1707 range_tree_t *condense_tree; 1708 space_map_t *sm = msp->ms_sm; 1709 1710 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1711 ASSERT3U(spa_sync_pass(spa), ==, 1); 1712 ASSERT(msp->ms_loaded); 1713 1714 1715 spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, vdev id %llu, " 1716 "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg, 1717 msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id, 1718 msp->ms_group->mg_vd->vdev_spa->spa_name, 1719 space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root), 1720 msp->ms_condense_wanted ? "TRUE" : "FALSE"); 1721 1722 msp->ms_condense_wanted = B_FALSE; 1723 1724 /* 1725 * Create an range tree that is 100% allocated. We remove segments 1726 * that have been freed in this txg, any deferred frees that exist, 1727 * and any allocation in the future. Removing segments should be 1728 * a relatively inexpensive operation since we expect these trees to 1729 * have a small number of nodes. 1730 */ 1731 condense_tree = range_tree_create(NULL, NULL, &msp->ms_lock); 1732 range_tree_add(condense_tree, msp->ms_start, msp->ms_size); 1733 1734 /* 1735 * Remove what's been freed in this txg from the condense_tree. 1736 * Since we're in sync_pass 1, we know that all the frees from 1737 * this txg are in the freetree. 1738 */ 1739 range_tree_walk(freetree, range_tree_remove, condense_tree); 1740 1741 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1742 range_tree_walk(msp->ms_defertree[t], 1743 range_tree_remove, condense_tree); 1744 } 1745 1746 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 1747 range_tree_walk(msp->ms_alloctree[(txg + t) & TXG_MASK], 1748 range_tree_remove, condense_tree); 1749 } 1750 1751 /* 1752 * We're about to drop the metaslab's lock thus allowing 1753 * other consumers to change it's content. Set the 1754 * metaslab's ms_condensing flag to ensure that 1755 * allocations on this metaslab do not occur while we're 1756 * in the middle of committing it to disk. This is only critical 1757 * for the ms_tree as all other range trees use per txg 1758 * views of their content. 1759 */ 1760 msp->ms_condensing = B_TRUE; 1761 1762 mutex_exit(&msp->ms_lock); 1763 space_map_truncate(sm, tx); 1764 mutex_enter(&msp->ms_lock); 1765 1766 /* 1767 * While we would ideally like to create a space_map representation 1768 * that consists only of allocation records, doing so can be 1769 * prohibitively expensive because the in-core free tree can be 1770 * large, and therefore computationally expensive to subtract 1771 * from the condense_tree. Instead we sync out two trees, a cheap 1772 * allocation only tree followed by the in-core free tree. While not 1773 * optimal, this is typically close to optimal, and much cheaper to 1774 * compute. 1775 */ 1776 space_map_write(sm, condense_tree, SM_ALLOC, tx); 1777 range_tree_vacate(condense_tree, NULL, NULL); 1778 range_tree_destroy(condense_tree); 1779 1780 space_map_write(sm, msp->ms_tree, SM_FREE, tx); 1781 msp->ms_condensing = B_FALSE; 1782 } 1783 1784 /* 1785 * Write a metaslab to disk in the context of the specified transaction group. 1786 */ 1787 void 1788 metaslab_sync(metaslab_t *msp, uint64_t txg) 1789 { 1790 metaslab_group_t *mg = msp->ms_group; 1791 vdev_t *vd = mg->mg_vd; 1792 spa_t *spa = vd->vdev_spa; 1793 objset_t *mos = spa_meta_objset(spa); 1794 range_tree_t *alloctree = msp->ms_alloctree[txg & TXG_MASK]; 1795 range_tree_t **freetree = &msp->ms_freetree[txg & TXG_MASK]; 1796 range_tree_t **freed_tree = 1797 &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK]; 1798 dmu_tx_t *tx; 1799 uint64_t object = space_map_object(msp->ms_sm); 1800 1801 ASSERT(!vd->vdev_ishole); 1802 1803 /* 1804 * This metaslab has just been added so there's no work to do now. 1805 */ 1806 if (*freetree == NULL) { 1807 ASSERT3P(alloctree, ==, NULL); 1808 return; 1809 } 1810 1811 ASSERT3P(alloctree, !=, NULL); 1812 ASSERT3P(*freetree, !=, NULL); 1813 ASSERT3P(*freed_tree, !=, NULL); 1814 1815 /* 1816 * Normally, we don't want to process a metaslab if there 1817 * are no allocations or frees to perform. However, if the metaslab 1818 * is being forced to condense we need to let it through. 1819 */ 1820 if (range_tree_space(alloctree) == 0 && 1821 range_tree_space(*freetree) == 0 && 1822 !msp->ms_condense_wanted) 1823 return; 1824 1825 /* 1826 * The only state that can actually be changing concurrently with 1827 * metaslab_sync() is the metaslab's ms_tree. No other thread can 1828 * be modifying this txg's alloctree, freetree, freed_tree, or 1829 * space_map_phys_t. Therefore, we only hold ms_lock to satify 1830 * space_map ASSERTs. We drop it whenever we call into the DMU, 1831 * because the DMU can call down to us (e.g. via zio_free()) at 1832 * any time. 1833 */ 1834 1835 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 1836 1837 if (msp->ms_sm == NULL) { 1838 uint64_t new_object; 1839 1840 new_object = space_map_alloc(mos, tx); 1841 VERIFY3U(new_object, !=, 0); 1842 1843 VERIFY0(space_map_open(&msp->ms_sm, mos, new_object, 1844 msp->ms_start, msp->ms_size, vd->vdev_ashift, 1845 &msp->ms_lock)); 1846 ASSERT(msp->ms_sm != NULL); 1847 } 1848 1849 mutex_enter(&msp->ms_lock); 1850 1851 /* 1852 * Note: metaslab_condense() clears the space_map's histogram. 1853 * Therefore we must verify and remove this histogram before 1854 * condensing. 1855 */ 1856 metaslab_group_histogram_verify(mg); 1857 metaslab_class_histogram_verify(mg->mg_class); 1858 metaslab_group_histogram_remove(mg, msp); 1859 1860 if (msp->ms_loaded && spa_sync_pass(spa) == 1 && 1861 metaslab_should_condense(msp)) { 1862 metaslab_condense(msp, txg, tx); 1863 } else { 1864 space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx); 1865 space_map_write(msp->ms_sm, *freetree, SM_FREE, tx); 1866 } 1867 1868 if (msp->ms_loaded) { 1869 /* 1870 * When the space map is loaded, we have an accruate 1871 * histogram in the range tree. This gives us an opportunity 1872 * to bring the space map's histogram up-to-date so we clear 1873 * it first before updating it. 1874 */ 1875 space_map_histogram_clear(msp->ms_sm); 1876 space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx); 1877 } else { 1878 /* 1879 * Since the space map is not loaded we simply update the 1880 * exisiting histogram with what was freed in this txg. This 1881 * means that the on-disk histogram may not have an accurate 1882 * view of the free space but it's close enough to allow 1883 * us to make allocation decisions. 1884 */ 1885 space_map_histogram_add(msp->ms_sm, *freetree, tx); 1886 } 1887 metaslab_group_histogram_add(mg, msp); 1888 metaslab_group_histogram_verify(mg); 1889 metaslab_class_histogram_verify(mg->mg_class); 1890 1891 /* 1892 * For sync pass 1, we avoid traversing this txg's free range tree 1893 * and instead will just swap the pointers for freetree and 1894 * freed_tree. We can safely do this since the freed_tree is 1895 * guaranteed to be empty on the initial pass. 1896 */ 1897 if (spa_sync_pass(spa) == 1) { 1898 range_tree_swap(freetree, freed_tree); 1899 } else { 1900 range_tree_vacate(*freetree, range_tree_add, *freed_tree); 1901 } 1902 range_tree_vacate(alloctree, NULL, NULL); 1903 1904 ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK])); 1905 ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK])); 1906 1907 mutex_exit(&msp->ms_lock); 1908 1909 if (object != space_map_object(msp->ms_sm)) { 1910 object = space_map_object(msp->ms_sm); 1911 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * 1912 msp->ms_id, sizeof (uint64_t), &object, tx); 1913 } 1914 dmu_tx_commit(tx); 1915 } 1916 1917 /* 1918 * Called after a transaction group has completely synced to mark 1919 * all of the metaslab's free space as usable. 1920 */ 1921 void 1922 metaslab_sync_done(metaslab_t *msp, uint64_t txg) 1923 { 1924 metaslab_group_t *mg = msp->ms_group; 1925 vdev_t *vd = mg->mg_vd; 1926 range_tree_t **freed_tree; 1927 range_tree_t **defer_tree; 1928 int64_t alloc_delta, defer_delta; 1929 1930 ASSERT(!vd->vdev_ishole); 1931 1932 mutex_enter(&msp->ms_lock); 1933 1934 /* 1935 * If this metaslab is just becoming available, initialize its 1936 * alloctrees, freetrees, and defertree and add its capacity to 1937 * the vdev. 1938 */ 1939 if (msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK] == NULL) { 1940 for (int t = 0; t < TXG_SIZE; t++) { 1941 ASSERT(msp->ms_alloctree[t] == NULL); 1942 ASSERT(msp->ms_freetree[t] == NULL); 1943 1944 msp->ms_alloctree[t] = range_tree_create(NULL, msp, 1945 &msp->ms_lock); 1946 msp->ms_freetree[t] = range_tree_create(NULL, msp, 1947 &msp->ms_lock); 1948 } 1949 1950 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1951 ASSERT(msp->ms_defertree[t] == NULL); 1952 1953 msp->ms_defertree[t] = range_tree_create(NULL, msp, 1954 &msp->ms_lock); 1955 } 1956 1957 vdev_space_update(vd, 0, 0, msp->ms_size); 1958 } 1959 1960 freed_tree = &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK]; 1961 defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE]; 1962 1963 alloc_delta = space_map_alloc_delta(msp->ms_sm); 1964 defer_delta = range_tree_space(*freed_tree) - 1965 range_tree_space(*defer_tree); 1966 1967 vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0); 1968 1969 ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK])); 1970 ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK])); 1971 1972 /* 1973 * If there's a metaslab_load() in progress, wait for it to complete 1974 * so that we have a consistent view of the in-core space map. 1975 */ 1976 metaslab_load_wait(msp); 1977 1978 /* 1979 * Move the frees from the defer_tree back to the free 1980 * range tree (if it's loaded). Swap the freed_tree and the 1981 * defer_tree -- this is safe to do because we've just emptied out 1982 * the defer_tree. 1983 */ 1984 range_tree_vacate(*defer_tree, 1985 msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree); 1986 range_tree_swap(freed_tree, defer_tree); 1987 1988 space_map_update(msp->ms_sm); 1989 1990 msp->ms_deferspace += defer_delta; 1991 ASSERT3S(msp->ms_deferspace, >=, 0); 1992 ASSERT3S(msp->ms_deferspace, <=, msp->ms_size); 1993 if (msp->ms_deferspace != 0) { 1994 /* 1995 * Keep syncing this metaslab until all deferred frees 1996 * are back in circulation. 1997 */ 1998 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 1999 } 2000 2001 if (msp->ms_loaded && msp->ms_access_txg < txg) { 2002 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 2003 VERIFY0(range_tree_space( 2004 msp->ms_alloctree[(txg + t) & TXG_MASK])); 2005 } 2006 2007 if (!metaslab_debug_unload) 2008 metaslab_unload(msp); 2009 } 2010 2011 metaslab_group_sort(mg, msp, metaslab_weight(msp)); 2012 mutex_exit(&msp->ms_lock); 2013 } 2014 2015 void 2016 metaslab_sync_reassess(metaslab_group_t *mg) 2017 { 2018 metaslab_group_alloc_update(mg); 2019 mg->mg_fragmentation = metaslab_group_fragmentation(mg); 2020 2021 /* 2022 * Preload the next potential metaslabs 2023 */ 2024 metaslab_group_preload(mg); 2025 } 2026 2027 static uint64_t 2028 metaslab_distance(metaslab_t *msp, dva_t *dva) 2029 { 2030 uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift; 2031 uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift; 2032 uint64_t start = msp->ms_id; 2033 2034 if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) 2035 return (1ULL << 63); 2036 2037 if (offset < start) 2038 return ((start - offset) << ms_shift); 2039 if (offset > start) 2040 return ((offset - start) << ms_shift); 2041 return (0); 2042 } 2043 2044 static uint64_t 2045 metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, 2046 uint64_t txg, uint64_t min_distance, dva_t *dva, int d) 2047 { 2048 spa_t *spa = mg->mg_vd->vdev_spa; 2049 metaslab_t *msp = NULL; 2050 uint64_t offset = -1ULL; 2051 avl_tree_t *t = &mg->mg_metaslab_tree; 2052 uint64_t activation_weight; 2053 uint64_t target_distance; 2054 int i; 2055 2056 activation_weight = METASLAB_WEIGHT_PRIMARY; 2057 for (i = 0; i < d; i++) { 2058 if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { 2059 activation_weight = METASLAB_WEIGHT_SECONDARY; 2060 break; 2061 } 2062 } 2063 2064 for (;;) { 2065 boolean_t was_active; 2066 2067 mutex_enter(&mg->mg_lock); 2068 for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) { 2069 if (msp->ms_weight < asize) { 2070 spa_dbgmsg(spa, "%s: failed to meet weight " 2071 "requirement: vdev %llu, txg %llu, mg %p, " 2072 "msp %p, psize %llu, asize %llu, " 2073 "weight %llu", spa_name(spa), 2074 mg->mg_vd->vdev_id, txg, 2075 mg, msp, psize, asize, msp->ms_weight); 2076 mutex_exit(&mg->mg_lock); 2077 return (-1ULL); 2078 } 2079 2080 /* 2081 * If the selected metaslab is condensing, skip it. 2082 */ 2083 if (msp->ms_condensing) 2084 continue; 2085 2086 was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; 2087 if (activation_weight == METASLAB_WEIGHT_PRIMARY) 2088 break; 2089 2090 target_distance = min_distance + 2091 (space_map_allocated(msp->ms_sm) != 0 ? 0 : 2092 min_distance >> 1); 2093 2094 for (i = 0; i < d; i++) 2095 if (metaslab_distance(msp, &dva[i]) < 2096 target_distance) 2097 break; 2098 if (i == d) 2099 break; 2100 } 2101 mutex_exit(&mg->mg_lock); 2102 if (msp == NULL) 2103 return (-1ULL); 2104 2105 mutex_enter(&msp->ms_lock); 2106 2107 /* 2108 * Ensure that the metaslab we have selected is still 2109 * capable of handling our request. It's possible that 2110 * another thread may have changed the weight while we 2111 * were blocked on the metaslab lock. 2112 */ 2113 if (msp->ms_weight < asize || (was_active && 2114 !(msp->ms_weight & METASLAB_ACTIVE_MASK) && 2115 activation_weight == METASLAB_WEIGHT_PRIMARY)) { 2116 mutex_exit(&msp->ms_lock); 2117 continue; 2118 } 2119 2120 if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) && 2121 activation_weight == METASLAB_WEIGHT_PRIMARY) { 2122 metaslab_passivate(msp, 2123 msp->ms_weight & ~METASLAB_ACTIVE_MASK); 2124 mutex_exit(&msp->ms_lock); 2125 continue; 2126 } 2127 2128 if (metaslab_activate(msp, activation_weight) != 0) { 2129 mutex_exit(&msp->ms_lock); 2130 continue; 2131 } 2132 2133 /* 2134 * If this metaslab is currently condensing then pick again as 2135 * we can't manipulate this metaslab until it's committed 2136 * to disk. 2137 */ 2138 if (msp->ms_condensing) { 2139 mutex_exit(&msp->ms_lock); 2140 continue; 2141 } 2142 2143 if ((offset = metaslab_block_alloc(msp, asize)) != -1ULL) 2144 break; 2145 2146 metaslab_passivate(msp, metaslab_block_maxsize(msp)); 2147 mutex_exit(&msp->ms_lock); 2148 } 2149 2150 if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0) 2151 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); 2152 2153 range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, asize); 2154 msp->ms_access_txg = txg + metaslab_unload_delay; 2155 2156 mutex_exit(&msp->ms_lock); 2157 2158 return (offset); 2159 } 2160 2161 /* 2162 * Allocate a block for the specified i/o. 2163 */ 2164 static int 2165 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, 2166 dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags) 2167 { 2168 metaslab_group_t *mg, *rotor; 2169 vdev_t *vd; 2170 int dshift = 3; 2171 int all_zero; 2172 int zio_lock = B_FALSE; 2173 boolean_t allocatable; 2174 uint64_t offset = -1ULL; 2175 uint64_t asize; 2176 uint64_t distance; 2177 2178 ASSERT(!DVA_IS_VALID(&dva[d])); 2179 2180 /* 2181 * For testing, make some blocks above a certain size be gang blocks. 2182 */ 2183 if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) 2184 return (SET_ERROR(ENOSPC)); 2185 2186 /* 2187 * Start at the rotor and loop through all mgs until we find something. 2188 * Note that there's no locking on mc_rotor or mc_aliquot because 2189 * nothing actually breaks if we miss a few updates -- we just won't 2190 * allocate quite as evenly. It all balances out over time. 2191 * 2192 * If we are doing ditto or log blocks, try to spread them across 2193 * consecutive vdevs. If we're forced to reuse a vdev before we've 2194 * allocated all of our ditto blocks, then try and spread them out on 2195 * that vdev as much as possible. If it turns out to not be possible, 2196 * gradually lower our standards until anything becomes acceptable. 2197 * Also, allocating on consecutive vdevs (as opposed to random vdevs) 2198 * gives us hope of containing our fault domains to something we're 2199 * able to reason about. Otherwise, any two top-level vdev failures 2200 * will guarantee the loss of data. With consecutive allocation, 2201 * only two adjacent top-level vdev failures will result in data loss. 2202 * 2203 * If we are doing gang blocks (hintdva is non-NULL), try to keep 2204 * ourselves on the same vdev as our gang block header. That 2205 * way, we can hope for locality in vdev_cache, plus it makes our 2206 * fault domains something tractable. 2207 */ 2208 if (hintdva) { 2209 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); 2210 2211 /* 2212 * It's possible the vdev we're using as the hint no 2213 * longer exists (i.e. removed). Consult the rotor when 2214 * all else fails. 2215 */ 2216 if (vd != NULL) { 2217 mg = vd->vdev_mg; 2218 2219 if (flags & METASLAB_HINTBP_AVOID && 2220 mg->mg_next != NULL) 2221 mg = mg->mg_next; 2222 } else { 2223 mg = mc->mc_rotor; 2224 } 2225 } else if (d != 0) { 2226 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); 2227 mg = vd->vdev_mg->mg_next; 2228 } else { 2229 mg = mc->mc_rotor; 2230 } 2231 2232 /* 2233 * If the hint put us into the wrong metaslab class, or into a 2234 * metaslab group that has been passivated, just follow the rotor. 2235 */ 2236 if (mg->mg_class != mc || mg->mg_activation_count <= 0) 2237 mg = mc->mc_rotor; 2238 2239 rotor = mg; 2240 top: 2241 all_zero = B_TRUE; 2242 do { 2243 ASSERT(mg->mg_activation_count == 1); 2244 2245 vd = mg->mg_vd; 2246 2247 /* 2248 * Don't allocate from faulted devices. 2249 */ 2250 if (zio_lock) { 2251 spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); 2252 allocatable = vdev_allocatable(vd); 2253 spa_config_exit(spa, SCL_ZIO, FTAG); 2254 } else { 2255 allocatable = vdev_allocatable(vd); 2256 } 2257 2258 /* 2259 * Determine if the selected metaslab group is eligible 2260 * for allocations. If we're ganging or have requested 2261 * an allocation for the smallest gang block size 2262 * then we don't want to avoid allocating to the this 2263 * metaslab group. If we're in this condition we should 2264 * try to allocate from any device possible so that we 2265 * don't inadvertently return ENOSPC and suspend the pool 2266 * even though space is still available. 2267 */ 2268 if (allocatable && CAN_FASTGANG(flags) && 2269 psize > SPA_GANGBLOCKSIZE) 2270 allocatable = metaslab_group_allocatable(mg); 2271 2272 if (!allocatable) 2273 goto next; 2274 2275 /* 2276 * Avoid writing single-copy data to a failing vdev 2277 * unless the user instructs us that it is okay. 2278 */ 2279 if ((vd->vdev_stat.vs_write_errors > 0 || 2280 vd->vdev_state < VDEV_STATE_HEALTHY) && 2281 d == 0 && dshift == 3 && vd->vdev_children == 0) { 2282 all_zero = B_FALSE; 2283 goto next; 2284 } 2285 2286 ASSERT(mg->mg_class == mc); 2287 2288 distance = vd->vdev_asize >> dshift; 2289 if (distance <= (1ULL << vd->vdev_ms_shift)) 2290 distance = 0; 2291 else 2292 all_zero = B_FALSE; 2293 2294 asize = vdev_psize_to_asize(vd, psize); 2295 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); 2296 2297 offset = metaslab_group_alloc(mg, psize, asize, txg, distance, 2298 dva, d); 2299 if (offset != -1ULL) { 2300 /* 2301 * If we've just selected this metaslab group, 2302 * figure out whether the corresponding vdev is 2303 * over- or under-used relative to the pool, 2304 * and set an allocation bias to even it out. 2305 */ 2306 if (mc->mc_aliquot == 0 && metaslab_bias_enabled) { 2307 vdev_stat_t *vs = &vd->vdev_stat; 2308 int64_t vu, cu; 2309 2310 vu = (vs->vs_alloc * 100) / (vs->vs_space + 1); 2311 cu = (mc->mc_alloc * 100) / (mc->mc_space + 1); 2312 2313 /* 2314 * Calculate how much more or less we should 2315 * try to allocate from this device during 2316 * this iteration around the rotor. 2317 * For example, if a device is 80% full 2318 * and the pool is 20% full then we should 2319 * reduce allocations by 60% on this device. 2320 * 2321 * mg_bias = (20 - 80) * 512K / 100 = -307K 2322 * 2323 * This reduces allocations by 307K for this 2324 * iteration. 2325 */ 2326 mg->mg_bias = ((cu - vu) * 2327 (int64_t)mg->mg_aliquot) / 100; 2328 } else if (!metaslab_bias_enabled) { 2329 mg->mg_bias = 0; 2330 } 2331 2332 if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= 2333 mg->mg_aliquot + mg->mg_bias) { 2334 mc->mc_rotor = mg->mg_next; 2335 mc->mc_aliquot = 0; 2336 } 2337 2338 DVA_SET_VDEV(&dva[d], vd->vdev_id); 2339 DVA_SET_OFFSET(&dva[d], offset); 2340 DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER)); 2341 DVA_SET_ASIZE(&dva[d], asize); 2342 2343 return (0); 2344 } 2345 next: 2346 mc->mc_rotor = mg->mg_next; 2347 mc->mc_aliquot = 0; 2348 } while ((mg = mg->mg_next) != rotor); 2349 2350 if (!all_zero) { 2351 dshift++; 2352 ASSERT(dshift < 64); 2353 goto top; 2354 } 2355 2356 if (!allocatable && !zio_lock) { 2357 dshift = 3; 2358 zio_lock = B_TRUE; 2359 goto top; 2360 } 2361 2362 bzero(&dva[d], sizeof (dva_t)); 2363 2364 return (SET_ERROR(ENOSPC)); 2365 } 2366 2367 /* 2368 * Free the block represented by DVA in the context of the specified 2369 * transaction group. 2370 */ 2371 static void 2372 metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now) 2373 { 2374 uint64_t vdev = DVA_GET_VDEV(dva); 2375 uint64_t offset = DVA_GET_OFFSET(dva); 2376 uint64_t size = DVA_GET_ASIZE(dva); 2377 vdev_t *vd; 2378 metaslab_t *msp; 2379 2380 ASSERT(DVA_IS_VALID(dva)); 2381 2382 if (txg > spa_freeze_txg(spa)) 2383 return; 2384 2385 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 2386 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { 2387 cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu", 2388 (u_longlong_t)vdev, (u_longlong_t)offset); 2389 ASSERT(0); 2390 return; 2391 } 2392 2393 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 2394 2395 if (DVA_GET_GANG(dva)) 2396 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 2397 2398 mutex_enter(&msp->ms_lock); 2399 2400 if (now) { 2401 range_tree_remove(msp->ms_alloctree[txg & TXG_MASK], 2402 offset, size); 2403 2404 VERIFY(!msp->ms_condensing); 2405 VERIFY3U(offset, >=, msp->ms_start); 2406 VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size); 2407 VERIFY3U(range_tree_space(msp->ms_tree) + size, <=, 2408 msp->ms_size); 2409 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 2410 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 2411 range_tree_add(msp->ms_tree, offset, size); 2412 } else { 2413 if (range_tree_space(msp->ms_freetree[txg & TXG_MASK]) == 0) 2414 vdev_dirty(vd, VDD_METASLAB, msp, txg); 2415 range_tree_add(msp->ms_freetree[txg & TXG_MASK], 2416 offset, size); 2417 } 2418 2419 mutex_exit(&msp->ms_lock); 2420 } 2421 2422 /* 2423 * Intent log support: upon opening the pool after a crash, notify the SPA 2424 * of blocks that the intent log has allocated for immediate write, but 2425 * which are still considered free by the SPA because the last transaction 2426 * group didn't commit yet. 2427 */ 2428 static int 2429 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) 2430 { 2431 uint64_t vdev = DVA_GET_VDEV(dva); 2432 uint64_t offset = DVA_GET_OFFSET(dva); 2433 uint64_t size = DVA_GET_ASIZE(dva); 2434 vdev_t *vd; 2435 metaslab_t *msp; 2436 int error = 0; 2437 2438 ASSERT(DVA_IS_VALID(dva)); 2439 2440 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 2441 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) 2442 return (SET_ERROR(ENXIO)); 2443 2444 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 2445 2446 if (DVA_GET_GANG(dva)) 2447 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 2448 2449 mutex_enter(&msp->ms_lock); 2450 2451 if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) 2452 error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); 2453 2454 if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size)) 2455 error = SET_ERROR(ENOENT); 2456 2457 if (error || txg == 0) { /* txg == 0 indicates dry run */ 2458 mutex_exit(&msp->ms_lock); 2459 return (error); 2460 } 2461 2462 VERIFY(!msp->ms_condensing); 2463 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 2464 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 2465 VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size); 2466 range_tree_remove(msp->ms_tree, offset, size); 2467 2468 if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ 2469 if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0) 2470 vdev_dirty(vd, VDD_METASLAB, msp, txg); 2471 range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size); 2472 } 2473 2474 mutex_exit(&msp->ms_lock); 2475 2476 return (0); 2477 } 2478 2479 int 2480 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, 2481 int ndvas, uint64_t txg, blkptr_t *hintbp, int flags) 2482 { 2483 dva_t *dva = bp->blk_dva; 2484 dva_t *hintdva = hintbp->blk_dva; 2485 int error = 0; 2486 2487 ASSERT(bp->blk_birth == 0); 2488 ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); 2489 2490 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 2491 2492 if (mc->mc_rotor == NULL) { /* no vdevs in this class */ 2493 spa_config_exit(spa, SCL_ALLOC, FTAG); 2494 return (SET_ERROR(ENOSPC)); 2495 } 2496 2497 ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); 2498 ASSERT(BP_GET_NDVAS(bp) == 0); 2499 ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); 2500 2501 for (int d = 0; d < ndvas; d++) { 2502 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, 2503 txg, flags); 2504 if (error != 0) { 2505 for (d--; d >= 0; d--) { 2506 metaslab_free_dva(spa, &dva[d], txg, B_TRUE); 2507 bzero(&dva[d], sizeof (dva_t)); 2508 } 2509 spa_config_exit(spa, SCL_ALLOC, FTAG); 2510 return (error); 2511 } 2512 } 2513 ASSERT(error == 0); 2514 ASSERT(BP_GET_NDVAS(bp) == ndvas); 2515 2516 spa_config_exit(spa, SCL_ALLOC, FTAG); 2517 2518 BP_SET_BIRTH(bp, txg, txg); 2519 2520 return (0); 2521 } 2522 2523 void 2524 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) 2525 { 2526 const dva_t *dva = bp->blk_dva; 2527 int ndvas = BP_GET_NDVAS(bp); 2528 2529 ASSERT(!BP_IS_HOLE(bp)); 2530 ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); 2531 2532 spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); 2533 2534 for (int d = 0; d < ndvas; d++) 2535 metaslab_free_dva(spa, &dva[d], txg, now); 2536 2537 spa_config_exit(spa, SCL_FREE, FTAG); 2538 } 2539 2540 int 2541 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) 2542 { 2543 const dva_t *dva = bp->blk_dva; 2544 int ndvas = BP_GET_NDVAS(bp); 2545 int error = 0; 2546 2547 ASSERT(!BP_IS_HOLE(bp)); 2548 2549 if (txg != 0) { 2550 /* 2551 * First do a dry run to make sure all DVAs are claimable, 2552 * so we don't have to unwind from partial failures below. 2553 */ 2554 if ((error = metaslab_claim(spa, bp, 0)) != 0) 2555 return (error); 2556 } 2557 2558 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 2559 2560 for (int d = 0; d < ndvas; d++) 2561 if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0) 2562 break; 2563 2564 spa_config_exit(spa, SCL_ALLOC, FTAG); 2565 2566 ASSERT(error == 0 || txg == 0); 2567 2568 return (error); 2569 } 2570 2571 void 2572 metaslab_check_free(spa_t *spa, const blkptr_t *bp) 2573 { 2574 if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) 2575 return; 2576 2577 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 2578 for (int i = 0; i < BP_GET_NDVAS(bp); i++) { 2579 uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]); 2580 vdev_t *vd = vdev_lookup_top(spa, vdev); 2581 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); 2582 uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]); 2583 metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 2584 2585 if (msp->ms_loaded) 2586 range_tree_verify(msp->ms_tree, offset, size); 2587 2588 for (int j = 0; j < TXG_SIZE; j++) 2589 range_tree_verify(msp->ms_freetree[j], offset, size); 2590 for (int j = 0; j < TXG_DEFER_SIZE; j++) 2591 range_tree_verify(msp->ms_defertree[j], offset, size); 2592 } 2593 spa_config_exit(spa, SCL_VDEV, FTAG); 2594 } 2595