1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/zfs_context.h> 27 #include <sys/dmu.h> 28 #include <sys/dmu_tx.h> 29 #include <sys/space_map.h> 30 #include <sys/metaslab_impl.h> 31 #include <sys/vdev_impl.h> 32 #include <sys/zio.h> 33 34 uint64_t metaslab_aliquot = 512ULL << 10; 35 uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ 36 37 /* 38 * Metaslab debugging: when set, keeps all space maps in core to verify frees. 39 */ 40 static int metaslab_debug = 0; 41 42 /* 43 * Minimum size which forces the dynamic allocator to change 44 * it's allocation strategy. Once the space map cannot satisfy 45 * an allocation of this size then it switches to using more 46 * aggressive strategy (i.e search by size rather than offset). 47 */ 48 uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE; 49 50 /* 51 * The minimum free space, in percent, which must be available 52 * in a space map to continue allocations in a first-fit fashion. 53 * Once the space_map's free space drops below this level we dynamically 54 * switch to using best-fit allocations. 55 */ 56 int metaslab_df_free_pct = 30; 57 58 /* 59 * ========================================================================== 60 * Metaslab classes 61 * ========================================================================== 62 */ 63 metaslab_class_t * 64 metaslab_class_create(spa_t *spa, space_map_ops_t *ops) 65 { 66 metaslab_class_t *mc; 67 68 mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); 69 70 mc->mc_spa = spa; 71 mc->mc_rotor = NULL; 72 mc->mc_ops = ops; 73 74 return (mc); 75 } 76 77 void 78 metaslab_class_destroy(metaslab_class_t *mc) 79 { 80 metaslab_group_t *mg; 81 82 while ((mg = mc->mc_rotor) != NULL) { 83 metaslab_class_remove(mc, mg); 84 metaslab_group_destroy(mg); 85 } 86 87 kmem_free(mc, sizeof (metaslab_class_t)); 88 } 89 90 void 91 metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg) 92 { 93 metaslab_group_t *mgprev, *mgnext; 94 95 ASSERT(mg->mg_class == NULL); 96 97 if ((mgprev = mc->mc_rotor) == NULL) { 98 mg->mg_prev = mg; 99 mg->mg_next = mg; 100 } else { 101 mgnext = mgprev->mg_next; 102 mg->mg_prev = mgprev; 103 mg->mg_next = mgnext; 104 mgprev->mg_next = mg; 105 mgnext->mg_prev = mg; 106 } 107 mc->mc_rotor = mg; 108 mg->mg_class = mc; 109 } 110 111 void 112 metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg) 113 { 114 metaslab_group_t *mgprev, *mgnext; 115 116 ASSERT(mg->mg_class == mc); 117 118 mgprev = mg->mg_prev; 119 mgnext = mg->mg_next; 120 121 if (mg == mgnext) { 122 mc->mc_rotor = NULL; 123 } else { 124 mc->mc_rotor = mgnext; 125 mgprev->mg_next = mgnext; 126 mgnext->mg_prev = mgprev; 127 } 128 129 mg->mg_prev = NULL; 130 mg->mg_next = NULL; 131 mg->mg_class = NULL; 132 } 133 134 int 135 metaslab_class_validate(metaslab_class_t *mc) 136 { 137 metaslab_group_t *mg; 138 vdev_t *vd; 139 140 /* 141 * Must hold one of the spa_config locks. 142 */ 143 ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || 144 spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); 145 146 if ((mg = mc->mc_rotor) == NULL) 147 return (0); 148 149 do { 150 vd = mg->mg_vd; 151 ASSERT(vd->vdev_mg != NULL); 152 ASSERT3P(vd->vdev_top, ==, vd); 153 ASSERT3P(mg->mg_class, ==, mc); 154 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); 155 } while ((mg = mg->mg_next) != mc->mc_rotor); 156 157 return (0); 158 } 159 160 void 161 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, 162 int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) 163 { 164 atomic_add_64(&mc->mc_alloc, alloc_delta); 165 atomic_add_64(&mc->mc_deferred, defer_delta); 166 atomic_add_64(&mc->mc_space, space_delta); 167 atomic_add_64(&mc->mc_dspace, dspace_delta); 168 169 ASSERT((int64_t)mc->mc_alloc >= 0 && 170 (int64_t)mc->mc_deferred >= 0 && 171 (int64_t)mc->mc_space >= 0 && 172 (int64_t)mc->mc_dspace >= 0); 173 } 174 175 uint64_t 176 metaslab_class_get_alloc(metaslab_class_t *mc) 177 { 178 return (mc->mc_alloc); 179 } 180 181 uint64_t 182 metaslab_class_get_deferred(metaslab_class_t *mc) 183 { 184 return (mc->mc_deferred); 185 } 186 187 uint64_t 188 metaslab_class_get_space(metaslab_class_t *mc) 189 { 190 return (mc->mc_space); 191 } 192 193 uint64_t 194 metaslab_class_get_dspace(metaslab_class_t *mc) 195 { 196 return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); 197 } 198 199 /* 200 * ========================================================================== 201 * Metaslab groups 202 * ========================================================================== 203 */ 204 static int 205 metaslab_compare(const void *x1, const void *x2) 206 { 207 const metaslab_t *m1 = x1; 208 const metaslab_t *m2 = x2; 209 210 if (m1->ms_weight < m2->ms_weight) 211 return (1); 212 if (m1->ms_weight > m2->ms_weight) 213 return (-1); 214 215 /* 216 * If the weights are identical, use the offset to force uniqueness. 217 */ 218 if (m1->ms_map.sm_start < m2->ms_map.sm_start) 219 return (-1); 220 if (m1->ms_map.sm_start > m2->ms_map.sm_start) 221 return (1); 222 223 ASSERT3P(m1, ==, m2); 224 225 return (0); 226 } 227 228 metaslab_group_t * 229 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) 230 { 231 metaslab_group_t *mg; 232 233 mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); 234 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); 235 avl_create(&mg->mg_metaslab_tree, metaslab_compare, 236 sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); 237 mg->mg_aliquot = metaslab_aliquot * MAX(1, vd->vdev_children); 238 mg->mg_vd = vd; 239 metaslab_class_add(mc, mg); 240 241 return (mg); 242 } 243 244 void 245 metaslab_group_destroy(metaslab_group_t *mg) 246 { 247 avl_destroy(&mg->mg_metaslab_tree); 248 mutex_destroy(&mg->mg_lock); 249 kmem_free(mg, sizeof (metaslab_group_t)); 250 } 251 252 static void 253 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) 254 { 255 mutex_enter(&mg->mg_lock); 256 ASSERT(msp->ms_group == NULL); 257 msp->ms_group = mg; 258 msp->ms_weight = 0; 259 avl_add(&mg->mg_metaslab_tree, msp); 260 mutex_exit(&mg->mg_lock); 261 } 262 263 static void 264 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) 265 { 266 mutex_enter(&mg->mg_lock); 267 ASSERT(msp->ms_group == mg); 268 avl_remove(&mg->mg_metaslab_tree, msp); 269 msp->ms_group = NULL; 270 mutex_exit(&mg->mg_lock); 271 } 272 273 static void 274 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 275 { 276 /* 277 * Although in principle the weight can be any value, in 278 * practice we do not use values in the range [1, 510]. 279 */ 280 ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0); 281 ASSERT(MUTEX_HELD(&msp->ms_lock)); 282 283 mutex_enter(&mg->mg_lock); 284 ASSERT(msp->ms_group == mg); 285 avl_remove(&mg->mg_metaslab_tree, msp); 286 msp->ms_weight = weight; 287 avl_add(&mg->mg_metaslab_tree, msp); 288 mutex_exit(&mg->mg_lock); 289 } 290 291 /* 292 * This is a helper function that can be used by the allocator to find 293 * a suitable block to allocate. This will search the specified AVL 294 * tree looking for a block that matches the specified criteria. 295 */ 296 static uint64_t 297 metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, 298 uint64_t align) 299 { 300 space_seg_t *ss, ssearch; 301 avl_index_t where; 302 303 ssearch.ss_start = *cursor; 304 ssearch.ss_end = *cursor + size; 305 306 ss = avl_find(t, &ssearch, &where); 307 if (ss == NULL) 308 ss = avl_nearest(t, where, AVL_AFTER); 309 310 while (ss != NULL) { 311 uint64_t offset = P2ROUNDUP(ss->ss_start, align); 312 313 if (offset + size <= ss->ss_end) { 314 *cursor = offset + size; 315 return (offset); 316 } 317 ss = AVL_NEXT(t, ss); 318 } 319 320 /* 321 * If we know we've searched the whole map (*cursor == 0), give up. 322 * Otherwise, reset the cursor to the beginning and try again. 323 */ 324 if (*cursor == 0) 325 return (-1ULL); 326 327 *cursor = 0; 328 return (metaslab_block_picker(t, cursor, size, align)); 329 } 330 331 /* 332 * ========================================================================== 333 * The first-fit block allocator 334 * ========================================================================== 335 */ 336 static void 337 metaslab_ff_load(space_map_t *sm) 338 { 339 ASSERT(sm->sm_ppd == NULL); 340 sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP); 341 sm->sm_pp_root = NULL; 342 } 343 344 static void 345 metaslab_ff_unload(space_map_t *sm) 346 { 347 kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t)); 348 sm->sm_ppd = NULL; 349 } 350 351 static uint64_t 352 metaslab_ff_alloc(space_map_t *sm, uint64_t size) 353 { 354 avl_tree_t *t = &sm->sm_root; 355 uint64_t align = size & -size; 356 uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; 357 358 return (metaslab_block_picker(t, cursor, size, align)); 359 } 360 361 /* ARGSUSED */ 362 static void 363 metaslab_ff_claim(space_map_t *sm, uint64_t start, uint64_t size) 364 { 365 /* No need to update cursor */ 366 } 367 368 /* ARGSUSED */ 369 static void 370 metaslab_ff_free(space_map_t *sm, uint64_t start, uint64_t size) 371 { 372 /* No need to update cursor */ 373 } 374 375 static space_map_ops_t metaslab_ff_ops = { 376 metaslab_ff_load, 377 metaslab_ff_unload, 378 metaslab_ff_alloc, 379 metaslab_ff_claim, 380 metaslab_ff_free, 381 NULL /* maxsize */ 382 }; 383 384 /* 385 * Dynamic block allocator - 386 * Uses the first fit allocation scheme until space get low and then 387 * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold 388 * and metaslab_df_free_pct to determine when to switch the allocation scheme. 389 */ 390 391 uint64_t 392 metaslab_df_maxsize(space_map_t *sm) 393 { 394 avl_tree_t *t = sm->sm_pp_root; 395 space_seg_t *ss; 396 397 if (t == NULL || (ss = avl_last(t)) == NULL) 398 return (0ULL); 399 400 return (ss->ss_end - ss->ss_start); 401 } 402 403 static int 404 metaslab_df_seg_compare(const void *x1, const void *x2) 405 { 406 const space_seg_t *s1 = x1; 407 const space_seg_t *s2 = x2; 408 uint64_t ss_size1 = s1->ss_end - s1->ss_start; 409 uint64_t ss_size2 = s2->ss_end - s2->ss_start; 410 411 if (ss_size1 < ss_size2) 412 return (-1); 413 if (ss_size1 > ss_size2) 414 return (1); 415 416 if (s1->ss_start < s2->ss_start) 417 return (-1); 418 if (s1->ss_start > s2->ss_start) 419 return (1); 420 421 return (0); 422 } 423 424 static void 425 metaslab_df_load(space_map_t *sm) 426 { 427 space_seg_t *ss; 428 429 ASSERT(sm->sm_ppd == NULL); 430 sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP); 431 432 sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); 433 avl_create(sm->sm_pp_root, metaslab_df_seg_compare, 434 sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node)); 435 436 for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss)) 437 avl_add(sm->sm_pp_root, ss); 438 } 439 440 static void 441 metaslab_df_unload(space_map_t *sm) 442 { 443 void *cookie = NULL; 444 445 kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t)); 446 sm->sm_ppd = NULL; 447 448 while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) { 449 /* tear down the tree */ 450 } 451 452 avl_destroy(sm->sm_pp_root); 453 kmem_free(sm->sm_pp_root, sizeof (avl_tree_t)); 454 sm->sm_pp_root = NULL; 455 } 456 457 static uint64_t 458 metaslab_df_alloc(space_map_t *sm, uint64_t size) 459 { 460 avl_tree_t *t = &sm->sm_root; 461 uint64_t align = size & -size; 462 uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; 463 uint64_t max_size = metaslab_df_maxsize(sm); 464 int free_pct = sm->sm_space * 100 / sm->sm_size; 465 466 ASSERT(MUTEX_HELD(sm->sm_lock)); 467 ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); 468 469 if (max_size < size) 470 return (-1ULL); 471 472 /* 473 * If we're running low on space switch to using the size 474 * sorted AVL tree (best-fit). 475 */ 476 if (max_size < metaslab_df_alloc_threshold || 477 free_pct < metaslab_df_free_pct) { 478 t = sm->sm_pp_root; 479 *cursor = 0; 480 } 481 482 return (metaslab_block_picker(t, cursor, size, 1ULL)); 483 } 484 485 /* ARGSUSED */ 486 static void 487 metaslab_df_claim(space_map_t *sm, uint64_t start, uint64_t size) 488 { 489 /* No need to update cursor */ 490 } 491 492 /* ARGSUSED */ 493 static void 494 metaslab_df_free(space_map_t *sm, uint64_t start, uint64_t size) 495 { 496 /* No need to update cursor */ 497 } 498 499 static space_map_ops_t metaslab_df_ops = { 500 metaslab_df_load, 501 metaslab_df_unload, 502 metaslab_df_alloc, 503 metaslab_df_claim, 504 metaslab_df_free, 505 metaslab_df_maxsize 506 }; 507 508 space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops; 509 510 /* 511 * ========================================================================== 512 * Metaslabs 513 * ========================================================================== 514 */ 515 metaslab_t * 516 metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, 517 uint64_t start, uint64_t size, uint64_t txg) 518 { 519 vdev_t *vd = mg->mg_vd; 520 metaslab_t *msp; 521 522 msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); 523 mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL); 524 525 msp->ms_smo_syncing = *smo; 526 527 /* 528 * We create the main space map here, but we don't create the 529 * allocmaps and freemaps until metaslab_sync_done(). This serves 530 * two purposes: it allows metaslab_sync_done() to detect the 531 * addition of new space; and for debugging, it ensures that we'd 532 * data fault on any attempt to use this metaslab before it's ready. 533 */ 534 space_map_create(&msp->ms_map, start, size, 535 vd->vdev_ashift, &msp->ms_lock); 536 537 metaslab_group_add(mg, msp); 538 539 if (metaslab_debug && smo->smo_object != 0) { 540 mutex_enter(&msp->ms_lock); 541 VERIFY(space_map_load(&msp->ms_map, mg->mg_class->mc_ops, 542 SM_FREE, smo, spa_meta_objset(vd->vdev_spa)) == 0); 543 mutex_exit(&msp->ms_lock); 544 } 545 546 /* 547 * If we're opening an existing pool (txg == 0) or creating 548 * a new one (txg == TXG_INITIAL), all space is available now. 549 * If we're adding space to an existing pool, the new space 550 * does not become available until after this txg has synced. 551 */ 552 if (txg <= TXG_INITIAL) 553 metaslab_sync_done(msp, 0); 554 555 if (txg != 0) { 556 vdev_dirty(vd, 0, NULL, txg); 557 vdev_dirty(vd, VDD_METASLAB, msp, txg); 558 } 559 560 return (msp); 561 } 562 563 void 564 metaslab_fini(metaslab_t *msp) 565 { 566 metaslab_group_t *mg = msp->ms_group; 567 568 vdev_space_update(mg->mg_vd, 569 -msp->ms_smo.smo_alloc, 0, -msp->ms_map.sm_size); 570 571 metaslab_group_remove(mg, msp); 572 573 mutex_enter(&msp->ms_lock); 574 575 space_map_unload(&msp->ms_map); 576 space_map_destroy(&msp->ms_map); 577 578 for (int t = 0; t < TXG_SIZE; t++) { 579 space_map_destroy(&msp->ms_allocmap[t]); 580 space_map_destroy(&msp->ms_freemap[t]); 581 } 582 583 for (int t = 0; t < TXG_DEFER_SIZE; t++) 584 space_map_destroy(&msp->ms_defermap[t]); 585 586 ASSERT3S(msp->ms_deferspace, ==, 0); 587 588 mutex_exit(&msp->ms_lock); 589 mutex_destroy(&msp->ms_lock); 590 591 kmem_free(msp, sizeof (metaslab_t)); 592 } 593 594 #define METASLAB_WEIGHT_PRIMARY (1ULL << 63) 595 #define METASLAB_WEIGHT_SECONDARY (1ULL << 62) 596 #define METASLAB_ACTIVE_MASK \ 597 (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY) 598 #define METASLAB_SMO_BONUS_MULTIPLIER 2 599 600 static uint64_t 601 metaslab_weight(metaslab_t *msp) 602 { 603 metaslab_group_t *mg = msp->ms_group; 604 space_map_t *sm = &msp->ms_map; 605 space_map_obj_t *smo = &msp->ms_smo; 606 vdev_t *vd = mg->mg_vd; 607 uint64_t weight, space; 608 609 ASSERT(MUTEX_HELD(&msp->ms_lock)); 610 611 /* 612 * The baseline weight is the metaslab's free space. 613 */ 614 space = sm->sm_size - smo->smo_alloc; 615 weight = space; 616 617 /* 618 * Modern disks have uniform bit density and constant angular velocity. 619 * Therefore, the outer recording zones are faster (higher bandwidth) 620 * than the inner zones by the ratio of outer to inner track diameter, 621 * which is typically around 2:1. We account for this by assigning 622 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). 623 * In effect, this means that we'll select the metaslab with the most 624 * free bandwidth rather than simply the one with the most free space. 625 */ 626 weight = 2 * weight - 627 ((sm->sm_start >> vd->vdev_ms_shift) * weight) / vd->vdev_ms_count; 628 ASSERT(weight >= space && weight <= 2 * space); 629 630 /* 631 * For locality, assign higher weight to metaslabs we've used before. 632 */ 633 if (smo->smo_object != 0) 634 weight *= METASLAB_SMO_BONUS_MULTIPLIER; 635 ASSERT(weight >= space && 636 weight <= 2 * METASLAB_SMO_BONUS_MULTIPLIER * space); 637 638 /* 639 * If this metaslab is one we're actively using, adjust its weight to 640 * make it preferable to any inactive metaslab so we'll polish it off. 641 */ 642 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); 643 644 return (weight); 645 } 646 647 static int 648 metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size) 649 { 650 space_map_t *sm = &msp->ms_map; 651 space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops; 652 653 ASSERT(MUTEX_HELD(&msp->ms_lock)); 654 655 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { 656 space_map_load_wait(sm); 657 if (!sm->sm_loaded) { 658 int error = space_map_load(sm, sm_ops, SM_FREE, 659 &msp->ms_smo, 660 spa_meta_objset(msp->ms_group->mg_vd->vdev_spa)); 661 if (error) { 662 metaslab_group_sort(msp->ms_group, msp, 0); 663 return (error); 664 } 665 for (int t = 0; t < TXG_DEFER_SIZE; t++) 666 space_map_walk(&msp->ms_defermap[t], 667 space_map_claim, sm); 668 } 669 670 /* 671 * If we were able to load the map then make sure 672 * that this map is still able to satisfy our request. 673 */ 674 if (msp->ms_weight < size) 675 return (ENOSPC); 676 677 metaslab_group_sort(msp->ms_group, msp, 678 msp->ms_weight | activation_weight); 679 } 680 ASSERT(sm->sm_loaded); 681 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); 682 683 return (0); 684 } 685 686 static void 687 metaslab_passivate(metaslab_t *msp, uint64_t size) 688 { 689 /* 690 * If size < SPA_MINBLOCKSIZE, then we will not allocate from 691 * this metaslab again. In that case, it had better be empty, 692 * or we would be leaving space on the table. 693 */ 694 #if 0 695 ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map.sm_space == 0); 696 #endif 697 metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size)); 698 ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); 699 } 700 701 /* 702 * Write a metaslab to disk in the context of the specified transaction group. 703 */ 704 void 705 metaslab_sync(metaslab_t *msp, uint64_t txg) 706 { 707 vdev_t *vd = msp->ms_group->mg_vd; 708 spa_t *spa = vd->vdev_spa; 709 objset_t *mos = spa_meta_objset(spa); 710 space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK]; 711 space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK]; 712 space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; 713 space_map_t *sm = &msp->ms_map; 714 space_map_obj_t *smo = &msp->ms_smo_syncing; 715 dmu_buf_t *db; 716 dmu_tx_t *tx; 717 718 ASSERT(!vd->vdev_ishole); 719 720 if (allocmap->sm_space == 0 && freemap->sm_space == 0) 721 return; 722 723 /* 724 * The only state that can actually be changing concurrently with 725 * metaslab_sync() is the metaslab's ms_map. No other thread can 726 * be modifying this txg's allocmap, freemap, freed_map, or smo. 727 * Therefore, we only hold ms_lock to satify space_map ASSERTs. 728 * We drop it whenever we call into the DMU, because the DMU 729 * can call down to us (e.g. via zio_free()) at any time. 730 */ 731 732 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 733 734 if (smo->smo_object == 0) { 735 ASSERT(smo->smo_objsize == 0); 736 ASSERT(smo->smo_alloc == 0); 737 smo->smo_object = dmu_object_alloc(mos, 738 DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT, 739 DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx); 740 ASSERT(smo->smo_object != 0); 741 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * 742 (sm->sm_start >> vd->vdev_ms_shift), 743 sizeof (uint64_t), &smo->smo_object, tx); 744 } 745 746 mutex_enter(&msp->ms_lock); 747 748 space_map_walk(freemap, space_map_add, freed_map); 749 750 if (sm->sm_loaded && spa_sync_pass(spa) == 1 && smo->smo_objsize >= 751 2 * sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) { 752 /* 753 * The in-core space map representation is twice as compact 754 * as the on-disk one, so it's time to condense the latter 755 * by generating a pure allocmap from first principles. 756 * 757 * This metaslab is 100% allocated, 758 * minus the content of the in-core map (sm), 759 * minus what's been freed this txg (freed_map), 760 * minus deferred frees (ms_defermap[]), 761 * minus allocations from txgs in the future 762 * (because they haven't been committed yet). 763 */ 764 space_map_vacate(allocmap, NULL, NULL); 765 space_map_vacate(freemap, NULL, NULL); 766 767 space_map_add(allocmap, allocmap->sm_start, allocmap->sm_size); 768 769 space_map_walk(sm, space_map_remove, allocmap); 770 space_map_walk(freed_map, space_map_remove, allocmap); 771 772 for (int t = 0; t < TXG_DEFER_SIZE; t++) 773 space_map_walk(&msp->ms_defermap[t], 774 space_map_remove, allocmap); 775 776 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) 777 space_map_walk(&msp->ms_allocmap[(txg + t) & TXG_MASK], 778 space_map_remove, allocmap); 779 780 mutex_exit(&msp->ms_lock); 781 space_map_truncate(smo, mos, tx); 782 mutex_enter(&msp->ms_lock); 783 } 784 785 space_map_sync(allocmap, SM_ALLOC, smo, mos, tx); 786 space_map_sync(freemap, SM_FREE, smo, mos, tx); 787 788 mutex_exit(&msp->ms_lock); 789 790 VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)); 791 dmu_buf_will_dirty(db, tx); 792 ASSERT3U(db->db_size, >=, sizeof (*smo)); 793 bcopy(smo, db->db_data, sizeof (*smo)); 794 dmu_buf_rele(db, FTAG); 795 796 dmu_tx_commit(tx); 797 } 798 799 /* 800 * Called after a transaction group has completely synced to mark 801 * all of the metaslab's free space as usable. 802 */ 803 void 804 metaslab_sync_done(metaslab_t *msp, uint64_t txg) 805 { 806 space_map_obj_t *smo = &msp->ms_smo; 807 space_map_obj_t *smosync = &msp->ms_smo_syncing; 808 space_map_t *sm = &msp->ms_map; 809 space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; 810 space_map_t *defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE]; 811 metaslab_group_t *mg = msp->ms_group; 812 vdev_t *vd = mg->mg_vd; 813 int64_t alloc_delta, defer_delta; 814 815 ASSERT(!vd->vdev_ishole); 816 817 mutex_enter(&msp->ms_lock); 818 819 /* 820 * If this metaslab is just becoming available, initialize its 821 * allocmaps and freemaps and add its capacity to the vdev. 822 */ 823 if (freed_map->sm_size == 0) { 824 for (int t = 0; t < TXG_SIZE; t++) { 825 space_map_create(&msp->ms_allocmap[t], sm->sm_start, 826 sm->sm_size, sm->sm_shift, sm->sm_lock); 827 space_map_create(&msp->ms_freemap[t], sm->sm_start, 828 sm->sm_size, sm->sm_shift, sm->sm_lock); 829 } 830 831 for (int t = 0; t < TXG_DEFER_SIZE; t++) 832 space_map_create(&msp->ms_defermap[t], sm->sm_start, 833 sm->sm_size, sm->sm_shift, sm->sm_lock); 834 835 vdev_space_update(vd, 0, 0, sm->sm_size); 836 } 837 838 alloc_delta = smosync->smo_alloc - smo->smo_alloc; 839 defer_delta = freed_map->sm_space - defer_map->sm_space; 840 841 vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0); 842 843 ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0); 844 ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0); 845 846 /* 847 * If there's a space_map_load() in progress, wait for it to complete 848 * so that we have a consistent view of the in-core space map. 849 * Then, add defer_map (oldest deferred frees) to this map and 850 * transfer freed_map (this txg's frees) to defer_map. 851 */ 852 space_map_load_wait(sm); 853 space_map_vacate(defer_map, sm->sm_loaded ? space_map_free : NULL, sm); 854 space_map_vacate(freed_map, space_map_add, defer_map); 855 856 *smo = *smosync; 857 858 msp->ms_deferspace += defer_delta; 859 ASSERT3S(msp->ms_deferspace, >=, 0); 860 ASSERT3S(msp->ms_deferspace, <=, sm->sm_size); 861 if (msp->ms_deferspace != 0) { 862 /* 863 * Keep syncing this metaslab until all deferred frees 864 * are back in circulation. 865 */ 866 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 867 } 868 869 /* 870 * If the map is loaded but no longer active, evict it as soon as all 871 * future allocations have synced. (If we unloaded it now and then 872 * loaded a moment later, the map wouldn't reflect those allocations.) 873 */ 874 if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { 875 int evictable = 1; 876 877 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) 878 if (msp->ms_allocmap[(txg + t) & TXG_MASK].sm_space) 879 evictable = 0; 880 881 if (evictable && !metaslab_debug) 882 space_map_unload(sm); 883 } 884 885 metaslab_group_sort(mg, msp, metaslab_weight(msp)); 886 887 mutex_exit(&msp->ms_lock); 888 } 889 890 static uint64_t 891 metaslab_distance(metaslab_t *msp, dva_t *dva) 892 { 893 uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift; 894 uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift; 895 uint64_t start = msp->ms_map.sm_start >> ms_shift; 896 897 if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) 898 return (1ULL << 63); 899 900 if (offset < start) 901 return ((start - offset) << ms_shift); 902 if (offset > start) 903 return ((offset - start) << ms_shift); 904 return (0); 905 } 906 907 static uint64_t 908 metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg, 909 uint64_t min_distance, dva_t *dva, int d) 910 { 911 metaslab_t *msp = NULL; 912 uint64_t offset = -1ULL; 913 avl_tree_t *t = &mg->mg_metaslab_tree; 914 uint64_t activation_weight; 915 uint64_t target_distance; 916 int i; 917 918 activation_weight = METASLAB_WEIGHT_PRIMARY; 919 for (i = 0; i < d; i++) { 920 if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { 921 activation_weight = METASLAB_WEIGHT_SECONDARY; 922 break; 923 } 924 } 925 926 for (;;) { 927 boolean_t was_active; 928 929 mutex_enter(&mg->mg_lock); 930 for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) { 931 if (msp->ms_weight < size) { 932 mutex_exit(&mg->mg_lock); 933 return (-1ULL); 934 } 935 936 was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; 937 if (activation_weight == METASLAB_WEIGHT_PRIMARY) 938 break; 939 940 target_distance = min_distance + 941 (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1); 942 943 for (i = 0; i < d; i++) 944 if (metaslab_distance(msp, &dva[i]) < 945 target_distance) 946 break; 947 if (i == d) 948 break; 949 } 950 mutex_exit(&mg->mg_lock); 951 if (msp == NULL) 952 return (-1ULL); 953 954 mutex_enter(&msp->ms_lock); 955 956 /* 957 * Ensure that the metaslab we have selected is still 958 * capable of handling our request. It's possible that 959 * another thread may have changed the weight while we 960 * were blocked on the metaslab lock. 961 */ 962 if (msp->ms_weight < size || (was_active && 963 !(msp->ms_weight & METASLAB_ACTIVE_MASK) && 964 activation_weight == METASLAB_WEIGHT_PRIMARY)) { 965 mutex_exit(&msp->ms_lock); 966 continue; 967 } 968 969 if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) && 970 activation_weight == METASLAB_WEIGHT_PRIMARY) { 971 metaslab_passivate(msp, 972 msp->ms_weight & ~METASLAB_ACTIVE_MASK); 973 mutex_exit(&msp->ms_lock); 974 continue; 975 } 976 977 if (metaslab_activate(msp, activation_weight, size) != 0) { 978 mutex_exit(&msp->ms_lock); 979 continue; 980 } 981 982 if ((offset = space_map_alloc(&msp->ms_map, size)) != -1ULL) 983 break; 984 985 metaslab_passivate(msp, size - 1); 986 987 mutex_exit(&msp->ms_lock); 988 } 989 990 if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) 991 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); 992 993 space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); 994 995 mutex_exit(&msp->ms_lock); 996 997 return (offset); 998 } 999 1000 /* 1001 * Allocate a block for the specified i/o. 1002 */ 1003 static int 1004 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, 1005 dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags) 1006 { 1007 metaslab_group_t *mg, *rotor; 1008 vdev_t *vd; 1009 int dshift = 3; 1010 int all_zero; 1011 int zio_lock = B_FALSE; 1012 boolean_t allocatable; 1013 uint64_t offset = -1ULL; 1014 uint64_t asize; 1015 uint64_t distance; 1016 1017 ASSERT(!DVA_IS_VALID(&dva[d])); 1018 1019 /* 1020 * For testing, make some blocks above a certain size be gang blocks. 1021 */ 1022 if (psize >= metaslab_gang_bang && (lbolt & 3) == 0) 1023 return (ENOSPC); 1024 1025 /* 1026 * Start at the rotor and loop through all mgs until we find something. 1027 * Note that there's no locking on mc_rotor or mc_aliquot because 1028 * nothing actually breaks if we miss a few updates -- we just won't 1029 * allocate quite as evenly. It all balances out over time. 1030 * 1031 * If we are doing ditto or log blocks, try to spread them across 1032 * consecutive vdevs. If we're forced to reuse a vdev before we've 1033 * allocated all of our ditto blocks, then try and spread them out on 1034 * that vdev as much as possible. If it turns out to not be possible, 1035 * gradually lower our standards until anything becomes acceptable. 1036 * Also, allocating on consecutive vdevs (as opposed to random vdevs) 1037 * gives us hope of containing our fault domains to something we're 1038 * able to reason about. Otherwise, any two top-level vdev failures 1039 * will guarantee the loss of data. With consecutive allocation, 1040 * only two adjacent top-level vdev failures will result in data loss. 1041 * 1042 * If we are doing gang blocks (hintdva is non-NULL), try to keep 1043 * ourselves on the same vdev as our gang block header. That 1044 * way, we can hope for locality in vdev_cache, plus it makes our 1045 * fault domains something tractable. 1046 */ 1047 if (hintdva) { 1048 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); 1049 1050 /* 1051 * It's possible the vdev we're using as the hint no 1052 * longer exists (i.e. removed). Consult the rotor when 1053 * all else fails. 1054 */ 1055 if (vd != NULL && vd->vdev_mg != NULL) { 1056 mg = vd->vdev_mg; 1057 1058 if (flags & METASLAB_HINTBP_AVOID && 1059 mg->mg_next != NULL) 1060 mg = mg->mg_next; 1061 } else { 1062 mg = mc->mc_rotor; 1063 } 1064 } else if (d != 0) { 1065 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); 1066 mg = vd->vdev_mg->mg_next; 1067 } else { 1068 mg = mc->mc_rotor; 1069 } 1070 1071 /* 1072 * If the hint put us into the wrong class, just follow the rotor. 1073 */ 1074 if (mg->mg_class != mc) 1075 mg = mc->mc_rotor; 1076 1077 rotor = mg; 1078 top: 1079 all_zero = B_TRUE; 1080 do { 1081 vd = mg->mg_vd; 1082 1083 /* 1084 * Don't allocate from faulted devices. 1085 */ 1086 if (zio_lock) { 1087 spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); 1088 allocatable = vdev_allocatable(vd); 1089 spa_config_exit(spa, SCL_ZIO, FTAG); 1090 } else { 1091 allocatable = vdev_allocatable(vd); 1092 } 1093 if (!allocatable) 1094 goto next; 1095 1096 /* 1097 * Avoid writing single-copy data to a failing vdev 1098 */ 1099 if ((vd->vdev_stat.vs_write_errors > 0 || 1100 vd->vdev_state < VDEV_STATE_HEALTHY) && 1101 d == 0 && dshift == 3) { 1102 all_zero = B_FALSE; 1103 goto next; 1104 } 1105 1106 ASSERT(mg->mg_class == mc); 1107 1108 distance = vd->vdev_asize >> dshift; 1109 if (distance <= (1ULL << vd->vdev_ms_shift)) 1110 distance = 0; 1111 else 1112 all_zero = B_FALSE; 1113 1114 asize = vdev_psize_to_asize(vd, psize); 1115 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); 1116 1117 offset = metaslab_group_alloc(mg, asize, txg, distance, dva, d); 1118 if (offset != -1ULL) { 1119 /* 1120 * If we've just selected this metaslab group, 1121 * figure out whether the corresponding vdev is 1122 * over- or under-used relative to the pool, 1123 * and set an allocation bias to even it out. 1124 */ 1125 if (mc->mc_aliquot == 0) { 1126 vdev_stat_t *vs = &vd->vdev_stat; 1127 int64_t vu, cu; 1128 1129 /* 1130 * Determine percent used in units of 0..1024. 1131 * (This is just to avoid floating point.) 1132 */ 1133 vu = (vs->vs_alloc << 10) / (vs->vs_space + 1); 1134 cu = (mc->mc_alloc << 10) / (mc->mc_space + 1); 1135 1136 /* 1137 * Bias by at most +/- 25% of the aliquot. 1138 */ 1139 mg->mg_bias = ((cu - vu) * 1140 (int64_t)mg->mg_aliquot) / (1024 * 4); 1141 } 1142 1143 if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= 1144 mg->mg_aliquot + mg->mg_bias) { 1145 mc->mc_rotor = mg->mg_next; 1146 mc->mc_aliquot = 0; 1147 } 1148 1149 DVA_SET_VDEV(&dva[d], vd->vdev_id); 1150 DVA_SET_OFFSET(&dva[d], offset); 1151 DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER)); 1152 DVA_SET_ASIZE(&dva[d], asize); 1153 1154 return (0); 1155 } 1156 next: 1157 mc->mc_rotor = mg->mg_next; 1158 mc->mc_aliquot = 0; 1159 } while ((mg = mg->mg_next) != rotor); 1160 1161 if (!all_zero) { 1162 dshift++; 1163 ASSERT(dshift < 64); 1164 goto top; 1165 } 1166 1167 if (!allocatable && !zio_lock) { 1168 dshift = 3; 1169 zio_lock = B_TRUE; 1170 goto top; 1171 } 1172 1173 bzero(&dva[d], sizeof (dva_t)); 1174 1175 return (ENOSPC); 1176 } 1177 1178 /* 1179 * Free the block represented by DVA in the context of the specified 1180 * transaction group. 1181 */ 1182 static void 1183 metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now) 1184 { 1185 uint64_t vdev = DVA_GET_VDEV(dva); 1186 uint64_t offset = DVA_GET_OFFSET(dva); 1187 uint64_t size = DVA_GET_ASIZE(dva); 1188 vdev_t *vd; 1189 metaslab_t *msp; 1190 1191 ASSERT(DVA_IS_VALID(dva)); 1192 1193 if (txg > spa_freeze_txg(spa)) 1194 return; 1195 1196 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 1197 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { 1198 cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu", 1199 (u_longlong_t)vdev, (u_longlong_t)offset); 1200 ASSERT(0); 1201 return; 1202 } 1203 1204 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 1205 1206 if (DVA_GET_GANG(dva)) 1207 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 1208 1209 mutex_enter(&msp->ms_lock); 1210 1211 if (now) { 1212 space_map_remove(&msp->ms_allocmap[txg & TXG_MASK], 1213 offset, size); 1214 space_map_free(&msp->ms_map, offset, size); 1215 } else { 1216 if (msp->ms_freemap[txg & TXG_MASK].sm_space == 0) 1217 vdev_dirty(vd, VDD_METASLAB, msp, txg); 1218 space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size); 1219 } 1220 1221 mutex_exit(&msp->ms_lock); 1222 } 1223 1224 /* 1225 * Intent log support: upon opening the pool after a crash, notify the SPA 1226 * of blocks that the intent log has allocated for immediate write, but 1227 * which are still considered free by the SPA because the last transaction 1228 * group didn't commit yet. 1229 */ 1230 static int 1231 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) 1232 { 1233 uint64_t vdev = DVA_GET_VDEV(dva); 1234 uint64_t offset = DVA_GET_OFFSET(dva); 1235 uint64_t size = DVA_GET_ASIZE(dva); 1236 vdev_t *vd; 1237 metaslab_t *msp; 1238 int error = 0; 1239 1240 ASSERT(DVA_IS_VALID(dva)); 1241 1242 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 1243 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) 1244 return (ENXIO); 1245 1246 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 1247 1248 if (DVA_GET_GANG(dva)) 1249 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 1250 1251 mutex_enter(&msp->ms_lock); 1252 1253 if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map.sm_loaded) 1254 error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY, 0); 1255 1256 if (error == 0 && !space_map_contains(&msp->ms_map, offset, size)) 1257 error = ENOENT; 1258 1259 if (error || txg == 0) { /* txg == 0 indicates dry run */ 1260 mutex_exit(&msp->ms_lock); 1261 return (error); 1262 } 1263 1264 space_map_claim(&msp->ms_map, offset, size); 1265 1266 if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ 1267 if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) 1268 vdev_dirty(vd, VDD_METASLAB, msp, txg); 1269 space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); 1270 } 1271 1272 mutex_exit(&msp->ms_lock); 1273 1274 return (0); 1275 } 1276 1277 int 1278 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, 1279 int ndvas, uint64_t txg, blkptr_t *hintbp, int flags) 1280 { 1281 dva_t *dva = bp->blk_dva; 1282 dva_t *hintdva = hintbp->blk_dva; 1283 int error = 0; 1284 1285 ASSERT(bp->blk_birth == 0); 1286 ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); 1287 1288 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 1289 1290 if (mc->mc_rotor == NULL) { /* no vdevs in this class */ 1291 spa_config_exit(spa, SCL_ALLOC, FTAG); 1292 return (ENOSPC); 1293 } 1294 1295 ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); 1296 ASSERT(BP_GET_NDVAS(bp) == 0); 1297 ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); 1298 1299 for (int d = 0; d < ndvas; d++) { 1300 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, 1301 txg, flags); 1302 if (error) { 1303 for (d--; d >= 0; d--) { 1304 metaslab_free_dva(spa, &dva[d], txg, B_TRUE); 1305 bzero(&dva[d], sizeof (dva_t)); 1306 } 1307 spa_config_exit(spa, SCL_ALLOC, FTAG); 1308 return (error); 1309 } 1310 } 1311 ASSERT(error == 0); 1312 ASSERT(BP_GET_NDVAS(bp) == ndvas); 1313 1314 spa_config_exit(spa, SCL_ALLOC, FTAG); 1315 1316 BP_SET_BIRTH(bp, txg, txg); 1317 1318 return (0); 1319 } 1320 1321 void 1322 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) 1323 { 1324 const dva_t *dva = bp->blk_dva; 1325 int ndvas = BP_GET_NDVAS(bp); 1326 1327 ASSERT(!BP_IS_HOLE(bp)); 1328 ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); 1329 1330 spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); 1331 1332 for (int d = 0; d < ndvas; d++) 1333 metaslab_free_dva(spa, &dva[d], txg, now); 1334 1335 spa_config_exit(spa, SCL_FREE, FTAG); 1336 } 1337 1338 int 1339 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) 1340 { 1341 const dva_t *dva = bp->blk_dva; 1342 int ndvas = BP_GET_NDVAS(bp); 1343 int error = 0; 1344 1345 ASSERT(!BP_IS_HOLE(bp)); 1346 1347 if (txg != 0) { 1348 /* 1349 * First do a dry run to make sure all DVAs are claimable, 1350 * so we don't have to unwind from partial failures below. 1351 */ 1352 if ((error = metaslab_claim(spa, bp, 0)) != 0) 1353 return (error); 1354 } 1355 1356 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 1357 1358 for (int d = 0; d < ndvas; d++) 1359 if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0) 1360 break; 1361 1362 spa_config_exit(spa, SCL_ALLOC, FTAG); 1363 1364 ASSERT(error == 0 || txg == 0); 1365 1366 return (error); 1367 } 1368