1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012 by Delphix. All rights reserved. 24 */ 25 26 #include <sys/zfs_context.h> 27 #include <sys/dmu.h> 28 #include <sys/dmu_tx.h> 29 #include <sys/space_map.h> 30 #include <sys/metaslab_impl.h> 31 #include <sys/vdev_impl.h> 32 #include <sys/zio.h> 33 34 /* 35 * Allow allocations to switch to gang blocks quickly. We do this to 36 * avoid having to load lots of space_maps in a given txg. There are, 37 * however, some cases where we want to avoid "fast" ganging and instead 38 * we want to do an exhaustive search of all metaslabs on this device. 39 * Currently we don't allow any gang, zil, or dump device related allocations 40 * to "fast" gang. 41 */ 42 #define CAN_FASTGANG(flags) \ 43 (!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \ 44 METASLAB_GANG_AVOID))) 45 46 uint64_t metaslab_aliquot = 512ULL << 10; 47 uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ 48 49 /* 50 * This value defines the number of allowed allocation failures per vdev. 51 * If a device reaches this threshold in a given txg then we consider skipping 52 * allocations on that device. 53 */ 54 int zfs_mg_alloc_failures; 55 56 /* 57 * Metaslab debugging: when set, keeps all space maps in core to verify frees. 58 */ 59 static int metaslab_debug = 0; 60 61 /* 62 * Minimum size which forces the dynamic allocator to change 63 * it's allocation strategy. Once the space map cannot satisfy 64 * an allocation of this size then it switches to using more 65 * aggressive strategy (i.e search by size rather than offset). 66 */ 67 uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE; 68 69 /* 70 * The minimum free space, in percent, which must be available 71 * in a space map to continue allocations in a first-fit fashion. 72 * Once the space_map's free space drops below this level we dynamically 73 * switch to using best-fit allocations. 74 */ 75 int metaslab_df_free_pct = 4; 76 77 /* 78 * A metaslab is considered "free" if it contains a contiguous 79 * segment which is greater than metaslab_min_alloc_size. 80 */ 81 uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS; 82 83 /* 84 * Max number of space_maps to prefetch. 85 */ 86 int metaslab_prefetch_limit = SPA_DVAS_PER_BP; 87 88 /* 89 * Percentage bonus multiplier for metaslabs that are in the bonus area. 90 */ 91 int metaslab_smo_bonus_pct = 150; 92 93 /* 94 * ========================================================================== 95 * Metaslab classes 96 * ========================================================================== 97 */ 98 metaslab_class_t * 99 metaslab_class_create(spa_t *spa, space_map_ops_t *ops) 100 { 101 metaslab_class_t *mc; 102 103 mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); 104 105 mc->mc_spa = spa; 106 mc->mc_rotor = NULL; 107 mc->mc_ops = ops; 108 109 return (mc); 110 } 111 112 void 113 metaslab_class_destroy(metaslab_class_t *mc) 114 { 115 ASSERT(mc->mc_rotor == NULL); 116 ASSERT(mc->mc_alloc == 0); 117 ASSERT(mc->mc_deferred == 0); 118 ASSERT(mc->mc_space == 0); 119 ASSERT(mc->mc_dspace == 0); 120 121 kmem_free(mc, sizeof (metaslab_class_t)); 122 } 123 124 int 125 metaslab_class_validate(metaslab_class_t *mc) 126 { 127 metaslab_group_t *mg; 128 vdev_t *vd; 129 130 /* 131 * Must hold one of the spa_config locks. 132 */ 133 ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || 134 spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); 135 136 if ((mg = mc->mc_rotor) == NULL) 137 return (0); 138 139 do { 140 vd = mg->mg_vd; 141 ASSERT(vd->vdev_mg != NULL); 142 ASSERT3P(vd->vdev_top, ==, vd); 143 ASSERT3P(mg->mg_class, ==, mc); 144 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); 145 } while ((mg = mg->mg_next) != mc->mc_rotor); 146 147 return (0); 148 } 149 150 void 151 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, 152 int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) 153 { 154 atomic_add_64(&mc->mc_alloc, alloc_delta); 155 atomic_add_64(&mc->mc_deferred, defer_delta); 156 atomic_add_64(&mc->mc_space, space_delta); 157 atomic_add_64(&mc->mc_dspace, dspace_delta); 158 } 159 160 uint64_t 161 metaslab_class_get_alloc(metaslab_class_t *mc) 162 { 163 return (mc->mc_alloc); 164 } 165 166 uint64_t 167 metaslab_class_get_deferred(metaslab_class_t *mc) 168 { 169 return (mc->mc_deferred); 170 } 171 172 uint64_t 173 metaslab_class_get_space(metaslab_class_t *mc) 174 { 175 return (mc->mc_space); 176 } 177 178 uint64_t 179 metaslab_class_get_dspace(metaslab_class_t *mc) 180 { 181 return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); 182 } 183 184 /* 185 * ========================================================================== 186 * Metaslab groups 187 * ========================================================================== 188 */ 189 static int 190 metaslab_compare(const void *x1, const void *x2) 191 { 192 const metaslab_t *m1 = x1; 193 const metaslab_t *m2 = x2; 194 195 if (m1->ms_weight < m2->ms_weight) 196 return (1); 197 if (m1->ms_weight > m2->ms_weight) 198 return (-1); 199 200 /* 201 * If the weights are identical, use the offset to force uniqueness. 202 */ 203 if (m1->ms_map.sm_start < m2->ms_map.sm_start) 204 return (-1); 205 if (m1->ms_map.sm_start > m2->ms_map.sm_start) 206 return (1); 207 208 ASSERT3P(m1, ==, m2); 209 210 return (0); 211 } 212 213 metaslab_group_t * 214 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) 215 { 216 metaslab_group_t *mg; 217 218 mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); 219 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); 220 avl_create(&mg->mg_metaslab_tree, metaslab_compare, 221 sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); 222 mg->mg_vd = vd; 223 mg->mg_class = mc; 224 mg->mg_activation_count = 0; 225 226 return (mg); 227 } 228 229 void 230 metaslab_group_destroy(metaslab_group_t *mg) 231 { 232 ASSERT(mg->mg_prev == NULL); 233 ASSERT(mg->mg_next == NULL); 234 /* 235 * We may have gone below zero with the activation count 236 * either because we never activated in the first place or 237 * because we're done, and possibly removing the vdev. 238 */ 239 ASSERT(mg->mg_activation_count <= 0); 240 241 avl_destroy(&mg->mg_metaslab_tree); 242 mutex_destroy(&mg->mg_lock); 243 kmem_free(mg, sizeof (metaslab_group_t)); 244 } 245 246 void 247 metaslab_group_activate(metaslab_group_t *mg) 248 { 249 metaslab_class_t *mc = mg->mg_class; 250 metaslab_group_t *mgprev, *mgnext; 251 252 ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); 253 254 ASSERT(mc->mc_rotor != mg); 255 ASSERT(mg->mg_prev == NULL); 256 ASSERT(mg->mg_next == NULL); 257 ASSERT(mg->mg_activation_count <= 0); 258 259 if (++mg->mg_activation_count <= 0) 260 return; 261 262 mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); 263 264 if ((mgprev = mc->mc_rotor) == NULL) { 265 mg->mg_prev = mg; 266 mg->mg_next = mg; 267 } else { 268 mgnext = mgprev->mg_next; 269 mg->mg_prev = mgprev; 270 mg->mg_next = mgnext; 271 mgprev->mg_next = mg; 272 mgnext->mg_prev = mg; 273 } 274 mc->mc_rotor = mg; 275 } 276 277 void 278 metaslab_group_passivate(metaslab_group_t *mg) 279 { 280 metaslab_class_t *mc = mg->mg_class; 281 metaslab_group_t *mgprev, *mgnext; 282 283 ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); 284 285 if (--mg->mg_activation_count != 0) { 286 ASSERT(mc->mc_rotor != mg); 287 ASSERT(mg->mg_prev == NULL); 288 ASSERT(mg->mg_next == NULL); 289 ASSERT(mg->mg_activation_count < 0); 290 return; 291 } 292 293 mgprev = mg->mg_prev; 294 mgnext = mg->mg_next; 295 296 if (mg == mgnext) { 297 mc->mc_rotor = NULL; 298 } else { 299 mc->mc_rotor = mgnext; 300 mgprev->mg_next = mgnext; 301 mgnext->mg_prev = mgprev; 302 } 303 304 mg->mg_prev = NULL; 305 mg->mg_next = NULL; 306 } 307 308 static void 309 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) 310 { 311 mutex_enter(&mg->mg_lock); 312 ASSERT(msp->ms_group == NULL); 313 msp->ms_group = mg; 314 msp->ms_weight = 0; 315 avl_add(&mg->mg_metaslab_tree, msp); 316 mutex_exit(&mg->mg_lock); 317 } 318 319 static void 320 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) 321 { 322 mutex_enter(&mg->mg_lock); 323 ASSERT(msp->ms_group == mg); 324 avl_remove(&mg->mg_metaslab_tree, msp); 325 msp->ms_group = NULL; 326 mutex_exit(&mg->mg_lock); 327 } 328 329 static void 330 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 331 { 332 /* 333 * Although in principle the weight can be any value, in 334 * practice we do not use values in the range [1, 510]. 335 */ 336 ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0); 337 ASSERT(MUTEX_HELD(&msp->ms_lock)); 338 339 mutex_enter(&mg->mg_lock); 340 ASSERT(msp->ms_group == mg); 341 avl_remove(&mg->mg_metaslab_tree, msp); 342 msp->ms_weight = weight; 343 avl_add(&mg->mg_metaslab_tree, msp); 344 mutex_exit(&mg->mg_lock); 345 } 346 347 /* 348 * ========================================================================== 349 * Common allocator routines 350 * ========================================================================== 351 */ 352 static int 353 metaslab_segsize_compare(const void *x1, const void *x2) 354 { 355 const space_seg_t *s1 = x1; 356 const space_seg_t *s2 = x2; 357 uint64_t ss_size1 = s1->ss_end - s1->ss_start; 358 uint64_t ss_size2 = s2->ss_end - s2->ss_start; 359 360 if (ss_size1 < ss_size2) 361 return (-1); 362 if (ss_size1 > ss_size2) 363 return (1); 364 365 if (s1->ss_start < s2->ss_start) 366 return (-1); 367 if (s1->ss_start > s2->ss_start) 368 return (1); 369 370 return (0); 371 } 372 373 /* 374 * This is a helper function that can be used by the allocator to find 375 * a suitable block to allocate. This will search the specified AVL 376 * tree looking for a block that matches the specified criteria. 377 */ 378 static uint64_t 379 metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, 380 uint64_t align) 381 { 382 space_seg_t *ss, ssearch; 383 avl_index_t where; 384 385 ssearch.ss_start = *cursor; 386 ssearch.ss_end = *cursor + size; 387 388 ss = avl_find(t, &ssearch, &where); 389 if (ss == NULL) 390 ss = avl_nearest(t, where, AVL_AFTER); 391 392 while (ss != NULL) { 393 uint64_t offset = P2ROUNDUP(ss->ss_start, align); 394 395 if (offset + size <= ss->ss_end) { 396 *cursor = offset + size; 397 return (offset); 398 } 399 ss = AVL_NEXT(t, ss); 400 } 401 402 /* 403 * If we know we've searched the whole map (*cursor == 0), give up. 404 * Otherwise, reset the cursor to the beginning and try again. 405 */ 406 if (*cursor == 0) 407 return (-1ULL); 408 409 *cursor = 0; 410 return (metaslab_block_picker(t, cursor, size, align)); 411 } 412 413 static void 414 metaslab_pp_load(space_map_t *sm) 415 { 416 space_seg_t *ss; 417 418 ASSERT(sm->sm_ppd == NULL); 419 sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP); 420 421 sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); 422 avl_create(sm->sm_pp_root, metaslab_segsize_compare, 423 sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node)); 424 425 for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss)) 426 avl_add(sm->sm_pp_root, ss); 427 } 428 429 static void 430 metaslab_pp_unload(space_map_t *sm) 431 { 432 void *cookie = NULL; 433 434 kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t)); 435 sm->sm_ppd = NULL; 436 437 while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) { 438 /* tear down the tree */ 439 } 440 441 avl_destroy(sm->sm_pp_root); 442 kmem_free(sm->sm_pp_root, sizeof (avl_tree_t)); 443 sm->sm_pp_root = NULL; 444 } 445 446 /* ARGSUSED */ 447 static void 448 metaslab_pp_claim(space_map_t *sm, uint64_t start, uint64_t size) 449 { 450 /* No need to update cursor */ 451 } 452 453 /* ARGSUSED */ 454 static void 455 metaslab_pp_free(space_map_t *sm, uint64_t start, uint64_t size) 456 { 457 /* No need to update cursor */ 458 } 459 460 /* 461 * Return the maximum contiguous segment within the metaslab. 462 */ 463 uint64_t 464 metaslab_pp_maxsize(space_map_t *sm) 465 { 466 avl_tree_t *t = sm->sm_pp_root; 467 space_seg_t *ss; 468 469 if (t == NULL || (ss = avl_last(t)) == NULL) 470 return (0ULL); 471 472 return (ss->ss_end - ss->ss_start); 473 } 474 475 /* 476 * ========================================================================== 477 * The first-fit block allocator 478 * ========================================================================== 479 */ 480 static uint64_t 481 metaslab_ff_alloc(space_map_t *sm, uint64_t size) 482 { 483 avl_tree_t *t = &sm->sm_root; 484 uint64_t align = size & -size; 485 uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; 486 487 return (metaslab_block_picker(t, cursor, size, align)); 488 } 489 490 /* ARGSUSED */ 491 boolean_t 492 metaslab_ff_fragmented(space_map_t *sm) 493 { 494 return (B_TRUE); 495 } 496 497 static space_map_ops_t metaslab_ff_ops = { 498 metaslab_pp_load, 499 metaslab_pp_unload, 500 metaslab_ff_alloc, 501 metaslab_pp_claim, 502 metaslab_pp_free, 503 metaslab_pp_maxsize, 504 metaslab_ff_fragmented 505 }; 506 507 /* 508 * ========================================================================== 509 * Dynamic block allocator - 510 * Uses the first fit allocation scheme until space get low and then 511 * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold 512 * and metaslab_df_free_pct to determine when to switch the allocation scheme. 513 * ========================================================================== 514 */ 515 static uint64_t 516 metaslab_df_alloc(space_map_t *sm, uint64_t size) 517 { 518 avl_tree_t *t = &sm->sm_root; 519 uint64_t align = size & -size; 520 uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; 521 uint64_t max_size = metaslab_pp_maxsize(sm); 522 int free_pct = sm->sm_space * 100 / sm->sm_size; 523 524 ASSERT(MUTEX_HELD(sm->sm_lock)); 525 ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); 526 527 if (max_size < size) 528 return (-1ULL); 529 530 /* 531 * If we're running low on space switch to using the size 532 * sorted AVL tree (best-fit). 533 */ 534 if (max_size < metaslab_df_alloc_threshold || 535 free_pct < metaslab_df_free_pct) { 536 t = sm->sm_pp_root; 537 *cursor = 0; 538 } 539 540 return (metaslab_block_picker(t, cursor, size, 1ULL)); 541 } 542 543 static boolean_t 544 metaslab_df_fragmented(space_map_t *sm) 545 { 546 uint64_t max_size = metaslab_pp_maxsize(sm); 547 int free_pct = sm->sm_space * 100 / sm->sm_size; 548 549 if (max_size >= metaslab_df_alloc_threshold && 550 free_pct >= metaslab_df_free_pct) 551 return (B_FALSE); 552 553 return (B_TRUE); 554 } 555 556 static space_map_ops_t metaslab_df_ops = { 557 metaslab_pp_load, 558 metaslab_pp_unload, 559 metaslab_df_alloc, 560 metaslab_pp_claim, 561 metaslab_pp_free, 562 metaslab_pp_maxsize, 563 metaslab_df_fragmented 564 }; 565 566 /* 567 * ========================================================================== 568 * Other experimental allocators 569 * ========================================================================== 570 */ 571 static uint64_t 572 metaslab_cdf_alloc(space_map_t *sm, uint64_t size) 573 { 574 avl_tree_t *t = &sm->sm_root; 575 uint64_t *cursor = (uint64_t *)sm->sm_ppd; 576 uint64_t *extent_end = (uint64_t *)sm->sm_ppd + 1; 577 uint64_t max_size = metaslab_pp_maxsize(sm); 578 uint64_t rsize = size; 579 uint64_t offset = 0; 580 581 ASSERT(MUTEX_HELD(sm->sm_lock)); 582 ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); 583 584 if (max_size < size) 585 return (-1ULL); 586 587 ASSERT3U(*extent_end, >=, *cursor); 588 589 /* 590 * If we're running low on space switch to using the size 591 * sorted AVL tree (best-fit). 592 */ 593 if ((*cursor + size) > *extent_end) { 594 595 t = sm->sm_pp_root; 596 *cursor = *extent_end = 0; 597 598 if (max_size > 2 * SPA_MAXBLOCKSIZE) 599 rsize = MIN(metaslab_min_alloc_size, max_size); 600 offset = metaslab_block_picker(t, extent_end, rsize, 1ULL); 601 if (offset != -1) 602 *cursor = offset + size; 603 } else { 604 offset = metaslab_block_picker(t, cursor, rsize, 1ULL); 605 } 606 ASSERT3U(*cursor, <=, *extent_end); 607 return (offset); 608 } 609 610 static boolean_t 611 metaslab_cdf_fragmented(space_map_t *sm) 612 { 613 uint64_t max_size = metaslab_pp_maxsize(sm); 614 615 if (max_size > (metaslab_min_alloc_size * 10)) 616 return (B_FALSE); 617 return (B_TRUE); 618 } 619 620 static space_map_ops_t metaslab_cdf_ops = { 621 metaslab_pp_load, 622 metaslab_pp_unload, 623 metaslab_cdf_alloc, 624 metaslab_pp_claim, 625 metaslab_pp_free, 626 metaslab_pp_maxsize, 627 metaslab_cdf_fragmented 628 }; 629 630 uint64_t metaslab_ndf_clump_shift = 4; 631 632 static uint64_t 633 metaslab_ndf_alloc(space_map_t *sm, uint64_t size) 634 { 635 avl_tree_t *t = &sm->sm_root; 636 avl_index_t where; 637 space_seg_t *ss, ssearch; 638 uint64_t hbit = highbit(size); 639 uint64_t *cursor = (uint64_t *)sm->sm_ppd + hbit - 1; 640 uint64_t max_size = metaslab_pp_maxsize(sm); 641 642 ASSERT(MUTEX_HELD(sm->sm_lock)); 643 ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); 644 645 if (max_size < size) 646 return (-1ULL); 647 648 ssearch.ss_start = *cursor; 649 ssearch.ss_end = *cursor + size; 650 651 ss = avl_find(t, &ssearch, &where); 652 if (ss == NULL || (ss->ss_start + size > ss->ss_end)) { 653 t = sm->sm_pp_root; 654 655 ssearch.ss_start = 0; 656 ssearch.ss_end = MIN(max_size, 657 1ULL << (hbit + metaslab_ndf_clump_shift)); 658 ss = avl_find(t, &ssearch, &where); 659 if (ss == NULL) 660 ss = avl_nearest(t, where, AVL_AFTER); 661 ASSERT(ss != NULL); 662 } 663 664 if (ss != NULL) { 665 if (ss->ss_start + size <= ss->ss_end) { 666 *cursor = ss->ss_start + size; 667 return (ss->ss_start); 668 } 669 } 670 return (-1ULL); 671 } 672 673 static boolean_t 674 metaslab_ndf_fragmented(space_map_t *sm) 675 { 676 uint64_t max_size = metaslab_pp_maxsize(sm); 677 678 if (max_size > (metaslab_min_alloc_size << metaslab_ndf_clump_shift)) 679 return (B_FALSE); 680 return (B_TRUE); 681 } 682 683 684 static space_map_ops_t metaslab_ndf_ops = { 685 metaslab_pp_load, 686 metaslab_pp_unload, 687 metaslab_ndf_alloc, 688 metaslab_pp_claim, 689 metaslab_pp_free, 690 metaslab_pp_maxsize, 691 metaslab_ndf_fragmented 692 }; 693 694 space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops; 695 696 /* 697 * ========================================================================== 698 * Metaslabs 699 * ========================================================================== 700 */ 701 metaslab_t * 702 metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, 703 uint64_t start, uint64_t size, uint64_t txg) 704 { 705 vdev_t *vd = mg->mg_vd; 706 metaslab_t *msp; 707 708 msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); 709 mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL); 710 711 msp->ms_smo_syncing = *smo; 712 713 /* 714 * We create the main space map here, but we don't create the 715 * allocmaps and freemaps until metaslab_sync_done(). This serves 716 * two purposes: it allows metaslab_sync_done() to detect the 717 * addition of new space; and for debugging, it ensures that we'd 718 * data fault on any attempt to use this metaslab before it's ready. 719 */ 720 space_map_create(&msp->ms_map, start, size, 721 vd->vdev_ashift, &msp->ms_lock); 722 723 metaslab_group_add(mg, msp); 724 725 if (metaslab_debug && smo->smo_object != 0) { 726 mutex_enter(&msp->ms_lock); 727 VERIFY(space_map_load(&msp->ms_map, mg->mg_class->mc_ops, 728 SM_FREE, smo, spa_meta_objset(vd->vdev_spa)) == 0); 729 mutex_exit(&msp->ms_lock); 730 } 731 732 /* 733 * If we're opening an existing pool (txg == 0) or creating 734 * a new one (txg == TXG_INITIAL), all space is available now. 735 * If we're adding space to an existing pool, the new space 736 * does not become available until after this txg has synced. 737 */ 738 if (txg <= TXG_INITIAL) 739 metaslab_sync_done(msp, 0); 740 741 if (txg != 0) { 742 vdev_dirty(vd, 0, NULL, txg); 743 vdev_dirty(vd, VDD_METASLAB, msp, txg); 744 } 745 746 return (msp); 747 } 748 749 void 750 metaslab_fini(metaslab_t *msp) 751 { 752 metaslab_group_t *mg = msp->ms_group; 753 754 vdev_space_update(mg->mg_vd, 755 -msp->ms_smo.smo_alloc, 0, -msp->ms_map.sm_size); 756 757 metaslab_group_remove(mg, msp); 758 759 mutex_enter(&msp->ms_lock); 760 761 space_map_unload(&msp->ms_map); 762 space_map_destroy(&msp->ms_map); 763 764 for (int t = 0; t < TXG_SIZE; t++) { 765 space_map_destroy(&msp->ms_allocmap[t]); 766 space_map_destroy(&msp->ms_freemap[t]); 767 } 768 769 for (int t = 0; t < TXG_DEFER_SIZE; t++) 770 space_map_destroy(&msp->ms_defermap[t]); 771 772 ASSERT0(msp->ms_deferspace); 773 774 mutex_exit(&msp->ms_lock); 775 mutex_destroy(&msp->ms_lock); 776 777 kmem_free(msp, sizeof (metaslab_t)); 778 } 779 780 #define METASLAB_WEIGHT_PRIMARY (1ULL << 63) 781 #define METASLAB_WEIGHT_SECONDARY (1ULL << 62) 782 #define METASLAB_ACTIVE_MASK \ 783 (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY) 784 785 static uint64_t 786 metaslab_weight(metaslab_t *msp) 787 { 788 metaslab_group_t *mg = msp->ms_group; 789 space_map_t *sm = &msp->ms_map; 790 space_map_obj_t *smo = &msp->ms_smo; 791 vdev_t *vd = mg->mg_vd; 792 uint64_t weight, space; 793 794 ASSERT(MUTEX_HELD(&msp->ms_lock)); 795 796 /* 797 * The baseline weight is the metaslab's free space. 798 */ 799 space = sm->sm_size - smo->smo_alloc; 800 weight = space; 801 802 /* 803 * Modern disks have uniform bit density and constant angular velocity. 804 * Therefore, the outer recording zones are faster (higher bandwidth) 805 * than the inner zones by the ratio of outer to inner track diameter, 806 * which is typically around 2:1. We account for this by assigning 807 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). 808 * In effect, this means that we'll select the metaslab with the most 809 * free bandwidth rather than simply the one with the most free space. 810 */ 811 weight = 2 * weight - 812 ((sm->sm_start >> vd->vdev_ms_shift) * weight) / vd->vdev_ms_count; 813 ASSERT(weight >= space && weight <= 2 * space); 814 815 /* 816 * For locality, assign higher weight to metaslabs which have 817 * a lower offset than what we've already activated. 818 */ 819 if (sm->sm_start <= mg->mg_bonus_area) 820 weight *= (metaslab_smo_bonus_pct / 100); 821 ASSERT(weight >= space && 822 weight <= 2 * (metaslab_smo_bonus_pct / 100) * space); 823 824 if (sm->sm_loaded && !sm->sm_ops->smop_fragmented(sm)) { 825 /* 826 * If this metaslab is one we're actively using, adjust its 827 * weight to make it preferable to any inactive metaslab so 828 * we'll polish it off. 829 */ 830 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); 831 } 832 return (weight); 833 } 834 835 static void 836 metaslab_prefetch(metaslab_group_t *mg) 837 { 838 spa_t *spa = mg->mg_vd->vdev_spa; 839 metaslab_t *msp; 840 avl_tree_t *t = &mg->mg_metaslab_tree; 841 int m; 842 843 mutex_enter(&mg->mg_lock); 844 845 /* 846 * Prefetch the next potential metaslabs 847 */ 848 for (msp = avl_first(t), m = 0; msp; msp = AVL_NEXT(t, msp), m++) { 849 space_map_t *sm = &msp->ms_map; 850 space_map_obj_t *smo = &msp->ms_smo; 851 852 /* If we have reached our prefetch limit then we're done */ 853 if (m >= metaslab_prefetch_limit) 854 break; 855 856 if (!sm->sm_loaded && smo->smo_object != 0) { 857 mutex_exit(&mg->mg_lock); 858 dmu_prefetch(spa_meta_objset(spa), smo->smo_object, 859 0ULL, smo->smo_objsize); 860 mutex_enter(&mg->mg_lock); 861 } 862 } 863 mutex_exit(&mg->mg_lock); 864 } 865 866 static int 867 metaslab_activate(metaslab_t *msp, uint64_t activation_weight) 868 { 869 metaslab_group_t *mg = msp->ms_group; 870 space_map_t *sm = &msp->ms_map; 871 space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops; 872 873 ASSERT(MUTEX_HELD(&msp->ms_lock)); 874 875 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { 876 space_map_load_wait(sm); 877 if (!sm->sm_loaded) { 878 space_map_obj_t *smo = &msp->ms_smo; 879 880 int error = space_map_load(sm, sm_ops, SM_FREE, smo, 881 spa_meta_objset(msp->ms_group->mg_vd->vdev_spa)); 882 if (error) { 883 metaslab_group_sort(msp->ms_group, msp, 0); 884 return (error); 885 } 886 for (int t = 0; t < TXG_DEFER_SIZE; t++) 887 space_map_walk(&msp->ms_defermap[t], 888 space_map_claim, sm); 889 890 } 891 892 /* 893 * Track the bonus area as we activate new metaslabs. 894 */ 895 if (sm->sm_start > mg->mg_bonus_area) { 896 mutex_enter(&mg->mg_lock); 897 mg->mg_bonus_area = sm->sm_start; 898 mutex_exit(&mg->mg_lock); 899 } 900 901 metaslab_group_sort(msp->ms_group, msp, 902 msp->ms_weight | activation_weight); 903 } 904 ASSERT(sm->sm_loaded); 905 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); 906 907 return (0); 908 } 909 910 static void 911 metaslab_passivate(metaslab_t *msp, uint64_t size) 912 { 913 /* 914 * If size < SPA_MINBLOCKSIZE, then we will not allocate from 915 * this metaslab again. In that case, it had better be empty, 916 * or we would be leaving space on the table. 917 */ 918 ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map.sm_space == 0); 919 metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size)); 920 ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); 921 } 922 923 /* 924 * Write a metaslab to disk in the context of the specified transaction group. 925 */ 926 void 927 metaslab_sync(metaslab_t *msp, uint64_t txg) 928 { 929 vdev_t *vd = msp->ms_group->mg_vd; 930 spa_t *spa = vd->vdev_spa; 931 objset_t *mos = spa_meta_objset(spa); 932 space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK]; 933 space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK]; 934 space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; 935 space_map_t *sm = &msp->ms_map; 936 space_map_obj_t *smo = &msp->ms_smo_syncing; 937 dmu_buf_t *db; 938 dmu_tx_t *tx; 939 940 ASSERT(!vd->vdev_ishole); 941 942 if (allocmap->sm_space == 0 && freemap->sm_space == 0) 943 return; 944 945 /* 946 * The only state that can actually be changing concurrently with 947 * metaslab_sync() is the metaslab's ms_map. No other thread can 948 * be modifying this txg's allocmap, freemap, freed_map, or smo. 949 * Therefore, we only hold ms_lock to satify space_map ASSERTs. 950 * We drop it whenever we call into the DMU, because the DMU 951 * can call down to us (e.g. via zio_free()) at any time. 952 */ 953 954 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 955 956 if (smo->smo_object == 0) { 957 ASSERT(smo->smo_objsize == 0); 958 ASSERT(smo->smo_alloc == 0); 959 smo->smo_object = dmu_object_alloc(mos, 960 DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT, 961 DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx); 962 ASSERT(smo->smo_object != 0); 963 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * 964 (sm->sm_start >> vd->vdev_ms_shift), 965 sizeof (uint64_t), &smo->smo_object, tx); 966 } 967 968 mutex_enter(&msp->ms_lock); 969 970 space_map_walk(freemap, space_map_add, freed_map); 971 972 if (sm->sm_loaded && spa_sync_pass(spa) == 1 && smo->smo_objsize >= 973 2 * sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) { 974 /* 975 * The in-core space map representation is twice as compact 976 * as the on-disk one, so it's time to condense the latter 977 * by generating a pure allocmap from first principles. 978 * 979 * This metaslab is 100% allocated, 980 * minus the content of the in-core map (sm), 981 * minus what's been freed this txg (freed_map), 982 * minus deferred frees (ms_defermap[]), 983 * minus allocations from txgs in the future 984 * (because they haven't been committed yet). 985 */ 986 space_map_vacate(allocmap, NULL, NULL); 987 space_map_vacate(freemap, NULL, NULL); 988 989 space_map_add(allocmap, allocmap->sm_start, allocmap->sm_size); 990 991 space_map_walk(sm, space_map_remove, allocmap); 992 space_map_walk(freed_map, space_map_remove, allocmap); 993 994 for (int t = 0; t < TXG_DEFER_SIZE; t++) 995 space_map_walk(&msp->ms_defermap[t], 996 space_map_remove, allocmap); 997 998 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) 999 space_map_walk(&msp->ms_allocmap[(txg + t) & TXG_MASK], 1000 space_map_remove, allocmap); 1001 1002 mutex_exit(&msp->ms_lock); 1003 space_map_truncate(smo, mos, tx); 1004 mutex_enter(&msp->ms_lock); 1005 } 1006 1007 space_map_sync(allocmap, SM_ALLOC, smo, mos, tx); 1008 space_map_sync(freemap, SM_FREE, smo, mos, tx); 1009 1010 mutex_exit(&msp->ms_lock); 1011 1012 VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)); 1013 dmu_buf_will_dirty(db, tx); 1014 ASSERT3U(db->db_size, >=, sizeof (*smo)); 1015 bcopy(smo, db->db_data, sizeof (*smo)); 1016 dmu_buf_rele(db, FTAG); 1017 1018 dmu_tx_commit(tx); 1019 } 1020 1021 /* 1022 * Called after a transaction group has completely synced to mark 1023 * all of the metaslab's free space as usable. 1024 */ 1025 void 1026 metaslab_sync_done(metaslab_t *msp, uint64_t txg) 1027 { 1028 space_map_obj_t *smo = &msp->ms_smo; 1029 space_map_obj_t *smosync = &msp->ms_smo_syncing; 1030 space_map_t *sm = &msp->ms_map; 1031 space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; 1032 space_map_t *defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE]; 1033 metaslab_group_t *mg = msp->ms_group; 1034 vdev_t *vd = mg->mg_vd; 1035 int64_t alloc_delta, defer_delta; 1036 1037 ASSERT(!vd->vdev_ishole); 1038 1039 mutex_enter(&msp->ms_lock); 1040 1041 /* 1042 * If this metaslab is just becoming available, initialize its 1043 * allocmaps and freemaps and add its capacity to the vdev. 1044 */ 1045 if (freed_map->sm_size == 0) { 1046 for (int t = 0; t < TXG_SIZE; t++) { 1047 space_map_create(&msp->ms_allocmap[t], sm->sm_start, 1048 sm->sm_size, sm->sm_shift, sm->sm_lock); 1049 space_map_create(&msp->ms_freemap[t], sm->sm_start, 1050 sm->sm_size, sm->sm_shift, sm->sm_lock); 1051 } 1052 1053 for (int t = 0; t < TXG_DEFER_SIZE; t++) 1054 space_map_create(&msp->ms_defermap[t], sm->sm_start, 1055 sm->sm_size, sm->sm_shift, sm->sm_lock); 1056 1057 vdev_space_update(vd, 0, 0, sm->sm_size); 1058 } 1059 1060 alloc_delta = smosync->smo_alloc - smo->smo_alloc; 1061 defer_delta = freed_map->sm_space - defer_map->sm_space; 1062 1063 vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0); 1064 1065 ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0); 1066 ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0); 1067 1068 /* 1069 * If there's a space_map_load() in progress, wait for it to complete 1070 * so that we have a consistent view of the in-core space map. 1071 * Then, add defer_map (oldest deferred frees) to this map and 1072 * transfer freed_map (this txg's frees) to defer_map. 1073 */ 1074 space_map_load_wait(sm); 1075 space_map_vacate(defer_map, sm->sm_loaded ? space_map_free : NULL, sm); 1076 space_map_vacate(freed_map, space_map_add, defer_map); 1077 1078 *smo = *smosync; 1079 1080 msp->ms_deferspace += defer_delta; 1081 ASSERT3S(msp->ms_deferspace, >=, 0); 1082 ASSERT3S(msp->ms_deferspace, <=, sm->sm_size); 1083 if (msp->ms_deferspace != 0) { 1084 /* 1085 * Keep syncing this metaslab until all deferred frees 1086 * are back in circulation. 1087 */ 1088 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 1089 } 1090 1091 /* 1092 * If the map is loaded but no longer active, evict it as soon as all 1093 * future allocations have synced. (If we unloaded it now and then 1094 * loaded a moment later, the map wouldn't reflect those allocations.) 1095 */ 1096 if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { 1097 int evictable = 1; 1098 1099 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) 1100 if (msp->ms_allocmap[(txg + t) & TXG_MASK].sm_space) 1101 evictable = 0; 1102 1103 if (evictable && !metaslab_debug) 1104 space_map_unload(sm); 1105 } 1106 1107 metaslab_group_sort(mg, msp, metaslab_weight(msp)); 1108 1109 mutex_exit(&msp->ms_lock); 1110 } 1111 1112 void 1113 metaslab_sync_reassess(metaslab_group_t *mg) 1114 { 1115 vdev_t *vd = mg->mg_vd; 1116 int64_t failures = mg->mg_alloc_failures; 1117 1118 /* 1119 * Re-evaluate all metaslabs which have lower offsets than the 1120 * bonus area. 1121 */ 1122 for (int m = 0; m < vd->vdev_ms_count; m++) { 1123 metaslab_t *msp = vd->vdev_ms[m]; 1124 1125 if (msp->ms_map.sm_start > mg->mg_bonus_area) 1126 break; 1127 1128 mutex_enter(&msp->ms_lock); 1129 metaslab_group_sort(mg, msp, metaslab_weight(msp)); 1130 mutex_exit(&msp->ms_lock); 1131 } 1132 1133 atomic_add_64(&mg->mg_alloc_failures, -failures); 1134 1135 /* 1136 * Prefetch the next potential metaslabs 1137 */ 1138 metaslab_prefetch(mg); 1139 } 1140 1141 static uint64_t 1142 metaslab_distance(metaslab_t *msp, dva_t *dva) 1143 { 1144 uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift; 1145 uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift; 1146 uint64_t start = msp->ms_map.sm_start >> ms_shift; 1147 1148 if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) 1149 return (1ULL << 63); 1150 1151 if (offset < start) 1152 return ((start - offset) << ms_shift); 1153 if (offset > start) 1154 return ((offset - start) << ms_shift); 1155 return (0); 1156 } 1157 1158 static uint64_t 1159 metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, 1160 uint64_t txg, uint64_t min_distance, dva_t *dva, int d, int flags) 1161 { 1162 spa_t *spa = mg->mg_vd->vdev_spa; 1163 metaslab_t *msp = NULL; 1164 uint64_t offset = -1ULL; 1165 avl_tree_t *t = &mg->mg_metaslab_tree; 1166 uint64_t activation_weight; 1167 uint64_t target_distance; 1168 int i; 1169 1170 activation_weight = METASLAB_WEIGHT_PRIMARY; 1171 for (i = 0; i < d; i++) { 1172 if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { 1173 activation_weight = METASLAB_WEIGHT_SECONDARY; 1174 break; 1175 } 1176 } 1177 1178 for (;;) { 1179 boolean_t was_active; 1180 1181 mutex_enter(&mg->mg_lock); 1182 for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) { 1183 if (msp->ms_weight < asize) { 1184 spa_dbgmsg(spa, "%s: failed to meet weight " 1185 "requirement: vdev %llu, txg %llu, mg %p, " 1186 "msp %p, psize %llu, asize %llu, " 1187 "failures %llu, weight %llu", 1188 spa_name(spa), mg->mg_vd->vdev_id, txg, 1189 mg, msp, psize, asize, 1190 mg->mg_alloc_failures, msp->ms_weight); 1191 mutex_exit(&mg->mg_lock); 1192 return (-1ULL); 1193 } 1194 was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; 1195 if (activation_weight == METASLAB_WEIGHT_PRIMARY) 1196 break; 1197 1198 target_distance = min_distance + 1199 (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1); 1200 1201 for (i = 0; i < d; i++) 1202 if (metaslab_distance(msp, &dva[i]) < 1203 target_distance) 1204 break; 1205 if (i == d) 1206 break; 1207 } 1208 mutex_exit(&mg->mg_lock); 1209 if (msp == NULL) 1210 return (-1ULL); 1211 1212 /* 1213 * If we've already reached the allowable number of failed 1214 * allocation attempts on this metaslab group then we 1215 * consider skipping it. We skip it only if we're allowed 1216 * to "fast" gang, the physical size is larger than 1217 * a gang block, and we're attempting to allocate from 1218 * the primary metaslab. 1219 */ 1220 if (mg->mg_alloc_failures > zfs_mg_alloc_failures && 1221 CAN_FASTGANG(flags) && psize > SPA_GANGBLOCKSIZE && 1222 activation_weight == METASLAB_WEIGHT_PRIMARY) { 1223 spa_dbgmsg(spa, "%s: skipping metaslab group: " 1224 "vdev %llu, txg %llu, mg %p, psize %llu, " 1225 "asize %llu, failures %llu", spa_name(spa), 1226 mg->mg_vd->vdev_id, txg, mg, psize, asize, 1227 mg->mg_alloc_failures); 1228 return (-1ULL); 1229 } 1230 1231 mutex_enter(&msp->ms_lock); 1232 1233 /* 1234 * Ensure that the metaslab we have selected is still 1235 * capable of handling our request. It's possible that 1236 * another thread may have changed the weight while we 1237 * were blocked on the metaslab lock. 1238 */ 1239 if (msp->ms_weight < asize || (was_active && 1240 !(msp->ms_weight & METASLAB_ACTIVE_MASK) && 1241 activation_weight == METASLAB_WEIGHT_PRIMARY)) { 1242 mutex_exit(&msp->ms_lock); 1243 continue; 1244 } 1245 1246 if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) && 1247 activation_weight == METASLAB_WEIGHT_PRIMARY) { 1248 metaslab_passivate(msp, 1249 msp->ms_weight & ~METASLAB_ACTIVE_MASK); 1250 mutex_exit(&msp->ms_lock); 1251 continue; 1252 } 1253 1254 if (metaslab_activate(msp, activation_weight) != 0) { 1255 mutex_exit(&msp->ms_lock); 1256 continue; 1257 } 1258 1259 if ((offset = space_map_alloc(&msp->ms_map, asize)) != -1ULL) 1260 break; 1261 1262 atomic_inc_64(&mg->mg_alloc_failures); 1263 1264 metaslab_passivate(msp, space_map_maxsize(&msp->ms_map)); 1265 1266 mutex_exit(&msp->ms_lock); 1267 } 1268 1269 if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) 1270 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); 1271 1272 space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, asize); 1273 1274 mutex_exit(&msp->ms_lock); 1275 1276 return (offset); 1277 } 1278 1279 /* 1280 * Allocate a block for the specified i/o. 1281 */ 1282 static int 1283 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, 1284 dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags) 1285 { 1286 metaslab_group_t *mg, *rotor; 1287 vdev_t *vd; 1288 int dshift = 3; 1289 int all_zero; 1290 int zio_lock = B_FALSE; 1291 boolean_t allocatable; 1292 uint64_t offset = -1ULL; 1293 uint64_t asize; 1294 uint64_t distance; 1295 1296 ASSERT(!DVA_IS_VALID(&dva[d])); 1297 1298 /* 1299 * For testing, make some blocks above a certain size be gang blocks. 1300 */ 1301 if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) 1302 return (ENOSPC); 1303 1304 /* 1305 * Start at the rotor and loop through all mgs until we find something. 1306 * Note that there's no locking on mc_rotor or mc_aliquot because 1307 * nothing actually breaks if we miss a few updates -- we just won't 1308 * allocate quite as evenly. It all balances out over time. 1309 * 1310 * If we are doing ditto or log blocks, try to spread them across 1311 * consecutive vdevs. If we're forced to reuse a vdev before we've 1312 * allocated all of our ditto blocks, then try and spread them out on 1313 * that vdev as much as possible. If it turns out to not be possible, 1314 * gradually lower our standards until anything becomes acceptable. 1315 * Also, allocating on consecutive vdevs (as opposed to random vdevs) 1316 * gives us hope of containing our fault domains to something we're 1317 * able to reason about. Otherwise, any two top-level vdev failures 1318 * will guarantee the loss of data. With consecutive allocation, 1319 * only two adjacent top-level vdev failures will result in data loss. 1320 * 1321 * If we are doing gang blocks (hintdva is non-NULL), try to keep 1322 * ourselves on the same vdev as our gang block header. That 1323 * way, we can hope for locality in vdev_cache, plus it makes our 1324 * fault domains something tractable. 1325 */ 1326 if (hintdva) { 1327 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); 1328 1329 /* 1330 * It's possible the vdev we're using as the hint no 1331 * longer exists (i.e. removed). Consult the rotor when 1332 * all else fails. 1333 */ 1334 if (vd != NULL) { 1335 mg = vd->vdev_mg; 1336 1337 if (flags & METASLAB_HINTBP_AVOID && 1338 mg->mg_next != NULL) 1339 mg = mg->mg_next; 1340 } else { 1341 mg = mc->mc_rotor; 1342 } 1343 } else if (d != 0) { 1344 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); 1345 mg = vd->vdev_mg->mg_next; 1346 } else { 1347 mg = mc->mc_rotor; 1348 } 1349 1350 /* 1351 * If the hint put us into the wrong metaslab class, or into a 1352 * metaslab group that has been passivated, just follow the rotor. 1353 */ 1354 if (mg->mg_class != mc || mg->mg_activation_count <= 0) 1355 mg = mc->mc_rotor; 1356 1357 rotor = mg; 1358 top: 1359 all_zero = B_TRUE; 1360 do { 1361 ASSERT(mg->mg_activation_count == 1); 1362 1363 vd = mg->mg_vd; 1364 1365 /* 1366 * Don't allocate from faulted devices. 1367 */ 1368 if (zio_lock) { 1369 spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); 1370 allocatable = vdev_allocatable(vd); 1371 spa_config_exit(spa, SCL_ZIO, FTAG); 1372 } else { 1373 allocatable = vdev_allocatable(vd); 1374 } 1375 if (!allocatable) 1376 goto next; 1377 1378 /* 1379 * Avoid writing single-copy data to a failing vdev 1380 */ 1381 if ((vd->vdev_stat.vs_write_errors > 0 || 1382 vd->vdev_state < VDEV_STATE_HEALTHY) && 1383 d == 0 && dshift == 3) { 1384 all_zero = B_FALSE; 1385 goto next; 1386 } 1387 1388 ASSERT(mg->mg_class == mc); 1389 1390 distance = vd->vdev_asize >> dshift; 1391 if (distance <= (1ULL << vd->vdev_ms_shift)) 1392 distance = 0; 1393 else 1394 all_zero = B_FALSE; 1395 1396 asize = vdev_psize_to_asize(vd, psize); 1397 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); 1398 1399 offset = metaslab_group_alloc(mg, psize, asize, txg, distance, 1400 dva, d, flags); 1401 if (offset != -1ULL) { 1402 /* 1403 * If we've just selected this metaslab group, 1404 * figure out whether the corresponding vdev is 1405 * over- or under-used relative to the pool, 1406 * and set an allocation bias to even it out. 1407 */ 1408 if (mc->mc_aliquot == 0) { 1409 vdev_stat_t *vs = &vd->vdev_stat; 1410 int64_t vu, cu; 1411 1412 vu = (vs->vs_alloc * 100) / (vs->vs_space + 1); 1413 cu = (mc->mc_alloc * 100) / (mc->mc_space + 1); 1414 1415 /* 1416 * Calculate how much more or less we should 1417 * try to allocate from this device during 1418 * this iteration around the rotor. 1419 * For example, if a device is 80% full 1420 * and the pool is 20% full then we should 1421 * reduce allocations by 60% on this device. 1422 * 1423 * mg_bias = (20 - 80) * 512K / 100 = -307K 1424 * 1425 * This reduces allocations by 307K for this 1426 * iteration. 1427 */ 1428 mg->mg_bias = ((cu - vu) * 1429 (int64_t)mg->mg_aliquot) / 100; 1430 } 1431 1432 if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= 1433 mg->mg_aliquot + mg->mg_bias) { 1434 mc->mc_rotor = mg->mg_next; 1435 mc->mc_aliquot = 0; 1436 } 1437 1438 DVA_SET_VDEV(&dva[d], vd->vdev_id); 1439 DVA_SET_OFFSET(&dva[d], offset); 1440 DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER)); 1441 DVA_SET_ASIZE(&dva[d], asize); 1442 1443 return (0); 1444 } 1445 next: 1446 mc->mc_rotor = mg->mg_next; 1447 mc->mc_aliquot = 0; 1448 } while ((mg = mg->mg_next) != rotor); 1449 1450 if (!all_zero) { 1451 dshift++; 1452 ASSERT(dshift < 64); 1453 goto top; 1454 } 1455 1456 if (!allocatable && !zio_lock) { 1457 dshift = 3; 1458 zio_lock = B_TRUE; 1459 goto top; 1460 } 1461 1462 bzero(&dva[d], sizeof (dva_t)); 1463 1464 return (ENOSPC); 1465 } 1466 1467 /* 1468 * Free the block represented by DVA in the context of the specified 1469 * transaction group. 1470 */ 1471 static void 1472 metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now) 1473 { 1474 uint64_t vdev = DVA_GET_VDEV(dva); 1475 uint64_t offset = DVA_GET_OFFSET(dva); 1476 uint64_t size = DVA_GET_ASIZE(dva); 1477 vdev_t *vd; 1478 metaslab_t *msp; 1479 1480 ASSERT(DVA_IS_VALID(dva)); 1481 1482 if (txg > spa_freeze_txg(spa)) 1483 return; 1484 1485 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 1486 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { 1487 cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu", 1488 (u_longlong_t)vdev, (u_longlong_t)offset); 1489 ASSERT(0); 1490 return; 1491 } 1492 1493 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 1494 1495 if (DVA_GET_GANG(dva)) 1496 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 1497 1498 mutex_enter(&msp->ms_lock); 1499 1500 if (now) { 1501 space_map_remove(&msp->ms_allocmap[txg & TXG_MASK], 1502 offset, size); 1503 space_map_free(&msp->ms_map, offset, size); 1504 } else { 1505 if (msp->ms_freemap[txg & TXG_MASK].sm_space == 0) 1506 vdev_dirty(vd, VDD_METASLAB, msp, txg); 1507 space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size); 1508 } 1509 1510 mutex_exit(&msp->ms_lock); 1511 } 1512 1513 /* 1514 * Intent log support: upon opening the pool after a crash, notify the SPA 1515 * of blocks that the intent log has allocated for immediate write, but 1516 * which are still considered free by the SPA because the last transaction 1517 * group didn't commit yet. 1518 */ 1519 static int 1520 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) 1521 { 1522 uint64_t vdev = DVA_GET_VDEV(dva); 1523 uint64_t offset = DVA_GET_OFFSET(dva); 1524 uint64_t size = DVA_GET_ASIZE(dva); 1525 vdev_t *vd; 1526 metaslab_t *msp; 1527 int error = 0; 1528 1529 ASSERT(DVA_IS_VALID(dva)); 1530 1531 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 1532 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) 1533 return (ENXIO); 1534 1535 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 1536 1537 if (DVA_GET_GANG(dva)) 1538 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 1539 1540 mutex_enter(&msp->ms_lock); 1541 1542 if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map.sm_loaded) 1543 error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); 1544 1545 if (error == 0 && !space_map_contains(&msp->ms_map, offset, size)) 1546 error = ENOENT; 1547 1548 if (error || txg == 0) { /* txg == 0 indicates dry run */ 1549 mutex_exit(&msp->ms_lock); 1550 return (error); 1551 } 1552 1553 space_map_claim(&msp->ms_map, offset, size); 1554 1555 if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ 1556 if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) 1557 vdev_dirty(vd, VDD_METASLAB, msp, txg); 1558 space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); 1559 } 1560 1561 mutex_exit(&msp->ms_lock); 1562 1563 return (0); 1564 } 1565 1566 int 1567 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, 1568 int ndvas, uint64_t txg, blkptr_t *hintbp, int flags) 1569 { 1570 dva_t *dva = bp->blk_dva; 1571 dva_t *hintdva = hintbp->blk_dva; 1572 int error = 0; 1573 1574 ASSERT(bp->blk_birth == 0); 1575 ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); 1576 1577 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 1578 1579 if (mc->mc_rotor == NULL) { /* no vdevs in this class */ 1580 spa_config_exit(spa, SCL_ALLOC, FTAG); 1581 return (ENOSPC); 1582 } 1583 1584 ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); 1585 ASSERT(BP_GET_NDVAS(bp) == 0); 1586 ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); 1587 1588 for (int d = 0; d < ndvas; d++) { 1589 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, 1590 txg, flags); 1591 if (error) { 1592 for (d--; d >= 0; d--) { 1593 metaslab_free_dva(spa, &dva[d], txg, B_TRUE); 1594 bzero(&dva[d], sizeof (dva_t)); 1595 } 1596 spa_config_exit(spa, SCL_ALLOC, FTAG); 1597 return (error); 1598 } 1599 } 1600 ASSERT(error == 0); 1601 ASSERT(BP_GET_NDVAS(bp) == ndvas); 1602 1603 spa_config_exit(spa, SCL_ALLOC, FTAG); 1604 1605 BP_SET_BIRTH(bp, txg, txg); 1606 1607 return (0); 1608 } 1609 1610 void 1611 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) 1612 { 1613 const dva_t *dva = bp->blk_dva; 1614 int ndvas = BP_GET_NDVAS(bp); 1615 1616 ASSERT(!BP_IS_HOLE(bp)); 1617 ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); 1618 1619 spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); 1620 1621 for (int d = 0; d < ndvas; d++) 1622 metaslab_free_dva(spa, &dva[d], txg, now); 1623 1624 spa_config_exit(spa, SCL_FREE, FTAG); 1625 } 1626 1627 int 1628 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) 1629 { 1630 const dva_t *dva = bp->blk_dva; 1631 int ndvas = BP_GET_NDVAS(bp); 1632 int error = 0; 1633 1634 ASSERT(!BP_IS_HOLE(bp)); 1635 1636 if (txg != 0) { 1637 /* 1638 * First do a dry run to make sure all DVAs are claimable, 1639 * so we don't have to unwind from partial failures below. 1640 */ 1641 if ((error = metaslab_claim(spa, bp, 0)) != 0) 1642 return (error); 1643 } 1644 1645 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 1646 1647 for (int d = 0; d < ndvas; d++) 1648 if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0) 1649 break; 1650 1651 spa_config_exit(spa, SCL_ALLOC, FTAG); 1652 1653 ASSERT(error == 0 || txg == 0); 1654 1655 return (error); 1656 } 1657