1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011 by Delphix. All rights reserved. 24 */ 25 26 #include <sys/zfs_context.h> 27 #include <sys/dmu.h> 28 #include <sys/dmu_tx.h> 29 #include <sys/space_map.h> 30 #include <sys/metaslab_impl.h> 31 #include <sys/vdev_impl.h> 32 #include <sys/zio.h> 33 34 /* 35 * Allow allocations to switch to gang blocks quickly. We do this to 36 * avoid having to load lots of space_maps in a given txg. There are, 37 * however, some cases where we want to avoid "fast" ganging and instead 38 * we want to do an exhaustive search of all metaslabs on this device. 39 * Currently we don't allow any gang or dump device related allocations 40 * to "fast" gang. 41 */ 42 #define CAN_FASTGANG(flags) \ 43 (!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \ 44 METASLAB_GANG_AVOID))) 45 46 uint64_t metaslab_aliquot = 512ULL << 10; 47 uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ 48 49 /* 50 * This value defines the number of allowed allocation failures per vdev. 51 * If a device reaches this threshold in a given txg then we consider skipping 52 * allocations on that device. 53 */ 54 int zfs_mg_alloc_failures; 55 56 /* 57 * Metaslab debugging: when set, keeps all space maps in core to verify frees. 58 */ 59 static int metaslab_debug = 0; 60 61 /* 62 * Minimum size which forces the dynamic allocator to change 63 * it's allocation strategy. Once the space map cannot satisfy 64 * an allocation of this size then it switches to using more 65 * aggressive strategy (i.e search by size rather than offset). 66 */ 67 uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE; 68 69 /* 70 * The minimum free space, in percent, which must be available 71 * in a space map to continue allocations in a first-fit fashion. 72 * Once the space_map's free space drops below this level we dynamically 73 * switch to using best-fit allocations. 74 */ 75 int metaslab_df_free_pct = 4; 76 77 /* 78 * A metaslab is considered "free" if it contains a contiguous 79 * segment which is greater than metaslab_min_alloc_size. 80 */ 81 uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS; 82 83 /* 84 * Max number of space_maps to prefetch. 85 */ 86 int metaslab_prefetch_limit = SPA_DVAS_PER_BP; 87 88 /* 89 * Percentage bonus multiplier for metaslabs that are in the bonus area. 90 */ 91 int metaslab_smo_bonus_pct = 150; 92 93 /* 94 * ========================================================================== 95 * Metaslab classes 96 * ========================================================================== 97 */ 98 metaslab_class_t * 99 metaslab_class_create(spa_t *spa, space_map_ops_t *ops) 100 { 101 metaslab_class_t *mc; 102 103 mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); 104 105 mc->mc_spa = spa; 106 mc->mc_rotor = NULL; 107 mc->mc_ops = ops; 108 109 return (mc); 110 } 111 112 void 113 metaslab_class_destroy(metaslab_class_t *mc) 114 { 115 ASSERT(mc->mc_rotor == NULL); 116 ASSERT(mc->mc_alloc == 0); 117 ASSERT(mc->mc_deferred == 0); 118 ASSERT(mc->mc_space == 0); 119 ASSERT(mc->mc_dspace == 0); 120 121 kmem_free(mc, sizeof (metaslab_class_t)); 122 } 123 124 int 125 metaslab_class_validate(metaslab_class_t *mc) 126 { 127 metaslab_group_t *mg; 128 vdev_t *vd; 129 130 /* 131 * Must hold one of the spa_config locks. 132 */ 133 ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || 134 spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); 135 136 if ((mg = mc->mc_rotor) == NULL) 137 return (0); 138 139 do { 140 vd = mg->mg_vd; 141 ASSERT(vd->vdev_mg != NULL); 142 ASSERT3P(vd->vdev_top, ==, vd); 143 ASSERT3P(mg->mg_class, ==, mc); 144 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); 145 } while ((mg = mg->mg_next) != mc->mc_rotor); 146 147 return (0); 148 } 149 150 void 151 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, 152 int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) 153 { 154 atomic_add_64(&mc->mc_alloc, alloc_delta); 155 atomic_add_64(&mc->mc_deferred, defer_delta); 156 atomic_add_64(&mc->mc_space, space_delta); 157 atomic_add_64(&mc->mc_dspace, dspace_delta); 158 } 159 160 uint64_t 161 metaslab_class_get_alloc(metaslab_class_t *mc) 162 { 163 return (mc->mc_alloc); 164 } 165 166 uint64_t 167 metaslab_class_get_deferred(metaslab_class_t *mc) 168 { 169 return (mc->mc_deferred); 170 } 171 172 uint64_t 173 metaslab_class_get_space(metaslab_class_t *mc) 174 { 175 return (mc->mc_space); 176 } 177 178 uint64_t 179 metaslab_class_get_dspace(metaslab_class_t *mc) 180 { 181 return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); 182 } 183 184 /* 185 * ========================================================================== 186 * Metaslab groups 187 * ========================================================================== 188 */ 189 static int 190 metaslab_compare(const void *x1, const void *x2) 191 { 192 const metaslab_t *m1 = x1; 193 const metaslab_t *m2 = x2; 194 195 if (m1->ms_weight < m2->ms_weight) 196 return (1); 197 if (m1->ms_weight > m2->ms_weight) 198 return (-1); 199 200 /* 201 * If the weights are identical, use the offset to force uniqueness. 202 */ 203 if (m1->ms_map.sm_start < m2->ms_map.sm_start) 204 return (-1); 205 if (m1->ms_map.sm_start > m2->ms_map.sm_start) 206 return (1); 207 208 ASSERT3P(m1, ==, m2); 209 210 return (0); 211 } 212 213 metaslab_group_t * 214 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) 215 { 216 metaslab_group_t *mg; 217 218 mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); 219 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); 220 avl_create(&mg->mg_metaslab_tree, metaslab_compare, 221 sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); 222 mg->mg_vd = vd; 223 mg->mg_class = mc; 224 mg->mg_activation_count = 0; 225 226 return (mg); 227 } 228 229 void 230 metaslab_group_destroy(metaslab_group_t *mg) 231 { 232 ASSERT(mg->mg_prev == NULL); 233 ASSERT(mg->mg_next == NULL); 234 /* 235 * We may have gone below zero with the activation count 236 * either because we never activated in the first place or 237 * because we're done, and possibly removing the vdev. 238 */ 239 ASSERT(mg->mg_activation_count <= 0); 240 241 avl_destroy(&mg->mg_metaslab_tree); 242 mutex_destroy(&mg->mg_lock); 243 kmem_free(mg, sizeof (metaslab_group_t)); 244 } 245 246 void 247 metaslab_group_activate(metaslab_group_t *mg) 248 { 249 metaslab_class_t *mc = mg->mg_class; 250 metaslab_group_t *mgprev, *mgnext; 251 252 ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); 253 254 ASSERT(mc->mc_rotor != mg); 255 ASSERT(mg->mg_prev == NULL); 256 ASSERT(mg->mg_next == NULL); 257 ASSERT(mg->mg_activation_count <= 0); 258 259 if (++mg->mg_activation_count <= 0) 260 return; 261 262 mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); 263 264 if ((mgprev = mc->mc_rotor) == NULL) { 265 mg->mg_prev = mg; 266 mg->mg_next = mg; 267 } else { 268 mgnext = mgprev->mg_next; 269 mg->mg_prev = mgprev; 270 mg->mg_next = mgnext; 271 mgprev->mg_next = mg; 272 mgnext->mg_prev = mg; 273 } 274 mc->mc_rotor = mg; 275 } 276 277 void 278 metaslab_group_passivate(metaslab_group_t *mg) 279 { 280 metaslab_class_t *mc = mg->mg_class; 281 metaslab_group_t *mgprev, *mgnext; 282 283 ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); 284 285 if (--mg->mg_activation_count != 0) { 286 ASSERT(mc->mc_rotor != mg); 287 ASSERT(mg->mg_prev == NULL); 288 ASSERT(mg->mg_next == NULL); 289 ASSERT(mg->mg_activation_count < 0); 290 return; 291 } 292 293 mgprev = mg->mg_prev; 294 mgnext = mg->mg_next; 295 296 if (mg == mgnext) { 297 mc->mc_rotor = NULL; 298 } else { 299 mc->mc_rotor = mgnext; 300 mgprev->mg_next = mgnext; 301 mgnext->mg_prev = mgprev; 302 } 303 304 mg->mg_prev = NULL; 305 mg->mg_next = NULL; 306 } 307 308 static void 309 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) 310 { 311 mutex_enter(&mg->mg_lock); 312 ASSERT(msp->ms_group == NULL); 313 msp->ms_group = mg; 314 msp->ms_weight = 0; 315 avl_add(&mg->mg_metaslab_tree, msp); 316 mutex_exit(&mg->mg_lock); 317 } 318 319 static void 320 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) 321 { 322 mutex_enter(&mg->mg_lock); 323 ASSERT(msp->ms_group == mg); 324 avl_remove(&mg->mg_metaslab_tree, msp); 325 msp->ms_group = NULL; 326 mutex_exit(&mg->mg_lock); 327 } 328 329 static void 330 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 331 { 332 /* 333 * Although in principle the weight can be any value, in 334 * practice we do not use values in the range [1, 510]. 335 */ 336 ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0); 337 ASSERT(MUTEX_HELD(&msp->ms_lock)); 338 339 mutex_enter(&mg->mg_lock); 340 ASSERT(msp->ms_group == mg); 341 avl_remove(&mg->mg_metaslab_tree, msp); 342 msp->ms_weight = weight; 343 avl_add(&mg->mg_metaslab_tree, msp); 344 mutex_exit(&mg->mg_lock); 345 } 346 347 /* 348 * ========================================================================== 349 * Common allocator routines 350 * ========================================================================== 351 */ 352 static int 353 metaslab_segsize_compare(const void *x1, const void *x2) 354 { 355 const space_seg_t *s1 = x1; 356 const space_seg_t *s2 = x2; 357 uint64_t ss_size1 = s1->ss_end - s1->ss_start; 358 uint64_t ss_size2 = s2->ss_end - s2->ss_start; 359 360 if (ss_size1 < ss_size2) 361 return (-1); 362 if (ss_size1 > ss_size2) 363 return (1); 364 365 if (s1->ss_start < s2->ss_start) 366 return (-1); 367 if (s1->ss_start > s2->ss_start) 368 return (1); 369 370 return (0); 371 } 372 373 /* 374 * This is a helper function that can be used by the allocator to find 375 * a suitable block to allocate. This will search the specified AVL 376 * tree looking for a block that matches the specified criteria. 377 */ 378 static uint64_t 379 metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, 380 uint64_t align) 381 { 382 space_seg_t *ss, ssearch; 383 avl_index_t where; 384 385 ssearch.ss_start = *cursor; 386 ssearch.ss_end = *cursor + size; 387 388 ss = avl_find(t, &ssearch, &where); 389 if (ss == NULL) 390 ss = avl_nearest(t, where, AVL_AFTER); 391 392 while (ss != NULL) { 393 uint64_t offset = P2ROUNDUP(ss->ss_start, align); 394 395 if (offset + size <= ss->ss_end) { 396 *cursor = offset + size; 397 return (offset); 398 } 399 ss = AVL_NEXT(t, ss); 400 } 401 402 /* 403 * If we know we've searched the whole map (*cursor == 0), give up. 404 * Otherwise, reset the cursor to the beginning and try again. 405 */ 406 if (*cursor == 0) 407 return (-1ULL); 408 409 *cursor = 0; 410 return (metaslab_block_picker(t, cursor, size, align)); 411 } 412 413 static void 414 metaslab_pp_load(space_map_t *sm) 415 { 416 space_seg_t *ss; 417 418 ASSERT(sm->sm_ppd == NULL); 419 sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP); 420 421 sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); 422 avl_create(sm->sm_pp_root, metaslab_segsize_compare, 423 sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node)); 424 425 for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss)) 426 avl_add(sm->sm_pp_root, ss); 427 } 428 429 static void 430 metaslab_pp_unload(space_map_t *sm) 431 { 432 void *cookie = NULL; 433 434 kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t)); 435 sm->sm_ppd = NULL; 436 437 while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) { 438 /* tear down the tree */ 439 } 440 441 avl_destroy(sm->sm_pp_root); 442 kmem_free(sm->sm_pp_root, sizeof (avl_tree_t)); 443 sm->sm_pp_root = NULL; 444 } 445 446 /* ARGSUSED */ 447 static void 448 metaslab_pp_claim(space_map_t *sm, uint64_t start, uint64_t size) 449 { 450 /* No need to update cursor */ 451 } 452 453 /* ARGSUSED */ 454 static void 455 metaslab_pp_free(space_map_t *sm, uint64_t start, uint64_t size) 456 { 457 /* No need to update cursor */ 458 } 459 460 /* 461 * Return the maximum contiguous segment within the metaslab. 462 */ 463 uint64_t 464 metaslab_pp_maxsize(space_map_t *sm) 465 { 466 avl_tree_t *t = sm->sm_pp_root; 467 space_seg_t *ss; 468 469 if (t == NULL || (ss = avl_last(t)) == NULL) 470 return (0ULL); 471 472 return (ss->ss_end - ss->ss_start); 473 } 474 475 /* 476 * ========================================================================== 477 * The first-fit block allocator 478 * ========================================================================== 479 */ 480 static uint64_t 481 metaslab_ff_alloc(space_map_t *sm, uint64_t size) 482 { 483 avl_tree_t *t = &sm->sm_root; 484 uint64_t align = size & -size; 485 uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; 486 487 return (metaslab_block_picker(t, cursor, size, align)); 488 } 489 490 /* ARGSUSED */ 491 boolean_t 492 metaslab_ff_fragmented(space_map_t *sm) 493 { 494 return (B_TRUE); 495 } 496 497 static space_map_ops_t metaslab_ff_ops = { 498 metaslab_pp_load, 499 metaslab_pp_unload, 500 metaslab_ff_alloc, 501 metaslab_pp_claim, 502 metaslab_pp_free, 503 metaslab_pp_maxsize, 504 metaslab_ff_fragmented 505 }; 506 507 /* 508 * ========================================================================== 509 * Dynamic block allocator - 510 * Uses the first fit allocation scheme until space get low and then 511 * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold 512 * and metaslab_df_free_pct to determine when to switch the allocation scheme. 513 * ========================================================================== 514 */ 515 static uint64_t 516 metaslab_df_alloc(space_map_t *sm, uint64_t size) 517 { 518 avl_tree_t *t = &sm->sm_root; 519 uint64_t align = size & -size; 520 uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; 521 uint64_t max_size = metaslab_pp_maxsize(sm); 522 int free_pct = sm->sm_space * 100 / sm->sm_size; 523 524 ASSERT(MUTEX_HELD(sm->sm_lock)); 525 ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); 526 527 if (max_size < size) 528 return (-1ULL); 529 530 /* 531 * If we're running low on space switch to using the size 532 * sorted AVL tree (best-fit). 533 */ 534 if (max_size < metaslab_df_alloc_threshold || 535 free_pct < metaslab_df_free_pct) { 536 t = sm->sm_pp_root; 537 *cursor = 0; 538 } 539 540 return (metaslab_block_picker(t, cursor, size, 1ULL)); 541 } 542 543 static boolean_t 544 metaslab_df_fragmented(space_map_t *sm) 545 { 546 uint64_t max_size = metaslab_pp_maxsize(sm); 547 int free_pct = sm->sm_space * 100 / sm->sm_size; 548 549 if (max_size >= metaslab_df_alloc_threshold && 550 free_pct >= metaslab_df_free_pct) 551 return (B_FALSE); 552 553 return (B_TRUE); 554 } 555 556 static space_map_ops_t metaslab_df_ops = { 557 metaslab_pp_load, 558 metaslab_pp_unload, 559 metaslab_df_alloc, 560 metaslab_pp_claim, 561 metaslab_pp_free, 562 metaslab_pp_maxsize, 563 metaslab_df_fragmented 564 }; 565 566 /* 567 * ========================================================================== 568 * Other experimental allocators 569 * ========================================================================== 570 */ 571 static uint64_t 572 metaslab_cdf_alloc(space_map_t *sm, uint64_t size) 573 { 574 avl_tree_t *t = &sm->sm_root; 575 uint64_t *cursor = (uint64_t *)sm->sm_ppd; 576 uint64_t *extent_end = (uint64_t *)sm->sm_ppd + 1; 577 uint64_t max_size = metaslab_pp_maxsize(sm); 578 uint64_t rsize = size; 579 uint64_t offset = 0; 580 581 ASSERT(MUTEX_HELD(sm->sm_lock)); 582 ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); 583 584 if (max_size < size) 585 return (-1ULL); 586 587 ASSERT3U(*extent_end, >=, *cursor); 588 589 /* 590 * If we're running low on space switch to using the size 591 * sorted AVL tree (best-fit). 592 */ 593 if ((*cursor + size) > *extent_end) { 594 595 t = sm->sm_pp_root; 596 *cursor = *extent_end = 0; 597 598 if (max_size > 2 * SPA_MAXBLOCKSIZE) 599 rsize = MIN(metaslab_min_alloc_size, max_size); 600 offset = metaslab_block_picker(t, extent_end, rsize, 1ULL); 601 if (offset != -1) 602 *cursor = offset + size; 603 } else { 604 offset = metaslab_block_picker(t, cursor, rsize, 1ULL); 605 } 606 ASSERT3U(*cursor, <=, *extent_end); 607 return (offset); 608 } 609 610 static boolean_t 611 metaslab_cdf_fragmented(space_map_t *sm) 612 { 613 uint64_t max_size = metaslab_pp_maxsize(sm); 614 615 if (max_size > (metaslab_min_alloc_size * 10)) 616 return (B_FALSE); 617 return (B_TRUE); 618 } 619 620 static space_map_ops_t metaslab_cdf_ops = { 621 metaslab_pp_load, 622 metaslab_pp_unload, 623 metaslab_cdf_alloc, 624 metaslab_pp_claim, 625 metaslab_pp_free, 626 metaslab_pp_maxsize, 627 metaslab_cdf_fragmented 628 }; 629 630 uint64_t metaslab_ndf_clump_shift = 4; 631 632 static uint64_t 633 metaslab_ndf_alloc(space_map_t *sm, uint64_t size) 634 { 635 avl_tree_t *t = &sm->sm_root; 636 avl_index_t where; 637 space_seg_t *ss, ssearch; 638 uint64_t hbit = highbit(size); 639 uint64_t *cursor = (uint64_t *)sm->sm_ppd + hbit - 1; 640 uint64_t max_size = metaslab_pp_maxsize(sm); 641 642 ASSERT(MUTEX_HELD(sm->sm_lock)); 643 ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); 644 645 if (max_size < size) 646 return (-1ULL); 647 648 ssearch.ss_start = *cursor; 649 ssearch.ss_end = *cursor + size; 650 651 ss = avl_find(t, &ssearch, &where); 652 if (ss == NULL || (ss->ss_start + size > ss->ss_end)) { 653 t = sm->sm_pp_root; 654 655 ssearch.ss_start = 0; 656 ssearch.ss_end = MIN(max_size, 657 1ULL << (hbit + metaslab_ndf_clump_shift)); 658 ss = avl_find(t, &ssearch, &where); 659 if (ss == NULL) 660 ss = avl_nearest(t, where, AVL_AFTER); 661 ASSERT(ss != NULL); 662 } 663 664 if (ss != NULL) { 665 if (ss->ss_start + size <= ss->ss_end) { 666 *cursor = ss->ss_start + size; 667 return (ss->ss_start); 668 } 669 } 670 return (-1ULL); 671 } 672 673 static boolean_t 674 metaslab_ndf_fragmented(space_map_t *sm) 675 { 676 uint64_t max_size = metaslab_pp_maxsize(sm); 677 678 if (max_size > (metaslab_min_alloc_size << metaslab_ndf_clump_shift)) 679 return (B_FALSE); 680 return (B_TRUE); 681 } 682 683 684 static space_map_ops_t metaslab_ndf_ops = { 685 metaslab_pp_load, 686 metaslab_pp_unload, 687 metaslab_ndf_alloc, 688 metaslab_pp_claim, 689 metaslab_pp_free, 690 metaslab_pp_maxsize, 691 metaslab_ndf_fragmented 692 }; 693 694 space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops; 695 696 /* 697 * ========================================================================== 698 * Metaslabs 699 * ========================================================================== 700 */ 701 metaslab_t * 702 metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, 703 uint64_t start, uint64_t size, uint64_t txg) 704 { 705 vdev_t *vd = mg->mg_vd; 706 metaslab_t *msp; 707 708 msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); 709 mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL); 710 711 msp->ms_smo_syncing = *smo; 712 713 /* 714 * We create the main space map here, but we don't create the 715 * allocmaps and freemaps until metaslab_sync_done(). This serves 716 * two purposes: it allows metaslab_sync_done() to detect the 717 * addition of new space; and for debugging, it ensures that we'd 718 * data fault on any attempt to use this metaslab before it's ready. 719 */ 720 space_map_create(&msp->ms_map, start, size, 721 vd->vdev_ashift, &msp->ms_lock); 722 723 metaslab_group_add(mg, msp); 724 725 if (metaslab_debug && smo->smo_object != 0) { 726 mutex_enter(&msp->ms_lock); 727 VERIFY(space_map_load(&msp->ms_map, mg->mg_class->mc_ops, 728 SM_FREE, smo, spa_meta_objset(vd->vdev_spa)) == 0); 729 mutex_exit(&msp->ms_lock); 730 } 731 732 /* 733 * If we're opening an existing pool (txg == 0) or creating 734 * a new one (txg == TXG_INITIAL), all space is available now. 735 * If we're adding space to an existing pool, the new space 736 * does not become available until after this txg has synced. 737 */ 738 if (txg <= TXG_INITIAL) 739 metaslab_sync_done(msp, 0); 740 741 if (txg != 0) { 742 vdev_dirty(vd, 0, NULL, txg); 743 vdev_dirty(vd, VDD_METASLAB, msp, txg); 744 } 745 746 return (msp); 747 } 748 749 void 750 metaslab_fini(metaslab_t *msp) 751 { 752 metaslab_group_t *mg = msp->ms_group; 753 754 vdev_space_update(mg->mg_vd, 755 -msp->ms_smo.smo_alloc, 0, -msp->ms_map.sm_size); 756 757 metaslab_group_remove(mg, msp); 758 759 mutex_enter(&msp->ms_lock); 760 761 space_map_unload(&msp->ms_map); 762 space_map_destroy(&msp->ms_map); 763 764 for (int t = 0; t < TXG_SIZE; t++) { 765 space_map_destroy(&msp->ms_allocmap[t]); 766 space_map_destroy(&msp->ms_freemap[t]); 767 } 768 769 for (int t = 0; t < TXG_DEFER_SIZE; t++) 770 space_map_destroy(&msp->ms_defermap[t]); 771 772 ASSERT3S(msp->ms_deferspace, ==, 0); 773 774 mutex_exit(&msp->ms_lock); 775 mutex_destroy(&msp->ms_lock); 776 777 kmem_free(msp, sizeof (metaslab_t)); 778 } 779 780 #define METASLAB_WEIGHT_PRIMARY (1ULL << 63) 781 #define METASLAB_WEIGHT_SECONDARY (1ULL << 62) 782 #define METASLAB_ACTIVE_MASK \ 783 (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY) 784 785 static uint64_t 786 metaslab_weight(metaslab_t *msp) 787 { 788 metaslab_group_t *mg = msp->ms_group; 789 space_map_t *sm = &msp->ms_map; 790 space_map_obj_t *smo = &msp->ms_smo; 791 vdev_t *vd = mg->mg_vd; 792 uint64_t weight, space; 793 794 ASSERT(MUTEX_HELD(&msp->ms_lock)); 795 796 /* 797 * The baseline weight is the metaslab's free space. 798 */ 799 space = sm->sm_size - smo->smo_alloc; 800 weight = space; 801 802 /* 803 * Modern disks have uniform bit density and constant angular velocity. 804 * Therefore, the outer recording zones are faster (higher bandwidth) 805 * than the inner zones by the ratio of outer to inner track diameter, 806 * which is typically around 2:1. We account for this by assigning 807 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). 808 * In effect, this means that we'll select the metaslab with the most 809 * free bandwidth rather than simply the one with the most free space. 810 */ 811 weight = 2 * weight - 812 ((sm->sm_start >> vd->vdev_ms_shift) * weight) / vd->vdev_ms_count; 813 ASSERT(weight >= space && weight <= 2 * space); 814 815 /* 816 * For locality, assign higher weight to metaslabs which have 817 * a lower offset than what we've already activated. 818 */ 819 if (sm->sm_start <= mg->mg_bonus_area) 820 weight *= (metaslab_smo_bonus_pct / 100); 821 ASSERT(weight >= space && 822 weight <= 2 * (metaslab_smo_bonus_pct / 100) * space); 823 824 if (sm->sm_loaded && !sm->sm_ops->smop_fragmented(sm)) { 825 /* 826 * If this metaslab is one we're actively using, adjust its 827 * weight to make it preferable to any inactive metaslab so 828 * we'll polish it off. 829 */ 830 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); 831 } 832 return (weight); 833 } 834 835 static void 836 metaslab_prefetch(metaslab_group_t *mg) 837 { 838 spa_t *spa = mg->mg_vd->vdev_spa; 839 metaslab_t *msp; 840 avl_tree_t *t = &mg->mg_metaslab_tree; 841 int m; 842 843 mutex_enter(&mg->mg_lock); 844 845 /* 846 * Prefetch the next potential metaslabs 847 */ 848 for (msp = avl_first(t), m = 0; msp; msp = AVL_NEXT(t, msp), m++) { 849 space_map_t *sm = &msp->ms_map; 850 space_map_obj_t *smo = &msp->ms_smo; 851 852 /* If we have reached our prefetch limit then we're done */ 853 if (m >= metaslab_prefetch_limit) 854 break; 855 856 if (!sm->sm_loaded && smo->smo_object != 0) { 857 mutex_exit(&mg->mg_lock); 858 dmu_prefetch(spa_meta_objset(spa), smo->smo_object, 859 0ULL, smo->smo_objsize); 860 mutex_enter(&mg->mg_lock); 861 } 862 } 863 mutex_exit(&mg->mg_lock); 864 } 865 866 static int 867 metaslab_activate(metaslab_t *msp, uint64_t activation_weight) 868 { 869 metaslab_group_t *mg = msp->ms_group; 870 space_map_t *sm = &msp->ms_map; 871 space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops; 872 873 ASSERT(MUTEX_HELD(&msp->ms_lock)); 874 875 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { 876 space_map_load_wait(sm); 877 if (!sm->sm_loaded) { 878 int error = space_map_load(sm, sm_ops, SM_FREE, 879 &msp->ms_smo, 880 spa_meta_objset(msp->ms_group->mg_vd->vdev_spa)); 881 if (error) { 882 metaslab_group_sort(msp->ms_group, msp, 0); 883 return (error); 884 } 885 for (int t = 0; t < TXG_DEFER_SIZE; t++) 886 space_map_walk(&msp->ms_defermap[t], 887 space_map_claim, sm); 888 889 } 890 891 /* 892 * Track the bonus area as we activate new metaslabs. 893 */ 894 if (sm->sm_start > mg->mg_bonus_area) { 895 mutex_enter(&mg->mg_lock); 896 mg->mg_bonus_area = sm->sm_start; 897 mutex_exit(&mg->mg_lock); 898 } 899 900 metaslab_group_sort(msp->ms_group, msp, 901 msp->ms_weight | activation_weight); 902 } 903 ASSERT(sm->sm_loaded); 904 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); 905 906 return (0); 907 } 908 909 static void 910 metaslab_passivate(metaslab_t *msp, uint64_t size) 911 { 912 /* 913 * If size < SPA_MINBLOCKSIZE, then we will not allocate from 914 * this metaslab again. In that case, it had better be empty, 915 * or we would be leaving space on the table. 916 */ 917 ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map.sm_space == 0); 918 metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size)); 919 ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); 920 } 921 922 /* 923 * Write a metaslab to disk in the context of the specified transaction group. 924 */ 925 void 926 metaslab_sync(metaslab_t *msp, uint64_t txg) 927 { 928 vdev_t *vd = msp->ms_group->mg_vd; 929 spa_t *spa = vd->vdev_spa; 930 objset_t *mos = spa_meta_objset(spa); 931 space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK]; 932 space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK]; 933 space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; 934 space_map_t *sm = &msp->ms_map; 935 space_map_obj_t *smo = &msp->ms_smo_syncing; 936 dmu_buf_t *db; 937 dmu_tx_t *tx; 938 939 ASSERT(!vd->vdev_ishole); 940 941 if (allocmap->sm_space == 0 && freemap->sm_space == 0) 942 return; 943 944 /* 945 * The only state that can actually be changing concurrently with 946 * metaslab_sync() is the metaslab's ms_map. No other thread can 947 * be modifying this txg's allocmap, freemap, freed_map, or smo. 948 * Therefore, we only hold ms_lock to satify space_map ASSERTs. 949 * We drop it whenever we call into the DMU, because the DMU 950 * can call down to us (e.g. via zio_free()) at any time. 951 */ 952 953 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 954 955 if (smo->smo_object == 0) { 956 ASSERT(smo->smo_objsize == 0); 957 ASSERT(smo->smo_alloc == 0); 958 smo->smo_object = dmu_object_alloc(mos, 959 DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT, 960 DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx); 961 ASSERT(smo->smo_object != 0); 962 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * 963 (sm->sm_start >> vd->vdev_ms_shift), 964 sizeof (uint64_t), &smo->smo_object, tx); 965 } 966 967 mutex_enter(&msp->ms_lock); 968 969 space_map_walk(freemap, space_map_add, freed_map); 970 971 if (sm->sm_loaded && spa_sync_pass(spa) == 1 && smo->smo_objsize >= 972 2 * sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) { 973 /* 974 * The in-core space map representation is twice as compact 975 * as the on-disk one, so it's time to condense the latter 976 * by generating a pure allocmap from first principles. 977 * 978 * This metaslab is 100% allocated, 979 * minus the content of the in-core map (sm), 980 * minus what's been freed this txg (freed_map), 981 * minus deferred frees (ms_defermap[]), 982 * minus allocations from txgs in the future 983 * (because they haven't been committed yet). 984 */ 985 space_map_vacate(allocmap, NULL, NULL); 986 space_map_vacate(freemap, NULL, NULL); 987 988 space_map_add(allocmap, allocmap->sm_start, allocmap->sm_size); 989 990 space_map_walk(sm, space_map_remove, allocmap); 991 space_map_walk(freed_map, space_map_remove, allocmap); 992 993 for (int t = 0; t < TXG_DEFER_SIZE; t++) 994 space_map_walk(&msp->ms_defermap[t], 995 space_map_remove, allocmap); 996 997 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) 998 space_map_walk(&msp->ms_allocmap[(txg + t) & TXG_MASK], 999 space_map_remove, allocmap); 1000 1001 mutex_exit(&msp->ms_lock); 1002 space_map_truncate(smo, mos, tx); 1003 mutex_enter(&msp->ms_lock); 1004 } 1005 1006 space_map_sync(allocmap, SM_ALLOC, smo, mos, tx); 1007 space_map_sync(freemap, SM_FREE, smo, mos, tx); 1008 1009 mutex_exit(&msp->ms_lock); 1010 1011 VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)); 1012 dmu_buf_will_dirty(db, tx); 1013 ASSERT3U(db->db_size, >=, sizeof (*smo)); 1014 bcopy(smo, db->db_data, sizeof (*smo)); 1015 dmu_buf_rele(db, FTAG); 1016 1017 dmu_tx_commit(tx); 1018 } 1019 1020 /* 1021 * Called after a transaction group has completely synced to mark 1022 * all of the metaslab's free space as usable. 1023 */ 1024 void 1025 metaslab_sync_done(metaslab_t *msp, uint64_t txg) 1026 { 1027 space_map_obj_t *smo = &msp->ms_smo; 1028 space_map_obj_t *smosync = &msp->ms_smo_syncing; 1029 space_map_t *sm = &msp->ms_map; 1030 space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; 1031 space_map_t *defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE]; 1032 metaslab_group_t *mg = msp->ms_group; 1033 vdev_t *vd = mg->mg_vd; 1034 int64_t alloc_delta, defer_delta; 1035 1036 ASSERT(!vd->vdev_ishole); 1037 1038 mutex_enter(&msp->ms_lock); 1039 1040 /* 1041 * If this metaslab is just becoming available, initialize its 1042 * allocmaps and freemaps and add its capacity to the vdev. 1043 */ 1044 if (freed_map->sm_size == 0) { 1045 for (int t = 0; t < TXG_SIZE; t++) { 1046 space_map_create(&msp->ms_allocmap[t], sm->sm_start, 1047 sm->sm_size, sm->sm_shift, sm->sm_lock); 1048 space_map_create(&msp->ms_freemap[t], sm->sm_start, 1049 sm->sm_size, sm->sm_shift, sm->sm_lock); 1050 } 1051 1052 for (int t = 0; t < TXG_DEFER_SIZE; t++) 1053 space_map_create(&msp->ms_defermap[t], sm->sm_start, 1054 sm->sm_size, sm->sm_shift, sm->sm_lock); 1055 1056 vdev_space_update(vd, 0, 0, sm->sm_size); 1057 } 1058 1059 alloc_delta = smosync->smo_alloc - smo->smo_alloc; 1060 defer_delta = freed_map->sm_space - defer_map->sm_space; 1061 1062 vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0); 1063 1064 ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0); 1065 ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0); 1066 1067 /* 1068 * If there's a space_map_load() in progress, wait for it to complete 1069 * so that we have a consistent view of the in-core space map. 1070 * Then, add defer_map (oldest deferred frees) to this map and 1071 * transfer freed_map (this txg's frees) to defer_map. 1072 */ 1073 space_map_load_wait(sm); 1074 space_map_vacate(defer_map, sm->sm_loaded ? space_map_free : NULL, sm); 1075 space_map_vacate(freed_map, space_map_add, defer_map); 1076 1077 *smo = *smosync; 1078 1079 msp->ms_deferspace += defer_delta; 1080 ASSERT3S(msp->ms_deferspace, >=, 0); 1081 ASSERT3S(msp->ms_deferspace, <=, sm->sm_size); 1082 if (msp->ms_deferspace != 0) { 1083 /* 1084 * Keep syncing this metaslab until all deferred frees 1085 * are back in circulation. 1086 */ 1087 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 1088 } 1089 1090 /* 1091 * If the map is loaded but no longer active, evict it as soon as all 1092 * future allocations have synced. (If we unloaded it now and then 1093 * loaded a moment later, the map wouldn't reflect those allocations.) 1094 */ 1095 if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { 1096 int evictable = 1; 1097 1098 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) 1099 if (msp->ms_allocmap[(txg + t) & TXG_MASK].sm_space) 1100 evictable = 0; 1101 1102 if (evictable && !metaslab_debug) 1103 space_map_unload(sm); 1104 } 1105 1106 metaslab_group_sort(mg, msp, metaslab_weight(msp)); 1107 1108 mutex_exit(&msp->ms_lock); 1109 } 1110 1111 void 1112 metaslab_sync_reassess(metaslab_group_t *mg) 1113 { 1114 vdev_t *vd = mg->mg_vd; 1115 int64_t failures = mg->mg_alloc_failures; 1116 1117 /* 1118 * Re-evaluate all metaslabs which have lower offsets than the 1119 * bonus area. 1120 */ 1121 for (int m = 0; m < vd->vdev_ms_count; m++) { 1122 metaslab_t *msp = vd->vdev_ms[m]; 1123 1124 if (msp->ms_map.sm_start > mg->mg_bonus_area) 1125 break; 1126 1127 mutex_enter(&msp->ms_lock); 1128 metaslab_group_sort(mg, msp, metaslab_weight(msp)); 1129 mutex_exit(&msp->ms_lock); 1130 } 1131 1132 atomic_add_64(&mg->mg_alloc_failures, -failures); 1133 1134 /* 1135 * Prefetch the next potential metaslabs 1136 */ 1137 metaslab_prefetch(mg); 1138 } 1139 1140 static uint64_t 1141 metaslab_distance(metaslab_t *msp, dva_t *dva) 1142 { 1143 uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift; 1144 uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift; 1145 uint64_t start = msp->ms_map.sm_start >> ms_shift; 1146 1147 if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) 1148 return (1ULL << 63); 1149 1150 if (offset < start) 1151 return ((start - offset) << ms_shift); 1152 if (offset > start) 1153 return ((offset - start) << ms_shift); 1154 return (0); 1155 } 1156 1157 static uint64_t 1158 metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, 1159 uint64_t txg, uint64_t min_distance, dva_t *dva, int d, int flags) 1160 { 1161 spa_t *spa = mg->mg_vd->vdev_spa; 1162 metaslab_t *msp = NULL; 1163 uint64_t offset = -1ULL; 1164 avl_tree_t *t = &mg->mg_metaslab_tree; 1165 uint64_t activation_weight; 1166 uint64_t target_distance; 1167 int i; 1168 1169 activation_weight = METASLAB_WEIGHT_PRIMARY; 1170 for (i = 0; i < d; i++) { 1171 if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { 1172 activation_weight = METASLAB_WEIGHT_SECONDARY; 1173 break; 1174 } 1175 } 1176 1177 for (;;) { 1178 boolean_t was_active; 1179 1180 mutex_enter(&mg->mg_lock); 1181 for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) { 1182 if (msp->ms_weight < asize) { 1183 spa_dbgmsg(spa, "%s: failed to meet weight " 1184 "requirement: vdev %llu, txg %llu, mg %p, " 1185 "msp %p, psize %llu, asize %llu, " 1186 "failures %llu, weight %llu", 1187 spa_name(spa), mg->mg_vd->vdev_id, txg, 1188 mg, msp, psize, asize, 1189 mg->mg_alloc_failures, msp->ms_weight); 1190 mutex_exit(&mg->mg_lock); 1191 return (-1ULL); 1192 } 1193 was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; 1194 if (activation_weight == METASLAB_WEIGHT_PRIMARY) 1195 break; 1196 1197 target_distance = min_distance + 1198 (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1); 1199 1200 for (i = 0; i < d; i++) 1201 if (metaslab_distance(msp, &dva[i]) < 1202 target_distance) 1203 break; 1204 if (i == d) 1205 break; 1206 } 1207 mutex_exit(&mg->mg_lock); 1208 if (msp == NULL) 1209 return (-1ULL); 1210 1211 /* 1212 * If we've already reached the allowable number of failed 1213 * allocation attempts on this metaslab group then we 1214 * consider skipping it. We skip it only if we're allowed 1215 * to "fast" gang, the physical size is larger than 1216 * a gang block, and we're attempting to allocate from 1217 * the primary metaslab. 1218 */ 1219 if (mg->mg_alloc_failures > zfs_mg_alloc_failures && 1220 CAN_FASTGANG(flags) && psize > SPA_GANGBLOCKSIZE && 1221 activation_weight == METASLAB_WEIGHT_PRIMARY) { 1222 spa_dbgmsg(spa, "%s: skipping metaslab group: " 1223 "vdev %llu, txg %llu, mg %p, psize %llu, " 1224 "asize %llu, failures %llu", spa_name(spa), 1225 mg->mg_vd->vdev_id, txg, mg, psize, asize, 1226 mg->mg_alloc_failures); 1227 return (-1ULL); 1228 } 1229 1230 mutex_enter(&msp->ms_lock); 1231 1232 /* 1233 * Ensure that the metaslab we have selected is still 1234 * capable of handling our request. It's possible that 1235 * another thread may have changed the weight while we 1236 * were blocked on the metaslab lock. 1237 */ 1238 if (msp->ms_weight < asize || (was_active && 1239 !(msp->ms_weight & METASLAB_ACTIVE_MASK) && 1240 activation_weight == METASLAB_WEIGHT_PRIMARY)) { 1241 mutex_exit(&msp->ms_lock); 1242 continue; 1243 } 1244 1245 if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) && 1246 activation_weight == METASLAB_WEIGHT_PRIMARY) { 1247 metaslab_passivate(msp, 1248 msp->ms_weight & ~METASLAB_ACTIVE_MASK); 1249 mutex_exit(&msp->ms_lock); 1250 continue; 1251 } 1252 1253 if (metaslab_activate(msp, activation_weight) != 0) { 1254 mutex_exit(&msp->ms_lock); 1255 continue; 1256 } 1257 1258 if ((offset = space_map_alloc(&msp->ms_map, asize)) != -1ULL) 1259 break; 1260 1261 atomic_inc_64(&mg->mg_alloc_failures); 1262 1263 metaslab_passivate(msp, space_map_maxsize(&msp->ms_map)); 1264 1265 mutex_exit(&msp->ms_lock); 1266 } 1267 1268 if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) 1269 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); 1270 1271 space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, asize); 1272 1273 mutex_exit(&msp->ms_lock); 1274 1275 return (offset); 1276 } 1277 1278 /* 1279 * Allocate a block for the specified i/o. 1280 */ 1281 static int 1282 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, 1283 dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags) 1284 { 1285 metaslab_group_t *mg, *rotor; 1286 vdev_t *vd; 1287 int dshift = 3; 1288 int all_zero; 1289 int zio_lock = B_FALSE; 1290 boolean_t allocatable; 1291 uint64_t offset = -1ULL; 1292 uint64_t asize; 1293 uint64_t distance; 1294 1295 ASSERT(!DVA_IS_VALID(&dva[d])); 1296 1297 /* 1298 * For testing, make some blocks above a certain size be gang blocks. 1299 */ 1300 if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) 1301 return (ENOSPC); 1302 1303 /* 1304 * Start at the rotor and loop through all mgs until we find something. 1305 * Note that there's no locking on mc_rotor or mc_aliquot because 1306 * nothing actually breaks if we miss a few updates -- we just won't 1307 * allocate quite as evenly. It all balances out over time. 1308 * 1309 * If we are doing ditto or log blocks, try to spread them across 1310 * consecutive vdevs. If we're forced to reuse a vdev before we've 1311 * allocated all of our ditto blocks, then try and spread them out on 1312 * that vdev as much as possible. If it turns out to not be possible, 1313 * gradually lower our standards until anything becomes acceptable. 1314 * Also, allocating on consecutive vdevs (as opposed to random vdevs) 1315 * gives us hope of containing our fault domains to something we're 1316 * able to reason about. Otherwise, any two top-level vdev failures 1317 * will guarantee the loss of data. With consecutive allocation, 1318 * only two adjacent top-level vdev failures will result in data loss. 1319 * 1320 * If we are doing gang blocks (hintdva is non-NULL), try to keep 1321 * ourselves on the same vdev as our gang block header. That 1322 * way, we can hope for locality in vdev_cache, plus it makes our 1323 * fault domains something tractable. 1324 */ 1325 if (hintdva) { 1326 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); 1327 1328 /* 1329 * It's possible the vdev we're using as the hint no 1330 * longer exists (i.e. removed). Consult the rotor when 1331 * all else fails. 1332 */ 1333 if (vd != NULL) { 1334 mg = vd->vdev_mg; 1335 1336 if (flags & METASLAB_HINTBP_AVOID && 1337 mg->mg_next != NULL) 1338 mg = mg->mg_next; 1339 } else { 1340 mg = mc->mc_rotor; 1341 } 1342 } else if (d != 0) { 1343 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); 1344 mg = vd->vdev_mg->mg_next; 1345 } else { 1346 mg = mc->mc_rotor; 1347 } 1348 1349 /* 1350 * If the hint put us into the wrong metaslab class, or into a 1351 * metaslab group that has been passivated, just follow the rotor. 1352 */ 1353 if (mg->mg_class != mc || mg->mg_activation_count <= 0) 1354 mg = mc->mc_rotor; 1355 1356 rotor = mg; 1357 top: 1358 all_zero = B_TRUE; 1359 do { 1360 ASSERT(mg->mg_activation_count == 1); 1361 1362 vd = mg->mg_vd; 1363 1364 /* 1365 * Don't allocate from faulted devices. 1366 */ 1367 if (zio_lock) { 1368 spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); 1369 allocatable = vdev_allocatable(vd); 1370 spa_config_exit(spa, SCL_ZIO, FTAG); 1371 } else { 1372 allocatable = vdev_allocatable(vd); 1373 } 1374 if (!allocatable) 1375 goto next; 1376 1377 /* 1378 * Avoid writing single-copy data to a failing vdev 1379 */ 1380 if ((vd->vdev_stat.vs_write_errors > 0 || 1381 vd->vdev_state < VDEV_STATE_HEALTHY) && 1382 d == 0 && dshift == 3) { 1383 all_zero = B_FALSE; 1384 goto next; 1385 } 1386 1387 ASSERT(mg->mg_class == mc); 1388 1389 distance = vd->vdev_asize >> dshift; 1390 if (distance <= (1ULL << vd->vdev_ms_shift)) 1391 distance = 0; 1392 else 1393 all_zero = B_FALSE; 1394 1395 asize = vdev_psize_to_asize(vd, psize); 1396 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); 1397 1398 offset = metaslab_group_alloc(mg, psize, asize, txg, distance, 1399 dva, d, flags); 1400 if (offset != -1ULL) { 1401 /* 1402 * If we've just selected this metaslab group, 1403 * figure out whether the corresponding vdev is 1404 * over- or under-used relative to the pool, 1405 * and set an allocation bias to even it out. 1406 */ 1407 if (mc->mc_aliquot == 0) { 1408 vdev_stat_t *vs = &vd->vdev_stat; 1409 int64_t vu, cu; 1410 1411 vu = (vs->vs_alloc * 100) / (vs->vs_space + 1); 1412 cu = (mc->mc_alloc * 100) / (mc->mc_space + 1); 1413 1414 /* 1415 * Calculate how much more or less we should 1416 * try to allocate from this device during 1417 * this iteration around the rotor. 1418 * For example, if a device is 80% full 1419 * and the pool is 20% full then we should 1420 * reduce allocations by 60% on this device. 1421 * 1422 * mg_bias = (20 - 80) * 512K / 100 = -307K 1423 * 1424 * This reduces allocations by 307K for this 1425 * iteration. 1426 */ 1427 mg->mg_bias = ((cu - vu) * 1428 (int64_t)mg->mg_aliquot) / 100; 1429 } 1430 1431 if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= 1432 mg->mg_aliquot + mg->mg_bias) { 1433 mc->mc_rotor = mg->mg_next; 1434 mc->mc_aliquot = 0; 1435 } 1436 1437 DVA_SET_VDEV(&dva[d], vd->vdev_id); 1438 DVA_SET_OFFSET(&dva[d], offset); 1439 DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER)); 1440 DVA_SET_ASIZE(&dva[d], asize); 1441 1442 return (0); 1443 } 1444 next: 1445 mc->mc_rotor = mg->mg_next; 1446 mc->mc_aliquot = 0; 1447 } while ((mg = mg->mg_next) != rotor); 1448 1449 if (!all_zero) { 1450 dshift++; 1451 ASSERT(dshift < 64); 1452 goto top; 1453 } 1454 1455 if (!allocatable && !zio_lock) { 1456 dshift = 3; 1457 zio_lock = B_TRUE; 1458 goto top; 1459 } 1460 1461 bzero(&dva[d], sizeof (dva_t)); 1462 1463 return (ENOSPC); 1464 } 1465 1466 /* 1467 * Free the block represented by DVA in the context of the specified 1468 * transaction group. 1469 */ 1470 static void 1471 metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now) 1472 { 1473 uint64_t vdev = DVA_GET_VDEV(dva); 1474 uint64_t offset = DVA_GET_OFFSET(dva); 1475 uint64_t size = DVA_GET_ASIZE(dva); 1476 vdev_t *vd; 1477 metaslab_t *msp; 1478 1479 ASSERT(DVA_IS_VALID(dva)); 1480 1481 if (txg > spa_freeze_txg(spa)) 1482 return; 1483 1484 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 1485 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { 1486 cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu", 1487 (u_longlong_t)vdev, (u_longlong_t)offset); 1488 ASSERT(0); 1489 return; 1490 } 1491 1492 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 1493 1494 if (DVA_GET_GANG(dva)) 1495 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 1496 1497 mutex_enter(&msp->ms_lock); 1498 1499 if (now) { 1500 space_map_remove(&msp->ms_allocmap[txg & TXG_MASK], 1501 offset, size); 1502 space_map_free(&msp->ms_map, offset, size); 1503 } else { 1504 if (msp->ms_freemap[txg & TXG_MASK].sm_space == 0) 1505 vdev_dirty(vd, VDD_METASLAB, msp, txg); 1506 space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size); 1507 } 1508 1509 mutex_exit(&msp->ms_lock); 1510 } 1511 1512 /* 1513 * Intent log support: upon opening the pool after a crash, notify the SPA 1514 * of blocks that the intent log has allocated for immediate write, but 1515 * which are still considered free by the SPA because the last transaction 1516 * group didn't commit yet. 1517 */ 1518 static int 1519 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) 1520 { 1521 uint64_t vdev = DVA_GET_VDEV(dva); 1522 uint64_t offset = DVA_GET_OFFSET(dva); 1523 uint64_t size = DVA_GET_ASIZE(dva); 1524 vdev_t *vd; 1525 metaslab_t *msp; 1526 int error = 0; 1527 1528 ASSERT(DVA_IS_VALID(dva)); 1529 1530 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 1531 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) 1532 return (ENXIO); 1533 1534 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 1535 1536 if (DVA_GET_GANG(dva)) 1537 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 1538 1539 mutex_enter(&msp->ms_lock); 1540 1541 if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map.sm_loaded) 1542 error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); 1543 1544 if (error == 0 && !space_map_contains(&msp->ms_map, offset, size)) 1545 error = ENOENT; 1546 1547 if (error || txg == 0) { /* txg == 0 indicates dry run */ 1548 mutex_exit(&msp->ms_lock); 1549 return (error); 1550 } 1551 1552 space_map_claim(&msp->ms_map, offset, size); 1553 1554 if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ 1555 if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) 1556 vdev_dirty(vd, VDD_METASLAB, msp, txg); 1557 space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); 1558 } 1559 1560 mutex_exit(&msp->ms_lock); 1561 1562 return (0); 1563 } 1564 1565 int 1566 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, 1567 int ndvas, uint64_t txg, blkptr_t *hintbp, int flags) 1568 { 1569 dva_t *dva = bp->blk_dva; 1570 dva_t *hintdva = hintbp->blk_dva; 1571 int error = 0; 1572 1573 ASSERT(bp->blk_birth == 0); 1574 ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); 1575 1576 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 1577 1578 if (mc->mc_rotor == NULL) { /* no vdevs in this class */ 1579 spa_config_exit(spa, SCL_ALLOC, FTAG); 1580 return (ENOSPC); 1581 } 1582 1583 ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); 1584 ASSERT(BP_GET_NDVAS(bp) == 0); 1585 ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); 1586 1587 for (int d = 0; d < ndvas; d++) { 1588 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, 1589 txg, flags); 1590 if (error) { 1591 for (d--; d >= 0; d--) { 1592 metaslab_free_dva(spa, &dva[d], txg, B_TRUE); 1593 bzero(&dva[d], sizeof (dva_t)); 1594 } 1595 spa_config_exit(spa, SCL_ALLOC, FTAG); 1596 return (error); 1597 } 1598 } 1599 ASSERT(error == 0); 1600 ASSERT(BP_GET_NDVAS(bp) == ndvas); 1601 1602 spa_config_exit(spa, SCL_ALLOC, FTAG); 1603 1604 BP_SET_BIRTH(bp, txg, txg); 1605 1606 return (0); 1607 } 1608 1609 void 1610 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) 1611 { 1612 const dva_t *dva = bp->blk_dva; 1613 int ndvas = BP_GET_NDVAS(bp); 1614 1615 ASSERT(!BP_IS_HOLE(bp)); 1616 ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); 1617 1618 spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); 1619 1620 for (int d = 0; d < ndvas; d++) 1621 metaslab_free_dva(spa, &dva[d], txg, now); 1622 1623 spa_config_exit(spa, SCL_FREE, FTAG); 1624 } 1625 1626 int 1627 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) 1628 { 1629 const dva_t *dva = bp->blk_dva; 1630 int ndvas = BP_GET_NDVAS(bp); 1631 int error = 0; 1632 1633 ASSERT(!BP_IS_HOLE(bp)); 1634 1635 if (txg != 0) { 1636 /* 1637 * First do a dry run to make sure all DVAs are claimable, 1638 * so we don't have to unwind from partial failures below. 1639 */ 1640 if ((error = metaslab_claim(spa, bp, 0)) != 0) 1641 return (error); 1642 } 1643 1644 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 1645 1646 for (int d = 0; d < ndvas; d++) 1647 if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0) 1648 break; 1649 1650 spa_config_exit(spa, SCL_ALLOC, FTAG); 1651 1652 ASSERT(error == 0 || txg == 0); 1653 1654 return (error); 1655 } 1656