1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012 by Delphix. All rights reserved. 24 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 25 */ 26 27 #include <sys/zfs_context.h> 28 #include <sys/dmu.h> 29 #include <sys/dmu_tx.h> 30 #include <sys/space_map.h> 31 #include <sys/metaslab_impl.h> 32 #include <sys/vdev_impl.h> 33 #include <sys/zio.h> 34 35 /* 36 * Allow allocations to switch to gang blocks quickly. We do this to 37 * avoid having to load lots of space_maps in a given txg. There are, 38 * however, some cases where we want to avoid "fast" ganging and instead 39 * we want to do an exhaustive search of all metaslabs on this device. 40 * Currently we don't allow any gang, zil, or dump device related allocations 41 * to "fast" gang. 42 */ 43 #define CAN_FASTGANG(flags) \ 44 (!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \ 45 METASLAB_GANG_AVOID))) 46 47 uint64_t metaslab_aliquot = 512ULL << 10; 48 uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ 49 50 /* 51 * This value defines the number of allowed allocation failures per vdev. 52 * If a device reaches this threshold in a given txg then we consider skipping 53 * allocations on that device. 54 */ 55 int zfs_mg_alloc_failures; 56 57 /* 58 * Metaslab debugging: when set, keeps all space maps in core to verify frees. 59 */ 60 static int metaslab_debug = 0; 61 62 /* 63 * Minimum size which forces the dynamic allocator to change 64 * it's allocation strategy. Once the space map cannot satisfy 65 * an allocation of this size then it switches to using more 66 * aggressive strategy (i.e search by size rather than offset). 67 */ 68 uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE; 69 70 /* 71 * The minimum free space, in percent, which must be available 72 * in a space map to continue allocations in a first-fit fashion. 73 * Once the space_map's free space drops below this level we dynamically 74 * switch to using best-fit allocations. 75 */ 76 int metaslab_df_free_pct = 4; 77 78 /* 79 * A metaslab is considered "free" if it contains a contiguous 80 * segment which is greater than metaslab_min_alloc_size. 81 */ 82 uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS; 83 84 /* 85 * Max number of space_maps to prefetch. 86 */ 87 int metaslab_prefetch_limit = SPA_DVAS_PER_BP; 88 89 /* 90 * Percentage bonus multiplier for metaslabs that are in the bonus area. 91 */ 92 int metaslab_smo_bonus_pct = 150; 93 94 /* 95 * Should we be willing to write data to degraded vdevs? 96 */ 97 boolean_t zfs_write_to_degraded = B_FALSE; 98 99 /* 100 * ========================================================================== 101 * Metaslab classes 102 * ========================================================================== 103 */ 104 metaslab_class_t * 105 metaslab_class_create(spa_t *spa, space_map_ops_t *ops) 106 { 107 metaslab_class_t *mc; 108 109 mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); 110 111 mc->mc_spa = spa; 112 mc->mc_rotor = NULL; 113 mc->mc_ops = ops; 114 115 return (mc); 116 } 117 118 void 119 metaslab_class_destroy(metaslab_class_t *mc) 120 { 121 ASSERT(mc->mc_rotor == NULL); 122 ASSERT(mc->mc_alloc == 0); 123 ASSERT(mc->mc_deferred == 0); 124 ASSERT(mc->mc_space == 0); 125 ASSERT(mc->mc_dspace == 0); 126 127 kmem_free(mc, sizeof (metaslab_class_t)); 128 } 129 130 int 131 metaslab_class_validate(metaslab_class_t *mc) 132 { 133 metaslab_group_t *mg; 134 vdev_t *vd; 135 136 /* 137 * Must hold one of the spa_config locks. 138 */ 139 ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || 140 spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); 141 142 if ((mg = mc->mc_rotor) == NULL) 143 return (0); 144 145 do { 146 vd = mg->mg_vd; 147 ASSERT(vd->vdev_mg != NULL); 148 ASSERT3P(vd->vdev_top, ==, vd); 149 ASSERT3P(mg->mg_class, ==, mc); 150 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); 151 } while ((mg = mg->mg_next) != mc->mc_rotor); 152 153 return (0); 154 } 155 156 void 157 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, 158 int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) 159 { 160 atomic_add_64(&mc->mc_alloc, alloc_delta); 161 atomic_add_64(&mc->mc_deferred, defer_delta); 162 atomic_add_64(&mc->mc_space, space_delta); 163 atomic_add_64(&mc->mc_dspace, dspace_delta); 164 } 165 166 uint64_t 167 metaslab_class_get_alloc(metaslab_class_t *mc) 168 { 169 return (mc->mc_alloc); 170 } 171 172 uint64_t 173 metaslab_class_get_deferred(metaslab_class_t *mc) 174 { 175 return (mc->mc_deferred); 176 } 177 178 uint64_t 179 metaslab_class_get_space(metaslab_class_t *mc) 180 { 181 return (mc->mc_space); 182 } 183 184 uint64_t 185 metaslab_class_get_dspace(metaslab_class_t *mc) 186 { 187 return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); 188 } 189 190 /* 191 * ========================================================================== 192 * Metaslab groups 193 * ========================================================================== 194 */ 195 static int 196 metaslab_compare(const void *x1, const void *x2) 197 { 198 const metaslab_t *m1 = x1; 199 const metaslab_t *m2 = x2; 200 201 if (m1->ms_weight < m2->ms_weight) 202 return (1); 203 if (m1->ms_weight > m2->ms_weight) 204 return (-1); 205 206 /* 207 * If the weights are identical, use the offset to force uniqueness. 208 */ 209 if (m1->ms_map.sm_start < m2->ms_map.sm_start) 210 return (-1); 211 if (m1->ms_map.sm_start > m2->ms_map.sm_start) 212 return (1); 213 214 ASSERT3P(m1, ==, m2); 215 216 return (0); 217 } 218 219 metaslab_group_t * 220 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) 221 { 222 metaslab_group_t *mg; 223 224 mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); 225 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); 226 avl_create(&mg->mg_metaslab_tree, metaslab_compare, 227 sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); 228 mg->mg_vd = vd; 229 mg->mg_class = mc; 230 mg->mg_activation_count = 0; 231 232 return (mg); 233 } 234 235 void 236 metaslab_group_destroy(metaslab_group_t *mg) 237 { 238 ASSERT(mg->mg_prev == NULL); 239 ASSERT(mg->mg_next == NULL); 240 /* 241 * We may have gone below zero with the activation count 242 * either because we never activated in the first place or 243 * because we're done, and possibly removing the vdev. 244 */ 245 ASSERT(mg->mg_activation_count <= 0); 246 247 avl_destroy(&mg->mg_metaslab_tree); 248 mutex_destroy(&mg->mg_lock); 249 kmem_free(mg, sizeof (metaslab_group_t)); 250 } 251 252 void 253 metaslab_group_activate(metaslab_group_t *mg) 254 { 255 metaslab_class_t *mc = mg->mg_class; 256 metaslab_group_t *mgprev, *mgnext; 257 258 ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); 259 260 ASSERT(mc->mc_rotor != mg); 261 ASSERT(mg->mg_prev == NULL); 262 ASSERT(mg->mg_next == NULL); 263 ASSERT(mg->mg_activation_count <= 0); 264 265 if (++mg->mg_activation_count <= 0) 266 return; 267 268 mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); 269 270 if ((mgprev = mc->mc_rotor) == NULL) { 271 mg->mg_prev = mg; 272 mg->mg_next = mg; 273 } else { 274 mgnext = mgprev->mg_next; 275 mg->mg_prev = mgprev; 276 mg->mg_next = mgnext; 277 mgprev->mg_next = mg; 278 mgnext->mg_prev = mg; 279 } 280 mc->mc_rotor = mg; 281 } 282 283 void 284 metaslab_group_passivate(metaslab_group_t *mg) 285 { 286 metaslab_class_t *mc = mg->mg_class; 287 metaslab_group_t *mgprev, *mgnext; 288 289 ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); 290 291 if (--mg->mg_activation_count != 0) { 292 ASSERT(mc->mc_rotor != mg); 293 ASSERT(mg->mg_prev == NULL); 294 ASSERT(mg->mg_next == NULL); 295 ASSERT(mg->mg_activation_count < 0); 296 return; 297 } 298 299 mgprev = mg->mg_prev; 300 mgnext = mg->mg_next; 301 302 if (mg == mgnext) { 303 mc->mc_rotor = NULL; 304 } else { 305 mc->mc_rotor = mgnext; 306 mgprev->mg_next = mgnext; 307 mgnext->mg_prev = mgprev; 308 } 309 310 mg->mg_prev = NULL; 311 mg->mg_next = NULL; 312 } 313 314 static void 315 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) 316 { 317 mutex_enter(&mg->mg_lock); 318 ASSERT(msp->ms_group == NULL); 319 msp->ms_group = mg; 320 msp->ms_weight = 0; 321 avl_add(&mg->mg_metaslab_tree, msp); 322 mutex_exit(&mg->mg_lock); 323 } 324 325 static void 326 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) 327 { 328 mutex_enter(&mg->mg_lock); 329 ASSERT(msp->ms_group == mg); 330 avl_remove(&mg->mg_metaslab_tree, msp); 331 msp->ms_group = NULL; 332 mutex_exit(&mg->mg_lock); 333 } 334 335 static void 336 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 337 { 338 /* 339 * Although in principle the weight can be any value, in 340 * practice we do not use values in the range [1, 510]. 341 */ 342 ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0); 343 ASSERT(MUTEX_HELD(&msp->ms_lock)); 344 345 mutex_enter(&mg->mg_lock); 346 ASSERT(msp->ms_group == mg); 347 avl_remove(&mg->mg_metaslab_tree, msp); 348 msp->ms_weight = weight; 349 avl_add(&mg->mg_metaslab_tree, msp); 350 mutex_exit(&mg->mg_lock); 351 } 352 353 /* 354 * ========================================================================== 355 * Common allocator routines 356 * ========================================================================== 357 */ 358 static int 359 metaslab_segsize_compare(const void *x1, const void *x2) 360 { 361 const space_seg_t *s1 = x1; 362 const space_seg_t *s2 = x2; 363 uint64_t ss_size1 = s1->ss_end - s1->ss_start; 364 uint64_t ss_size2 = s2->ss_end - s2->ss_start; 365 366 if (ss_size1 < ss_size2) 367 return (-1); 368 if (ss_size1 > ss_size2) 369 return (1); 370 371 if (s1->ss_start < s2->ss_start) 372 return (-1); 373 if (s1->ss_start > s2->ss_start) 374 return (1); 375 376 return (0); 377 } 378 379 /* 380 * This is a helper function that can be used by the allocator to find 381 * a suitable block to allocate. This will search the specified AVL 382 * tree looking for a block that matches the specified criteria. 383 */ 384 static uint64_t 385 metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, 386 uint64_t align) 387 { 388 space_seg_t *ss, ssearch; 389 avl_index_t where; 390 391 ssearch.ss_start = *cursor; 392 ssearch.ss_end = *cursor + size; 393 394 ss = avl_find(t, &ssearch, &where); 395 if (ss == NULL) 396 ss = avl_nearest(t, where, AVL_AFTER); 397 398 while (ss != NULL) { 399 uint64_t offset = P2ROUNDUP(ss->ss_start, align); 400 401 if (offset + size <= ss->ss_end) { 402 *cursor = offset + size; 403 return (offset); 404 } 405 ss = AVL_NEXT(t, ss); 406 } 407 408 /* 409 * If we know we've searched the whole map (*cursor == 0), give up. 410 * Otherwise, reset the cursor to the beginning and try again. 411 */ 412 if (*cursor == 0) 413 return (-1ULL); 414 415 *cursor = 0; 416 return (metaslab_block_picker(t, cursor, size, align)); 417 } 418 419 static void 420 metaslab_pp_load(space_map_t *sm) 421 { 422 space_seg_t *ss; 423 424 ASSERT(sm->sm_ppd == NULL); 425 sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP); 426 427 sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); 428 avl_create(sm->sm_pp_root, metaslab_segsize_compare, 429 sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node)); 430 431 for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss)) 432 avl_add(sm->sm_pp_root, ss); 433 } 434 435 static void 436 metaslab_pp_unload(space_map_t *sm) 437 { 438 void *cookie = NULL; 439 440 kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t)); 441 sm->sm_ppd = NULL; 442 443 while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) { 444 /* tear down the tree */ 445 } 446 447 avl_destroy(sm->sm_pp_root); 448 kmem_free(sm->sm_pp_root, sizeof (avl_tree_t)); 449 sm->sm_pp_root = NULL; 450 } 451 452 /* ARGSUSED */ 453 static void 454 metaslab_pp_claim(space_map_t *sm, uint64_t start, uint64_t size) 455 { 456 /* No need to update cursor */ 457 } 458 459 /* ARGSUSED */ 460 static void 461 metaslab_pp_free(space_map_t *sm, uint64_t start, uint64_t size) 462 { 463 /* No need to update cursor */ 464 } 465 466 /* 467 * Return the maximum contiguous segment within the metaslab. 468 */ 469 uint64_t 470 metaslab_pp_maxsize(space_map_t *sm) 471 { 472 avl_tree_t *t = sm->sm_pp_root; 473 space_seg_t *ss; 474 475 if (t == NULL || (ss = avl_last(t)) == NULL) 476 return (0ULL); 477 478 return (ss->ss_end - ss->ss_start); 479 } 480 481 /* 482 * ========================================================================== 483 * The first-fit block allocator 484 * ========================================================================== 485 */ 486 static uint64_t 487 metaslab_ff_alloc(space_map_t *sm, uint64_t size) 488 { 489 avl_tree_t *t = &sm->sm_root; 490 uint64_t align = size & -size; 491 uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; 492 493 return (metaslab_block_picker(t, cursor, size, align)); 494 } 495 496 /* ARGSUSED */ 497 boolean_t 498 metaslab_ff_fragmented(space_map_t *sm) 499 { 500 return (B_TRUE); 501 } 502 503 static space_map_ops_t metaslab_ff_ops = { 504 metaslab_pp_load, 505 metaslab_pp_unload, 506 metaslab_ff_alloc, 507 metaslab_pp_claim, 508 metaslab_pp_free, 509 metaslab_pp_maxsize, 510 metaslab_ff_fragmented 511 }; 512 513 /* 514 * ========================================================================== 515 * Dynamic block allocator - 516 * Uses the first fit allocation scheme until space get low and then 517 * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold 518 * and metaslab_df_free_pct to determine when to switch the allocation scheme. 519 * ========================================================================== 520 */ 521 static uint64_t 522 metaslab_df_alloc(space_map_t *sm, uint64_t size) 523 { 524 avl_tree_t *t = &sm->sm_root; 525 uint64_t align = size & -size; 526 uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; 527 uint64_t max_size = metaslab_pp_maxsize(sm); 528 int free_pct = sm->sm_space * 100 / sm->sm_size; 529 530 ASSERT(MUTEX_HELD(sm->sm_lock)); 531 ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); 532 533 if (max_size < size) 534 return (-1ULL); 535 536 /* 537 * If we're running low on space switch to using the size 538 * sorted AVL tree (best-fit). 539 */ 540 if (max_size < metaslab_df_alloc_threshold || 541 free_pct < metaslab_df_free_pct) { 542 t = sm->sm_pp_root; 543 *cursor = 0; 544 } 545 546 return (metaslab_block_picker(t, cursor, size, 1ULL)); 547 } 548 549 static boolean_t 550 metaslab_df_fragmented(space_map_t *sm) 551 { 552 uint64_t max_size = metaslab_pp_maxsize(sm); 553 int free_pct = sm->sm_space * 100 / sm->sm_size; 554 555 if (max_size >= metaslab_df_alloc_threshold && 556 free_pct >= metaslab_df_free_pct) 557 return (B_FALSE); 558 559 return (B_TRUE); 560 } 561 562 static space_map_ops_t metaslab_df_ops = { 563 metaslab_pp_load, 564 metaslab_pp_unload, 565 metaslab_df_alloc, 566 metaslab_pp_claim, 567 metaslab_pp_free, 568 metaslab_pp_maxsize, 569 metaslab_df_fragmented 570 }; 571 572 /* 573 * ========================================================================== 574 * Other experimental allocators 575 * ========================================================================== 576 */ 577 static uint64_t 578 metaslab_cdf_alloc(space_map_t *sm, uint64_t size) 579 { 580 avl_tree_t *t = &sm->sm_root; 581 uint64_t *cursor = (uint64_t *)sm->sm_ppd; 582 uint64_t *extent_end = (uint64_t *)sm->sm_ppd + 1; 583 uint64_t max_size = metaslab_pp_maxsize(sm); 584 uint64_t rsize = size; 585 uint64_t offset = 0; 586 587 ASSERT(MUTEX_HELD(sm->sm_lock)); 588 ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); 589 590 if (max_size < size) 591 return (-1ULL); 592 593 ASSERT3U(*extent_end, >=, *cursor); 594 595 /* 596 * If we're running low on space switch to using the size 597 * sorted AVL tree (best-fit). 598 */ 599 if ((*cursor + size) > *extent_end) { 600 601 t = sm->sm_pp_root; 602 *cursor = *extent_end = 0; 603 604 if (max_size > 2 * SPA_MAXBLOCKSIZE) 605 rsize = MIN(metaslab_min_alloc_size, max_size); 606 offset = metaslab_block_picker(t, extent_end, rsize, 1ULL); 607 if (offset != -1) 608 *cursor = offset + size; 609 } else { 610 offset = metaslab_block_picker(t, cursor, rsize, 1ULL); 611 } 612 ASSERT3U(*cursor, <=, *extent_end); 613 return (offset); 614 } 615 616 static boolean_t 617 metaslab_cdf_fragmented(space_map_t *sm) 618 { 619 uint64_t max_size = metaslab_pp_maxsize(sm); 620 621 if (max_size > (metaslab_min_alloc_size * 10)) 622 return (B_FALSE); 623 return (B_TRUE); 624 } 625 626 static space_map_ops_t metaslab_cdf_ops = { 627 metaslab_pp_load, 628 metaslab_pp_unload, 629 metaslab_cdf_alloc, 630 metaslab_pp_claim, 631 metaslab_pp_free, 632 metaslab_pp_maxsize, 633 metaslab_cdf_fragmented 634 }; 635 636 uint64_t metaslab_ndf_clump_shift = 4; 637 638 static uint64_t 639 metaslab_ndf_alloc(space_map_t *sm, uint64_t size) 640 { 641 avl_tree_t *t = &sm->sm_root; 642 avl_index_t where; 643 space_seg_t *ss, ssearch; 644 uint64_t hbit = highbit(size); 645 uint64_t *cursor = (uint64_t *)sm->sm_ppd + hbit - 1; 646 uint64_t max_size = metaslab_pp_maxsize(sm); 647 648 ASSERT(MUTEX_HELD(sm->sm_lock)); 649 ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); 650 651 if (max_size < size) 652 return (-1ULL); 653 654 ssearch.ss_start = *cursor; 655 ssearch.ss_end = *cursor + size; 656 657 ss = avl_find(t, &ssearch, &where); 658 if (ss == NULL || (ss->ss_start + size > ss->ss_end)) { 659 t = sm->sm_pp_root; 660 661 ssearch.ss_start = 0; 662 ssearch.ss_end = MIN(max_size, 663 1ULL << (hbit + metaslab_ndf_clump_shift)); 664 ss = avl_find(t, &ssearch, &where); 665 if (ss == NULL) 666 ss = avl_nearest(t, where, AVL_AFTER); 667 ASSERT(ss != NULL); 668 } 669 670 if (ss != NULL) { 671 if (ss->ss_start + size <= ss->ss_end) { 672 *cursor = ss->ss_start + size; 673 return (ss->ss_start); 674 } 675 } 676 return (-1ULL); 677 } 678 679 static boolean_t 680 metaslab_ndf_fragmented(space_map_t *sm) 681 { 682 uint64_t max_size = metaslab_pp_maxsize(sm); 683 684 if (max_size > (metaslab_min_alloc_size << metaslab_ndf_clump_shift)) 685 return (B_FALSE); 686 return (B_TRUE); 687 } 688 689 690 static space_map_ops_t metaslab_ndf_ops = { 691 metaslab_pp_load, 692 metaslab_pp_unload, 693 metaslab_ndf_alloc, 694 metaslab_pp_claim, 695 metaslab_pp_free, 696 metaslab_pp_maxsize, 697 metaslab_ndf_fragmented 698 }; 699 700 space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops; 701 702 /* 703 * ========================================================================== 704 * Metaslabs 705 * ========================================================================== 706 */ 707 metaslab_t * 708 metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, 709 uint64_t start, uint64_t size, uint64_t txg) 710 { 711 vdev_t *vd = mg->mg_vd; 712 metaslab_t *msp; 713 714 msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); 715 mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL); 716 717 msp->ms_smo_syncing = *smo; 718 719 /* 720 * We create the main space map here, but we don't create the 721 * allocmaps and freemaps until metaslab_sync_done(). This serves 722 * two purposes: it allows metaslab_sync_done() to detect the 723 * addition of new space; and for debugging, it ensures that we'd 724 * data fault on any attempt to use this metaslab before it's ready. 725 */ 726 space_map_create(&msp->ms_map, start, size, 727 vd->vdev_ashift, &msp->ms_lock); 728 729 metaslab_group_add(mg, msp); 730 731 if (metaslab_debug && smo->smo_object != 0) { 732 mutex_enter(&msp->ms_lock); 733 VERIFY(space_map_load(&msp->ms_map, mg->mg_class->mc_ops, 734 SM_FREE, smo, spa_meta_objset(vd->vdev_spa)) == 0); 735 mutex_exit(&msp->ms_lock); 736 } 737 738 /* 739 * If we're opening an existing pool (txg == 0) or creating 740 * a new one (txg == TXG_INITIAL), all space is available now. 741 * If we're adding space to an existing pool, the new space 742 * does not become available until after this txg has synced. 743 */ 744 if (txg <= TXG_INITIAL) 745 metaslab_sync_done(msp, 0); 746 747 if (txg != 0) { 748 vdev_dirty(vd, 0, NULL, txg); 749 vdev_dirty(vd, VDD_METASLAB, msp, txg); 750 } 751 752 return (msp); 753 } 754 755 void 756 metaslab_fini(metaslab_t *msp) 757 { 758 metaslab_group_t *mg = msp->ms_group; 759 760 vdev_space_update(mg->mg_vd, 761 -msp->ms_smo.smo_alloc, 0, -msp->ms_map.sm_size); 762 763 metaslab_group_remove(mg, msp); 764 765 mutex_enter(&msp->ms_lock); 766 767 space_map_unload(&msp->ms_map); 768 space_map_destroy(&msp->ms_map); 769 770 for (int t = 0; t < TXG_SIZE; t++) { 771 space_map_destroy(&msp->ms_allocmap[t]); 772 space_map_destroy(&msp->ms_freemap[t]); 773 } 774 775 for (int t = 0; t < TXG_DEFER_SIZE; t++) 776 space_map_destroy(&msp->ms_defermap[t]); 777 778 ASSERT0(msp->ms_deferspace); 779 780 mutex_exit(&msp->ms_lock); 781 mutex_destroy(&msp->ms_lock); 782 783 kmem_free(msp, sizeof (metaslab_t)); 784 } 785 786 #define METASLAB_WEIGHT_PRIMARY (1ULL << 63) 787 #define METASLAB_WEIGHT_SECONDARY (1ULL << 62) 788 #define METASLAB_ACTIVE_MASK \ 789 (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY) 790 791 static uint64_t 792 metaslab_weight(metaslab_t *msp) 793 { 794 metaslab_group_t *mg = msp->ms_group; 795 space_map_t *sm = &msp->ms_map; 796 space_map_obj_t *smo = &msp->ms_smo; 797 vdev_t *vd = mg->mg_vd; 798 uint64_t weight, space; 799 800 ASSERT(MUTEX_HELD(&msp->ms_lock)); 801 802 /* 803 * The baseline weight is the metaslab's free space. 804 */ 805 space = sm->sm_size - smo->smo_alloc; 806 weight = space; 807 808 /* 809 * Modern disks have uniform bit density and constant angular velocity. 810 * Therefore, the outer recording zones are faster (higher bandwidth) 811 * than the inner zones by the ratio of outer to inner track diameter, 812 * which is typically around 2:1. We account for this by assigning 813 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). 814 * In effect, this means that we'll select the metaslab with the most 815 * free bandwidth rather than simply the one with the most free space. 816 */ 817 weight = 2 * weight - 818 ((sm->sm_start >> vd->vdev_ms_shift) * weight) / vd->vdev_ms_count; 819 ASSERT(weight >= space && weight <= 2 * space); 820 821 /* 822 * For locality, assign higher weight to metaslabs which have 823 * a lower offset than what we've already activated. 824 */ 825 if (sm->sm_start <= mg->mg_bonus_area) 826 weight *= (metaslab_smo_bonus_pct / 100); 827 ASSERT(weight >= space && 828 weight <= 2 * (metaslab_smo_bonus_pct / 100) * space); 829 830 if (sm->sm_loaded && !sm->sm_ops->smop_fragmented(sm)) { 831 /* 832 * If this metaslab is one we're actively using, adjust its 833 * weight to make it preferable to any inactive metaslab so 834 * we'll polish it off. 835 */ 836 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); 837 } 838 return (weight); 839 } 840 841 static void 842 metaslab_prefetch(metaslab_group_t *mg) 843 { 844 spa_t *spa = mg->mg_vd->vdev_spa; 845 metaslab_t *msp; 846 avl_tree_t *t = &mg->mg_metaslab_tree; 847 int m; 848 849 mutex_enter(&mg->mg_lock); 850 851 /* 852 * Prefetch the next potential metaslabs 853 */ 854 for (msp = avl_first(t), m = 0; msp; msp = AVL_NEXT(t, msp), m++) { 855 space_map_t *sm = &msp->ms_map; 856 space_map_obj_t *smo = &msp->ms_smo; 857 858 /* If we have reached our prefetch limit then we're done */ 859 if (m >= metaslab_prefetch_limit) 860 break; 861 862 if (!sm->sm_loaded && smo->smo_object != 0) { 863 mutex_exit(&mg->mg_lock); 864 dmu_prefetch(spa_meta_objset(spa), smo->smo_object, 865 0ULL, smo->smo_objsize); 866 mutex_enter(&mg->mg_lock); 867 } 868 } 869 mutex_exit(&mg->mg_lock); 870 } 871 872 static int 873 metaslab_activate(metaslab_t *msp, uint64_t activation_weight) 874 { 875 metaslab_group_t *mg = msp->ms_group; 876 space_map_t *sm = &msp->ms_map; 877 space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops; 878 879 ASSERT(MUTEX_HELD(&msp->ms_lock)); 880 881 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { 882 space_map_load_wait(sm); 883 if (!sm->sm_loaded) { 884 space_map_obj_t *smo = &msp->ms_smo; 885 886 int error = space_map_load(sm, sm_ops, SM_FREE, smo, 887 spa_meta_objset(msp->ms_group->mg_vd->vdev_spa)); 888 if (error) { 889 metaslab_group_sort(msp->ms_group, msp, 0); 890 return (error); 891 } 892 for (int t = 0; t < TXG_DEFER_SIZE; t++) 893 space_map_walk(&msp->ms_defermap[t], 894 space_map_claim, sm); 895 896 } 897 898 /* 899 * Track the bonus area as we activate new metaslabs. 900 */ 901 if (sm->sm_start > mg->mg_bonus_area) { 902 mutex_enter(&mg->mg_lock); 903 mg->mg_bonus_area = sm->sm_start; 904 mutex_exit(&mg->mg_lock); 905 } 906 907 metaslab_group_sort(msp->ms_group, msp, 908 msp->ms_weight | activation_weight); 909 } 910 ASSERT(sm->sm_loaded); 911 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); 912 913 return (0); 914 } 915 916 static void 917 metaslab_passivate(metaslab_t *msp, uint64_t size) 918 { 919 /* 920 * If size < SPA_MINBLOCKSIZE, then we will not allocate from 921 * this metaslab again. In that case, it had better be empty, 922 * or we would be leaving space on the table. 923 */ 924 ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map.sm_space == 0); 925 metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size)); 926 ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); 927 } 928 929 /* 930 * Write a metaslab to disk in the context of the specified transaction group. 931 */ 932 void 933 metaslab_sync(metaslab_t *msp, uint64_t txg) 934 { 935 vdev_t *vd = msp->ms_group->mg_vd; 936 spa_t *spa = vd->vdev_spa; 937 objset_t *mos = spa_meta_objset(spa); 938 space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK]; 939 space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK]; 940 space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; 941 space_map_t *sm = &msp->ms_map; 942 space_map_obj_t *smo = &msp->ms_smo_syncing; 943 dmu_buf_t *db; 944 dmu_tx_t *tx; 945 946 ASSERT(!vd->vdev_ishole); 947 948 if (allocmap->sm_space == 0 && freemap->sm_space == 0) 949 return; 950 951 /* 952 * The only state that can actually be changing concurrently with 953 * metaslab_sync() is the metaslab's ms_map. No other thread can 954 * be modifying this txg's allocmap, freemap, freed_map, or smo. 955 * Therefore, we only hold ms_lock to satify space_map ASSERTs. 956 * We drop it whenever we call into the DMU, because the DMU 957 * can call down to us (e.g. via zio_free()) at any time. 958 */ 959 960 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 961 962 if (smo->smo_object == 0) { 963 ASSERT(smo->smo_objsize == 0); 964 ASSERT(smo->smo_alloc == 0); 965 smo->smo_object = dmu_object_alloc(mos, 966 DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT, 967 DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx); 968 ASSERT(smo->smo_object != 0); 969 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * 970 (sm->sm_start >> vd->vdev_ms_shift), 971 sizeof (uint64_t), &smo->smo_object, tx); 972 } 973 974 mutex_enter(&msp->ms_lock); 975 976 space_map_walk(freemap, space_map_add, freed_map); 977 978 if (sm->sm_loaded && spa_sync_pass(spa) == 1 && smo->smo_objsize >= 979 2 * sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) { 980 /* 981 * The in-core space map representation is twice as compact 982 * as the on-disk one, so it's time to condense the latter 983 * by generating a pure allocmap from first principles. 984 * 985 * This metaslab is 100% allocated, 986 * minus the content of the in-core map (sm), 987 * minus what's been freed this txg (freed_map), 988 * minus deferred frees (ms_defermap[]), 989 * minus allocations from txgs in the future 990 * (because they haven't been committed yet). 991 */ 992 space_map_vacate(allocmap, NULL, NULL); 993 space_map_vacate(freemap, NULL, NULL); 994 995 space_map_add(allocmap, allocmap->sm_start, allocmap->sm_size); 996 997 space_map_walk(sm, space_map_remove, allocmap); 998 space_map_walk(freed_map, space_map_remove, allocmap); 999 1000 for (int t = 0; t < TXG_DEFER_SIZE; t++) 1001 space_map_walk(&msp->ms_defermap[t], 1002 space_map_remove, allocmap); 1003 1004 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) 1005 space_map_walk(&msp->ms_allocmap[(txg + t) & TXG_MASK], 1006 space_map_remove, allocmap); 1007 1008 mutex_exit(&msp->ms_lock); 1009 space_map_truncate(smo, mos, tx); 1010 mutex_enter(&msp->ms_lock); 1011 } 1012 1013 space_map_sync(allocmap, SM_ALLOC, smo, mos, tx); 1014 space_map_sync(freemap, SM_FREE, smo, mos, tx); 1015 1016 mutex_exit(&msp->ms_lock); 1017 1018 VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)); 1019 dmu_buf_will_dirty(db, tx); 1020 ASSERT3U(db->db_size, >=, sizeof (*smo)); 1021 bcopy(smo, db->db_data, sizeof (*smo)); 1022 dmu_buf_rele(db, FTAG); 1023 1024 dmu_tx_commit(tx); 1025 } 1026 1027 /* 1028 * Called after a transaction group has completely synced to mark 1029 * all of the metaslab's free space as usable. 1030 */ 1031 void 1032 metaslab_sync_done(metaslab_t *msp, uint64_t txg) 1033 { 1034 space_map_obj_t *smo = &msp->ms_smo; 1035 space_map_obj_t *smosync = &msp->ms_smo_syncing; 1036 space_map_t *sm = &msp->ms_map; 1037 space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; 1038 space_map_t *defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE]; 1039 metaslab_group_t *mg = msp->ms_group; 1040 vdev_t *vd = mg->mg_vd; 1041 int64_t alloc_delta, defer_delta; 1042 1043 ASSERT(!vd->vdev_ishole); 1044 1045 mutex_enter(&msp->ms_lock); 1046 1047 /* 1048 * If this metaslab is just becoming available, initialize its 1049 * allocmaps and freemaps and add its capacity to the vdev. 1050 */ 1051 if (freed_map->sm_size == 0) { 1052 for (int t = 0; t < TXG_SIZE; t++) { 1053 space_map_create(&msp->ms_allocmap[t], sm->sm_start, 1054 sm->sm_size, sm->sm_shift, sm->sm_lock); 1055 space_map_create(&msp->ms_freemap[t], sm->sm_start, 1056 sm->sm_size, sm->sm_shift, sm->sm_lock); 1057 } 1058 1059 for (int t = 0; t < TXG_DEFER_SIZE; t++) 1060 space_map_create(&msp->ms_defermap[t], sm->sm_start, 1061 sm->sm_size, sm->sm_shift, sm->sm_lock); 1062 1063 vdev_space_update(vd, 0, 0, sm->sm_size); 1064 } 1065 1066 alloc_delta = smosync->smo_alloc - smo->smo_alloc; 1067 defer_delta = freed_map->sm_space - defer_map->sm_space; 1068 1069 vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0); 1070 1071 ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0); 1072 ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0); 1073 1074 /* 1075 * If there's a space_map_load() in progress, wait for it to complete 1076 * so that we have a consistent view of the in-core space map. 1077 * Then, add defer_map (oldest deferred frees) to this map and 1078 * transfer freed_map (this txg's frees) to defer_map. 1079 */ 1080 space_map_load_wait(sm); 1081 space_map_vacate(defer_map, sm->sm_loaded ? space_map_free : NULL, sm); 1082 space_map_vacate(freed_map, space_map_add, defer_map); 1083 1084 *smo = *smosync; 1085 1086 msp->ms_deferspace += defer_delta; 1087 ASSERT3S(msp->ms_deferspace, >=, 0); 1088 ASSERT3S(msp->ms_deferspace, <=, sm->sm_size); 1089 if (msp->ms_deferspace != 0) { 1090 /* 1091 * Keep syncing this metaslab until all deferred frees 1092 * are back in circulation. 1093 */ 1094 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 1095 } 1096 1097 /* 1098 * If the map is loaded but no longer active, evict it as soon as all 1099 * future allocations have synced. (If we unloaded it now and then 1100 * loaded a moment later, the map wouldn't reflect those allocations.) 1101 */ 1102 if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { 1103 int evictable = 1; 1104 1105 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) 1106 if (msp->ms_allocmap[(txg + t) & TXG_MASK].sm_space) 1107 evictable = 0; 1108 1109 if (evictable && !metaslab_debug) 1110 space_map_unload(sm); 1111 } 1112 1113 metaslab_group_sort(mg, msp, metaslab_weight(msp)); 1114 1115 mutex_exit(&msp->ms_lock); 1116 } 1117 1118 void 1119 metaslab_sync_reassess(metaslab_group_t *mg) 1120 { 1121 vdev_t *vd = mg->mg_vd; 1122 int64_t failures = mg->mg_alloc_failures; 1123 1124 /* 1125 * Re-evaluate all metaslabs which have lower offsets than the 1126 * bonus area. 1127 */ 1128 for (int m = 0; m < vd->vdev_ms_count; m++) { 1129 metaslab_t *msp = vd->vdev_ms[m]; 1130 1131 if (msp->ms_map.sm_start > mg->mg_bonus_area) 1132 break; 1133 1134 mutex_enter(&msp->ms_lock); 1135 metaslab_group_sort(mg, msp, metaslab_weight(msp)); 1136 mutex_exit(&msp->ms_lock); 1137 } 1138 1139 atomic_add_64(&mg->mg_alloc_failures, -failures); 1140 1141 /* 1142 * Prefetch the next potential metaslabs 1143 */ 1144 metaslab_prefetch(mg); 1145 } 1146 1147 static uint64_t 1148 metaslab_distance(metaslab_t *msp, dva_t *dva) 1149 { 1150 uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift; 1151 uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift; 1152 uint64_t start = msp->ms_map.sm_start >> ms_shift; 1153 1154 if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) 1155 return (1ULL << 63); 1156 1157 if (offset < start) 1158 return ((start - offset) << ms_shift); 1159 if (offset > start) 1160 return ((offset - start) << ms_shift); 1161 return (0); 1162 } 1163 1164 static uint64_t 1165 metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, 1166 uint64_t txg, uint64_t min_distance, dva_t *dva, int d, int flags) 1167 { 1168 spa_t *spa = mg->mg_vd->vdev_spa; 1169 metaslab_t *msp = NULL; 1170 uint64_t offset = -1ULL; 1171 avl_tree_t *t = &mg->mg_metaslab_tree; 1172 uint64_t activation_weight; 1173 uint64_t target_distance; 1174 int i; 1175 1176 activation_weight = METASLAB_WEIGHT_PRIMARY; 1177 for (i = 0; i < d; i++) { 1178 if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { 1179 activation_weight = METASLAB_WEIGHT_SECONDARY; 1180 break; 1181 } 1182 } 1183 1184 for (;;) { 1185 boolean_t was_active; 1186 1187 mutex_enter(&mg->mg_lock); 1188 for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) { 1189 if (msp->ms_weight < asize) { 1190 spa_dbgmsg(spa, "%s: failed to meet weight " 1191 "requirement: vdev %llu, txg %llu, mg %p, " 1192 "msp %p, psize %llu, asize %llu, " 1193 "failures %llu, weight %llu", 1194 spa_name(spa), mg->mg_vd->vdev_id, txg, 1195 mg, msp, psize, asize, 1196 mg->mg_alloc_failures, msp->ms_weight); 1197 mutex_exit(&mg->mg_lock); 1198 return (-1ULL); 1199 } 1200 was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; 1201 if (activation_weight == METASLAB_WEIGHT_PRIMARY) 1202 break; 1203 1204 target_distance = min_distance + 1205 (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1); 1206 1207 for (i = 0; i < d; i++) 1208 if (metaslab_distance(msp, &dva[i]) < 1209 target_distance) 1210 break; 1211 if (i == d) 1212 break; 1213 } 1214 mutex_exit(&mg->mg_lock); 1215 if (msp == NULL) 1216 return (-1ULL); 1217 1218 /* 1219 * If we've already reached the allowable number of failed 1220 * allocation attempts on this metaslab group then we 1221 * consider skipping it. We skip it only if we're allowed 1222 * to "fast" gang, the physical size is larger than 1223 * a gang block, and we're attempting to allocate from 1224 * the primary metaslab. 1225 */ 1226 if (mg->mg_alloc_failures > zfs_mg_alloc_failures && 1227 CAN_FASTGANG(flags) && psize > SPA_GANGBLOCKSIZE && 1228 activation_weight == METASLAB_WEIGHT_PRIMARY) { 1229 spa_dbgmsg(spa, "%s: skipping metaslab group: " 1230 "vdev %llu, txg %llu, mg %p, psize %llu, " 1231 "asize %llu, failures %llu", spa_name(spa), 1232 mg->mg_vd->vdev_id, txg, mg, psize, asize, 1233 mg->mg_alloc_failures); 1234 return (-1ULL); 1235 } 1236 1237 mutex_enter(&msp->ms_lock); 1238 1239 /* 1240 * Ensure that the metaslab we have selected is still 1241 * capable of handling our request. It's possible that 1242 * another thread may have changed the weight while we 1243 * were blocked on the metaslab lock. 1244 */ 1245 if (msp->ms_weight < asize || (was_active && 1246 !(msp->ms_weight & METASLAB_ACTIVE_MASK) && 1247 activation_weight == METASLAB_WEIGHT_PRIMARY)) { 1248 mutex_exit(&msp->ms_lock); 1249 continue; 1250 } 1251 1252 if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) && 1253 activation_weight == METASLAB_WEIGHT_PRIMARY) { 1254 metaslab_passivate(msp, 1255 msp->ms_weight & ~METASLAB_ACTIVE_MASK); 1256 mutex_exit(&msp->ms_lock); 1257 continue; 1258 } 1259 1260 if (metaslab_activate(msp, activation_weight) != 0) { 1261 mutex_exit(&msp->ms_lock); 1262 continue; 1263 } 1264 1265 if ((offset = space_map_alloc(&msp->ms_map, asize)) != -1ULL) 1266 break; 1267 1268 atomic_inc_64(&mg->mg_alloc_failures); 1269 1270 metaslab_passivate(msp, space_map_maxsize(&msp->ms_map)); 1271 1272 mutex_exit(&msp->ms_lock); 1273 } 1274 1275 if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) 1276 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); 1277 1278 space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, asize); 1279 1280 mutex_exit(&msp->ms_lock); 1281 1282 return (offset); 1283 } 1284 1285 /* 1286 * Allocate a block for the specified i/o. 1287 */ 1288 static int 1289 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, 1290 dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags) 1291 { 1292 metaslab_group_t *mg, *rotor; 1293 vdev_t *vd; 1294 int dshift = 3; 1295 int all_zero; 1296 int zio_lock = B_FALSE; 1297 boolean_t allocatable; 1298 uint64_t offset = -1ULL; 1299 uint64_t asize; 1300 uint64_t distance; 1301 1302 ASSERT(!DVA_IS_VALID(&dva[d])); 1303 1304 /* 1305 * For testing, make some blocks above a certain size be gang blocks. 1306 */ 1307 if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) 1308 return (ENOSPC); 1309 1310 /* 1311 * Start at the rotor and loop through all mgs until we find something. 1312 * Note that there's no locking on mc_rotor or mc_aliquot because 1313 * nothing actually breaks if we miss a few updates -- we just won't 1314 * allocate quite as evenly. It all balances out over time. 1315 * 1316 * If we are doing ditto or log blocks, try to spread them across 1317 * consecutive vdevs. If we're forced to reuse a vdev before we've 1318 * allocated all of our ditto blocks, then try and spread them out on 1319 * that vdev as much as possible. If it turns out to not be possible, 1320 * gradually lower our standards until anything becomes acceptable. 1321 * Also, allocating on consecutive vdevs (as opposed to random vdevs) 1322 * gives us hope of containing our fault domains to something we're 1323 * able to reason about. Otherwise, any two top-level vdev failures 1324 * will guarantee the loss of data. With consecutive allocation, 1325 * only two adjacent top-level vdev failures will result in data loss. 1326 * 1327 * If we are doing gang blocks (hintdva is non-NULL), try to keep 1328 * ourselves on the same vdev as our gang block header. That 1329 * way, we can hope for locality in vdev_cache, plus it makes our 1330 * fault domains something tractable. 1331 */ 1332 if (hintdva) { 1333 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); 1334 1335 /* 1336 * It's possible the vdev we're using as the hint no 1337 * longer exists (i.e. removed). Consult the rotor when 1338 * all else fails. 1339 */ 1340 if (vd != NULL) { 1341 mg = vd->vdev_mg; 1342 1343 if (flags & METASLAB_HINTBP_AVOID && 1344 mg->mg_next != NULL) 1345 mg = mg->mg_next; 1346 } else { 1347 mg = mc->mc_rotor; 1348 } 1349 } else if (d != 0) { 1350 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); 1351 mg = vd->vdev_mg->mg_next; 1352 } else { 1353 mg = mc->mc_rotor; 1354 } 1355 1356 /* 1357 * If the hint put us into the wrong metaslab class, or into a 1358 * metaslab group that has been passivated, just follow the rotor. 1359 */ 1360 if (mg->mg_class != mc || mg->mg_activation_count <= 0) 1361 mg = mc->mc_rotor; 1362 1363 rotor = mg; 1364 top: 1365 all_zero = B_TRUE; 1366 do { 1367 ASSERT(mg->mg_activation_count == 1); 1368 1369 vd = mg->mg_vd; 1370 1371 /* 1372 * Don't allocate from faulted devices. 1373 */ 1374 if (zio_lock) { 1375 spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); 1376 allocatable = vdev_allocatable(vd); 1377 spa_config_exit(spa, SCL_ZIO, FTAG); 1378 } else { 1379 allocatable = vdev_allocatable(vd); 1380 } 1381 if (!allocatable) 1382 goto next; 1383 1384 /* 1385 * Avoid writing single-copy data to a failing vdev 1386 * unless the user instructs us that it is okay. 1387 */ 1388 if ((vd->vdev_stat.vs_write_errors > 0 || 1389 vd->vdev_state < VDEV_STATE_HEALTHY) && 1390 d == 0 && dshift == 3 && 1391 !(zfs_write_to_degraded && vd->vdev_state == 1392 VDEV_STATE_DEGRADED)) { 1393 all_zero = B_FALSE; 1394 goto next; 1395 } 1396 1397 ASSERT(mg->mg_class == mc); 1398 1399 distance = vd->vdev_asize >> dshift; 1400 if (distance <= (1ULL << vd->vdev_ms_shift)) 1401 distance = 0; 1402 else 1403 all_zero = B_FALSE; 1404 1405 asize = vdev_psize_to_asize(vd, psize); 1406 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); 1407 1408 offset = metaslab_group_alloc(mg, psize, asize, txg, distance, 1409 dva, d, flags); 1410 if (offset != -1ULL) { 1411 /* 1412 * If we've just selected this metaslab group, 1413 * figure out whether the corresponding vdev is 1414 * over- or under-used relative to the pool, 1415 * and set an allocation bias to even it out. 1416 */ 1417 if (mc->mc_aliquot == 0) { 1418 vdev_stat_t *vs = &vd->vdev_stat; 1419 int64_t vu, cu; 1420 1421 vu = (vs->vs_alloc * 100) / (vs->vs_space + 1); 1422 cu = (mc->mc_alloc * 100) / (mc->mc_space + 1); 1423 1424 /* 1425 * Calculate how much more or less we should 1426 * try to allocate from this device during 1427 * this iteration around the rotor. 1428 * For example, if a device is 80% full 1429 * and the pool is 20% full then we should 1430 * reduce allocations by 60% on this device. 1431 * 1432 * mg_bias = (20 - 80) * 512K / 100 = -307K 1433 * 1434 * This reduces allocations by 307K for this 1435 * iteration. 1436 */ 1437 mg->mg_bias = ((cu - vu) * 1438 (int64_t)mg->mg_aliquot) / 100; 1439 } 1440 1441 if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= 1442 mg->mg_aliquot + mg->mg_bias) { 1443 mc->mc_rotor = mg->mg_next; 1444 mc->mc_aliquot = 0; 1445 } 1446 1447 DVA_SET_VDEV(&dva[d], vd->vdev_id); 1448 DVA_SET_OFFSET(&dva[d], offset); 1449 DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER)); 1450 DVA_SET_ASIZE(&dva[d], asize); 1451 1452 return (0); 1453 } 1454 next: 1455 mc->mc_rotor = mg->mg_next; 1456 mc->mc_aliquot = 0; 1457 } while ((mg = mg->mg_next) != rotor); 1458 1459 if (!all_zero) { 1460 dshift++; 1461 ASSERT(dshift < 64); 1462 goto top; 1463 } 1464 1465 if (!allocatable && !zio_lock) { 1466 dshift = 3; 1467 zio_lock = B_TRUE; 1468 goto top; 1469 } 1470 1471 bzero(&dva[d], sizeof (dva_t)); 1472 1473 return (ENOSPC); 1474 } 1475 1476 /* 1477 * Free the block represented by DVA in the context of the specified 1478 * transaction group. 1479 */ 1480 static void 1481 metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now) 1482 { 1483 uint64_t vdev = DVA_GET_VDEV(dva); 1484 uint64_t offset = DVA_GET_OFFSET(dva); 1485 uint64_t size = DVA_GET_ASIZE(dva); 1486 vdev_t *vd; 1487 metaslab_t *msp; 1488 1489 ASSERT(DVA_IS_VALID(dva)); 1490 1491 if (txg > spa_freeze_txg(spa)) 1492 return; 1493 1494 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 1495 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { 1496 cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu", 1497 (u_longlong_t)vdev, (u_longlong_t)offset); 1498 ASSERT(0); 1499 return; 1500 } 1501 1502 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 1503 1504 if (DVA_GET_GANG(dva)) 1505 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 1506 1507 mutex_enter(&msp->ms_lock); 1508 1509 if (now) { 1510 space_map_remove(&msp->ms_allocmap[txg & TXG_MASK], 1511 offset, size); 1512 space_map_free(&msp->ms_map, offset, size); 1513 } else { 1514 if (msp->ms_freemap[txg & TXG_MASK].sm_space == 0) 1515 vdev_dirty(vd, VDD_METASLAB, msp, txg); 1516 space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size); 1517 } 1518 1519 mutex_exit(&msp->ms_lock); 1520 } 1521 1522 /* 1523 * Intent log support: upon opening the pool after a crash, notify the SPA 1524 * of blocks that the intent log has allocated for immediate write, but 1525 * which are still considered free by the SPA because the last transaction 1526 * group didn't commit yet. 1527 */ 1528 static int 1529 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) 1530 { 1531 uint64_t vdev = DVA_GET_VDEV(dva); 1532 uint64_t offset = DVA_GET_OFFSET(dva); 1533 uint64_t size = DVA_GET_ASIZE(dva); 1534 vdev_t *vd; 1535 metaslab_t *msp; 1536 int error = 0; 1537 1538 ASSERT(DVA_IS_VALID(dva)); 1539 1540 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 1541 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) 1542 return (ENXIO); 1543 1544 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 1545 1546 if (DVA_GET_GANG(dva)) 1547 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 1548 1549 mutex_enter(&msp->ms_lock); 1550 1551 if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map.sm_loaded) 1552 error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); 1553 1554 if (error == 0 && !space_map_contains(&msp->ms_map, offset, size)) 1555 error = ENOENT; 1556 1557 if (error || txg == 0) { /* txg == 0 indicates dry run */ 1558 mutex_exit(&msp->ms_lock); 1559 return (error); 1560 } 1561 1562 space_map_claim(&msp->ms_map, offset, size); 1563 1564 if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ 1565 if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) 1566 vdev_dirty(vd, VDD_METASLAB, msp, txg); 1567 space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); 1568 } 1569 1570 mutex_exit(&msp->ms_lock); 1571 1572 return (0); 1573 } 1574 1575 int 1576 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, 1577 int ndvas, uint64_t txg, blkptr_t *hintbp, int flags) 1578 { 1579 dva_t *dva = bp->blk_dva; 1580 dva_t *hintdva = hintbp->blk_dva; 1581 int error = 0; 1582 1583 ASSERT(bp->blk_birth == 0); 1584 ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); 1585 1586 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 1587 1588 if (mc->mc_rotor == NULL) { /* no vdevs in this class */ 1589 spa_config_exit(spa, SCL_ALLOC, FTAG); 1590 return (ENOSPC); 1591 } 1592 1593 ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); 1594 ASSERT(BP_GET_NDVAS(bp) == 0); 1595 ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); 1596 1597 for (int d = 0; d < ndvas; d++) { 1598 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, 1599 txg, flags); 1600 if (error) { 1601 for (d--; d >= 0; d--) { 1602 metaslab_free_dva(spa, &dva[d], txg, B_TRUE); 1603 bzero(&dva[d], sizeof (dva_t)); 1604 } 1605 spa_config_exit(spa, SCL_ALLOC, FTAG); 1606 return (error); 1607 } 1608 } 1609 ASSERT(error == 0); 1610 ASSERT(BP_GET_NDVAS(bp) == ndvas); 1611 1612 spa_config_exit(spa, SCL_ALLOC, FTAG); 1613 1614 BP_SET_BIRTH(bp, txg, txg); 1615 1616 return (0); 1617 } 1618 1619 void 1620 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) 1621 { 1622 const dva_t *dva = bp->blk_dva; 1623 int ndvas = BP_GET_NDVAS(bp); 1624 1625 ASSERT(!BP_IS_HOLE(bp)); 1626 ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); 1627 1628 spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); 1629 1630 for (int d = 0; d < ndvas; d++) 1631 metaslab_free_dva(spa, &dva[d], txg, now); 1632 1633 spa_config_exit(spa, SCL_FREE, FTAG); 1634 } 1635 1636 int 1637 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) 1638 { 1639 const dva_t *dva = bp->blk_dva; 1640 int ndvas = BP_GET_NDVAS(bp); 1641 int error = 0; 1642 1643 ASSERT(!BP_IS_HOLE(bp)); 1644 1645 if (txg != 0) { 1646 /* 1647 * First do a dry run to make sure all DVAs are claimable, 1648 * so we don't have to unwind from partial failures below. 1649 */ 1650 if ((error = metaslab_claim(spa, bp, 0)) != 0) 1651 return (error); 1652 } 1653 1654 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 1655 1656 for (int d = 0; d < ndvas; d++) 1657 if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0) 1658 break; 1659 1660 spa_config_exit(spa, SCL_ALLOC, FTAG); 1661 1662 ASSERT(error == 0 || txg == 0); 1663 1664 return (error); 1665 } 1666