1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012 by Delphix. All rights reserved. 24 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 25 */ 26 27 #include <sys/zfs_context.h> 28 #include <sys/dmu.h> 29 #include <sys/dmu_tx.h> 30 #include <sys/space_map.h> 31 #include <sys/metaslab_impl.h> 32 #include <sys/vdev_impl.h> 33 #include <sys/zio.h> 34 35 /* 36 * Allow allocations to switch to gang blocks quickly. We do this to 37 * avoid having to load lots of space_maps in a given txg. There are, 38 * however, some cases where we want to avoid "fast" ganging and instead 39 * we want to do an exhaustive search of all metaslabs on this device. 40 * Currently we don't allow any gang, zil, or dump device related allocations 41 * to "fast" gang. 42 */ 43 #define CAN_FASTGANG(flags) \ 44 (!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \ 45 METASLAB_GANG_AVOID))) 46 47 uint64_t metaslab_aliquot = 512ULL << 10; 48 uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ 49 50 /* 51 * The in-core space map representation is more compact than its on-disk form. 52 * The zfs_condense_pct determines how much more compact the in-core 53 * space_map representation must be before we compact it on-disk. 54 * Values should be greater than or equal to 100. 55 */ 56 int zfs_condense_pct = 200; 57 58 /* 59 * This value defines the number of allowed allocation failures per vdev. 60 * If a device reaches this threshold in a given txg then we consider skipping 61 * allocations on that device. 62 */ 63 int zfs_mg_alloc_failures; 64 65 /* 66 * Metaslab debugging: when set, keeps all space maps in core to verify frees. 67 */ 68 static int metaslab_debug = 0; 69 70 /* 71 * Minimum size which forces the dynamic allocator to change 72 * it's allocation strategy. Once the space map cannot satisfy 73 * an allocation of this size then it switches to using more 74 * aggressive strategy (i.e search by size rather than offset). 75 */ 76 uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE; 77 78 /* 79 * The minimum free space, in percent, which must be available 80 * in a space map to continue allocations in a first-fit fashion. 81 * Once the space_map's free space drops below this level we dynamically 82 * switch to using best-fit allocations. 83 */ 84 int metaslab_df_free_pct = 4; 85 86 /* 87 * A metaslab is considered "free" if it contains a contiguous 88 * segment which is greater than metaslab_min_alloc_size. 89 */ 90 uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS; 91 92 /* 93 * Max number of space_maps to prefetch. 94 */ 95 int metaslab_prefetch_limit = SPA_DVAS_PER_BP; 96 97 /* 98 * Percentage bonus multiplier for metaslabs that are in the bonus area. 99 */ 100 int metaslab_smo_bonus_pct = 150; 101 102 /* 103 * Should we be willing to write data to degraded vdevs? 104 */ 105 boolean_t zfs_write_to_degraded = B_FALSE; 106 107 /* 108 * ========================================================================== 109 * Metaslab classes 110 * ========================================================================== 111 */ 112 metaslab_class_t * 113 metaslab_class_create(spa_t *spa, space_map_ops_t *ops) 114 { 115 metaslab_class_t *mc; 116 117 mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); 118 119 mc->mc_spa = spa; 120 mc->mc_rotor = NULL; 121 mc->mc_ops = ops; 122 123 return (mc); 124 } 125 126 void 127 metaslab_class_destroy(metaslab_class_t *mc) 128 { 129 ASSERT(mc->mc_rotor == NULL); 130 ASSERT(mc->mc_alloc == 0); 131 ASSERT(mc->mc_deferred == 0); 132 ASSERT(mc->mc_space == 0); 133 ASSERT(mc->mc_dspace == 0); 134 135 kmem_free(mc, sizeof (metaslab_class_t)); 136 } 137 138 int 139 metaslab_class_validate(metaslab_class_t *mc) 140 { 141 metaslab_group_t *mg; 142 vdev_t *vd; 143 144 /* 145 * Must hold one of the spa_config locks. 146 */ 147 ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || 148 spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); 149 150 if ((mg = mc->mc_rotor) == NULL) 151 return (0); 152 153 do { 154 vd = mg->mg_vd; 155 ASSERT(vd->vdev_mg != NULL); 156 ASSERT3P(vd->vdev_top, ==, vd); 157 ASSERT3P(mg->mg_class, ==, mc); 158 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); 159 } while ((mg = mg->mg_next) != mc->mc_rotor); 160 161 return (0); 162 } 163 164 void 165 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, 166 int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) 167 { 168 atomic_add_64(&mc->mc_alloc, alloc_delta); 169 atomic_add_64(&mc->mc_deferred, defer_delta); 170 atomic_add_64(&mc->mc_space, space_delta); 171 atomic_add_64(&mc->mc_dspace, dspace_delta); 172 } 173 174 uint64_t 175 metaslab_class_get_alloc(metaslab_class_t *mc) 176 { 177 return (mc->mc_alloc); 178 } 179 180 uint64_t 181 metaslab_class_get_deferred(metaslab_class_t *mc) 182 { 183 return (mc->mc_deferred); 184 } 185 186 uint64_t 187 metaslab_class_get_space(metaslab_class_t *mc) 188 { 189 return (mc->mc_space); 190 } 191 192 uint64_t 193 metaslab_class_get_dspace(metaslab_class_t *mc) 194 { 195 return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); 196 } 197 198 /* 199 * ========================================================================== 200 * Metaslab groups 201 * ========================================================================== 202 */ 203 static int 204 metaslab_compare(const void *x1, const void *x2) 205 { 206 const metaslab_t *m1 = x1; 207 const metaslab_t *m2 = x2; 208 209 if (m1->ms_weight < m2->ms_weight) 210 return (1); 211 if (m1->ms_weight > m2->ms_weight) 212 return (-1); 213 214 /* 215 * If the weights are identical, use the offset to force uniqueness. 216 */ 217 if (m1->ms_map->sm_start < m2->ms_map->sm_start) 218 return (-1); 219 if (m1->ms_map->sm_start > m2->ms_map->sm_start) 220 return (1); 221 222 ASSERT3P(m1, ==, m2); 223 224 return (0); 225 } 226 227 metaslab_group_t * 228 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) 229 { 230 metaslab_group_t *mg; 231 232 mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); 233 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); 234 avl_create(&mg->mg_metaslab_tree, metaslab_compare, 235 sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); 236 mg->mg_vd = vd; 237 mg->mg_class = mc; 238 mg->mg_activation_count = 0; 239 240 return (mg); 241 } 242 243 void 244 metaslab_group_destroy(metaslab_group_t *mg) 245 { 246 ASSERT(mg->mg_prev == NULL); 247 ASSERT(mg->mg_next == NULL); 248 /* 249 * We may have gone below zero with the activation count 250 * either because we never activated in the first place or 251 * because we're done, and possibly removing the vdev. 252 */ 253 ASSERT(mg->mg_activation_count <= 0); 254 255 avl_destroy(&mg->mg_metaslab_tree); 256 mutex_destroy(&mg->mg_lock); 257 kmem_free(mg, sizeof (metaslab_group_t)); 258 } 259 260 void 261 metaslab_group_activate(metaslab_group_t *mg) 262 { 263 metaslab_class_t *mc = mg->mg_class; 264 metaslab_group_t *mgprev, *mgnext; 265 266 ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); 267 268 ASSERT(mc->mc_rotor != mg); 269 ASSERT(mg->mg_prev == NULL); 270 ASSERT(mg->mg_next == NULL); 271 ASSERT(mg->mg_activation_count <= 0); 272 273 if (++mg->mg_activation_count <= 0) 274 return; 275 276 mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); 277 278 if ((mgprev = mc->mc_rotor) == NULL) { 279 mg->mg_prev = mg; 280 mg->mg_next = mg; 281 } else { 282 mgnext = mgprev->mg_next; 283 mg->mg_prev = mgprev; 284 mg->mg_next = mgnext; 285 mgprev->mg_next = mg; 286 mgnext->mg_prev = mg; 287 } 288 mc->mc_rotor = mg; 289 } 290 291 void 292 metaslab_group_passivate(metaslab_group_t *mg) 293 { 294 metaslab_class_t *mc = mg->mg_class; 295 metaslab_group_t *mgprev, *mgnext; 296 297 ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); 298 299 if (--mg->mg_activation_count != 0) { 300 ASSERT(mc->mc_rotor != mg); 301 ASSERT(mg->mg_prev == NULL); 302 ASSERT(mg->mg_next == NULL); 303 ASSERT(mg->mg_activation_count < 0); 304 return; 305 } 306 307 mgprev = mg->mg_prev; 308 mgnext = mg->mg_next; 309 310 if (mg == mgnext) { 311 mc->mc_rotor = NULL; 312 } else { 313 mc->mc_rotor = mgnext; 314 mgprev->mg_next = mgnext; 315 mgnext->mg_prev = mgprev; 316 } 317 318 mg->mg_prev = NULL; 319 mg->mg_next = NULL; 320 } 321 322 static void 323 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) 324 { 325 mutex_enter(&mg->mg_lock); 326 ASSERT(msp->ms_group == NULL); 327 msp->ms_group = mg; 328 msp->ms_weight = 0; 329 avl_add(&mg->mg_metaslab_tree, msp); 330 mutex_exit(&mg->mg_lock); 331 } 332 333 static void 334 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) 335 { 336 mutex_enter(&mg->mg_lock); 337 ASSERT(msp->ms_group == mg); 338 avl_remove(&mg->mg_metaslab_tree, msp); 339 msp->ms_group = NULL; 340 mutex_exit(&mg->mg_lock); 341 } 342 343 static void 344 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 345 { 346 /* 347 * Although in principle the weight can be any value, in 348 * practice we do not use values in the range [1, 510]. 349 */ 350 ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0); 351 ASSERT(MUTEX_HELD(&msp->ms_lock)); 352 353 mutex_enter(&mg->mg_lock); 354 ASSERT(msp->ms_group == mg); 355 avl_remove(&mg->mg_metaslab_tree, msp); 356 msp->ms_weight = weight; 357 avl_add(&mg->mg_metaslab_tree, msp); 358 mutex_exit(&mg->mg_lock); 359 } 360 361 /* 362 * ========================================================================== 363 * Common allocator routines 364 * ========================================================================== 365 */ 366 static int 367 metaslab_segsize_compare(const void *x1, const void *x2) 368 { 369 const space_seg_t *s1 = x1; 370 const space_seg_t *s2 = x2; 371 uint64_t ss_size1 = s1->ss_end - s1->ss_start; 372 uint64_t ss_size2 = s2->ss_end - s2->ss_start; 373 374 if (ss_size1 < ss_size2) 375 return (-1); 376 if (ss_size1 > ss_size2) 377 return (1); 378 379 if (s1->ss_start < s2->ss_start) 380 return (-1); 381 if (s1->ss_start > s2->ss_start) 382 return (1); 383 384 return (0); 385 } 386 387 /* 388 * This is a helper function that can be used by the allocator to find 389 * a suitable block to allocate. This will search the specified AVL 390 * tree looking for a block that matches the specified criteria. 391 */ 392 static uint64_t 393 metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, 394 uint64_t align) 395 { 396 space_seg_t *ss, ssearch; 397 avl_index_t where; 398 399 ssearch.ss_start = *cursor; 400 ssearch.ss_end = *cursor + size; 401 402 ss = avl_find(t, &ssearch, &where); 403 if (ss == NULL) 404 ss = avl_nearest(t, where, AVL_AFTER); 405 406 while (ss != NULL) { 407 uint64_t offset = P2ROUNDUP(ss->ss_start, align); 408 409 if (offset + size <= ss->ss_end) { 410 *cursor = offset + size; 411 return (offset); 412 } 413 ss = AVL_NEXT(t, ss); 414 } 415 416 /* 417 * If we know we've searched the whole map (*cursor == 0), give up. 418 * Otherwise, reset the cursor to the beginning and try again. 419 */ 420 if (*cursor == 0) 421 return (-1ULL); 422 423 *cursor = 0; 424 return (metaslab_block_picker(t, cursor, size, align)); 425 } 426 427 static void 428 metaslab_pp_load(space_map_t *sm) 429 { 430 space_seg_t *ss; 431 432 ASSERT(sm->sm_ppd == NULL); 433 sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP); 434 435 sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); 436 avl_create(sm->sm_pp_root, metaslab_segsize_compare, 437 sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node)); 438 439 for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss)) 440 avl_add(sm->sm_pp_root, ss); 441 } 442 443 static void 444 metaslab_pp_unload(space_map_t *sm) 445 { 446 void *cookie = NULL; 447 448 kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t)); 449 sm->sm_ppd = NULL; 450 451 while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) { 452 /* tear down the tree */ 453 } 454 455 avl_destroy(sm->sm_pp_root); 456 kmem_free(sm->sm_pp_root, sizeof (avl_tree_t)); 457 sm->sm_pp_root = NULL; 458 } 459 460 /* ARGSUSED */ 461 static void 462 metaslab_pp_claim(space_map_t *sm, uint64_t start, uint64_t size) 463 { 464 /* No need to update cursor */ 465 } 466 467 /* ARGSUSED */ 468 static void 469 metaslab_pp_free(space_map_t *sm, uint64_t start, uint64_t size) 470 { 471 /* No need to update cursor */ 472 } 473 474 /* 475 * Return the maximum contiguous segment within the metaslab. 476 */ 477 uint64_t 478 metaslab_pp_maxsize(space_map_t *sm) 479 { 480 avl_tree_t *t = sm->sm_pp_root; 481 space_seg_t *ss; 482 483 if (t == NULL || (ss = avl_last(t)) == NULL) 484 return (0ULL); 485 486 return (ss->ss_end - ss->ss_start); 487 } 488 489 /* 490 * ========================================================================== 491 * The first-fit block allocator 492 * ========================================================================== 493 */ 494 static uint64_t 495 metaslab_ff_alloc(space_map_t *sm, uint64_t size) 496 { 497 avl_tree_t *t = &sm->sm_root; 498 uint64_t align = size & -size; 499 uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; 500 501 return (metaslab_block_picker(t, cursor, size, align)); 502 } 503 504 /* ARGSUSED */ 505 boolean_t 506 metaslab_ff_fragmented(space_map_t *sm) 507 { 508 return (B_TRUE); 509 } 510 511 static space_map_ops_t metaslab_ff_ops = { 512 metaslab_pp_load, 513 metaslab_pp_unload, 514 metaslab_ff_alloc, 515 metaslab_pp_claim, 516 metaslab_pp_free, 517 metaslab_pp_maxsize, 518 metaslab_ff_fragmented 519 }; 520 521 /* 522 * ========================================================================== 523 * Dynamic block allocator - 524 * Uses the first fit allocation scheme until space get low and then 525 * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold 526 * and metaslab_df_free_pct to determine when to switch the allocation scheme. 527 * ========================================================================== 528 */ 529 static uint64_t 530 metaslab_df_alloc(space_map_t *sm, uint64_t size) 531 { 532 avl_tree_t *t = &sm->sm_root; 533 uint64_t align = size & -size; 534 uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; 535 uint64_t max_size = metaslab_pp_maxsize(sm); 536 int free_pct = sm->sm_space * 100 / sm->sm_size; 537 538 ASSERT(MUTEX_HELD(sm->sm_lock)); 539 ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); 540 541 if (max_size < size) 542 return (-1ULL); 543 544 /* 545 * If we're running low on space switch to using the size 546 * sorted AVL tree (best-fit). 547 */ 548 if (max_size < metaslab_df_alloc_threshold || 549 free_pct < metaslab_df_free_pct) { 550 t = sm->sm_pp_root; 551 *cursor = 0; 552 } 553 554 return (metaslab_block_picker(t, cursor, size, 1ULL)); 555 } 556 557 static boolean_t 558 metaslab_df_fragmented(space_map_t *sm) 559 { 560 uint64_t max_size = metaslab_pp_maxsize(sm); 561 int free_pct = sm->sm_space * 100 / sm->sm_size; 562 563 if (max_size >= metaslab_df_alloc_threshold && 564 free_pct >= metaslab_df_free_pct) 565 return (B_FALSE); 566 567 return (B_TRUE); 568 } 569 570 static space_map_ops_t metaslab_df_ops = { 571 metaslab_pp_load, 572 metaslab_pp_unload, 573 metaslab_df_alloc, 574 metaslab_pp_claim, 575 metaslab_pp_free, 576 metaslab_pp_maxsize, 577 metaslab_df_fragmented 578 }; 579 580 /* 581 * ========================================================================== 582 * Other experimental allocators 583 * ========================================================================== 584 */ 585 static uint64_t 586 metaslab_cdf_alloc(space_map_t *sm, uint64_t size) 587 { 588 avl_tree_t *t = &sm->sm_root; 589 uint64_t *cursor = (uint64_t *)sm->sm_ppd; 590 uint64_t *extent_end = (uint64_t *)sm->sm_ppd + 1; 591 uint64_t max_size = metaslab_pp_maxsize(sm); 592 uint64_t rsize = size; 593 uint64_t offset = 0; 594 595 ASSERT(MUTEX_HELD(sm->sm_lock)); 596 ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); 597 598 if (max_size < size) 599 return (-1ULL); 600 601 ASSERT3U(*extent_end, >=, *cursor); 602 603 /* 604 * If we're running low on space switch to using the size 605 * sorted AVL tree (best-fit). 606 */ 607 if ((*cursor + size) > *extent_end) { 608 609 t = sm->sm_pp_root; 610 *cursor = *extent_end = 0; 611 612 if (max_size > 2 * SPA_MAXBLOCKSIZE) 613 rsize = MIN(metaslab_min_alloc_size, max_size); 614 offset = metaslab_block_picker(t, extent_end, rsize, 1ULL); 615 if (offset != -1) 616 *cursor = offset + size; 617 } else { 618 offset = metaslab_block_picker(t, cursor, rsize, 1ULL); 619 } 620 ASSERT3U(*cursor, <=, *extent_end); 621 return (offset); 622 } 623 624 static boolean_t 625 metaslab_cdf_fragmented(space_map_t *sm) 626 { 627 uint64_t max_size = metaslab_pp_maxsize(sm); 628 629 if (max_size > (metaslab_min_alloc_size * 10)) 630 return (B_FALSE); 631 return (B_TRUE); 632 } 633 634 static space_map_ops_t metaslab_cdf_ops = { 635 metaslab_pp_load, 636 metaslab_pp_unload, 637 metaslab_cdf_alloc, 638 metaslab_pp_claim, 639 metaslab_pp_free, 640 metaslab_pp_maxsize, 641 metaslab_cdf_fragmented 642 }; 643 644 uint64_t metaslab_ndf_clump_shift = 4; 645 646 static uint64_t 647 metaslab_ndf_alloc(space_map_t *sm, uint64_t size) 648 { 649 avl_tree_t *t = &sm->sm_root; 650 avl_index_t where; 651 space_seg_t *ss, ssearch; 652 uint64_t hbit = highbit(size); 653 uint64_t *cursor = (uint64_t *)sm->sm_ppd + hbit - 1; 654 uint64_t max_size = metaslab_pp_maxsize(sm); 655 656 ASSERT(MUTEX_HELD(sm->sm_lock)); 657 ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); 658 659 if (max_size < size) 660 return (-1ULL); 661 662 ssearch.ss_start = *cursor; 663 ssearch.ss_end = *cursor + size; 664 665 ss = avl_find(t, &ssearch, &where); 666 if (ss == NULL || (ss->ss_start + size > ss->ss_end)) { 667 t = sm->sm_pp_root; 668 669 ssearch.ss_start = 0; 670 ssearch.ss_end = MIN(max_size, 671 1ULL << (hbit + metaslab_ndf_clump_shift)); 672 ss = avl_find(t, &ssearch, &where); 673 if (ss == NULL) 674 ss = avl_nearest(t, where, AVL_AFTER); 675 ASSERT(ss != NULL); 676 } 677 678 if (ss != NULL) { 679 if (ss->ss_start + size <= ss->ss_end) { 680 *cursor = ss->ss_start + size; 681 return (ss->ss_start); 682 } 683 } 684 return (-1ULL); 685 } 686 687 static boolean_t 688 metaslab_ndf_fragmented(space_map_t *sm) 689 { 690 uint64_t max_size = metaslab_pp_maxsize(sm); 691 692 if (max_size > (metaslab_min_alloc_size << metaslab_ndf_clump_shift)) 693 return (B_FALSE); 694 return (B_TRUE); 695 } 696 697 698 static space_map_ops_t metaslab_ndf_ops = { 699 metaslab_pp_load, 700 metaslab_pp_unload, 701 metaslab_ndf_alloc, 702 metaslab_pp_claim, 703 metaslab_pp_free, 704 metaslab_pp_maxsize, 705 metaslab_ndf_fragmented 706 }; 707 708 space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops; 709 710 /* 711 * ========================================================================== 712 * Metaslabs 713 * ========================================================================== 714 */ 715 metaslab_t * 716 metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, 717 uint64_t start, uint64_t size, uint64_t txg) 718 { 719 vdev_t *vd = mg->mg_vd; 720 metaslab_t *msp; 721 722 msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); 723 mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL); 724 725 msp->ms_smo_syncing = *smo; 726 727 /* 728 * We create the main space map here, but we don't create the 729 * allocmaps and freemaps until metaslab_sync_done(). This serves 730 * two purposes: it allows metaslab_sync_done() to detect the 731 * addition of new space; and for debugging, it ensures that we'd 732 * data fault on any attempt to use this metaslab before it's ready. 733 */ 734 msp->ms_map = kmem_zalloc(sizeof (space_map_t), KM_SLEEP); 735 space_map_create(msp->ms_map, start, size, 736 vd->vdev_ashift, &msp->ms_lock); 737 738 metaslab_group_add(mg, msp); 739 740 if (metaslab_debug && smo->smo_object != 0) { 741 mutex_enter(&msp->ms_lock); 742 VERIFY(space_map_load(msp->ms_map, mg->mg_class->mc_ops, 743 SM_FREE, smo, spa_meta_objset(vd->vdev_spa)) == 0); 744 mutex_exit(&msp->ms_lock); 745 } 746 747 /* 748 * If we're opening an existing pool (txg == 0) or creating 749 * a new one (txg == TXG_INITIAL), all space is available now. 750 * If we're adding space to an existing pool, the new space 751 * does not become available until after this txg has synced. 752 */ 753 if (txg <= TXG_INITIAL) 754 metaslab_sync_done(msp, 0); 755 756 if (txg != 0) { 757 vdev_dirty(vd, 0, NULL, txg); 758 vdev_dirty(vd, VDD_METASLAB, msp, txg); 759 } 760 761 return (msp); 762 } 763 764 void 765 metaslab_fini(metaslab_t *msp) 766 { 767 metaslab_group_t *mg = msp->ms_group; 768 769 vdev_space_update(mg->mg_vd, 770 -msp->ms_smo.smo_alloc, 0, -msp->ms_map->sm_size); 771 772 metaslab_group_remove(mg, msp); 773 774 mutex_enter(&msp->ms_lock); 775 776 space_map_unload(msp->ms_map); 777 space_map_destroy(msp->ms_map); 778 kmem_free(msp->ms_map, sizeof (*msp->ms_map)); 779 780 for (int t = 0; t < TXG_SIZE; t++) { 781 space_map_destroy(msp->ms_allocmap[t]); 782 space_map_destroy(msp->ms_freemap[t]); 783 kmem_free(msp->ms_allocmap[t], sizeof (*msp->ms_allocmap[t])); 784 kmem_free(msp->ms_freemap[t], sizeof (*msp->ms_freemap[t])); 785 } 786 787 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 788 space_map_destroy(msp->ms_defermap[t]); 789 kmem_free(msp->ms_defermap[t], sizeof (*msp->ms_defermap[t])); 790 } 791 792 ASSERT0(msp->ms_deferspace); 793 794 mutex_exit(&msp->ms_lock); 795 mutex_destroy(&msp->ms_lock); 796 797 kmem_free(msp, sizeof (metaslab_t)); 798 } 799 800 #define METASLAB_WEIGHT_PRIMARY (1ULL << 63) 801 #define METASLAB_WEIGHT_SECONDARY (1ULL << 62) 802 #define METASLAB_ACTIVE_MASK \ 803 (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY) 804 805 static uint64_t 806 metaslab_weight(metaslab_t *msp) 807 { 808 metaslab_group_t *mg = msp->ms_group; 809 space_map_t *sm = msp->ms_map; 810 space_map_obj_t *smo = &msp->ms_smo; 811 vdev_t *vd = mg->mg_vd; 812 uint64_t weight, space; 813 814 ASSERT(MUTEX_HELD(&msp->ms_lock)); 815 816 /* 817 * The baseline weight is the metaslab's free space. 818 */ 819 space = sm->sm_size - smo->smo_alloc; 820 weight = space; 821 822 /* 823 * Modern disks have uniform bit density and constant angular velocity. 824 * Therefore, the outer recording zones are faster (higher bandwidth) 825 * than the inner zones by the ratio of outer to inner track diameter, 826 * which is typically around 2:1. We account for this by assigning 827 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). 828 * In effect, this means that we'll select the metaslab with the most 829 * free bandwidth rather than simply the one with the most free space. 830 */ 831 weight = 2 * weight - 832 ((sm->sm_start >> vd->vdev_ms_shift) * weight) / vd->vdev_ms_count; 833 ASSERT(weight >= space && weight <= 2 * space); 834 835 /* 836 * For locality, assign higher weight to metaslabs which have 837 * a lower offset than what we've already activated. 838 */ 839 if (sm->sm_start <= mg->mg_bonus_area) 840 weight *= (metaslab_smo_bonus_pct / 100); 841 ASSERT(weight >= space && 842 weight <= 2 * (metaslab_smo_bonus_pct / 100) * space); 843 844 if (sm->sm_loaded && !sm->sm_ops->smop_fragmented(sm)) { 845 /* 846 * If this metaslab is one we're actively using, adjust its 847 * weight to make it preferable to any inactive metaslab so 848 * we'll polish it off. 849 */ 850 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); 851 } 852 return (weight); 853 } 854 855 static void 856 metaslab_prefetch(metaslab_group_t *mg) 857 { 858 spa_t *spa = mg->mg_vd->vdev_spa; 859 metaslab_t *msp; 860 avl_tree_t *t = &mg->mg_metaslab_tree; 861 int m; 862 863 mutex_enter(&mg->mg_lock); 864 865 /* 866 * Prefetch the next potential metaslabs 867 */ 868 for (msp = avl_first(t), m = 0; msp; msp = AVL_NEXT(t, msp), m++) { 869 space_map_t *sm = msp->ms_map; 870 space_map_obj_t *smo = &msp->ms_smo; 871 872 /* If we have reached our prefetch limit then we're done */ 873 if (m >= metaslab_prefetch_limit) 874 break; 875 876 if (!sm->sm_loaded && smo->smo_object != 0) { 877 mutex_exit(&mg->mg_lock); 878 dmu_prefetch(spa_meta_objset(spa), smo->smo_object, 879 0ULL, smo->smo_objsize); 880 mutex_enter(&mg->mg_lock); 881 } 882 } 883 mutex_exit(&mg->mg_lock); 884 } 885 886 static int 887 metaslab_activate(metaslab_t *msp, uint64_t activation_weight) 888 { 889 metaslab_group_t *mg = msp->ms_group; 890 space_map_t *sm = msp->ms_map; 891 space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops; 892 893 ASSERT(MUTEX_HELD(&msp->ms_lock)); 894 895 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { 896 space_map_load_wait(sm); 897 if (!sm->sm_loaded) { 898 space_map_obj_t *smo = &msp->ms_smo; 899 900 int error = space_map_load(sm, sm_ops, SM_FREE, smo, 901 spa_meta_objset(msp->ms_group->mg_vd->vdev_spa)); 902 if (error) { 903 metaslab_group_sort(msp->ms_group, msp, 0); 904 return (error); 905 } 906 for (int t = 0; t < TXG_DEFER_SIZE; t++) 907 space_map_walk(msp->ms_defermap[t], 908 space_map_claim, sm); 909 910 } 911 912 /* 913 * Track the bonus area as we activate new metaslabs. 914 */ 915 if (sm->sm_start > mg->mg_bonus_area) { 916 mutex_enter(&mg->mg_lock); 917 mg->mg_bonus_area = sm->sm_start; 918 mutex_exit(&mg->mg_lock); 919 } 920 921 metaslab_group_sort(msp->ms_group, msp, 922 msp->ms_weight | activation_weight); 923 } 924 ASSERT(sm->sm_loaded); 925 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); 926 927 return (0); 928 } 929 930 static void 931 metaslab_passivate(metaslab_t *msp, uint64_t size) 932 { 933 /* 934 * If size < SPA_MINBLOCKSIZE, then we will not allocate from 935 * this metaslab again. In that case, it had better be empty, 936 * or we would be leaving space on the table. 937 */ 938 ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map->sm_space == 0); 939 metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size)); 940 ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); 941 } 942 943 /* 944 * Determine if the in-core space map representation can be condensed on-disk. 945 * We would like to use the following criteria to make our decision: 946 * 947 * 1. The size of the space map object should not dramatically increase as a 948 * result of writing out our in-core free map. 949 * 950 * 2. The minimal on-disk space map representation is zfs_condense_pct/100 951 * times the size than the in-core representation (i.e. zfs_condense_pct = 110 952 * and in-core = 1MB, minimal = 1.1.MB). 953 * 954 * Checking the first condition is tricky since we don't want to walk 955 * the entire AVL tree calculating the estimated on-disk size. Instead we 956 * use the size-ordered AVL tree in the space map and calculate the 957 * size required for the largest segment in our in-core free map. If the 958 * size required to represent that segment on disk is larger than the space 959 * map object then we avoid condensing this map. 960 * 961 * To determine the second criterion we use a best-case estimate and assume 962 * each segment can be represented on-disk as a single 64-bit entry. We refer 963 * to this best-case estimate as the space map's minimal form. 964 */ 965 static boolean_t 966 metaslab_should_condense(metaslab_t *msp) 967 { 968 space_map_t *sm = msp->ms_map; 969 space_map_obj_t *smo = &msp->ms_smo_syncing; 970 space_seg_t *ss; 971 uint64_t size, entries, segsz; 972 973 ASSERT(MUTEX_HELD(&msp->ms_lock)); 974 ASSERT(sm->sm_loaded); 975 976 /* 977 * Use the sm_pp_root AVL tree, which is ordered by size, to obtain 978 * the largest segment in the in-core free map. If the tree is 979 * empty then we should condense the map. 980 */ 981 ss = avl_last(sm->sm_pp_root); 982 if (ss == NULL) 983 return (B_TRUE); 984 985 /* 986 * Calculate the number of 64-bit entries this segment would 987 * require when written to disk. If this single segment would be 988 * larger on-disk than the entire current on-disk structure, then 989 * clearly condensing will increase the on-disk structure size. 990 */ 991 size = (ss->ss_end - ss->ss_start) >> sm->sm_shift; 992 entries = size / (MIN(size, SM_RUN_MAX)); 993 segsz = entries * sizeof (uint64_t); 994 995 return (segsz <= smo->smo_objsize && 996 smo->smo_objsize >= (zfs_condense_pct * 997 sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) / 100); 998 } 999 1000 /* 1001 * Condense the on-disk space map representation to its minimized form. 1002 * The minimized form consists of a small number of allocations followed by 1003 * the in-core free map. 1004 */ 1005 static void 1006 metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) 1007 { 1008 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1009 space_map_t *freemap = msp->ms_freemap[txg & TXG_MASK]; 1010 space_map_t condense_map; 1011 space_map_t *sm = msp->ms_map; 1012 objset_t *mos = spa_meta_objset(spa); 1013 space_map_obj_t *smo = &msp->ms_smo_syncing; 1014 1015 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1016 ASSERT3U(spa_sync_pass(spa), ==, 1); 1017 ASSERT(sm->sm_loaded); 1018 1019 spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, " 1020 "smo size %llu, segments %lu", txg, 1021 (msp->ms_map->sm_start / msp->ms_map->sm_size), msp, 1022 smo->smo_objsize, avl_numnodes(&sm->sm_root)); 1023 1024 /* 1025 * Create an map that is a 100% allocated map. We remove segments 1026 * that have been freed in this txg, any deferred frees that exist, 1027 * and any allocation in the future. Removing segments should be 1028 * a relatively inexpensive operation since we expect these maps to 1029 * a small number of nodes. 1030 */ 1031 space_map_create(&condense_map, sm->sm_start, sm->sm_size, 1032 sm->sm_shift, sm->sm_lock); 1033 space_map_add(&condense_map, condense_map.sm_start, 1034 condense_map.sm_size); 1035 1036 /* 1037 * Remove what's been freed in this txg from the condense_map. 1038 * Since we're in sync_pass 1, we know that all the frees from 1039 * this txg are in the freemap. 1040 */ 1041 space_map_walk(freemap, space_map_remove, &condense_map); 1042 1043 for (int t = 0; t < TXG_DEFER_SIZE; t++) 1044 space_map_walk(msp->ms_defermap[t], 1045 space_map_remove, &condense_map); 1046 1047 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) 1048 space_map_walk(msp->ms_allocmap[(txg + t) & TXG_MASK], 1049 space_map_remove, &condense_map); 1050 1051 /* 1052 * We're about to drop the metaslab's lock thus allowing 1053 * other consumers to change it's content. Set the 1054 * space_map's sm_condensing flag to ensure that 1055 * allocations on this metaslab do not occur while we're 1056 * in the middle of committing it to disk. This is only critical 1057 * for the ms_map as all other space_maps use per txg 1058 * views of their content. 1059 */ 1060 sm->sm_condensing = B_TRUE; 1061 1062 mutex_exit(&msp->ms_lock); 1063 space_map_truncate(smo, mos, tx); 1064 mutex_enter(&msp->ms_lock); 1065 1066 /* 1067 * While we would ideally like to create a space_map representation 1068 * that consists only of allocation records, doing so can be 1069 * prohibitively expensive because the in-core free map can be 1070 * large, and therefore computationally expensive to subtract 1071 * from the condense_map. Instead we sync out two maps, a cheap 1072 * allocation only map followed by the in-core free map. While not 1073 * optimal, this is typically close to optimal, and much cheaper to 1074 * compute. 1075 */ 1076 space_map_sync(&condense_map, SM_ALLOC, smo, mos, tx); 1077 space_map_vacate(&condense_map, NULL, NULL); 1078 space_map_destroy(&condense_map); 1079 1080 space_map_sync(sm, SM_FREE, smo, mos, tx); 1081 sm->sm_condensing = B_FALSE; 1082 1083 spa_dbgmsg(spa, "condensed: txg %llu, msp[%llu] %p, " 1084 "smo size %llu", txg, 1085 (msp->ms_map->sm_start / msp->ms_map->sm_size), msp, 1086 smo->smo_objsize); 1087 } 1088 1089 /* 1090 * Write a metaslab to disk in the context of the specified transaction group. 1091 */ 1092 void 1093 metaslab_sync(metaslab_t *msp, uint64_t txg) 1094 { 1095 vdev_t *vd = msp->ms_group->mg_vd; 1096 spa_t *spa = vd->vdev_spa; 1097 objset_t *mos = spa_meta_objset(spa); 1098 space_map_t *allocmap = msp->ms_allocmap[txg & TXG_MASK]; 1099 space_map_t **freemap = &msp->ms_freemap[txg & TXG_MASK]; 1100 space_map_t **freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; 1101 space_map_t *sm = msp->ms_map; 1102 space_map_obj_t *smo = &msp->ms_smo_syncing; 1103 dmu_buf_t *db; 1104 dmu_tx_t *tx; 1105 1106 ASSERT(!vd->vdev_ishole); 1107 1108 /* 1109 * This metaslab has just been added so there's no work to do now. 1110 */ 1111 if (*freemap == NULL) { 1112 ASSERT3P(allocmap, ==, NULL); 1113 return; 1114 } 1115 1116 ASSERT3P(allocmap, !=, NULL); 1117 ASSERT3P(*freemap, !=, NULL); 1118 ASSERT3P(*freed_map, !=, NULL); 1119 1120 if (allocmap->sm_space == 0 && (*freemap)->sm_space == 0) 1121 return; 1122 1123 /* 1124 * The only state that can actually be changing concurrently with 1125 * metaslab_sync() is the metaslab's ms_map. No other thread can 1126 * be modifying this txg's allocmap, freemap, freed_map, or smo. 1127 * Therefore, we only hold ms_lock to satify space_map ASSERTs. 1128 * We drop it whenever we call into the DMU, because the DMU 1129 * can call down to us (e.g. via zio_free()) at any time. 1130 */ 1131 1132 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 1133 1134 if (smo->smo_object == 0) { 1135 ASSERT(smo->smo_objsize == 0); 1136 ASSERT(smo->smo_alloc == 0); 1137 smo->smo_object = dmu_object_alloc(mos, 1138 DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT, 1139 DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx); 1140 ASSERT(smo->smo_object != 0); 1141 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * 1142 (sm->sm_start >> vd->vdev_ms_shift), 1143 sizeof (uint64_t), &smo->smo_object, tx); 1144 } 1145 1146 mutex_enter(&msp->ms_lock); 1147 1148 if (sm->sm_loaded && spa_sync_pass(spa) == 1 && 1149 metaslab_should_condense(msp)) { 1150 metaslab_condense(msp, txg, tx); 1151 } else { 1152 space_map_sync(allocmap, SM_ALLOC, smo, mos, tx); 1153 space_map_sync(*freemap, SM_FREE, smo, mos, tx); 1154 } 1155 1156 space_map_vacate(allocmap, NULL, NULL); 1157 1158 /* 1159 * For sync pass 1, we avoid walking the entire space map and 1160 * instead will just swap the pointers for freemap and 1161 * freed_map. We can safely do this since the freed_map is 1162 * guaranteed to be empty on the initial pass. 1163 */ 1164 if (spa_sync_pass(spa) == 1) { 1165 ASSERT0((*freed_map)->sm_space); 1166 ASSERT0(avl_numnodes(&(*freed_map)->sm_root)); 1167 space_map_swap(freemap, freed_map); 1168 } else { 1169 space_map_vacate(*freemap, space_map_add, *freed_map); 1170 } 1171 1172 ASSERT0(msp->ms_allocmap[txg & TXG_MASK]->sm_space); 1173 ASSERT0(msp->ms_freemap[txg & TXG_MASK]->sm_space); 1174 1175 mutex_exit(&msp->ms_lock); 1176 1177 VERIFY0(dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)); 1178 dmu_buf_will_dirty(db, tx); 1179 ASSERT3U(db->db_size, >=, sizeof (*smo)); 1180 bcopy(smo, db->db_data, sizeof (*smo)); 1181 dmu_buf_rele(db, FTAG); 1182 1183 dmu_tx_commit(tx); 1184 } 1185 1186 /* 1187 * Called after a transaction group has completely synced to mark 1188 * all of the metaslab's free space as usable. 1189 */ 1190 void 1191 metaslab_sync_done(metaslab_t *msp, uint64_t txg) 1192 { 1193 space_map_obj_t *smo = &msp->ms_smo; 1194 space_map_obj_t *smosync = &msp->ms_smo_syncing; 1195 space_map_t *sm = msp->ms_map; 1196 space_map_t *freed_map = msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; 1197 space_map_t *defer_map = msp->ms_defermap[txg % TXG_DEFER_SIZE]; 1198 metaslab_group_t *mg = msp->ms_group; 1199 vdev_t *vd = mg->mg_vd; 1200 int64_t alloc_delta, defer_delta; 1201 1202 ASSERT(!vd->vdev_ishole); 1203 1204 mutex_enter(&msp->ms_lock); 1205 1206 /* 1207 * If this metaslab is just becoming available, initialize its 1208 * allocmaps, freemaps, and defermap and add its capacity to the vdev. 1209 */ 1210 if (freed_map == NULL) { 1211 ASSERT(defer_map == NULL); 1212 for (int t = 0; t < TXG_SIZE; t++) { 1213 msp->ms_allocmap[t] = kmem_zalloc(sizeof (space_map_t), 1214 KM_SLEEP); 1215 space_map_create(msp->ms_allocmap[t], sm->sm_start, 1216 sm->sm_size, sm->sm_shift, sm->sm_lock); 1217 msp->ms_freemap[t] = kmem_zalloc(sizeof (space_map_t), 1218 KM_SLEEP); 1219 space_map_create(msp->ms_freemap[t], sm->sm_start, 1220 sm->sm_size, sm->sm_shift, sm->sm_lock); 1221 } 1222 1223 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1224 msp->ms_defermap[t] = kmem_zalloc(sizeof (space_map_t), 1225 KM_SLEEP); 1226 space_map_create(msp->ms_defermap[t], sm->sm_start, 1227 sm->sm_size, sm->sm_shift, sm->sm_lock); 1228 } 1229 1230 freed_map = msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; 1231 defer_map = msp->ms_defermap[txg % TXG_DEFER_SIZE]; 1232 1233 vdev_space_update(vd, 0, 0, sm->sm_size); 1234 } 1235 1236 alloc_delta = smosync->smo_alloc - smo->smo_alloc; 1237 defer_delta = freed_map->sm_space - defer_map->sm_space; 1238 1239 vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0); 1240 1241 ASSERT(msp->ms_allocmap[txg & TXG_MASK]->sm_space == 0); 1242 ASSERT(msp->ms_freemap[txg & TXG_MASK]->sm_space == 0); 1243 1244 /* 1245 * If there's a space_map_load() in progress, wait for it to complete 1246 * so that we have a consistent view of the in-core space map. 1247 * Then, add defer_map (oldest deferred frees) to this map and 1248 * transfer freed_map (this txg's frees) to defer_map. 1249 */ 1250 space_map_load_wait(sm); 1251 space_map_vacate(defer_map, sm->sm_loaded ? space_map_free : NULL, sm); 1252 space_map_vacate(freed_map, space_map_add, defer_map); 1253 1254 *smo = *smosync; 1255 1256 msp->ms_deferspace += defer_delta; 1257 ASSERT3S(msp->ms_deferspace, >=, 0); 1258 ASSERT3S(msp->ms_deferspace, <=, sm->sm_size); 1259 if (msp->ms_deferspace != 0) { 1260 /* 1261 * Keep syncing this metaslab until all deferred frees 1262 * are back in circulation. 1263 */ 1264 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 1265 } 1266 1267 /* 1268 * If the map is loaded but no longer active, evict it as soon as all 1269 * future allocations have synced. (If we unloaded it now and then 1270 * loaded a moment later, the map wouldn't reflect those allocations.) 1271 */ 1272 if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { 1273 int evictable = 1; 1274 1275 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) 1276 if (msp->ms_allocmap[(txg + t) & TXG_MASK]->sm_space) 1277 evictable = 0; 1278 1279 if (evictable && !metaslab_debug) 1280 space_map_unload(sm); 1281 } 1282 1283 metaslab_group_sort(mg, msp, metaslab_weight(msp)); 1284 1285 mutex_exit(&msp->ms_lock); 1286 } 1287 1288 void 1289 metaslab_sync_reassess(metaslab_group_t *mg) 1290 { 1291 vdev_t *vd = mg->mg_vd; 1292 int64_t failures = mg->mg_alloc_failures; 1293 1294 /* 1295 * Re-evaluate all metaslabs which have lower offsets than the 1296 * bonus area. 1297 */ 1298 for (int m = 0; m < vd->vdev_ms_count; m++) { 1299 metaslab_t *msp = vd->vdev_ms[m]; 1300 1301 if (msp->ms_map->sm_start > mg->mg_bonus_area) 1302 break; 1303 1304 mutex_enter(&msp->ms_lock); 1305 metaslab_group_sort(mg, msp, metaslab_weight(msp)); 1306 mutex_exit(&msp->ms_lock); 1307 } 1308 1309 atomic_add_64(&mg->mg_alloc_failures, -failures); 1310 1311 /* 1312 * Prefetch the next potential metaslabs 1313 */ 1314 metaslab_prefetch(mg); 1315 } 1316 1317 static uint64_t 1318 metaslab_distance(metaslab_t *msp, dva_t *dva) 1319 { 1320 uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift; 1321 uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift; 1322 uint64_t start = msp->ms_map->sm_start >> ms_shift; 1323 1324 if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) 1325 return (1ULL << 63); 1326 1327 if (offset < start) 1328 return ((start - offset) << ms_shift); 1329 if (offset > start) 1330 return ((offset - start) << ms_shift); 1331 return (0); 1332 } 1333 1334 static uint64_t 1335 metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, 1336 uint64_t txg, uint64_t min_distance, dva_t *dva, int d, int flags) 1337 { 1338 spa_t *spa = mg->mg_vd->vdev_spa; 1339 metaslab_t *msp = NULL; 1340 uint64_t offset = -1ULL; 1341 avl_tree_t *t = &mg->mg_metaslab_tree; 1342 uint64_t activation_weight; 1343 uint64_t target_distance; 1344 int i; 1345 1346 activation_weight = METASLAB_WEIGHT_PRIMARY; 1347 for (i = 0; i < d; i++) { 1348 if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { 1349 activation_weight = METASLAB_WEIGHT_SECONDARY; 1350 break; 1351 } 1352 } 1353 1354 for (;;) { 1355 boolean_t was_active; 1356 1357 mutex_enter(&mg->mg_lock); 1358 for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) { 1359 if (msp->ms_weight < asize) { 1360 spa_dbgmsg(spa, "%s: failed to meet weight " 1361 "requirement: vdev %llu, txg %llu, mg %p, " 1362 "msp %p, psize %llu, asize %llu, " 1363 "failures %llu, weight %llu", 1364 spa_name(spa), mg->mg_vd->vdev_id, txg, 1365 mg, msp, psize, asize, 1366 mg->mg_alloc_failures, msp->ms_weight); 1367 mutex_exit(&mg->mg_lock); 1368 return (-1ULL); 1369 } 1370 was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; 1371 if (activation_weight == METASLAB_WEIGHT_PRIMARY) 1372 break; 1373 1374 target_distance = min_distance + 1375 (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1); 1376 1377 for (i = 0; i < d; i++) 1378 if (metaslab_distance(msp, &dva[i]) < 1379 target_distance) 1380 break; 1381 if (i == d) 1382 break; 1383 } 1384 mutex_exit(&mg->mg_lock); 1385 if (msp == NULL) 1386 return (-1ULL); 1387 1388 /* 1389 * If we've already reached the allowable number of failed 1390 * allocation attempts on this metaslab group then we 1391 * consider skipping it. We skip it only if we're allowed 1392 * to "fast" gang, the physical size is larger than 1393 * a gang block, and we're attempting to allocate from 1394 * the primary metaslab. 1395 */ 1396 if (mg->mg_alloc_failures > zfs_mg_alloc_failures && 1397 CAN_FASTGANG(flags) && psize > SPA_GANGBLOCKSIZE && 1398 activation_weight == METASLAB_WEIGHT_PRIMARY) { 1399 spa_dbgmsg(spa, "%s: skipping metaslab group: " 1400 "vdev %llu, txg %llu, mg %p, psize %llu, " 1401 "asize %llu, failures %llu", spa_name(spa), 1402 mg->mg_vd->vdev_id, txg, mg, psize, asize, 1403 mg->mg_alloc_failures); 1404 return (-1ULL); 1405 } 1406 1407 mutex_enter(&msp->ms_lock); 1408 1409 /* 1410 * If this metaslab is currently condensing then pick again as 1411 * we can't manipulate this metaslab until it's committed 1412 * to disk. 1413 */ 1414 if (msp->ms_map->sm_condensing) { 1415 mutex_exit(&msp->ms_lock); 1416 continue; 1417 } 1418 1419 /* 1420 * Ensure that the metaslab we have selected is still 1421 * capable of handling our request. It's possible that 1422 * another thread may have changed the weight while we 1423 * were blocked on the metaslab lock. 1424 */ 1425 if (msp->ms_weight < asize || (was_active && 1426 !(msp->ms_weight & METASLAB_ACTIVE_MASK) && 1427 activation_weight == METASLAB_WEIGHT_PRIMARY)) { 1428 mutex_exit(&msp->ms_lock); 1429 continue; 1430 } 1431 1432 if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) && 1433 activation_weight == METASLAB_WEIGHT_PRIMARY) { 1434 metaslab_passivate(msp, 1435 msp->ms_weight & ~METASLAB_ACTIVE_MASK); 1436 mutex_exit(&msp->ms_lock); 1437 continue; 1438 } 1439 1440 if (metaslab_activate(msp, activation_weight) != 0) { 1441 mutex_exit(&msp->ms_lock); 1442 continue; 1443 } 1444 1445 if ((offset = space_map_alloc(msp->ms_map, asize)) != -1ULL) 1446 break; 1447 1448 atomic_inc_64(&mg->mg_alloc_failures); 1449 1450 metaslab_passivate(msp, space_map_maxsize(msp->ms_map)); 1451 1452 mutex_exit(&msp->ms_lock); 1453 } 1454 1455 if (msp->ms_allocmap[txg & TXG_MASK]->sm_space == 0) 1456 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); 1457 1458 space_map_add(msp->ms_allocmap[txg & TXG_MASK], offset, asize); 1459 1460 mutex_exit(&msp->ms_lock); 1461 1462 return (offset); 1463 } 1464 1465 /* 1466 * Allocate a block for the specified i/o. 1467 */ 1468 static int 1469 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, 1470 dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags) 1471 { 1472 metaslab_group_t *mg, *rotor; 1473 vdev_t *vd; 1474 int dshift = 3; 1475 int all_zero; 1476 int zio_lock = B_FALSE; 1477 boolean_t allocatable; 1478 uint64_t offset = -1ULL; 1479 uint64_t asize; 1480 uint64_t distance; 1481 1482 ASSERT(!DVA_IS_VALID(&dva[d])); 1483 1484 /* 1485 * For testing, make some blocks above a certain size be gang blocks. 1486 */ 1487 if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) 1488 return (ENOSPC); 1489 1490 /* 1491 * Start at the rotor and loop through all mgs until we find something. 1492 * Note that there's no locking on mc_rotor or mc_aliquot because 1493 * nothing actually breaks if we miss a few updates -- we just won't 1494 * allocate quite as evenly. It all balances out over time. 1495 * 1496 * If we are doing ditto or log blocks, try to spread them across 1497 * consecutive vdevs. If we're forced to reuse a vdev before we've 1498 * allocated all of our ditto blocks, then try and spread them out on 1499 * that vdev as much as possible. If it turns out to not be possible, 1500 * gradually lower our standards until anything becomes acceptable. 1501 * Also, allocating on consecutive vdevs (as opposed to random vdevs) 1502 * gives us hope of containing our fault domains to something we're 1503 * able to reason about. Otherwise, any two top-level vdev failures 1504 * will guarantee the loss of data. With consecutive allocation, 1505 * only two adjacent top-level vdev failures will result in data loss. 1506 * 1507 * If we are doing gang blocks (hintdva is non-NULL), try to keep 1508 * ourselves on the same vdev as our gang block header. That 1509 * way, we can hope for locality in vdev_cache, plus it makes our 1510 * fault domains something tractable. 1511 */ 1512 if (hintdva) { 1513 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); 1514 1515 /* 1516 * It's possible the vdev we're using as the hint no 1517 * longer exists (i.e. removed). Consult the rotor when 1518 * all else fails. 1519 */ 1520 if (vd != NULL) { 1521 mg = vd->vdev_mg; 1522 1523 if (flags & METASLAB_HINTBP_AVOID && 1524 mg->mg_next != NULL) 1525 mg = mg->mg_next; 1526 } else { 1527 mg = mc->mc_rotor; 1528 } 1529 } else if (d != 0) { 1530 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); 1531 mg = vd->vdev_mg->mg_next; 1532 } else { 1533 mg = mc->mc_rotor; 1534 } 1535 1536 /* 1537 * If the hint put us into the wrong metaslab class, or into a 1538 * metaslab group that has been passivated, just follow the rotor. 1539 */ 1540 if (mg->mg_class != mc || mg->mg_activation_count <= 0) 1541 mg = mc->mc_rotor; 1542 1543 rotor = mg; 1544 top: 1545 all_zero = B_TRUE; 1546 do { 1547 ASSERT(mg->mg_activation_count == 1); 1548 1549 vd = mg->mg_vd; 1550 1551 /* 1552 * Don't allocate from faulted devices. 1553 */ 1554 if (zio_lock) { 1555 spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); 1556 allocatable = vdev_allocatable(vd); 1557 spa_config_exit(spa, SCL_ZIO, FTAG); 1558 } else { 1559 allocatable = vdev_allocatable(vd); 1560 } 1561 if (!allocatable) 1562 goto next; 1563 1564 /* 1565 * Avoid writing single-copy data to a failing vdev 1566 * unless the user instructs us that it is okay. 1567 */ 1568 if ((vd->vdev_stat.vs_write_errors > 0 || 1569 vd->vdev_state < VDEV_STATE_HEALTHY) && 1570 d == 0 && dshift == 3 && 1571 !(zfs_write_to_degraded && vd->vdev_state == 1572 VDEV_STATE_DEGRADED)) { 1573 all_zero = B_FALSE; 1574 goto next; 1575 } 1576 1577 ASSERT(mg->mg_class == mc); 1578 1579 distance = vd->vdev_asize >> dshift; 1580 if (distance <= (1ULL << vd->vdev_ms_shift)) 1581 distance = 0; 1582 else 1583 all_zero = B_FALSE; 1584 1585 asize = vdev_psize_to_asize(vd, psize); 1586 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); 1587 1588 offset = metaslab_group_alloc(mg, psize, asize, txg, distance, 1589 dva, d, flags); 1590 if (offset != -1ULL) { 1591 /* 1592 * If we've just selected this metaslab group, 1593 * figure out whether the corresponding vdev is 1594 * over- or under-used relative to the pool, 1595 * and set an allocation bias to even it out. 1596 */ 1597 if (mc->mc_aliquot == 0) { 1598 vdev_stat_t *vs = &vd->vdev_stat; 1599 int64_t vu, cu; 1600 1601 vu = (vs->vs_alloc * 100) / (vs->vs_space + 1); 1602 cu = (mc->mc_alloc * 100) / (mc->mc_space + 1); 1603 1604 /* 1605 * Calculate how much more or less we should 1606 * try to allocate from this device during 1607 * this iteration around the rotor. 1608 * For example, if a device is 80% full 1609 * and the pool is 20% full then we should 1610 * reduce allocations by 60% on this device. 1611 * 1612 * mg_bias = (20 - 80) * 512K / 100 = -307K 1613 * 1614 * This reduces allocations by 307K for this 1615 * iteration. 1616 */ 1617 mg->mg_bias = ((cu - vu) * 1618 (int64_t)mg->mg_aliquot) / 100; 1619 } 1620 1621 if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= 1622 mg->mg_aliquot + mg->mg_bias) { 1623 mc->mc_rotor = mg->mg_next; 1624 mc->mc_aliquot = 0; 1625 } 1626 1627 DVA_SET_VDEV(&dva[d], vd->vdev_id); 1628 DVA_SET_OFFSET(&dva[d], offset); 1629 DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER)); 1630 DVA_SET_ASIZE(&dva[d], asize); 1631 1632 return (0); 1633 } 1634 next: 1635 mc->mc_rotor = mg->mg_next; 1636 mc->mc_aliquot = 0; 1637 } while ((mg = mg->mg_next) != rotor); 1638 1639 if (!all_zero) { 1640 dshift++; 1641 ASSERT(dshift < 64); 1642 goto top; 1643 } 1644 1645 if (!allocatable && !zio_lock) { 1646 dshift = 3; 1647 zio_lock = B_TRUE; 1648 goto top; 1649 } 1650 1651 bzero(&dva[d], sizeof (dva_t)); 1652 1653 return (ENOSPC); 1654 } 1655 1656 /* 1657 * Free the block represented by DVA in the context of the specified 1658 * transaction group. 1659 */ 1660 static void 1661 metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now) 1662 { 1663 uint64_t vdev = DVA_GET_VDEV(dva); 1664 uint64_t offset = DVA_GET_OFFSET(dva); 1665 uint64_t size = DVA_GET_ASIZE(dva); 1666 vdev_t *vd; 1667 metaslab_t *msp; 1668 1669 ASSERT(DVA_IS_VALID(dva)); 1670 1671 if (txg > spa_freeze_txg(spa)) 1672 return; 1673 1674 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 1675 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { 1676 cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu", 1677 (u_longlong_t)vdev, (u_longlong_t)offset); 1678 ASSERT(0); 1679 return; 1680 } 1681 1682 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 1683 1684 if (DVA_GET_GANG(dva)) 1685 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 1686 1687 mutex_enter(&msp->ms_lock); 1688 1689 if (now) { 1690 space_map_remove(msp->ms_allocmap[txg & TXG_MASK], 1691 offset, size); 1692 space_map_free(msp->ms_map, offset, size); 1693 } else { 1694 if (msp->ms_freemap[txg & TXG_MASK]->sm_space == 0) 1695 vdev_dirty(vd, VDD_METASLAB, msp, txg); 1696 space_map_add(msp->ms_freemap[txg & TXG_MASK], offset, size); 1697 } 1698 1699 mutex_exit(&msp->ms_lock); 1700 } 1701 1702 /* 1703 * Intent log support: upon opening the pool after a crash, notify the SPA 1704 * of blocks that the intent log has allocated for immediate write, but 1705 * which are still considered free by the SPA because the last transaction 1706 * group didn't commit yet. 1707 */ 1708 static int 1709 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) 1710 { 1711 uint64_t vdev = DVA_GET_VDEV(dva); 1712 uint64_t offset = DVA_GET_OFFSET(dva); 1713 uint64_t size = DVA_GET_ASIZE(dva); 1714 vdev_t *vd; 1715 metaslab_t *msp; 1716 int error = 0; 1717 1718 ASSERT(DVA_IS_VALID(dva)); 1719 1720 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 1721 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) 1722 return (ENXIO); 1723 1724 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 1725 1726 if (DVA_GET_GANG(dva)) 1727 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 1728 1729 mutex_enter(&msp->ms_lock); 1730 1731 if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map->sm_loaded) 1732 error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); 1733 1734 if (error == 0 && !space_map_contains(msp->ms_map, offset, size)) 1735 error = ENOENT; 1736 1737 if (error || txg == 0) { /* txg == 0 indicates dry run */ 1738 mutex_exit(&msp->ms_lock); 1739 return (error); 1740 } 1741 1742 space_map_claim(msp->ms_map, offset, size); 1743 1744 if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ 1745 if (msp->ms_allocmap[txg & TXG_MASK]->sm_space == 0) 1746 vdev_dirty(vd, VDD_METASLAB, msp, txg); 1747 space_map_add(msp->ms_allocmap[txg & TXG_MASK], offset, size); 1748 } 1749 1750 mutex_exit(&msp->ms_lock); 1751 1752 return (0); 1753 } 1754 1755 int 1756 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, 1757 int ndvas, uint64_t txg, blkptr_t *hintbp, int flags) 1758 { 1759 dva_t *dva = bp->blk_dva; 1760 dva_t *hintdva = hintbp->blk_dva; 1761 int error = 0; 1762 1763 ASSERT(bp->blk_birth == 0); 1764 ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); 1765 1766 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 1767 1768 if (mc->mc_rotor == NULL) { /* no vdevs in this class */ 1769 spa_config_exit(spa, SCL_ALLOC, FTAG); 1770 return (ENOSPC); 1771 } 1772 1773 ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); 1774 ASSERT(BP_GET_NDVAS(bp) == 0); 1775 ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); 1776 1777 for (int d = 0; d < ndvas; d++) { 1778 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, 1779 txg, flags); 1780 if (error) { 1781 for (d--; d >= 0; d--) { 1782 metaslab_free_dva(spa, &dva[d], txg, B_TRUE); 1783 bzero(&dva[d], sizeof (dva_t)); 1784 } 1785 spa_config_exit(spa, SCL_ALLOC, FTAG); 1786 return (error); 1787 } 1788 } 1789 ASSERT(error == 0); 1790 ASSERT(BP_GET_NDVAS(bp) == ndvas); 1791 1792 spa_config_exit(spa, SCL_ALLOC, FTAG); 1793 1794 BP_SET_BIRTH(bp, txg, txg); 1795 1796 return (0); 1797 } 1798 1799 void 1800 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) 1801 { 1802 const dva_t *dva = bp->blk_dva; 1803 int ndvas = BP_GET_NDVAS(bp); 1804 1805 ASSERT(!BP_IS_HOLE(bp)); 1806 ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); 1807 1808 spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); 1809 1810 for (int d = 0; d < ndvas; d++) 1811 metaslab_free_dva(spa, &dva[d], txg, now); 1812 1813 spa_config_exit(spa, SCL_FREE, FTAG); 1814 } 1815 1816 int 1817 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) 1818 { 1819 const dva_t *dva = bp->blk_dva; 1820 int ndvas = BP_GET_NDVAS(bp); 1821 int error = 0; 1822 1823 ASSERT(!BP_IS_HOLE(bp)); 1824 1825 if (txg != 0) { 1826 /* 1827 * First do a dry run to make sure all DVAs are claimable, 1828 * so we don't have to unwind from partial failures below. 1829 */ 1830 if ((error = metaslab_claim(spa, bp, 0)) != 0) 1831 return (error); 1832 } 1833 1834 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 1835 1836 for (int d = 0; d < ndvas; d++) 1837 if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0) 1838 break; 1839 1840 spa_config_exit(spa, SCL_ALLOC, FTAG); 1841 1842 ASSERT(error == 0 || txg == 0); 1843 1844 return (error); 1845 } 1846