1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/zfs_context.h> 27 #include <sys/spa_impl.h> 28 #include <sys/dmu.h> 29 #include <sys/dmu_tx.h> 30 #include <sys/space_map.h> 31 #include <sys/metaslab_impl.h> 32 #include <sys/vdev_impl.h> 33 #include <sys/zio.h> 34 35 uint64_t metaslab_aliquot = 512ULL << 10; 36 uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ 37 38 /* 39 * Minimum size which forces the dynamic allocator to change 40 * it's allocation strategy. Once the space map cannot satisfy 41 * an allocation of this size then it switches to using more 42 * aggressive strategy (i.e search by size rather than offset). 43 */ 44 uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE; 45 46 /* 47 * The minimum free space, in percent, which must be available 48 * in a space map to continue allocations in a first-fit fashion. 49 * Once the space_map's free space drops below this level we dynamically 50 * switch to using best-fit allocations. 51 */ 52 int metaslab_df_free_pct = 30; 53 54 /* 55 * ========================================================================== 56 * Metaslab classes 57 * ========================================================================== 58 */ 59 metaslab_class_t * 60 metaslab_class_create(space_map_ops_t *ops) 61 { 62 metaslab_class_t *mc; 63 64 mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); 65 66 mc->mc_rotor = NULL; 67 mc->mc_ops = ops; 68 69 return (mc); 70 } 71 72 void 73 metaslab_class_destroy(metaslab_class_t *mc) 74 { 75 metaslab_group_t *mg; 76 77 while ((mg = mc->mc_rotor) != NULL) { 78 metaslab_class_remove(mc, mg); 79 metaslab_group_destroy(mg); 80 } 81 82 kmem_free(mc, sizeof (metaslab_class_t)); 83 } 84 85 void 86 metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg) 87 { 88 metaslab_group_t *mgprev, *mgnext; 89 90 ASSERT(mg->mg_class == NULL); 91 92 if ((mgprev = mc->mc_rotor) == NULL) { 93 mg->mg_prev = mg; 94 mg->mg_next = mg; 95 } else { 96 mgnext = mgprev->mg_next; 97 mg->mg_prev = mgprev; 98 mg->mg_next = mgnext; 99 mgprev->mg_next = mg; 100 mgnext->mg_prev = mg; 101 } 102 mc->mc_rotor = mg; 103 mg->mg_class = mc; 104 } 105 106 void 107 metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg) 108 { 109 metaslab_group_t *mgprev, *mgnext; 110 111 ASSERT(mg->mg_class == mc); 112 113 mgprev = mg->mg_prev; 114 mgnext = mg->mg_next; 115 116 if (mg == mgnext) { 117 mc->mc_rotor = NULL; 118 } else { 119 mc->mc_rotor = mgnext; 120 mgprev->mg_next = mgnext; 121 mgnext->mg_prev = mgprev; 122 } 123 124 mg->mg_prev = NULL; 125 mg->mg_next = NULL; 126 mg->mg_class = NULL; 127 } 128 129 /* 130 * ========================================================================== 131 * Metaslab groups 132 * ========================================================================== 133 */ 134 static int 135 metaslab_compare(const void *x1, const void *x2) 136 { 137 const metaslab_t *m1 = x1; 138 const metaslab_t *m2 = x2; 139 140 if (m1->ms_weight < m2->ms_weight) 141 return (1); 142 if (m1->ms_weight > m2->ms_weight) 143 return (-1); 144 145 /* 146 * If the weights are identical, use the offset to force uniqueness. 147 */ 148 if (m1->ms_map.sm_start < m2->ms_map.sm_start) 149 return (-1); 150 if (m1->ms_map.sm_start > m2->ms_map.sm_start) 151 return (1); 152 153 ASSERT3P(m1, ==, m2); 154 155 return (0); 156 } 157 158 metaslab_group_t * 159 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) 160 { 161 metaslab_group_t *mg; 162 163 mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); 164 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); 165 avl_create(&mg->mg_metaslab_tree, metaslab_compare, 166 sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); 167 mg->mg_aliquot = metaslab_aliquot * MAX(1, vd->vdev_children); 168 mg->mg_vd = vd; 169 metaslab_class_add(mc, mg); 170 171 return (mg); 172 } 173 174 void 175 metaslab_group_destroy(metaslab_group_t *mg) 176 { 177 avl_destroy(&mg->mg_metaslab_tree); 178 mutex_destroy(&mg->mg_lock); 179 kmem_free(mg, sizeof (metaslab_group_t)); 180 } 181 182 static void 183 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) 184 { 185 mutex_enter(&mg->mg_lock); 186 ASSERT(msp->ms_group == NULL); 187 msp->ms_group = mg; 188 msp->ms_weight = 0; 189 avl_add(&mg->mg_metaslab_tree, msp); 190 mutex_exit(&mg->mg_lock); 191 } 192 193 static void 194 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) 195 { 196 mutex_enter(&mg->mg_lock); 197 ASSERT(msp->ms_group == mg); 198 avl_remove(&mg->mg_metaslab_tree, msp); 199 msp->ms_group = NULL; 200 mutex_exit(&mg->mg_lock); 201 } 202 203 static void 204 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 205 { 206 /* 207 * Although in principle the weight can be any value, in 208 * practice we do not use values in the range [1, 510]. 209 */ 210 ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0); 211 ASSERT(MUTEX_HELD(&msp->ms_lock)); 212 213 mutex_enter(&mg->mg_lock); 214 ASSERT(msp->ms_group == mg); 215 avl_remove(&mg->mg_metaslab_tree, msp); 216 msp->ms_weight = weight; 217 avl_add(&mg->mg_metaslab_tree, msp); 218 mutex_exit(&mg->mg_lock); 219 } 220 221 /* 222 * This is a helper function that can be used by the allocator to find 223 * a suitable block to allocate. This will search the specified AVL 224 * tree looking for a block that matches the specified criteria. 225 */ 226 static uint64_t 227 metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, 228 uint64_t align) 229 { 230 space_seg_t *ss, ssearch; 231 avl_index_t where; 232 233 ssearch.ss_start = *cursor; 234 ssearch.ss_end = *cursor + size; 235 236 ss = avl_find(t, &ssearch, &where); 237 if (ss == NULL) 238 ss = avl_nearest(t, where, AVL_AFTER); 239 240 while (ss != NULL) { 241 uint64_t offset = P2ROUNDUP(ss->ss_start, align); 242 243 if (offset + size <= ss->ss_end) { 244 *cursor = offset + size; 245 return (offset); 246 } 247 ss = AVL_NEXT(t, ss); 248 } 249 250 /* 251 * If we know we've searched the whole map (*cursor == 0), give up. 252 * Otherwise, reset the cursor to the beginning and try again. 253 */ 254 if (*cursor == 0) 255 return (-1ULL); 256 257 *cursor = 0; 258 return (metaslab_block_picker(t, cursor, size, align)); 259 } 260 261 /* 262 * ========================================================================== 263 * The first-fit block allocator 264 * ========================================================================== 265 */ 266 static void 267 metaslab_ff_load(space_map_t *sm) 268 { 269 ASSERT(sm->sm_ppd == NULL); 270 sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP); 271 sm->sm_pp_root = NULL; 272 } 273 274 static void 275 metaslab_ff_unload(space_map_t *sm) 276 { 277 kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t)); 278 sm->sm_ppd = NULL; 279 } 280 281 static uint64_t 282 metaslab_ff_alloc(space_map_t *sm, uint64_t size) 283 { 284 avl_tree_t *t = &sm->sm_root; 285 uint64_t align = size & -size; 286 uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; 287 288 return (metaslab_block_picker(t, cursor, size, align)); 289 } 290 291 /* ARGSUSED */ 292 static void 293 metaslab_ff_claim(space_map_t *sm, uint64_t start, uint64_t size) 294 { 295 /* No need to update cursor */ 296 } 297 298 /* ARGSUSED */ 299 static void 300 metaslab_ff_free(space_map_t *sm, uint64_t start, uint64_t size) 301 { 302 /* No need to update cursor */ 303 } 304 305 static space_map_ops_t metaslab_ff_ops = { 306 metaslab_ff_load, 307 metaslab_ff_unload, 308 metaslab_ff_alloc, 309 metaslab_ff_claim, 310 metaslab_ff_free, 311 NULL /* maxsize */ 312 }; 313 314 /* 315 * Dynamic block allocator - 316 * Uses the first fit allocation scheme until space get low and then 317 * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold 318 * and metaslab_df_free_pct to determine when to switch the allocation scheme. 319 */ 320 321 uint64_t 322 metaslab_df_maxsize(space_map_t *sm) 323 { 324 avl_tree_t *t = sm->sm_pp_root; 325 space_seg_t *ss; 326 327 if (t == NULL || (ss = avl_last(t)) == NULL) 328 return (0ULL); 329 330 return (ss->ss_end - ss->ss_start); 331 } 332 333 static int 334 metaslab_df_seg_compare(const void *x1, const void *x2) 335 { 336 const space_seg_t *s1 = x1; 337 const space_seg_t *s2 = x2; 338 uint64_t ss_size1 = s1->ss_end - s1->ss_start; 339 uint64_t ss_size2 = s2->ss_end - s2->ss_start; 340 341 if (ss_size1 < ss_size2) 342 return (-1); 343 if (ss_size1 > ss_size2) 344 return (1); 345 346 if (s1->ss_start < s2->ss_start) 347 return (-1); 348 if (s1->ss_start > s2->ss_start) 349 return (1); 350 351 return (0); 352 } 353 354 static void 355 metaslab_df_load(space_map_t *sm) 356 { 357 space_seg_t *ss; 358 359 ASSERT(sm->sm_ppd == NULL); 360 sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP); 361 362 sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); 363 avl_create(sm->sm_pp_root, metaslab_df_seg_compare, 364 sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node)); 365 366 for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss)) 367 avl_add(sm->sm_pp_root, ss); 368 } 369 370 static void 371 metaslab_df_unload(space_map_t *sm) 372 { 373 void *cookie = NULL; 374 375 kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t)); 376 sm->sm_ppd = NULL; 377 378 while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) { 379 /* tear down the tree */ 380 } 381 382 avl_destroy(sm->sm_pp_root); 383 kmem_free(sm->sm_pp_root, sizeof (avl_tree_t)); 384 sm->sm_pp_root = NULL; 385 } 386 387 static uint64_t 388 metaslab_df_alloc(space_map_t *sm, uint64_t size) 389 { 390 avl_tree_t *t = &sm->sm_root; 391 uint64_t align = size & -size; 392 uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; 393 uint64_t max_size = metaslab_df_maxsize(sm); 394 int free_pct = sm->sm_space * 100 / sm->sm_size; 395 396 ASSERT(MUTEX_HELD(sm->sm_lock)); 397 ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); 398 399 if (max_size < size) 400 return (-1ULL); 401 402 /* 403 * If we're running low on space switch to using the size 404 * sorted AVL tree (best-fit). 405 */ 406 if (max_size < metaslab_df_alloc_threshold || 407 free_pct < metaslab_df_free_pct) { 408 t = sm->sm_pp_root; 409 *cursor = 0; 410 } 411 412 return (metaslab_block_picker(t, cursor, size, 1ULL)); 413 } 414 415 /* ARGSUSED */ 416 static void 417 metaslab_df_claim(space_map_t *sm, uint64_t start, uint64_t size) 418 { 419 /* No need to update cursor */ 420 } 421 422 /* ARGSUSED */ 423 static void 424 metaslab_df_free(space_map_t *sm, uint64_t start, uint64_t size) 425 { 426 /* No need to update cursor */ 427 } 428 429 static space_map_ops_t metaslab_df_ops = { 430 metaslab_df_load, 431 metaslab_df_unload, 432 metaslab_df_alloc, 433 metaslab_df_claim, 434 metaslab_df_free, 435 metaslab_df_maxsize 436 }; 437 438 space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops; 439 440 /* 441 * ========================================================================== 442 * Metaslabs 443 * ========================================================================== 444 */ 445 metaslab_t * 446 metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, 447 uint64_t start, uint64_t size, uint64_t txg) 448 { 449 vdev_t *vd = mg->mg_vd; 450 metaslab_t *msp; 451 452 msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); 453 mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL); 454 455 msp->ms_smo_syncing = *smo; 456 457 /* 458 * We create the main space map here, but we don't create the 459 * allocmaps and freemaps until metaslab_sync_done(). This serves 460 * two purposes: it allows metaslab_sync_done() to detect the 461 * addition of new space; and for debugging, it ensures that we'd 462 * data fault on any attempt to use this metaslab before it's ready. 463 */ 464 space_map_create(&msp->ms_map, start, size, 465 vd->vdev_ashift, &msp->ms_lock); 466 467 metaslab_group_add(mg, msp); 468 469 /* 470 * If we're opening an existing pool (txg == 0) or creating 471 * a new one (txg == TXG_INITIAL), all space is available now. 472 * If we're adding space to an existing pool, the new space 473 * does not become available until after this txg has synced. 474 */ 475 if (txg <= TXG_INITIAL) 476 metaslab_sync_done(msp, 0); 477 478 if (txg != 0) { 479 /* 480 * The vdev is dirty, but the metaslab isn't -- it just needs 481 * to have metaslab_sync_done() invoked from vdev_sync_done(). 482 * [We could just dirty the metaslab, but that would cause us 483 * to allocate a space map object for it, which is wasteful 484 * and would mess up the locality logic in metaslab_weight().] 485 */ 486 ASSERT(TXG_CLEAN(txg) == spa_last_synced_txg(vd->vdev_spa)); 487 vdev_dirty(vd, 0, NULL, txg); 488 vdev_dirty(vd, VDD_METASLAB, msp, TXG_CLEAN(txg)); 489 } 490 491 return (msp); 492 } 493 494 void 495 metaslab_fini(metaslab_t *msp) 496 { 497 metaslab_group_t *mg = msp->ms_group; 498 int t; 499 500 vdev_space_update(mg->mg_vd, -msp->ms_map.sm_size, 501 -msp->ms_smo.smo_alloc, B_TRUE); 502 503 metaslab_group_remove(mg, msp); 504 505 mutex_enter(&msp->ms_lock); 506 507 space_map_unload(&msp->ms_map); 508 space_map_destroy(&msp->ms_map); 509 510 for (t = 0; t < TXG_SIZE; t++) { 511 space_map_destroy(&msp->ms_allocmap[t]); 512 space_map_destroy(&msp->ms_freemap[t]); 513 } 514 515 mutex_exit(&msp->ms_lock); 516 mutex_destroy(&msp->ms_lock); 517 518 kmem_free(msp, sizeof (metaslab_t)); 519 } 520 521 #define METASLAB_WEIGHT_PRIMARY (1ULL << 63) 522 #define METASLAB_WEIGHT_SECONDARY (1ULL << 62) 523 #define METASLAB_ACTIVE_MASK \ 524 (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY) 525 #define METASLAB_SMO_BONUS_MULTIPLIER 2 526 527 static uint64_t 528 metaslab_weight(metaslab_t *msp) 529 { 530 metaslab_group_t *mg = msp->ms_group; 531 space_map_t *sm = &msp->ms_map; 532 space_map_obj_t *smo = &msp->ms_smo; 533 vdev_t *vd = mg->mg_vd; 534 uint64_t weight, space; 535 536 ASSERT(MUTEX_HELD(&msp->ms_lock)); 537 538 /* 539 * The baseline weight is the metaslab's free space. 540 */ 541 space = sm->sm_size - smo->smo_alloc; 542 weight = space; 543 544 /* 545 * Modern disks have uniform bit density and constant angular velocity. 546 * Therefore, the outer recording zones are faster (higher bandwidth) 547 * than the inner zones by the ratio of outer to inner track diameter, 548 * which is typically around 2:1. We account for this by assigning 549 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). 550 * In effect, this means that we'll select the metaslab with the most 551 * free bandwidth rather than simply the one with the most free space. 552 */ 553 weight = 2 * weight - 554 ((sm->sm_start >> vd->vdev_ms_shift) * weight) / vd->vdev_ms_count; 555 ASSERT(weight >= space && weight <= 2 * space); 556 557 /* 558 * For locality, assign higher weight to metaslabs we've used before. 559 */ 560 if (smo->smo_object != 0) 561 weight *= METASLAB_SMO_BONUS_MULTIPLIER; 562 ASSERT(weight >= space && 563 weight <= 2 * METASLAB_SMO_BONUS_MULTIPLIER * space); 564 565 /* 566 * If this metaslab is one we're actively using, adjust its weight to 567 * make it preferable to any inactive metaslab so we'll polish it off. 568 */ 569 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); 570 571 return (weight); 572 } 573 574 static int 575 metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size) 576 { 577 space_map_t *sm = &msp->ms_map; 578 space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops; 579 580 ASSERT(MUTEX_HELD(&msp->ms_lock)); 581 582 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { 583 int error = space_map_load(sm, sm_ops, SM_FREE, &msp->ms_smo, 584 msp->ms_group->mg_vd->vdev_spa->spa_meta_objset); 585 if (error) { 586 metaslab_group_sort(msp->ms_group, msp, 0); 587 return (error); 588 } 589 590 /* 591 * If we were able to load the map then make sure 592 * that this map is still able to satisfy our request. 593 */ 594 if (msp->ms_weight < size) 595 return (ENOSPC); 596 597 metaslab_group_sort(msp->ms_group, msp, 598 msp->ms_weight | activation_weight); 599 } 600 ASSERT(sm->sm_loaded); 601 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); 602 603 return (0); 604 } 605 606 static void 607 metaslab_passivate(metaslab_t *msp, uint64_t size) 608 { 609 /* 610 * If size < SPA_MINBLOCKSIZE, then we will not allocate from 611 * this metaslab again. In that case, it had better be empty, 612 * or we would be leaving space on the table. 613 */ 614 ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map.sm_space == 0); 615 metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size)); 616 ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); 617 } 618 619 /* 620 * Write a metaslab to disk in the context of the specified transaction group. 621 */ 622 void 623 metaslab_sync(metaslab_t *msp, uint64_t txg) 624 { 625 vdev_t *vd = msp->ms_group->mg_vd; 626 spa_t *spa = vd->vdev_spa; 627 objset_t *mos = spa->spa_meta_objset; 628 space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK]; 629 space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK]; 630 space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; 631 space_map_t *sm = &msp->ms_map; 632 space_map_obj_t *smo = &msp->ms_smo_syncing; 633 dmu_buf_t *db; 634 dmu_tx_t *tx; 635 int t; 636 637 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 638 639 /* 640 * The only state that can actually be changing concurrently with 641 * metaslab_sync() is the metaslab's ms_map. No other thread can 642 * be modifying this txg's allocmap, freemap, freed_map, or smo. 643 * Therefore, we only hold ms_lock to satify space_map ASSERTs. 644 * We drop it whenever we call into the DMU, because the DMU 645 * can call down to us (e.g. via zio_free()) at any time. 646 */ 647 mutex_enter(&msp->ms_lock); 648 649 if (smo->smo_object == 0) { 650 ASSERT(smo->smo_objsize == 0); 651 ASSERT(smo->smo_alloc == 0); 652 mutex_exit(&msp->ms_lock); 653 smo->smo_object = dmu_object_alloc(mos, 654 DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT, 655 DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx); 656 ASSERT(smo->smo_object != 0); 657 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * 658 (sm->sm_start >> vd->vdev_ms_shift), 659 sizeof (uint64_t), &smo->smo_object, tx); 660 mutex_enter(&msp->ms_lock); 661 } 662 663 space_map_walk(freemap, space_map_add, freed_map); 664 665 if (sm->sm_loaded && spa_sync_pass(spa) == 1 && smo->smo_objsize >= 666 2 * sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) { 667 /* 668 * The in-core space map representation is twice as compact 669 * as the on-disk one, so it's time to condense the latter 670 * by generating a pure allocmap from first principles. 671 * 672 * This metaslab is 100% allocated, 673 * minus the content of the in-core map (sm), 674 * minus what's been freed this txg (freed_map), 675 * minus allocations from txgs in the future 676 * (because they haven't been committed yet). 677 */ 678 space_map_vacate(allocmap, NULL, NULL); 679 space_map_vacate(freemap, NULL, NULL); 680 681 space_map_add(allocmap, allocmap->sm_start, allocmap->sm_size); 682 683 space_map_walk(sm, space_map_remove, allocmap); 684 space_map_walk(freed_map, space_map_remove, allocmap); 685 686 for (t = 1; t < TXG_CONCURRENT_STATES; t++) 687 space_map_walk(&msp->ms_allocmap[(txg + t) & TXG_MASK], 688 space_map_remove, allocmap); 689 690 mutex_exit(&msp->ms_lock); 691 space_map_truncate(smo, mos, tx); 692 mutex_enter(&msp->ms_lock); 693 } 694 695 space_map_sync(allocmap, SM_ALLOC, smo, mos, tx); 696 space_map_sync(freemap, SM_FREE, smo, mos, tx); 697 698 mutex_exit(&msp->ms_lock); 699 700 VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)); 701 dmu_buf_will_dirty(db, tx); 702 ASSERT3U(db->db_size, >=, sizeof (*smo)); 703 bcopy(smo, db->db_data, sizeof (*smo)); 704 dmu_buf_rele(db, FTAG); 705 706 dmu_tx_commit(tx); 707 } 708 709 /* 710 * Called after a transaction group has completely synced to mark 711 * all of the metaslab's free space as usable. 712 */ 713 void 714 metaslab_sync_done(metaslab_t *msp, uint64_t txg) 715 { 716 space_map_obj_t *smo = &msp->ms_smo; 717 space_map_obj_t *smosync = &msp->ms_smo_syncing; 718 space_map_t *sm = &msp->ms_map; 719 space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; 720 metaslab_group_t *mg = msp->ms_group; 721 vdev_t *vd = mg->mg_vd; 722 int t; 723 724 mutex_enter(&msp->ms_lock); 725 726 /* 727 * If this metaslab is just becoming available, initialize its 728 * allocmaps and freemaps and add its capacity to the vdev. 729 */ 730 if (freed_map->sm_size == 0) { 731 for (t = 0; t < TXG_SIZE; t++) { 732 space_map_create(&msp->ms_allocmap[t], sm->sm_start, 733 sm->sm_size, sm->sm_shift, sm->sm_lock); 734 space_map_create(&msp->ms_freemap[t], sm->sm_start, 735 sm->sm_size, sm->sm_shift, sm->sm_lock); 736 } 737 vdev_space_update(vd, sm->sm_size, 0, B_TRUE); 738 } 739 740 vdev_space_update(vd, 0, smosync->smo_alloc - smo->smo_alloc, B_TRUE); 741 742 ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0); 743 ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0); 744 745 /* 746 * If there's a space_map_load() in progress, wait for it to complete 747 * so that we have a consistent view of the in-core space map. 748 * Then, add everything we freed in this txg to the map. 749 */ 750 space_map_load_wait(sm); 751 space_map_vacate(freed_map, sm->sm_loaded ? space_map_free : NULL, sm); 752 753 *smo = *smosync; 754 755 /* 756 * If the map is loaded but no longer active, evict it as soon as all 757 * future allocations have synced. (If we unloaded it now and then 758 * loaded a moment later, the map wouldn't reflect those allocations.) 759 */ 760 if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { 761 int evictable = 1; 762 763 for (t = 1; t < TXG_CONCURRENT_STATES; t++) 764 if (msp->ms_allocmap[(txg + t) & TXG_MASK].sm_space) 765 evictable = 0; 766 767 if (evictable) 768 space_map_unload(sm); 769 } 770 771 metaslab_group_sort(mg, msp, metaslab_weight(msp)); 772 773 mutex_exit(&msp->ms_lock); 774 } 775 776 static uint64_t 777 metaslab_distance(metaslab_t *msp, dva_t *dva) 778 { 779 uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift; 780 uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift; 781 uint64_t start = msp->ms_map.sm_start >> ms_shift; 782 783 if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) 784 return (1ULL << 63); 785 786 if (offset < start) 787 return ((start - offset) << ms_shift); 788 if (offset > start) 789 return ((offset - start) << ms_shift); 790 return (0); 791 } 792 793 static uint64_t 794 metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg, 795 uint64_t min_distance, dva_t *dva, int d) 796 { 797 metaslab_t *msp = NULL; 798 uint64_t offset = -1ULL; 799 avl_tree_t *t = &mg->mg_metaslab_tree; 800 uint64_t activation_weight; 801 uint64_t target_distance; 802 int i; 803 804 activation_weight = METASLAB_WEIGHT_PRIMARY; 805 for (i = 0; i < d; i++) { 806 if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { 807 activation_weight = METASLAB_WEIGHT_SECONDARY; 808 break; 809 } 810 } 811 812 for (;;) { 813 boolean_t was_active; 814 815 mutex_enter(&mg->mg_lock); 816 for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) { 817 if (msp->ms_weight < size) { 818 mutex_exit(&mg->mg_lock); 819 return (-1ULL); 820 } 821 822 was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; 823 if (activation_weight == METASLAB_WEIGHT_PRIMARY) 824 break; 825 826 target_distance = min_distance + 827 (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1); 828 829 for (i = 0; i < d; i++) 830 if (metaslab_distance(msp, &dva[i]) < 831 target_distance) 832 break; 833 if (i == d) 834 break; 835 } 836 mutex_exit(&mg->mg_lock); 837 if (msp == NULL) 838 return (-1ULL); 839 840 mutex_enter(&msp->ms_lock); 841 842 /* 843 * Ensure that the metaslab we have selected is still 844 * capable of handling our request. It's possible that 845 * another thread may have changed the weight while we 846 * were blocked on the metaslab lock. 847 */ 848 if (msp->ms_weight < size || (was_active && 849 !(msp->ms_weight & METASLAB_ACTIVE_MASK) && 850 activation_weight == METASLAB_WEIGHT_PRIMARY)) { 851 mutex_exit(&msp->ms_lock); 852 continue; 853 } 854 855 if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) && 856 activation_weight == METASLAB_WEIGHT_PRIMARY) { 857 metaslab_passivate(msp, 858 msp->ms_weight & ~METASLAB_ACTIVE_MASK); 859 mutex_exit(&msp->ms_lock); 860 continue; 861 } 862 863 if (metaslab_activate(msp, activation_weight, size) != 0) { 864 mutex_exit(&msp->ms_lock); 865 continue; 866 } 867 868 if ((offset = space_map_alloc(&msp->ms_map, size)) != -1ULL) 869 break; 870 871 metaslab_passivate(msp, size - 1); 872 873 mutex_exit(&msp->ms_lock); 874 } 875 876 if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) 877 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); 878 879 space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); 880 881 mutex_exit(&msp->ms_lock); 882 883 return (offset); 884 } 885 886 /* 887 * Allocate a block for the specified i/o. 888 */ 889 static int 890 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, 891 dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags) 892 { 893 metaslab_group_t *mg, *rotor; 894 vdev_t *vd; 895 int dshift = 3; 896 int all_zero; 897 int zio_lock = B_FALSE; 898 boolean_t allocatable; 899 uint64_t offset = -1ULL; 900 uint64_t asize; 901 uint64_t distance; 902 903 ASSERT(!DVA_IS_VALID(&dva[d])); 904 905 /* 906 * For testing, make some blocks above a certain size be gang blocks. 907 */ 908 if (psize >= metaslab_gang_bang && (lbolt & 3) == 0) 909 return (ENOSPC); 910 911 /* 912 * Start at the rotor and loop through all mgs until we find something. 913 * Note that there's no locking on mc_rotor or mc_allocated because 914 * nothing actually breaks if we miss a few updates -- we just won't 915 * allocate quite as evenly. It all balances out over time. 916 * 917 * If we are doing ditto or log blocks, try to spread them across 918 * consecutive vdevs. If we're forced to reuse a vdev before we've 919 * allocated all of our ditto blocks, then try and spread them out on 920 * that vdev as much as possible. If it turns out to not be possible, 921 * gradually lower our standards until anything becomes acceptable. 922 * Also, allocating on consecutive vdevs (as opposed to random vdevs) 923 * gives us hope of containing our fault domains to something we're 924 * able to reason about. Otherwise, any two top-level vdev failures 925 * will guarantee the loss of data. With consecutive allocation, 926 * only two adjacent top-level vdev failures will result in data loss. 927 * 928 * If we are doing gang blocks (hintdva is non-NULL), try to keep 929 * ourselves on the same vdev as our gang block header. That 930 * way, we can hope for locality in vdev_cache, plus it makes our 931 * fault domains something tractable. 932 */ 933 if (hintdva) { 934 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); 935 if (flags & METASLAB_HINTBP_AVOID) 936 mg = vd->vdev_mg->mg_next; 937 else 938 mg = vd->vdev_mg; 939 } else if (d != 0) { 940 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); 941 mg = vd->vdev_mg->mg_next; 942 } else { 943 mg = mc->mc_rotor; 944 } 945 946 /* 947 * If the hint put us into the wrong class, just follow the rotor. 948 */ 949 if (mg->mg_class != mc) 950 mg = mc->mc_rotor; 951 952 rotor = mg; 953 top: 954 all_zero = B_TRUE; 955 do { 956 vd = mg->mg_vd; 957 958 /* 959 * Don't allocate from faulted devices. 960 */ 961 if (zio_lock) { 962 spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); 963 allocatable = vdev_allocatable(vd); 964 spa_config_exit(spa, SCL_ZIO, FTAG); 965 } else { 966 allocatable = vdev_allocatable(vd); 967 } 968 if (!allocatable) 969 goto next; 970 971 /* 972 * Avoid writing single-copy data to a failing vdev 973 */ 974 if ((vd->vdev_stat.vs_write_errors > 0 || 975 vd->vdev_state < VDEV_STATE_HEALTHY) && 976 d == 0 && dshift == 3) { 977 all_zero = B_FALSE; 978 goto next; 979 } 980 981 ASSERT(mg->mg_class == mc); 982 983 distance = vd->vdev_asize >> dshift; 984 if (distance <= (1ULL << vd->vdev_ms_shift)) 985 distance = 0; 986 else 987 all_zero = B_FALSE; 988 989 asize = vdev_psize_to_asize(vd, psize); 990 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); 991 992 offset = metaslab_group_alloc(mg, asize, txg, distance, dva, d); 993 if (offset != -1ULL) { 994 /* 995 * If we've just selected this metaslab group, 996 * figure out whether the corresponding vdev is 997 * over- or under-used relative to the pool, 998 * and set an allocation bias to even it out. 999 */ 1000 if (mc->mc_allocated == 0) { 1001 vdev_stat_t *vs = &vd->vdev_stat; 1002 uint64_t alloc, space; 1003 int64_t vu, su; 1004 1005 alloc = spa_get_alloc(spa); 1006 space = spa_get_space(spa); 1007 1008 /* 1009 * Determine percent used in units of 0..1024. 1010 * (This is just to avoid floating point.) 1011 */ 1012 vu = (vs->vs_alloc << 10) / (vs->vs_space + 1); 1013 su = (alloc << 10) / (space + 1); 1014 1015 /* 1016 * Bias by at most +/- 25% of the aliquot. 1017 */ 1018 mg->mg_bias = ((su - vu) * 1019 (int64_t)mg->mg_aliquot) / (1024 * 4); 1020 } 1021 1022 if (atomic_add_64_nv(&mc->mc_allocated, asize) >= 1023 mg->mg_aliquot + mg->mg_bias) { 1024 mc->mc_rotor = mg->mg_next; 1025 mc->mc_allocated = 0; 1026 } 1027 1028 DVA_SET_VDEV(&dva[d], vd->vdev_id); 1029 DVA_SET_OFFSET(&dva[d], offset); 1030 DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER)); 1031 DVA_SET_ASIZE(&dva[d], asize); 1032 1033 return (0); 1034 } 1035 next: 1036 mc->mc_rotor = mg->mg_next; 1037 mc->mc_allocated = 0; 1038 } while ((mg = mg->mg_next) != rotor); 1039 1040 if (!all_zero) { 1041 dshift++; 1042 ASSERT(dshift < 64); 1043 goto top; 1044 } 1045 1046 if (!allocatable && !zio_lock) { 1047 dshift = 3; 1048 zio_lock = B_TRUE; 1049 goto top; 1050 } 1051 1052 bzero(&dva[d], sizeof (dva_t)); 1053 1054 return (ENOSPC); 1055 } 1056 1057 /* 1058 * Free the block represented by DVA in the context of the specified 1059 * transaction group. 1060 */ 1061 static void 1062 metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now) 1063 { 1064 uint64_t vdev = DVA_GET_VDEV(dva); 1065 uint64_t offset = DVA_GET_OFFSET(dva); 1066 uint64_t size = DVA_GET_ASIZE(dva); 1067 vdev_t *vd; 1068 metaslab_t *msp; 1069 1070 ASSERT(DVA_IS_VALID(dva)); 1071 1072 if (txg > spa_freeze_txg(spa)) 1073 return; 1074 1075 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 1076 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { 1077 cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu", 1078 (u_longlong_t)vdev, (u_longlong_t)offset); 1079 ASSERT(0); 1080 return; 1081 } 1082 1083 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 1084 1085 if (DVA_GET_GANG(dva)) 1086 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 1087 1088 mutex_enter(&msp->ms_lock); 1089 1090 if (now) { 1091 space_map_remove(&msp->ms_allocmap[txg & TXG_MASK], 1092 offset, size); 1093 space_map_free(&msp->ms_map, offset, size); 1094 } else { 1095 if (msp->ms_freemap[txg & TXG_MASK].sm_space == 0) 1096 vdev_dirty(vd, VDD_METASLAB, msp, txg); 1097 space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size); 1098 } 1099 1100 mutex_exit(&msp->ms_lock); 1101 } 1102 1103 /* 1104 * Intent log support: upon opening the pool after a crash, notify the SPA 1105 * of blocks that the intent log has allocated for immediate write, but 1106 * which are still considered free by the SPA because the last transaction 1107 * group didn't commit yet. 1108 */ 1109 static int 1110 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) 1111 { 1112 uint64_t vdev = DVA_GET_VDEV(dva); 1113 uint64_t offset = DVA_GET_OFFSET(dva); 1114 uint64_t size = DVA_GET_ASIZE(dva); 1115 vdev_t *vd; 1116 metaslab_t *msp; 1117 int error; 1118 1119 ASSERT(DVA_IS_VALID(dva)); 1120 1121 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 1122 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) 1123 return (ENXIO); 1124 1125 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 1126 1127 if (DVA_GET_GANG(dva)) 1128 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 1129 1130 mutex_enter(&msp->ms_lock); 1131 1132 error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY, 0); 1133 if (error || txg == 0) { /* txg == 0 indicates dry run */ 1134 mutex_exit(&msp->ms_lock); 1135 return (error); 1136 } 1137 1138 space_map_claim(&msp->ms_map, offset, size); 1139 1140 if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ 1141 if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) 1142 vdev_dirty(vd, VDD_METASLAB, msp, txg); 1143 space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); 1144 } 1145 1146 mutex_exit(&msp->ms_lock); 1147 1148 return (0); 1149 } 1150 1151 int 1152 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, 1153 int ndvas, uint64_t txg, blkptr_t *hintbp, int flags) 1154 { 1155 dva_t *dva = bp->blk_dva; 1156 dva_t *hintdva = hintbp->blk_dva; 1157 int error = 0; 1158 1159 ASSERT(bp->blk_birth == 0); 1160 1161 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 1162 1163 if (mc->mc_rotor == NULL) { /* no vdevs in this class */ 1164 spa_config_exit(spa, SCL_ALLOC, FTAG); 1165 return (ENOSPC); 1166 } 1167 1168 ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); 1169 ASSERT(BP_GET_NDVAS(bp) == 0); 1170 ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); 1171 1172 for (int d = 0; d < ndvas; d++) { 1173 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, 1174 txg, flags); 1175 if (error) { 1176 for (d--; d >= 0; d--) { 1177 metaslab_free_dva(spa, &dva[d], txg, B_TRUE); 1178 bzero(&dva[d], sizeof (dva_t)); 1179 } 1180 spa_config_exit(spa, SCL_ALLOC, FTAG); 1181 return (error); 1182 } 1183 } 1184 ASSERT(error == 0); 1185 ASSERT(BP_GET_NDVAS(bp) == ndvas); 1186 1187 spa_config_exit(spa, SCL_ALLOC, FTAG); 1188 1189 bp->blk_birth = txg; 1190 1191 return (0); 1192 } 1193 1194 void 1195 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) 1196 { 1197 const dva_t *dva = bp->blk_dva; 1198 int ndvas = BP_GET_NDVAS(bp); 1199 1200 ASSERT(!BP_IS_HOLE(bp)); 1201 ASSERT(!now || bp->blk_birth >= spa->spa_syncing_txg); 1202 1203 spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); 1204 1205 for (int d = 0; d < ndvas; d++) 1206 metaslab_free_dva(spa, &dva[d], txg, now); 1207 1208 spa_config_exit(spa, SCL_FREE, FTAG); 1209 } 1210 1211 int 1212 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) 1213 { 1214 const dva_t *dva = bp->blk_dva; 1215 int ndvas = BP_GET_NDVAS(bp); 1216 int error = 0; 1217 1218 ASSERT(!BP_IS_HOLE(bp)); 1219 1220 if (txg != 0) { 1221 /* 1222 * First do a dry run to make sure all DVAs are claimable, 1223 * so we don't have to unwind from partial failures below. 1224 */ 1225 if ((error = metaslab_claim(spa, bp, 0)) != 0) 1226 return (error); 1227 } 1228 1229 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 1230 1231 for (int d = 0; d < ndvas; d++) 1232 if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0) 1233 break; 1234 1235 spa_config_exit(spa, SCL_ALLOC, FTAG); 1236 1237 ASSERT(error == 0 || txg == 0); 1238 1239 return (error); 1240 } 1241