1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/zfs_context.h> 29 #include <sys/spa_impl.h> 30 #include <sys/dmu.h> 31 #include <sys/dmu_tx.h> 32 #include <sys/space_map.h> 33 #include <sys/metaslab_impl.h> 34 #include <sys/vdev_impl.h> 35 #include <sys/zio.h> 36 37 /* 38 * ========================================================================== 39 * Metaslab classes 40 * ========================================================================== 41 */ 42 metaslab_class_t * 43 metaslab_class_create(void) 44 { 45 metaslab_class_t *mc; 46 47 mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); 48 49 mc->mc_rotor = NULL; 50 51 return (mc); 52 } 53 54 void 55 metaslab_class_destroy(metaslab_class_t *mc) 56 { 57 metaslab_group_t *mg; 58 59 while ((mg = mc->mc_rotor) != NULL) { 60 metaslab_class_remove(mc, mg); 61 metaslab_group_destroy(mg); 62 } 63 64 kmem_free(mc, sizeof (metaslab_class_t)); 65 } 66 67 void 68 metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg) 69 { 70 metaslab_group_t *mgprev, *mgnext; 71 72 ASSERT(mg->mg_class == NULL); 73 74 if ((mgprev = mc->mc_rotor) == NULL) { 75 mg->mg_prev = mg; 76 mg->mg_next = mg; 77 } else { 78 mgnext = mgprev->mg_next; 79 mg->mg_prev = mgprev; 80 mg->mg_next = mgnext; 81 mgprev->mg_next = mg; 82 mgnext->mg_prev = mg; 83 } 84 mc->mc_rotor = mg; 85 mg->mg_class = mc; 86 } 87 88 void 89 metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg) 90 { 91 metaslab_group_t *mgprev, *mgnext; 92 93 ASSERT(mg->mg_class == mc); 94 95 mgprev = mg->mg_prev; 96 mgnext = mg->mg_next; 97 98 if (mg == mgnext) { 99 mc->mc_rotor = NULL; 100 } else { 101 mc->mc_rotor = mgnext; 102 mgprev->mg_next = mgnext; 103 mgnext->mg_prev = mgprev; 104 } 105 106 mg->mg_prev = NULL; 107 mg->mg_next = NULL; 108 mg->mg_class = NULL; 109 } 110 111 /* 112 * ========================================================================== 113 * Metaslab groups 114 * ========================================================================== 115 */ 116 static int 117 metaslab_compare(const void *x1, const void *x2) 118 { 119 const metaslab_t *m1 = x1; 120 const metaslab_t *m2 = x2; 121 122 if (m1->ms_weight < m2->ms_weight) 123 return (1); 124 if (m1->ms_weight > m2->ms_weight) 125 return (-1); 126 127 /* 128 * If the weights are identical, use the offset to force uniqueness. 129 */ 130 if (m1->ms_map.sm_start < m2->ms_map.sm_start) 131 return (-1); 132 if (m1->ms_map.sm_start > m2->ms_map.sm_start) 133 return (1); 134 135 ASSERT3P(m1, ==, m2); 136 137 return (0); 138 } 139 140 metaslab_group_t * 141 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) 142 { 143 metaslab_group_t *mg; 144 145 mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); 146 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); 147 avl_create(&mg->mg_metaslab_tree, metaslab_compare, 148 sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); 149 mg->mg_aliquot = 2ULL << 20; /* XXX -- tweak me */ 150 mg->mg_vd = vd; 151 metaslab_class_add(mc, mg); 152 153 return (mg); 154 } 155 156 void 157 metaslab_group_destroy(metaslab_group_t *mg) 158 { 159 avl_destroy(&mg->mg_metaslab_tree); 160 mutex_destroy(&mg->mg_lock); 161 kmem_free(mg, sizeof (metaslab_group_t)); 162 } 163 164 void 165 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 166 { 167 mutex_enter(&mg->mg_lock); 168 ASSERT(msp->ms_group == NULL); 169 msp->ms_group = mg; 170 msp->ms_weight = weight; 171 avl_add(&mg->mg_metaslab_tree, msp); 172 mutex_exit(&mg->mg_lock); 173 } 174 175 void 176 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) 177 { 178 mutex_enter(&mg->mg_lock); 179 ASSERT(msp->ms_group == mg); 180 avl_remove(&mg->mg_metaslab_tree, msp); 181 msp->ms_group = NULL; 182 mutex_exit(&mg->mg_lock); 183 } 184 185 void 186 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 187 { 188 mutex_enter(&mg->mg_lock); 189 ASSERT(msp->ms_group == mg); 190 avl_remove(&mg->mg_metaslab_tree, msp); 191 msp->ms_weight = weight; 192 avl_add(&mg->mg_metaslab_tree, msp); 193 mutex_exit(&mg->mg_lock); 194 } 195 196 /* 197 * ========================================================================== 198 * Metaslabs 199 * ========================================================================== 200 */ 201 void 202 metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, metaslab_t **mspp, 203 uint64_t start, uint64_t size, uint64_t txg) 204 { 205 vdev_t *vd = mg->mg_vd; 206 metaslab_t *msp; 207 int fm; 208 209 msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); 210 211 msp->ms_smo = smo; 212 213 space_map_create(&msp->ms_map, start, size, vd->vdev_ashift, 214 &msp->ms_lock); 215 216 for (fm = 0; fm < TXG_SIZE; fm++) { 217 space_map_create(&msp->ms_allocmap[fm], start, size, 218 vd->vdev_ashift, &msp->ms_lock); 219 space_map_create(&msp->ms_freemap[fm], start, size, 220 vd->vdev_ashift, &msp->ms_lock); 221 } 222 223 /* 224 * If we're opening an existing pool (txg == 0) or creating 225 * a new one (txg == TXG_INITIAL), all space is available now. 226 * If we're adding space to an existing pool, the new space 227 * does not become available until after this txg has synced. 228 * We enforce this by assigning an initial weight of 0 to new space. 229 * 230 * (Transactional allocations for this txg would actually be OK; 231 * it's intent log allocations that cause trouble. If we wrote 232 * a log block in this txg and lost power, the log replay would be 233 * based on the DVA translations that had been synced in txg - 1. 234 * Those translations would not include this metaslab's vdev.) 235 */ 236 metaslab_group_add(mg, msp, txg > TXG_INITIAL ? 0 : size); 237 238 if (txg == 0) { 239 /* 240 * We're opening the pool. Make the metaslab's 241 * free space available immediately. 242 */ 243 vdev_space_update(vd, size, smo->smo_alloc); 244 metaslab_sync_done(msp, 0); 245 } else { 246 /* 247 * We're adding a new metaslab to an already-open pool. 248 * Declare all of the metaslab's space to be free. 249 * 250 * Note that older transaction groups cannot allocate 251 * from this metaslab until its existence is committed, 252 * because we set ms_last_alloc to the current txg. 253 */ 254 smo->smo_alloc = 0; 255 msp->ms_usable_space = size; 256 mutex_enter(&msp->ms_lock); 257 space_map_add(&msp->ms_map, start, size); 258 msp->ms_map_incore = 1; 259 mutex_exit(&msp->ms_lock); 260 261 /* XXX -- we'll need a call to picker_init here */ 262 msp->ms_dirty[txg & TXG_MASK] |= MSD_ADD; 263 msp->ms_last_alloc = txg; 264 vdev_dirty(vd, VDD_ADD, txg); 265 (void) txg_list_add(&vd->vdev_ms_list, msp, txg); 266 } 267 268 *mspp = msp; 269 } 270 271 void 272 metaslab_fini(metaslab_t *msp) 273 { 274 int fm; 275 metaslab_group_t *mg = msp->ms_group; 276 277 vdev_space_update(mg->mg_vd, -msp->ms_map.sm_size, 278 -msp->ms_smo->smo_alloc); 279 280 metaslab_group_remove(mg, msp); 281 282 /* XXX -- we'll need a call to picker_fini here */ 283 284 mutex_enter(&msp->ms_lock); 285 286 space_map_vacate(&msp->ms_map, NULL, NULL); 287 msp->ms_map_incore = 0; 288 space_map_destroy(&msp->ms_map); 289 290 for (fm = 0; fm < TXG_SIZE; fm++) { 291 space_map_destroy(&msp->ms_allocmap[fm]); 292 space_map_destroy(&msp->ms_freemap[fm]); 293 } 294 295 mutex_exit(&msp->ms_lock); 296 297 kmem_free(msp, sizeof (metaslab_t)); 298 } 299 300 /* 301 * Write a metaslab to disk in the context of the specified transaction group. 302 */ 303 void 304 metaslab_sync(metaslab_t *msp, uint64_t txg) 305 { 306 vdev_t *vd = msp->ms_group->mg_vd; 307 spa_t *spa = vd->vdev_spa; 308 objset_t *os = spa->spa_meta_objset; 309 space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK]; 310 space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK]; 311 space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; 312 space_map_obj_t *smo = msp->ms_smo; 313 uint8_t *dirty = &msp->ms_dirty[txg & TXG_MASK]; 314 uint64_t alloc_delta; 315 dmu_buf_t *db; 316 dmu_tx_t *tx; 317 318 dprintf("%s offset %llx\n", vdev_description(vd), msp->ms_map.sm_start); 319 320 mutex_enter(&msp->ms_lock); 321 322 if (*dirty & MSD_ADD) 323 vdev_space_update(vd, msp->ms_map.sm_size, 0); 324 325 if (*dirty & (MSD_ALLOC | MSD_FREE)) { 326 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 327 328 if (smo->smo_object == 0) { 329 ASSERT(smo->smo_objsize == 0); 330 ASSERT(smo->smo_alloc == 0); 331 smo->smo_object = dmu_object_alloc(os, 332 DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT, 333 DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx); 334 ASSERT(smo->smo_object != 0); 335 dmu_write(os, vd->vdev_ms_array, sizeof (uint64_t) * 336 (msp->ms_map.sm_start >> vd->vdev_ms_shift), 337 sizeof (uint64_t), &smo->smo_object, tx); 338 } 339 340 alloc_delta = allocmap->sm_space - freemap->sm_space; 341 vdev_space_update(vd, 0, alloc_delta); 342 smo->smo_alloc += alloc_delta; 343 344 if (msp->ms_last_alloc == txg && msp->ms_map.sm_space == 0 && 345 (*dirty & MSD_CONDENSE) == 0) { 346 space_map_t *sm = &msp->ms_map; 347 space_map_t *tsm; 348 int i; 349 350 ASSERT(msp->ms_map_incore); 351 352 space_map_merge(freemap, freed_map); 353 space_map_vacate(allocmap, NULL, NULL); 354 355 /* 356 * Write out the current state of the allocation 357 * world. The current metaslab is full, minus 358 * stuff that's been freed this txg (freed_map), 359 * minus allocations from txgs in the future. 360 */ 361 space_map_add(sm, sm->sm_start, sm->sm_size); 362 for (i = 1; i < TXG_CONCURRENT_STATES; i++) { 363 tsm = &msp->ms_allocmap[(txg + i) & TXG_MASK]; 364 space_map_iterate(tsm, space_map_remove, sm); 365 } 366 space_map_iterate(freed_map, space_map_remove, sm); 367 368 space_map_write(sm, smo, os, tx); 369 370 ASSERT(sm->sm_space == 0); 371 ASSERT(freemap->sm_space == 0); 372 ASSERT(allocmap->sm_space == 0); 373 374 *dirty |= MSD_CONDENSE; 375 } else { 376 space_map_sync(allocmap, NULL, smo, SM_ALLOC, os, tx); 377 space_map_sync(freemap, freed_map, smo, SM_FREE, 378 os, tx); 379 } 380 381 VERIFY(0 == dmu_bonus_hold(os, smo->smo_object, FTAG, &db)); 382 dmu_buf_will_dirty(db, tx); 383 ASSERT3U(db->db_size, ==, sizeof (*smo)); 384 bcopy(smo, db->db_data, db->db_size); 385 dmu_buf_rele(db, FTAG); 386 387 dmu_tx_commit(tx); 388 } 389 390 *dirty &= ~(MSD_ALLOC | MSD_FREE | MSD_ADD); 391 392 mutex_exit(&msp->ms_lock); 393 394 (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); 395 } 396 397 /* 398 * Called after a transaction group has completely synced to mark 399 * all of the metaslab's free space as usable. 400 */ 401 void 402 metaslab_sync_done(metaslab_t *msp, uint64_t txg) 403 { 404 uint64_t weight; 405 uint8_t *dirty = &msp->ms_dirty[txg & TXG_MASK]; 406 space_map_obj_t *smo = msp->ms_smo; 407 408 dprintf("%s offset %llx txg %llu\n", 409 vdev_description(msp->ms_group->mg_vd), msp->ms_map.sm_start, txg); 410 411 mutex_enter(&msp->ms_lock); 412 413 ASSERT3U((*dirty & (MSD_ALLOC | MSD_FREE | MSD_ADD)), ==, 0); 414 415 msp->ms_usable_space = msp->ms_map.sm_size - smo->smo_alloc; 416 msp->ms_usable_end = smo->smo_objsize; 417 418 weight = msp->ms_usable_space; 419 420 if (txg != 0) { 421 space_map_t *freed_map = 422 &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; 423 424 /* XXX -- we'll need a call to picker_fini here */ 425 426 /* If we're empty, don't bother sticking around */ 427 if (msp->ms_usable_space == 0) { 428 space_map_vacate(&msp->ms_map, NULL, NULL); 429 msp->ms_map_incore = 0; 430 ASSERT3U(freed_map->sm_space, ==, 0); 431 weight = 0; 432 } else { 433 /* Add the freed blocks to the available space map */ 434 if (msp->ms_map_incore) 435 space_map_merge(freed_map, &msp->ms_map); 436 else 437 space_map_vacate(freed_map, NULL, NULL); 438 weight += msp->ms_map.sm_size; 439 } 440 441 if (msp->ms_last_alloc == txg) 442 /* Safe to use for allocation now */ 443 msp->ms_last_alloc = 0; 444 445 *dirty = 0; 446 } 447 448 mutex_exit(&msp->ms_lock); 449 450 metaslab_group_sort(msp->ms_group, msp, weight); 451 } 452 453 /* 454 * The first-fit block picker. No picker_init or picker_fini, 455 * this is just an experiment to see how it feels to separate out 456 * the block selection policy from the map updates. 457 * Note: the 'cursor' argument is a form of PPD. 458 */ 459 static uint64_t 460 metaslab_pick_block(space_map_t *sm, uint64_t size, uint64_t *cursor) 461 { 462 avl_tree_t *t = &sm->sm_root; 463 uint64_t align = size & -size; 464 space_seg_t *ss, ssearch; 465 avl_index_t where; 466 int tried_once = 0; 467 468 again: 469 ssearch.ss_start = *cursor; 470 ssearch.ss_end = *cursor + size; 471 472 ss = avl_find(t, &ssearch, &where); 473 if (ss == NULL) 474 ss = avl_nearest(t, where, AVL_AFTER); 475 476 while (ss != NULL) { 477 uint64_t offset = P2ROUNDUP(ss->ss_start, align); 478 479 if (offset + size <= ss->ss_end) { 480 *cursor = offset + size; 481 return (offset); 482 } 483 ss = AVL_NEXT(t, ss); 484 } 485 486 /* If we couldn't find a block after cursor, search again */ 487 if (tried_once == 0) { 488 tried_once = 1; 489 *cursor = 0; 490 goto again; 491 } 492 493 return (-1ULL); 494 } 495 496 static uint64_t 497 metaslab_getblock(metaslab_t *msp, uint64_t size, uint64_t txg) 498 { 499 space_map_t *sm = &msp->ms_map; 500 vdev_t *vd = msp->ms_group->mg_vd; 501 uint64_t offset; 502 503 ASSERT(MUTEX_HELD(&msp->ms_lock)); 504 ASSERT(msp->ms_map_incore); 505 ASSERT(sm->sm_space != 0); 506 ASSERT(P2PHASE(size, 1ULL << vd->vdev_ashift) == 0); 507 508 offset = metaslab_pick_block(sm, size, 509 &msp->ms_map_cursor[highbit(size & -size) - vd->vdev_ashift - 1]); 510 if (offset != -1ULL) { 511 space_map_remove(sm, offset, size); 512 space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); 513 } 514 return (offset); 515 } 516 517 /* 518 * Intent log support: upon opening the pool after a crash, notify the SPA 519 * of blocks that the intent log has allocated for immediate write, but 520 * which are still considered free by the SPA because the last transaction 521 * group didn't commit yet. 522 */ 523 int 524 metaslab_claim(spa_t *spa, dva_t *dva, uint64_t txg) 525 { 526 uint64_t vdev = DVA_GET_VDEV(dva); 527 uint64_t offset = DVA_GET_OFFSET(dva); 528 uint64_t size = DVA_GET_ASIZE(dva); 529 objset_t *os = spa->spa_meta_objset; 530 vdev_t *vd; 531 metaslab_t *msp; 532 space_map_t *sm; 533 space_map_obj_t *smo; 534 int error; 535 536 if ((vd = vdev_lookup_top(spa, vdev)) == NULL) 537 return (ENXIO); 538 539 if ((offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) 540 return (ENXIO); 541 542 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 543 sm = &msp->ms_map; 544 smo = msp->ms_smo; 545 546 if (DVA_GET_GANG(dva)) 547 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 548 549 mutex_enter(&msp->ms_lock); 550 551 if (msp->ms_map_incore == 0) { 552 error = space_map_load(sm, smo, SM_FREE, os, 553 msp->ms_usable_end, sm->sm_size - msp->ms_usable_space); 554 ASSERT(error == 0); 555 if (error) { 556 mutex_exit(&msp->ms_lock); 557 return (error); 558 } 559 msp->ms_map_incore = 1; 560 /* XXX -- we'll need a call to picker_init here */ 561 bzero(msp->ms_map_cursor, sizeof (msp->ms_map_cursor)); 562 } 563 564 space_map_remove(sm, offset, size); 565 space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); 566 567 if ((msp->ms_dirty[txg & TXG_MASK] & MSD_ALLOC) == 0) { 568 msp->ms_dirty[txg & TXG_MASK] |= MSD_ALLOC; 569 msp->ms_last_alloc = txg; 570 vdev_dirty(vd, VDD_ALLOC, txg); 571 (void) txg_list_add(&vd->vdev_ms_list, msp, txg); 572 } 573 574 mutex_exit(&msp->ms_lock); 575 576 return (0); 577 } 578 579 static int 580 metaslab_usable(metaslab_t *msp, uint64_t size, uint64_t txg) 581 { 582 /* 583 * Enforce segregation across transaction groups. 584 */ 585 /* XXX -- We should probably not assume we know what ms_weight means */ 586 if (msp->ms_last_alloc == txg) 587 return (msp->ms_map.sm_space >= size && msp->ms_weight >= size); 588 589 if (msp->ms_last_alloc != 0) 590 return (0); 591 592 if (msp->ms_map.sm_space >= size && msp->ms_weight >= size) 593 return (1); 594 595 /* XXX -- the weight test should be in terms of MINFREE */ 596 return (msp->ms_usable_space >= size && msp->ms_weight >= size); 597 } 598 599 static metaslab_t * 600 metaslab_pick(metaslab_group_t *mg, uint64_t size, uint64_t txg) 601 { 602 metaslab_t *msp; 603 avl_tree_t *t = &mg->mg_metaslab_tree; 604 605 mutex_enter(&mg->mg_lock); 606 for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) 607 if (metaslab_usable(msp, size, txg)) 608 break; 609 mutex_exit(&mg->mg_lock); 610 611 return (msp); 612 } 613 614 static metaslab_t * 615 metaslab_group_alloc(spa_t *spa, metaslab_group_t *mg, uint64_t size, 616 uint64_t *offp, uint64_t txg) 617 { 618 metaslab_t *msp; 619 int error; 620 621 while ((msp = metaslab_pick(mg, size, txg)) != NULL) { 622 space_map_obj_t *smo = msp->ms_smo; 623 mutex_enter(&msp->ms_lock); 624 if (!metaslab_usable(msp, size, txg)) { 625 mutex_exit(&msp->ms_lock); 626 continue; 627 } 628 if (msp->ms_map_incore == 0) { 629 error = space_map_load(&msp->ms_map, smo, SM_FREE, 630 spa->spa_meta_objset, msp->ms_usable_end, 631 msp->ms_map.sm_size - msp->ms_usable_space); 632 ASSERT(error == 0); 633 if (error) { 634 mutex_exit(&msp->ms_lock); 635 metaslab_group_sort(mg, msp, 0); 636 continue; 637 } 638 msp->ms_map_incore = 1; 639 /* XXX -- we'll need a call to picker_init here */ 640 bzero(msp->ms_map_cursor, sizeof (msp->ms_map_cursor)); 641 } 642 *offp = metaslab_getblock(msp, size, txg); 643 if (*offp != -1ULL) { 644 if ((msp->ms_dirty[txg & TXG_MASK] & MSD_ALLOC) == 0) { 645 vdev_t *vd = mg->mg_vd; 646 msp->ms_dirty[txg & TXG_MASK] |= MSD_ALLOC; 647 msp->ms_last_alloc = txg; 648 vdev_dirty(vd, VDD_ALLOC, txg); 649 (void) txg_list_add(&vd->vdev_ms_list, 650 msp, txg); 651 } 652 mutex_exit(&msp->ms_lock); 653 return (msp); 654 } 655 mutex_exit(&msp->ms_lock); 656 metaslab_group_sort(msp->ms_group, msp, size - 1); 657 } 658 659 return (NULL); 660 } 661 662 /* 663 * Allocate a block for the specified i/o. 664 */ 665 int 666 metaslab_alloc(spa_t *spa, uint64_t psize, dva_t *dva, uint64_t txg) 667 { 668 metaslab_t *msp; 669 metaslab_group_t *mg, *rotor; 670 metaslab_class_t *mc; 671 vdev_t *vd; 672 uint64_t offset = -1ULL; 673 uint64_t asize; 674 675 mc = spa_metaslab_class_select(spa); 676 677 /* 678 * Start at the rotor and loop through all mgs until we find something. 679 * Note that there's no locking on mc_rotor or mc_allocated because 680 * nothing actually breaks if we miss a few updates -- we just won't 681 * allocate quite as evenly. It all balances out over time. 682 */ 683 mg = rotor = mc->mc_rotor; 684 do { 685 vd = mg->mg_vd; 686 asize = vdev_psize_to_asize(vd, psize); 687 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); 688 689 msp = metaslab_group_alloc(spa, mg, asize, &offset, txg); 690 if (msp != NULL) { 691 ASSERT(offset != -1ULL); 692 693 /* 694 * If we've just selected this metaslab group, 695 * figure out whether the corresponding vdev is 696 * over- or under-used relative to the pool, 697 * and set an allocation bias to even it out. 698 */ 699 if (mc->mc_allocated == 0) { 700 vdev_stat_t *vs = &vd->vdev_stat; 701 uint64_t alloc, space; 702 int64_t vu, su; 703 704 alloc = spa_get_alloc(spa); 705 space = spa_get_space(spa); 706 707 /* 708 * Determine percent used in units of 0..1024. 709 * (This is just to avoid floating point.) 710 */ 711 vu = (vs->vs_alloc << 10) / (vs->vs_space + 1); 712 su = (alloc << 10) / (space + 1); 713 714 /* 715 * Bias by at most +/- 25% of the aliquot. 716 */ 717 mg->mg_bias = ((su - vu) * 718 (int64_t)mg->mg_aliquot) / (1024 * 4); 719 720 dprintf("bias = %lld\n", mg->mg_bias); 721 } 722 723 if (atomic_add_64_nv(&mc->mc_allocated, asize) >= 724 mg->mg_aliquot + mg->mg_bias) { 725 mc->mc_rotor = mg->mg_next; 726 mc->mc_allocated = 0; 727 } 728 729 DVA_SET_VDEV(dva, vd->vdev_id); 730 DVA_SET_OFFSET(dva, offset); 731 DVA_SET_GANG(dva, 0); 732 DVA_SET_ASIZE(dva, asize); 733 734 return (0); 735 } 736 mc->mc_rotor = mg->mg_next; 737 mc->mc_allocated = 0; 738 } while ((mg = mg->mg_next) != rotor); 739 740 dprintf("spa=%p, psize=%llu, txg=%llu: no\n", spa, psize, txg); 741 742 DVA_SET_VDEV(dva, 0); 743 DVA_SET_OFFSET(dva, 0); 744 DVA_SET_GANG(dva, 0); 745 746 return (ENOSPC); 747 } 748 749 /* 750 * Free the block represented by DVA in the context of the specified 751 * transaction group. 752 */ 753 void 754 metaslab_free(spa_t *spa, dva_t *dva, uint64_t txg) 755 { 756 uint64_t vdev = DVA_GET_VDEV(dva); 757 uint64_t offset = DVA_GET_OFFSET(dva); 758 uint64_t size = DVA_GET_ASIZE(dva); 759 vdev_t *vd; 760 metaslab_t *msp; 761 762 if (txg > spa_freeze_txg(spa)) 763 return; 764 765 if ((vd = vdev_lookup_top(spa, vdev)) == NULL) { 766 cmn_err(CE_WARN, "metaslab_free(): bad vdev %llu", 767 (u_longlong_t)vdev); 768 ASSERT(0); 769 return; 770 } 771 772 if ((offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { 773 cmn_err(CE_WARN, "metaslab_free(): bad offset %llu", 774 (u_longlong_t)offset); 775 ASSERT(0); 776 return; 777 } 778 779 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 780 781 if (DVA_GET_GANG(dva)) 782 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 783 784 mutex_enter(&msp->ms_lock); 785 786 if ((msp->ms_dirty[txg & TXG_MASK] & MSD_FREE) == 0) { 787 msp->ms_dirty[txg & TXG_MASK] |= MSD_FREE; 788 vdev_dirty(vd, VDD_FREE, txg); 789 (void) txg_list_add(&vd->vdev_ms_list, msp, txg); 790 } 791 792 space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size); 793 794 mutex_exit(&msp->ms_lock); 795 } 796