1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/zfs_context.h> 30 #include <sys/spa_impl.h> 31 #include <sys/dmu.h> 32 #include <sys/dmu_tx.h> 33 #include <sys/space_map.h> 34 #include <sys/metaslab_impl.h> 35 #include <sys/vdev_impl.h> 36 #include <sys/zio.h> 37 38 /* 39 * ========================================================================== 40 * Metaslab classes 41 * ========================================================================== 42 */ 43 metaslab_class_t * 44 metaslab_class_create(void) 45 { 46 metaslab_class_t *mc; 47 48 mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); 49 50 mc->mc_rotor = NULL; 51 52 return (mc); 53 } 54 55 void 56 metaslab_class_destroy(metaslab_class_t *mc) 57 { 58 metaslab_group_t *mg; 59 60 while ((mg = mc->mc_rotor) != NULL) { 61 metaslab_class_remove(mc, mg); 62 metaslab_group_destroy(mg); 63 } 64 65 kmem_free(mc, sizeof (metaslab_class_t)); 66 } 67 68 void 69 metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg) 70 { 71 metaslab_group_t *mgprev, *mgnext; 72 73 ASSERT(mg->mg_class == NULL); 74 75 if ((mgprev = mc->mc_rotor) == NULL) { 76 mg->mg_prev = mg; 77 mg->mg_next = mg; 78 } else { 79 mgnext = mgprev->mg_next; 80 mg->mg_prev = mgprev; 81 mg->mg_next = mgnext; 82 mgprev->mg_next = mg; 83 mgnext->mg_prev = mg; 84 } 85 mc->mc_rotor = mg; 86 mg->mg_class = mc; 87 } 88 89 void 90 metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg) 91 { 92 metaslab_group_t *mgprev, *mgnext; 93 94 ASSERT(mg->mg_class == mc); 95 96 mgprev = mg->mg_prev; 97 mgnext = mg->mg_next; 98 99 if (mg == mgnext) { 100 mc->mc_rotor = NULL; 101 } else { 102 mc->mc_rotor = mgnext; 103 mgprev->mg_next = mgnext; 104 mgnext->mg_prev = mgprev; 105 } 106 107 mg->mg_prev = NULL; 108 mg->mg_next = NULL; 109 mg->mg_class = NULL; 110 } 111 112 /* 113 * ========================================================================== 114 * Metaslab groups 115 * ========================================================================== 116 */ 117 static int 118 metaslab_compare(const void *x1, const void *x2) 119 { 120 const metaslab_t *m1 = x1; 121 const metaslab_t *m2 = x2; 122 123 if (m1->ms_weight < m2->ms_weight) 124 return (1); 125 if (m1->ms_weight > m2->ms_weight) 126 return (-1); 127 128 /* 129 * If the weights are identical, use the offset to force uniqueness. 130 */ 131 if (m1->ms_map.sm_start < m2->ms_map.sm_start) 132 return (-1); 133 if (m1->ms_map.sm_start > m2->ms_map.sm_start) 134 return (1); 135 136 ASSERT3P(m1, ==, m2); 137 138 return (0); 139 } 140 141 metaslab_group_t * 142 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) 143 { 144 metaslab_group_t *mg; 145 146 mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); 147 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); 148 avl_create(&mg->mg_metaslab_tree, metaslab_compare, 149 sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); 150 mg->mg_aliquot = 2ULL << 20; /* XXX -- tweak me */ 151 mg->mg_vd = vd; 152 metaslab_class_add(mc, mg); 153 154 return (mg); 155 } 156 157 void 158 metaslab_group_destroy(metaslab_group_t *mg) 159 { 160 avl_destroy(&mg->mg_metaslab_tree); 161 mutex_destroy(&mg->mg_lock); 162 kmem_free(mg, sizeof (metaslab_group_t)); 163 } 164 165 void 166 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 167 { 168 mutex_enter(&mg->mg_lock); 169 ASSERT(msp->ms_group == NULL); 170 msp->ms_group = mg; 171 msp->ms_weight = weight; 172 avl_add(&mg->mg_metaslab_tree, msp); 173 mutex_exit(&mg->mg_lock); 174 } 175 176 void 177 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) 178 { 179 mutex_enter(&mg->mg_lock); 180 ASSERT(msp->ms_group == mg); 181 avl_remove(&mg->mg_metaslab_tree, msp); 182 msp->ms_group = NULL; 183 mutex_exit(&mg->mg_lock); 184 } 185 186 void 187 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 188 { 189 mutex_enter(&mg->mg_lock); 190 ASSERT(msp->ms_group == mg); 191 avl_remove(&mg->mg_metaslab_tree, msp); 192 msp->ms_weight = weight; 193 avl_add(&mg->mg_metaslab_tree, msp); 194 mutex_exit(&mg->mg_lock); 195 } 196 197 /* 198 * ========================================================================== 199 * Metaslabs 200 * ========================================================================== 201 */ 202 void 203 metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, metaslab_t **mspp, 204 uint64_t start, uint64_t size, uint64_t txg) 205 { 206 vdev_t *vd = mg->mg_vd; 207 metaslab_t *msp; 208 int fm; 209 210 msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); 211 212 msp->ms_smo = smo; 213 214 space_map_create(&msp->ms_map, start, size, vd->vdev_ashift, 215 &msp->ms_lock); 216 217 for (fm = 0; fm < TXG_SIZE; fm++) { 218 space_map_create(&msp->ms_allocmap[fm], start, size, 219 vd->vdev_ashift, &msp->ms_lock); 220 space_map_create(&msp->ms_freemap[fm], start, size, 221 vd->vdev_ashift, &msp->ms_lock); 222 } 223 224 /* 225 * If we're opening an existing pool (txg == 0) or creating 226 * a new one (txg == TXG_INITIAL), all space is available now. 227 * If we're adding space to an existing pool, the new space 228 * does not become available until after this txg has synced. 229 * We enforce this by assigning an initial weight of 0 to new space. 230 * 231 * (Transactional allocations for this txg would actually be OK; 232 * it's intent log allocations that cause trouble. If we wrote 233 * a log block in this txg and lost power, the log replay would be 234 * based on the DVA translations that had been synced in txg - 1. 235 * Those translations would not include this metaslab's vdev.) 236 */ 237 metaslab_group_add(mg, msp, txg > TXG_INITIAL ? 0 : size); 238 239 if (txg == 0) { 240 /* 241 * We're opening the pool. Make the metaslab's 242 * free space available immediately. 243 */ 244 vdev_space_update(vd, size, smo->smo_alloc); 245 metaslab_sync_done(msp, 0); 246 } else { 247 /* 248 * We're adding a new metaslab to an already-open pool. 249 * Declare all of the metaslab's space to be free. 250 * 251 * Note that older transaction groups cannot allocate 252 * from this metaslab until its existence is committed, 253 * because we set ms_last_alloc to the current txg. 254 */ 255 smo->smo_alloc = 0; 256 msp->ms_usable_space = size; 257 mutex_enter(&msp->ms_lock); 258 space_map_add(&msp->ms_map, start, size); 259 msp->ms_map_incore = 1; 260 mutex_exit(&msp->ms_lock); 261 262 /* XXX -- we'll need a call to picker_init here */ 263 msp->ms_dirty[txg & TXG_MASK] |= MSD_ADD; 264 msp->ms_last_alloc = txg; 265 vdev_dirty(vd, VDD_ADD, txg); 266 (void) txg_list_add(&vd->vdev_ms_list, msp, txg); 267 } 268 269 *mspp = msp; 270 } 271 272 void 273 metaslab_fini(metaslab_t *msp) 274 { 275 int fm; 276 metaslab_group_t *mg = msp->ms_group; 277 278 vdev_space_update(mg->mg_vd, -msp->ms_map.sm_size, 279 -msp->ms_smo->smo_alloc); 280 281 metaslab_group_remove(mg, msp); 282 283 /* XXX -- we'll need a call to picker_fini here */ 284 285 mutex_enter(&msp->ms_lock); 286 287 space_map_vacate(&msp->ms_map, NULL, NULL); 288 msp->ms_map_incore = 0; 289 space_map_destroy(&msp->ms_map); 290 291 for (fm = 0; fm < TXG_SIZE; fm++) { 292 space_map_destroy(&msp->ms_allocmap[fm]); 293 space_map_destroy(&msp->ms_freemap[fm]); 294 } 295 296 mutex_exit(&msp->ms_lock); 297 298 kmem_free(msp, sizeof (metaslab_t)); 299 } 300 301 /* 302 * Write a metaslab to disk in the context of the specified transaction group. 303 */ 304 void 305 metaslab_sync(metaslab_t *msp, uint64_t txg) 306 { 307 vdev_t *vd = msp->ms_group->mg_vd; 308 spa_t *spa = vd->vdev_spa; 309 objset_t *os = spa->spa_meta_objset; 310 space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK]; 311 space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK]; 312 space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; 313 space_map_obj_t *smo = msp->ms_smo; 314 uint8_t *dirty = &msp->ms_dirty[txg & TXG_MASK]; 315 uint64_t alloc_delta; 316 dmu_buf_t *db; 317 dmu_tx_t *tx; 318 319 dprintf("%s offset %llx\n", vdev_description(vd), msp->ms_map.sm_start); 320 321 mutex_enter(&msp->ms_lock); 322 323 if (*dirty & MSD_ADD) 324 vdev_space_update(vd, msp->ms_map.sm_size, 0); 325 326 if (*dirty & (MSD_ALLOC | MSD_FREE)) { 327 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 328 329 if (smo->smo_object == 0) { 330 ASSERT(smo->smo_objsize == 0); 331 ASSERT(smo->smo_alloc == 0); 332 smo->smo_object = dmu_object_alloc(os, 333 DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT, 334 DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx); 335 ASSERT(smo->smo_object != 0); 336 dmu_write(os, vd->vdev_ms_array, sizeof (uint64_t) * 337 (msp->ms_map.sm_start >> vd->vdev_ms_shift), 338 sizeof (uint64_t), &smo->smo_object, tx); 339 } 340 341 alloc_delta = allocmap->sm_space - freemap->sm_space; 342 vdev_space_update(vd, 0, alloc_delta); 343 smo->smo_alloc += alloc_delta; 344 345 if (msp->ms_last_alloc == txg && msp->ms_map.sm_space == 0 && 346 (*dirty & MSD_CONDENSE) == 0) { 347 space_map_t *sm = &msp->ms_map; 348 space_map_t *tsm; 349 int i; 350 351 ASSERT(msp->ms_map_incore); 352 353 space_map_merge(freemap, freed_map); 354 space_map_vacate(allocmap, NULL, NULL); 355 356 /* 357 * Write out the current state of the allocation 358 * world. The current metaslab is full, minus 359 * stuff that's been freed this txg (freed_map), 360 * minus allocations from txgs in the future. 361 */ 362 space_map_add(sm, sm->sm_start, sm->sm_size); 363 for (i = 1; i < TXG_CONCURRENT_STATES; i++) { 364 tsm = &msp->ms_allocmap[(txg + i) & TXG_MASK]; 365 space_map_iterate(tsm, space_map_remove, sm); 366 } 367 space_map_iterate(freed_map, space_map_remove, sm); 368 369 space_map_write(sm, smo, os, tx); 370 371 ASSERT(sm->sm_space == 0); 372 ASSERT(freemap->sm_space == 0); 373 ASSERT(allocmap->sm_space == 0); 374 375 *dirty |= MSD_CONDENSE; 376 } else { 377 space_map_sync(allocmap, NULL, smo, SM_ALLOC, os, tx); 378 space_map_sync(freemap, freed_map, smo, SM_FREE, 379 os, tx); 380 } 381 382 db = dmu_bonus_hold(os, smo->smo_object); 383 dmu_buf_will_dirty(db, tx); 384 ASSERT3U(db->db_size, ==, sizeof (*smo)); 385 bcopy(smo, db->db_data, db->db_size); 386 dmu_buf_rele(db); 387 388 dmu_tx_commit(tx); 389 } 390 391 *dirty &= ~(MSD_ALLOC | MSD_FREE | MSD_ADD); 392 393 mutex_exit(&msp->ms_lock); 394 395 (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); 396 } 397 398 /* 399 * Called after a transaction group has completely synced to mark 400 * all of the metaslab's free space as usable. 401 */ 402 void 403 metaslab_sync_done(metaslab_t *msp, uint64_t txg) 404 { 405 uint64_t weight; 406 uint8_t *dirty = &msp->ms_dirty[txg & TXG_MASK]; 407 space_map_obj_t *smo = msp->ms_smo; 408 409 dprintf("%s offset %llx txg %llu\n", 410 vdev_description(msp->ms_group->mg_vd), msp->ms_map.sm_start, txg); 411 412 mutex_enter(&msp->ms_lock); 413 414 ASSERT3U((*dirty & (MSD_ALLOC | MSD_FREE | MSD_ADD)), ==, 0); 415 416 msp->ms_usable_space = msp->ms_map.sm_size - smo->smo_alloc; 417 msp->ms_usable_end = smo->smo_objsize; 418 419 weight = msp->ms_usable_space; 420 421 if (txg != 0) { 422 space_map_t *freed_map = 423 &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; 424 425 /* XXX -- we'll need a call to picker_fini here */ 426 427 /* If we're empty, don't bother sticking around */ 428 if (msp->ms_usable_space == 0) { 429 space_map_vacate(&msp->ms_map, NULL, NULL); 430 msp->ms_map_incore = 0; 431 ASSERT3U(freed_map->sm_space, ==, 0); 432 weight = 0; 433 } else { 434 /* Add the freed blocks to the available space map */ 435 if (msp->ms_map_incore) 436 space_map_merge(freed_map, &msp->ms_map); 437 else 438 space_map_vacate(freed_map, NULL, NULL); 439 weight += msp->ms_map.sm_size; 440 } 441 442 if (msp->ms_last_alloc == txg) 443 /* Safe to use for allocation now */ 444 msp->ms_last_alloc = 0; 445 446 *dirty = 0; 447 } 448 449 mutex_exit(&msp->ms_lock); 450 451 metaslab_group_sort(msp->ms_group, msp, weight); 452 } 453 454 /* 455 * The first-fit block picker. No picker_init or picker_fini, 456 * this is just an experiment to see how it feels to separate out 457 * the block selection policy from the map updates. 458 * Note: the 'cursor' argument is a form of PPD. 459 */ 460 static uint64_t 461 metaslab_pick_block(space_map_t *sm, uint64_t size, uint64_t *cursor) 462 { 463 avl_tree_t *t = &sm->sm_root; 464 uint64_t align = size & -size; 465 space_seg_t *ss, ssearch; 466 avl_index_t where; 467 int tried_once = 0; 468 469 again: 470 ssearch.ss_start = *cursor; 471 ssearch.ss_end = *cursor + size; 472 473 ss = avl_find(t, &ssearch, &where); 474 if (ss == NULL) 475 ss = avl_nearest(t, where, AVL_AFTER); 476 477 while (ss != NULL) { 478 uint64_t offset = P2ROUNDUP(ss->ss_start, align); 479 480 if (offset + size <= ss->ss_end) { 481 *cursor = offset + size; 482 return (offset); 483 } 484 ss = AVL_NEXT(t, ss); 485 } 486 487 /* If we couldn't find a block after cursor, search again */ 488 if (tried_once == 0) { 489 tried_once = 1; 490 *cursor = 0; 491 goto again; 492 } 493 494 return (-1ULL); 495 } 496 497 static uint64_t 498 metaslab_getblock(metaslab_t *msp, uint64_t size, uint64_t txg) 499 { 500 space_map_t *sm = &msp->ms_map; 501 vdev_t *vd = msp->ms_group->mg_vd; 502 uint64_t offset; 503 504 ASSERT(MUTEX_HELD(&msp->ms_lock)); 505 ASSERT(msp->ms_map_incore); 506 ASSERT(sm->sm_space != 0); 507 ASSERT(P2PHASE(size, 1ULL << vd->vdev_ashift) == 0); 508 509 offset = metaslab_pick_block(sm, size, 510 &msp->ms_map_cursor[highbit(size & -size) - vd->vdev_ashift - 1]); 511 if (offset != -1ULL) { 512 space_map_remove(sm, offset, size); 513 space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); 514 } 515 return (offset); 516 } 517 518 /* 519 * Intent log support: upon opening the pool after a crash, notify the SPA 520 * of blocks that the intent log has allocated for immediate write, but 521 * which are still considered free by the SPA because the last transaction 522 * group didn't commit yet. 523 */ 524 int 525 metaslab_claim(spa_t *spa, dva_t *dva, uint64_t txg) 526 { 527 uint64_t vdev = DVA_GET_VDEV(dva); 528 uint64_t offset = DVA_GET_OFFSET(dva); 529 uint64_t size = DVA_GET_ASIZE(dva); 530 objset_t *os = spa->spa_meta_objset; 531 vdev_t *vd; 532 metaslab_t *msp; 533 space_map_t *sm; 534 space_map_obj_t *smo; 535 int error; 536 537 if ((vd = vdev_lookup_top(spa, vdev)) == NULL) 538 return (ENXIO); 539 540 if ((offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) 541 return (ENXIO); 542 543 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 544 sm = &msp->ms_map; 545 smo = msp->ms_smo; 546 547 if (DVA_GET_GANG(dva)) 548 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 549 550 mutex_enter(&msp->ms_lock); 551 552 if (msp->ms_map_incore == 0) { 553 error = space_map_load(sm, smo, SM_FREE, os, 554 msp->ms_usable_end, sm->sm_size - msp->ms_usable_space); 555 ASSERT(error == 0); 556 if (error) { 557 mutex_exit(&msp->ms_lock); 558 return (error); 559 } 560 msp->ms_map_incore = 1; 561 /* XXX -- we'll need a call to picker_init here */ 562 bzero(msp->ms_map_cursor, sizeof (msp->ms_map_cursor)); 563 } 564 565 space_map_remove(sm, offset, size); 566 space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); 567 568 if ((msp->ms_dirty[txg & TXG_MASK] & MSD_ALLOC) == 0) { 569 msp->ms_dirty[txg & TXG_MASK] |= MSD_ALLOC; 570 msp->ms_last_alloc = txg; 571 vdev_dirty(vd, VDD_ALLOC, txg); 572 (void) txg_list_add(&vd->vdev_ms_list, msp, txg); 573 } 574 575 mutex_exit(&msp->ms_lock); 576 577 return (0); 578 } 579 580 static int 581 metaslab_usable(metaslab_t *msp, uint64_t size, uint64_t txg) 582 { 583 /* 584 * Enforce segregation across transaction groups. 585 */ 586 /* XXX -- We should probably not assume we know what ms_weight means */ 587 if (msp->ms_last_alloc == txg) 588 return (msp->ms_map.sm_space >= size && msp->ms_weight >= size); 589 590 if (msp->ms_last_alloc != 0) 591 return (0); 592 593 if (msp->ms_map.sm_space >= size && msp->ms_weight >= size) 594 return (1); 595 596 /* XXX -- the weight test should be in terms of MINFREE */ 597 return (msp->ms_usable_space >= size && msp->ms_weight >= size); 598 } 599 600 static metaslab_t * 601 metaslab_pick(metaslab_group_t *mg, uint64_t size, uint64_t txg) 602 { 603 metaslab_t *msp; 604 avl_tree_t *t = &mg->mg_metaslab_tree; 605 606 mutex_enter(&mg->mg_lock); 607 for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) 608 if (metaslab_usable(msp, size, txg)) 609 break; 610 mutex_exit(&mg->mg_lock); 611 612 return (msp); 613 } 614 615 static metaslab_t * 616 metaslab_group_alloc(spa_t *spa, metaslab_group_t *mg, uint64_t size, 617 uint64_t *offp, uint64_t txg) 618 { 619 metaslab_t *msp; 620 int error; 621 622 while ((msp = metaslab_pick(mg, size, txg)) != NULL) { 623 space_map_obj_t *smo = msp->ms_smo; 624 mutex_enter(&msp->ms_lock); 625 if (!metaslab_usable(msp, size, txg)) { 626 mutex_exit(&msp->ms_lock); 627 continue; 628 } 629 if (msp->ms_map_incore == 0) { 630 error = space_map_load(&msp->ms_map, smo, SM_FREE, 631 spa->spa_meta_objset, msp->ms_usable_end, 632 msp->ms_map.sm_size - msp->ms_usable_space); 633 ASSERT(error == 0); 634 if (error) { 635 mutex_exit(&msp->ms_lock); 636 metaslab_group_sort(mg, msp, 0); 637 continue; 638 } 639 msp->ms_map_incore = 1; 640 /* XXX -- we'll need a call to picker_init here */ 641 bzero(msp->ms_map_cursor, sizeof (msp->ms_map_cursor)); 642 } 643 *offp = metaslab_getblock(msp, size, txg); 644 if (*offp != -1ULL) { 645 if ((msp->ms_dirty[txg & TXG_MASK] & MSD_ALLOC) == 0) { 646 vdev_t *vd = mg->mg_vd; 647 msp->ms_dirty[txg & TXG_MASK] |= MSD_ALLOC; 648 msp->ms_last_alloc = txg; 649 vdev_dirty(vd, VDD_ALLOC, txg); 650 (void) txg_list_add(&vd->vdev_ms_list, 651 msp, txg); 652 } 653 mutex_exit(&msp->ms_lock); 654 return (msp); 655 } 656 mutex_exit(&msp->ms_lock); 657 metaslab_group_sort(msp->ms_group, msp, size - 1); 658 } 659 660 return (NULL); 661 } 662 663 /* 664 * Allocate a block for the specified i/o. 665 */ 666 int 667 metaslab_alloc(spa_t *spa, uint64_t psize, dva_t *dva, uint64_t txg) 668 { 669 metaslab_t *msp; 670 metaslab_group_t *mg, *rotor; 671 metaslab_class_t *mc; 672 vdev_t *vd; 673 uint64_t offset = -1ULL; 674 uint64_t asize; 675 676 mc = spa_metaslab_class_select(spa); 677 678 /* 679 * Start at the rotor and loop through all mgs until we find something. 680 * Note that there's no locking on mc_rotor or mc_allocated because 681 * nothing actually breaks if we miss a few updates -- we just won't 682 * allocate quite as evenly. It all balances out over time. 683 */ 684 mg = rotor = mc->mc_rotor; 685 do { 686 vd = mg->mg_vd; 687 asize = vdev_psize_to_asize(vd, psize); 688 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); 689 690 msp = metaslab_group_alloc(spa, mg, asize, &offset, txg); 691 if (msp != NULL) { 692 ASSERT(offset != -1ULL); 693 694 /* 695 * If we've just selected this metaslab group, 696 * figure out whether the corresponding vdev is 697 * over- or under-used relative to the pool, 698 * and set an allocation bias to even it out. 699 */ 700 if (mc->mc_allocated == 0) { 701 vdev_stat_t *vs = &vd->vdev_stat; 702 uint64_t alloc, space; 703 int64_t vu, su; 704 705 alloc = spa_get_alloc(spa); 706 space = spa_get_space(spa); 707 708 /* 709 * Determine percent used in units of 0..1024. 710 * (This is just to avoid floating point.) 711 */ 712 vu = (vs->vs_alloc << 10) / (vs->vs_space + 1); 713 su = (alloc << 10) / (space + 1); 714 715 /* 716 * Bias by at most +/- 25% of the aliquot. 717 */ 718 mg->mg_bias = ((su - vu) * 719 (int64_t)mg->mg_aliquot) / (1024 * 4); 720 721 dprintf("bias = %lld\n", mg->mg_bias); 722 } 723 724 if (atomic_add_64_nv(&mc->mc_allocated, asize) >= 725 mg->mg_aliquot + mg->mg_bias) { 726 mc->mc_rotor = mg->mg_next; 727 mc->mc_allocated = 0; 728 } 729 730 DVA_SET_VDEV(dva, vd->vdev_id); 731 DVA_SET_OFFSET(dva, offset); 732 DVA_SET_GANG(dva, 0); 733 DVA_SET_ASIZE(dva, asize); 734 735 return (0); 736 } 737 mc->mc_rotor = mg->mg_next; 738 mc->mc_allocated = 0; 739 } while ((mg = mg->mg_next) != rotor); 740 741 dprintf("spa=%p, psize=%llu, txg=%llu: no\n", spa, psize, txg); 742 743 DVA_SET_VDEV(dva, 0); 744 DVA_SET_OFFSET(dva, 0); 745 DVA_SET_GANG(dva, 0); 746 747 return (ENOSPC); 748 } 749 750 /* 751 * Free the block represented by DVA in the context of the specified 752 * transaction group. 753 */ 754 void 755 metaslab_free(spa_t *spa, dva_t *dva, uint64_t txg) 756 { 757 uint64_t vdev = DVA_GET_VDEV(dva); 758 uint64_t offset = DVA_GET_OFFSET(dva); 759 uint64_t size = DVA_GET_ASIZE(dva); 760 vdev_t *vd; 761 metaslab_t *msp; 762 763 if (txg > spa_freeze_txg(spa)) 764 return; 765 766 if ((vd = vdev_lookup_top(spa, vdev)) == NULL) { 767 cmn_err(CE_WARN, "metaslab_free(): bad vdev %llu", 768 (u_longlong_t)vdev); 769 ASSERT(0); 770 return; 771 } 772 773 if ((offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { 774 cmn_err(CE_WARN, "metaslab_free(): bad offset %llu", 775 (u_longlong_t)offset); 776 ASSERT(0); 777 return; 778 } 779 780 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 781 782 if (DVA_GET_GANG(dva)) 783 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 784 785 mutex_enter(&msp->ms_lock); 786 787 if ((msp->ms_dirty[txg & TXG_MASK] & MSD_FREE) == 0) { 788 msp->ms_dirty[txg & TXG_MASK] |= MSD_FREE; 789 vdev_dirty(vd, VDD_FREE, txg); 790 (void) txg_list_add(&vd->vdev_ms_list, msp, txg); 791 } 792 793 space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size); 794 795 mutex_exit(&msp->ms_lock); 796 } 797