1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/zfs_context.h> 30 #include <sys/dmu_objset.h> 31 #include <sys/dmu_traverse.h> 32 #include <sys/dsl_dataset.h> 33 #include <sys/dsl_dir.h> 34 #include <sys/dsl_pool.h> 35 #include <sys/dnode.h> 36 #include <sys/spa.h> 37 #include <sys/zio.h> 38 #include <sys/dmu_impl.h> 39 40 #define BP_SPAN_SHIFT(level, width) ((level) * (width)) 41 42 #define BP_EQUAL(b1, b2) \ 43 (DVA_EQUAL(BP_IDENTITY(b1), BP_IDENTITY(b2)) && \ 44 (b1)->blk_birth == (b2)->blk_birth) 45 46 /* 47 * Compare two bookmarks. 48 * 49 * For ADVANCE_PRE, the visitation order is: 50 * 51 * objset 0, 1, 2, ..., ZB_MAXOBJSET. 52 * object 0, 1, 2, ..., ZB_MAXOBJECT. 53 * blkoff 0, 1, 2, ... 54 * level ZB_MAXLEVEL, ..., 2, 1, 0. 55 * 56 * where blkoff = blkid << BP_SPAN_SHIFT(level, width), and thus a valid 57 * ordering vector is: 58 * 59 * < objset, object, blkoff, -level > 60 * 61 * For ADVANCE_POST, the starting offsets aren't sequential but ending 62 * offsets [blkoff = (blkid + 1) << BP_SPAN_SHIFT(level, width)] are. 63 * The visitation order is: 64 * 65 * objset 1, 2, ..., ZB_MAXOBJSET, 0. 66 * object 1, 2, ..., ZB_MAXOBJECT, 0. 67 * blkoff 1, 2, ... 68 * level 0, 1, 2, ..., ZB_MAXLEVEL. 69 * 70 * and thus a valid ordering vector is: 71 * 72 * < objset - 1, object - 1, blkoff, level > 73 * 74 * Both orderings can be expressed as: 75 * 76 * < objset + bias, object + bias, blkoff, level ^ bias > 77 * 78 * where 'bias' is either 0 or -1 (for ADVANCE_PRE or ADVANCE_POST) 79 * and 'blkoff' is (blkid - bias) << BP_SPAN_SHIFT(level, wshift). 80 * 81 * Special case: an objset's osphys is represented as level -1 of object 0. 82 * It is always either the very first or very last block we visit in an objset. 83 * Therefore, if either bookmark's level is -1, level alone determines order. 84 */ 85 static int 86 compare_bookmark(zbookmark_t *szb, zbookmark_t *ezb, dnode_phys_t *dnp, 87 int advance) 88 { 89 int bias = (advance & ADVANCE_PRE) ? 0 : -1; 90 uint64_t sblkoff, eblkoff; 91 int slevel, elevel, wshift; 92 93 if (szb->zb_objset + bias < ezb->zb_objset + bias) 94 return (-1); 95 96 if (szb->zb_objset + bias > ezb->zb_objset + bias) 97 return (1); 98 99 slevel = szb->zb_level; 100 elevel = ezb->zb_level; 101 102 if ((slevel | elevel) < 0) 103 return ((slevel ^ bias) - (elevel ^ bias)); 104 105 if (szb->zb_object + bias < ezb->zb_object + bias) 106 return (-1); 107 108 if (szb->zb_object + bias > ezb->zb_object + bias) 109 return (1); 110 111 if (dnp == NULL) 112 return (0); 113 114 wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT; 115 116 sblkoff = (szb->zb_blkid - bias) << BP_SPAN_SHIFT(slevel, wshift); 117 eblkoff = (ezb->zb_blkid - bias) << BP_SPAN_SHIFT(elevel, wshift); 118 119 if (sblkoff < eblkoff) 120 return (-1); 121 122 if (sblkoff > eblkoff) 123 return (1); 124 125 return ((elevel ^ bias) - (slevel ^ bias)); 126 } 127 128 #define SET_BOOKMARK(zb, objset, object, level, blkid) \ 129 { \ 130 (zb)->zb_objset = objset; \ 131 (zb)->zb_object = object; \ 132 (zb)->zb_level = level; \ 133 (zb)->zb_blkid = blkid; \ 134 } 135 136 #define SET_BOOKMARK_LB(zb, level, blkid) \ 137 { \ 138 (zb)->zb_level = level; \ 139 (zb)->zb_blkid = blkid; \ 140 } 141 142 static int 143 advance_objset(zseg_t *zseg, uint64_t objset, int advance) 144 { 145 zbookmark_t *zb = &zseg->seg_start; 146 147 if (advance & ADVANCE_PRE) { 148 if (objset >= ZB_MAXOBJSET) 149 return (ERANGE); 150 SET_BOOKMARK(zb, objset, 0, -1, 0); 151 } else { 152 if (objset >= ZB_MAXOBJSET) 153 objset = 0; 154 SET_BOOKMARK(zb, objset, 1, 0, 0); 155 } 156 157 if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0) 158 return (ERANGE); 159 160 return (EAGAIN); 161 } 162 163 static int 164 advance_object(zseg_t *zseg, uint64_t object, int advance) 165 { 166 zbookmark_t *zb = &zseg->seg_start; 167 168 if (advance & ADVANCE_PRE) { 169 if (object >= ZB_MAXOBJECT) { 170 SET_BOOKMARK(zb, zb->zb_objset + 1, 0, -1, 0); 171 } else { 172 SET_BOOKMARK(zb, zb->zb_objset, object, ZB_MAXLEVEL, 0); 173 } 174 } else { 175 if (zb->zb_object == 0) { 176 SET_BOOKMARK(zb, zb->zb_objset, 0, -1, 0); 177 } else { 178 if (object >= ZB_MAXOBJECT) 179 object = 0; 180 SET_BOOKMARK(zb, zb->zb_objset, object, 0, 0); 181 } 182 } 183 184 if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0) 185 return (ERANGE); 186 187 return (EAGAIN); 188 } 189 190 static int 191 advance_from_osphys(zseg_t *zseg, int advance) 192 { 193 zbookmark_t *zb = &zseg->seg_start; 194 195 ASSERT(zb->zb_object == 0); 196 ASSERT(zb->zb_level == -1); 197 ASSERT(zb->zb_blkid == 0); 198 199 if (advance & ADVANCE_PRE) { 200 SET_BOOKMARK_LB(zb, ZB_MAXLEVEL, 0); 201 } else { 202 if (zb->zb_objset == 0) 203 return (ERANGE); 204 SET_BOOKMARK(zb, zb->zb_objset + 1, 1, 0, 0); 205 } 206 207 if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0) 208 return (ERANGE); 209 210 return (EAGAIN); 211 } 212 213 static int 214 advance_block(zseg_t *zseg, dnode_phys_t *dnp, int rc, int advance) 215 { 216 zbookmark_t *zb = &zseg->seg_start; 217 int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT; 218 int maxlevel = dnp->dn_nlevels - 1; 219 int level = zb->zb_level; 220 uint64_t blkid = zb->zb_blkid; 221 222 if (advance & ADVANCE_PRE) { 223 if (level > 0 && rc == 0) { 224 level--; 225 blkid <<= wshift; 226 } else { 227 blkid++; 228 229 if ((blkid << BP_SPAN_SHIFT(level, wshift)) > 230 dnp->dn_maxblkid) 231 return (ERANGE); 232 233 while (level < maxlevel) { 234 if (P2PHASE(blkid, 1ULL << wshift)) 235 break; 236 blkid >>= wshift; 237 level++; 238 } 239 } 240 } else { 241 if (level >= maxlevel || P2PHASE(blkid + 1, 1ULL << wshift)) { 242 blkid = (blkid + 1) << BP_SPAN_SHIFT(level, wshift); 243 level = 0; 244 } else { 245 blkid >>= wshift; 246 level++; 247 } 248 249 while ((blkid << BP_SPAN_SHIFT(level, wshift)) > 250 dnp->dn_maxblkid) { 251 if (level == maxlevel) 252 return (ERANGE); 253 blkid >>= wshift; 254 level++; 255 } 256 } 257 SET_BOOKMARK_LB(zb, level, blkid); 258 259 if (compare_bookmark(zb, &zseg->seg_end, dnp, advance) > 0) 260 return (ERANGE); 261 262 return (EAGAIN); 263 } 264 265 static int 266 traverse_callback(traverse_handle_t *th, zseg_t *zseg, traverse_blk_cache_t *bc) 267 { 268 /* 269 * Before we issue the callback, prune against maxtxg. 270 * 271 * We prune against mintxg before we get here because it's a big win. 272 * If a given block was born in txg 37, then we know that the entire 273 * subtree below that block must have been born in txg 37 or earlier. 274 * We can therefore lop off huge branches of the tree as we go. 275 * 276 * There's no corresponding optimization for maxtxg because knowing 277 * that bp->blk_birth >= maxtxg doesn't imply anything about the bp's 278 * children. In fact, the copy-on-write design of ZFS ensures that 279 * top-level blocks will pretty much always be new. 280 * 281 * Therefore, in the name of simplicity we don't prune against 282 * maxtxg until the last possible moment -- that being right now. 283 */ 284 if (bc->bc_errno == 0 && bc->bc_blkptr.blk_birth >= zseg->seg_maxtxg) 285 return (0); 286 287 if (bc->bc_errno == 0) { 288 zbookmark_t *zb = &bc->bc_bookmark; 289 zbookmark_t *szb = &zseg->seg_start; 290 zbookmark_t *ezb = &zseg->seg_end; 291 zbookmark_t *lzb = &th->th_lastcb; 292 dnode_phys_t *dnp = bc->bc_dnode; 293 294 /* 295 * Debugging: verify that the order we visit things 296 * agrees with the order defined by compare_bookmark(). 297 */ 298 ASSERT(compare_bookmark(zb, ezb, dnp, th->th_advance) <= 0); 299 ASSERT(compare_bookmark(zb, szb, dnp, th->th_advance) == 0); 300 ASSERT(compare_bookmark(lzb, zb, dnp, th->th_advance) < 0 || 301 lzb->zb_level == ZB_NO_LEVEL); 302 *lzb = *zb; 303 } 304 305 th->th_callbacks++; 306 return (th->th_func(bc, th->th_spa, th->th_arg)); 307 } 308 309 static int 310 traverse_read(traverse_handle_t *th, traverse_blk_cache_t *bc, blkptr_t *bp, 311 dnode_phys_t *dnp) 312 { 313 zbookmark_t *zb = &bc->bc_bookmark; 314 int error; 315 316 th->th_hits++; 317 318 bc->bc_dnode = dnp; 319 bc->bc_errno = 0; 320 321 if (BP_EQUAL(&bc->bc_blkptr, bp)) 322 return (0); 323 324 bc->bc_blkptr = *bp; 325 326 if (bc->bc_data == NULL) 327 return (0); 328 329 if (BP_IS_HOLE(bp)) { 330 ASSERT(th->th_advance & ADVANCE_HOLES); 331 return (0); 332 } 333 334 if (compare_bookmark(zb, &th->th_noread, dnp, 0) == 0) { 335 error = EIO; 336 } else if (arc_tryread(th->th_spa, bp, bc->bc_data) == 0) { 337 error = 0; 338 th->th_arc_hits++; 339 } else { 340 error = zio_wait(zio_read(NULL, th->th_spa, bp, bc->bc_data, 341 BP_GET_LSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_READ, 342 th->th_zio_flags | ZIO_FLAG_DONT_CACHE)); 343 344 if (BP_SHOULD_BYTESWAP(bp) && error == 0) 345 (zb->zb_level > 0 ? byteswap_uint64_array : 346 dmu_ot[BP_GET_TYPE(bp)].ot_byteswap)(bc->bc_data, 347 BP_GET_LSIZE(bp)); 348 th->th_reads++; 349 } 350 351 if (error) { 352 bc->bc_errno = error; 353 error = traverse_callback(th, NULL, bc); 354 ASSERT(error == EAGAIN || error == EINTR || error == ERESTART); 355 bc->bc_blkptr.blk_birth = -1ULL; 356 } 357 358 dprintf("cache %02x error %d <%llu, %llu, %d, %llx>\n", 359 bc - &th->th_cache[0][0], error, 360 zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid); 361 362 return (error); 363 } 364 365 static int 366 find_block(traverse_handle_t *th, zseg_t *zseg, dnode_phys_t *dnp, int depth) 367 { 368 zbookmark_t *zb = &zseg->seg_start; 369 traverse_blk_cache_t *bc; 370 blkptr_t *bp = dnp->dn_blkptr; 371 int i, first, level; 372 int nbp = dnp->dn_nblkptr; 373 int minlevel = zb->zb_level; 374 int maxlevel = dnp->dn_nlevels - 1; 375 int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT; 376 int bp_shift = BP_SPAN_SHIFT(maxlevel - minlevel, wshift); 377 uint64_t blkid = zb->zb_blkid >> bp_shift; 378 int do_holes = (th->th_advance & ADVANCE_HOLES) && depth == ZB_DN_CACHE; 379 int rc; 380 381 if (minlevel > maxlevel || blkid >= nbp) 382 return (ERANGE); 383 384 for (level = maxlevel; level >= minlevel; level--) { 385 first = P2PHASE(blkid, 1ULL << wshift); 386 387 for (i = first; i < nbp; i++) 388 if (bp[i].blk_birth > zseg->seg_mintxg || 389 BP_IS_HOLE(&bp[i]) && do_holes) 390 break; 391 392 if (i != first) { 393 i--; 394 SET_BOOKMARK_LB(zb, level, blkid + (i - first)); 395 return (ENOTBLK); 396 } 397 398 bc = &th->th_cache[depth][level]; 399 400 SET_BOOKMARK(&bc->bc_bookmark, zb->zb_objset, zb->zb_object, 401 level, blkid); 402 403 if (rc = traverse_read(th, bc, bp + i, dnp)) { 404 if (rc != EAGAIN) { 405 SET_BOOKMARK_LB(zb, level, blkid); 406 } 407 return (rc); 408 } 409 410 if (BP_IS_HOLE(&bp[i])) { 411 SET_BOOKMARK_LB(zb, level, blkid); 412 th->th_lastcb.zb_level = ZB_NO_LEVEL; 413 return (0); 414 } 415 416 nbp = 1 << wshift; 417 bp = bc->bc_data; 418 bp_shift -= wshift; 419 blkid = zb->zb_blkid >> bp_shift; 420 } 421 422 return (0); 423 } 424 425 static int 426 get_dnode(traverse_handle_t *th, uint64_t objset, dnode_phys_t *mdn, 427 uint64_t *objectp, dnode_phys_t **dnpp, uint64_t txg, int type, int depth) 428 { 429 zseg_t zseg; 430 zbookmark_t *zb = &zseg.seg_start; 431 uint64_t object = *objectp; 432 int i, rc; 433 434 SET_BOOKMARK(zb, objset, 0, 0, object / DNODES_PER_BLOCK); 435 SET_BOOKMARK(&zseg.seg_end, objset, 0, 0, ZB_MAXBLKID); 436 437 zseg.seg_mintxg = txg; 438 zseg.seg_maxtxg = -1ULL; 439 440 for (;;) { 441 rc = find_block(th, &zseg, mdn, depth); 442 443 if (rc == EAGAIN || rc == EINTR || rc == ERANGE) 444 break; 445 446 if (rc == 0 && zb->zb_level == 0) { 447 dnode_phys_t *dnp = th->th_cache[depth][0].bc_data; 448 for (i = 0; i < DNODES_PER_BLOCK; i++) { 449 object = (zb->zb_blkid * DNODES_PER_BLOCK) + i; 450 if (object >= *objectp && 451 dnp[i].dn_type != DMU_OT_NONE && 452 (type == -1 || dnp[i].dn_type == type)) { 453 *objectp = object; 454 *dnpp = &dnp[i]; 455 return (0); 456 } 457 } 458 } 459 460 rc = advance_block(&zseg, mdn, rc, ADVANCE_PRE); 461 462 if (rc == ERANGE) 463 break; 464 } 465 466 if (rc == ERANGE) 467 *objectp = ZB_MAXOBJECT; 468 469 return (rc); 470 } 471 472 static int 473 traverse_segment(traverse_handle_t *th, zseg_t *zseg, blkptr_t *mosbp) 474 { 475 zbookmark_t *zb = &zseg->seg_start; 476 traverse_blk_cache_t *bc; 477 dnode_phys_t *dn, *dn_tmp; 478 int worklimit = 1000; 479 int rc; 480 481 dprintf("<%llu, %llu, %d, %llx>\n", 482 zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid); 483 484 bc = &th->th_cache[ZB_MOS_CACHE][ZB_MAXLEVEL - 1]; 485 dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode; 486 487 SET_BOOKMARK(&bc->bc_bookmark, 0, 0, -1, 0); 488 489 rc = traverse_read(th, bc, mosbp, dn); 490 491 if (rc) /* If we get ERESTART, we've got nowhere left to go */ 492 return (rc == ERESTART ? EINTR : rc); 493 494 ASSERT(dn->dn_nlevels < ZB_MAXLEVEL); 495 496 if (zb->zb_objset != 0) { 497 uint64_t objset = zb->zb_objset; 498 dsl_dataset_phys_t *dsp; 499 500 rc = get_dnode(th, 0, dn, &objset, &dn_tmp, 0, 501 DMU_OT_DSL_DATASET, ZB_MOS_CACHE); 502 503 if (objset != zb->zb_objset) 504 rc = advance_objset(zseg, objset, th->th_advance); 505 506 if (rc != 0) 507 return (rc); 508 509 dsp = DN_BONUS(dn_tmp); 510 511 bc = &th->th_cache[ZB_MDN_CACHE][ZB_MAXLEVEL - 1]; 512 dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode; 513 514 SET_BOOKMARK(&bc->bc_bookmark, objset, 0, -1, 0); 515 516 rc = traverse_read(th, bc, &dsp->ds_bp, dn); 517 518 if (rc != 0) { 519 if (rc == ERESTART) 520 rc = advance_objset(zseg, zb->zb_objset + 1, 521 th->th_advance); 522 return (rc); 523 } 524 525 if (th->th_advance & ADVANCE_PRUNE) 526 zseg->seg_mintxg = 527 MAX(zseg->seg_mintxg, dsp->ds_prev_snap_txg); 528 } 529 530 if (zb->zb_level == -1) { 531 ASSERT(zb->zb_object == 0); 532 533 if (bc->bc_blkptr.blk_birth > zseg->seg_mintxg) { 534 rc = traverse_callback(th, zseg, bc); 535 if (rc) { 536 ASSERT(rc == EINTR); 537 return (rc); 538 } 539 } 540 541 return (advance_from_osphys(zseg, th->th_advance)); 542 } 543 544 if (zb->zb_object != 0) { 545 uint64_t object = zb->zb_object; 546 547 rc = get_dnode(th, zb->zb_objset, dn, &object, &dn_tmp, 548 zseg->seg_mintxg, -1, ZB_MDN_CACHE); 549 550 if (object != zb->zb_object) 551 rc = advance_object(zseg, object, th->th_advance); 552 553 if (rc != 0) 554 return (rc); 555 556 dn = dn_tmp; 557 } 558 559 if (zb->zb_level == ZB_MAXLEVEL) 560 zb->zb_level = dn->dn_nlevels - 1; 561 562 for (;;) { 563 rc = find_block(th, zseg, dn, ZB_DN_CACHE); 564 565 if (rc == EAGAIN || rc == EINTR || rc == ERANGE) 566 break; 567 568 if (rc == 0) { 569 bc = &th->th_cache[ZB_DN_CACHE][zb->zb_level]; 570 ASSERT(bc->bc_dnode == dn); 571 ASSERT(bc->bc_blkptr.blk_birth <= mosbp->blk_birth); 572 rc = traverse_callback(th, zseg, bc); 573 if (rc) { 574 ASSERT(rc == EINTR); 575 return (rc); 576 } 577 if (BP_IS_HOLE(&bc->bc_blkptr)) { 578 ASSERT(th->th_advance & ADVANCE_HOLES); 579 rc = ENOTBLK; 580 } 581 } 582 583 rc = advance_block(zseg, dn, rc, th->th_advance); 584 585 if (rc == ERANGE) 586 break; 587 588 /* 589 * Give spa_sync() a chance to run. 590 */ 591 if (spa_traverse_wanted(th->th_spa)) { 592 th->th_syncs++; 593 return (EAGAIN); 594 } 595 596 if (--worklimit == 0) 597 return (EAGAIN); 598 } 599 600 if (rc == ERANGE) 601 rc = advance_object(zseg, zb->zb_object + 1, th->th_advance); 602 603 return (rc); 604 } 605 606 /* 607 * It is the caller's responsibility to ensure that the dsl_dataset_t 608 * doesn't go away during traversal. 609 */ 610 int 611 traverse_dsl_dataset(dsl_dataset_t *ds, uint64_t txg_start, int advance, 612 blkptr_cb_t func, void *arg) 613 { 614 spa_t *spa = ds->ds_dir->dd_pool->dp_spa; 615 traverse_handle_t *th; 616 int err; 617 618 th = traverse_init(spa, func, arg, advance, ZIO_FLAG_MUSTSUCCEED); 619 620 traverse_add_objset(th, txg_start, -1ULL, ds->ds_object); 621 622 while ((err = traverse_more(th)) == EAGAIN) 623 continue; 624 625 traverse_fini(th); 626 return (err); 627 } 628 629 int 630 traverse_more(traverse_handle_t *th) 631 { 632 zseg_t *zseg = list_head(&th->th_seglist); 633 uint64_t save_txg; /* XXX won't be necessary with real itinerary */ 634 krwlock_t *rw = spa_traverse_rwlock(th->th_spa); 635 blkptr_t *mosbp = spa_get_rootblkptr(th->th_spa); 636 int rc; 637 638 if (zseg == NULL) 639 return (0); 640 641 th->th_restarts++; 642 643 save_txg = zseg->seg_mintxg; 644 645 if (!(th->th_advance & ADVANCE_NOLOCK)) 646 rw_enter(rw, RW_READER); 647 648 rc = traverse_segment(th, zseg, mosbp); 649 ASSERT(rc == ERANGE || rc == EAGAIN || rc == EINTR); 650 651 if (!(th->th_advance & ADVANCE_NOLOCK)) 652 rw_exit(rw); 653 654 zseg->seg_mintxg = save_txg; 655 656 if (rc == ERANGE) { 657 list_remove(&th->th_seglist, zseg); 658 kmem_free(zseg, sizeof (*zseg)); 659 return (EAGAIN); 660 } 661 662 return (rc); 663 } 664 665 /* 666 * Note: (mintxg, maxtxg) is an open interval; mintxg and maxtxg themselves 667 * are not included. The blocks covered by this segment will all have 668 * mintxg < birth < maxtxg. 669 */ 670 static void 671 traverse_add_segment(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg, 672 uint64_t sobjset, uint64_t sobject, int slevel, uint64_t sblkid, 673 uint64_t eobjset, uint64_t eobject, int elevel, uint64_t eblkid) 674 { 675 zseg_t *zseg; 676 677 zseg = kmem_alloc(sizeof (zseg_t), KM_SLEEP); 678 679 zseg->seg_mintxg = mintxg; 680 zseg->seg_maxtxg = maxtxg; 681 682 zseg->seg_start.zb_objset = sobjset; 683 zseg->seg_start.zb_object = sobject; 684 zseg->seg_start.zb_level = slevel; 685 zseg->seg_start.zb_blkid = sblkid; 686 687 zseg->seg_end.zb_objset = eobjset; 688 zseg->seg_end.zb_object = eobject; 689 zseg->seg_end.zb_level = elevel; 690 zseg->seg_end.zb_blkid = eblkid; 691 692 list_insert_tail(&th->th_seglist, zseg); 693 } 694 695 void 696 traverse_add_dnode(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg, 697 uint64_t objset, uint64_t object) 698 { 699 if (th->th_advance & ADVANCE_PRE) 700 traverse_add_segment(th, mintxg, maxtxg, 701 objset, object, ZB_MAXLEVEL, 0, 702 objset, object, 0, ZB_MAXBLKID); 703 else 704 traverse_add_segment(th, mintxg, maxtxg, 705 objset, object, 0, 0, 706 objset, object, 0, ZB_MAXBLKID); 707 } 708 709 void 710 traverse_add_objset(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg, 711 uint64_t objset) 712 { 713 if (th->th_advance & ADVANCE_PRE) 714 traverse_add_segment(th, mintxg, maxtxg, 715 objset, 0, -1, 0, 716 objset, ZB_MAXOBJECT, 0, ZB_MAXBLKID); 717 else 718 traverse_add_segment(th, mintxg, maxtxg, 719 objset, 1, 0, 0, 720 objset, 0, -1, 0); 721 } 722 723 void 724 traverse_add_pool(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg) 725 { 726 if (th->th_advance & ADVANCE_PRE) 727 traverse_add_segment(th, mintxg, maxtxg, 728 0, 0, -1, 0, 729 ZB_MAXOBJSET, ZB_MAXOBJECT, 0, ZB_MAXBLKID); 730 else 731 traverse_add_segment(th, mintxg, maxtxg, 732 1, 1, 0, 0, 733 0, 0, -1, 0); 734 } 735 736 traverse_handle_t * 737 traverse_init(spa_t *spa, blkptr_cb_t func, void *arg, int advance, 738 int zio_flags) 739 { 740 traverse_handle_t *th; 741 int d, l; 742 743 th = kmem_zalloc(sizeof (*th), KM_SLEEP); 744 745 th->th_spa = spa; 746 th->th_func = func; 747 th->th_arg = arg; 748 th->th_advance = advance; 749 th->th_lastcb.zb_level = ZB_NO_LEVEL; 750 th->th_noread.zb_level = ZB_NO_LEVEL; 751 th->th_zio_flags = zio_flags; 752 753 list_create(&th->th_seglist, sizeof (zseg_t), 754 offsetof(zseg_t, seg_node)); 755 756 for (d = 0; d < ZB_DEPTH; d++) { 757 for (l = 0; l < ZB_MAXLEVEL; l++) { 758 if ((advance & ADVANCE_DATA) || 759 l != 0 || d != ZB_DN_CACHE) 760 th->th_cache[d][l].bc_data = 761 zio_buf_alloc(SPA_MAXBLOCKSIZE); 762 } 763 } 764 765 return (th); 766 } 767 768 void 769 traverse_fini(traverse_handle_t *th) 770 { 771 int d, l; 772 zseg_t *zseg; 773 774 for (d = 0; d < ZB_DEPTH; d++) 775 for (l = 0; l < ZB_MAXLEVEL; l++) 776 if (th->th_cache[d][l].bc_data != NULL) 777 zio_buf_free(th->th_cache[d][l].bc_data, 778 SPA_MAXBLOCKSIZE); 779 780 while ((zseg = list_head(&th->th_seglist)) != NULL) { 781 list_remove(&th->th_seglist, zseg); 782 kmem_free(zseg, sizeof (*zseg)); 783 } 784 785 list_destroy(&th->th_seglist); 786 787 dprintf("%llu hit, %llu ARC, %llu IO, %llu cb, %llu sync, %llu again\n", 788 th->th_hits, th->th_arc_hits, th->th_reads, th->th_callbacks, 789 th->th_syncs, th->th_restarts); 790 791 kmem_free(th, sizeof (*th)); 792 } 793