1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/zfs_context.h> 29 #include <sys/dmu_objset.h> 30 #include <sys/dmu_traverse.h> 31 #include <sys/dsl_dataset.h> 32 #include <sys/dsl_dir.h> 33 #include <sys/dsl_pool.h> 34 #include <sys/dnode.h> 35 #include <sys/spa.h> 36 #include <sys/zio.h> 37 #include <sys/dmu_impl.h> 38 39 #define BP_SPAN_SHIFT(level, width) ((level) * (width)) 40 41 #define BP_EQUAL(b1, b2) \ 42 (DVA_EQUAL(BP_IDENTITY(b1), BP_IDENTITY(b2)) && \ 43 (b1)->blk_birth == (b2)->blk_birth) 44 45 /* 46 * Compare two bookmarks. 47 * 48 * For ADVANCE_PRE, the visitation order is: 49 * 50 * objset 0, 1, 2, ..., ZB_MAXOBJSET. 51 * object 0, 1, 2, ..., ZB_MAXOBJECT. 52 * blkoff 0, 1, 2, ... 53 * level ZB_MAXLEVEL, ..., 2, 1, 0. 54 * 55 * where blkoff = blkid << BP_SPAN_SHIFT(level, width), and thus a valid 56 * ordering vector is: 57 * 58 * < objset, object, blkoff, -level > 59 * 60 * For ADVANCE_POST, the starting offsets aren't sequential but ending 61 * offsets [blkoff = (blkid + 1) << BP_SPAN_SHIFT(level, width)] are. 62 * The visitation order is: 63 * 64 * objset 1, 2, ..., ZB_MAXOBJSET, 0. 65 * object 1, 2, ..., ZB_MAXOBJECT, 0. 66 * blkoff 1, 2, ... 67 * level 0, 1, 2, ..., ZB_MAXLEVEL. 68 * 69 * and thus a valid ordering vector is: 70 * 71 * < objset - 1, object - 1, blkoff, level > 72 * 73 * Both orderings can be expressed as: 74 * 75 * < objset + bias, object + bias, blkoff, level ^ bias > 76 * 77 * where 'bias' is either 0 or -1 (for ADVANCE_PRE or ADVANCE_POST) 78 * and 'blkoff' is (blkid - bias) << BP_SPAN_SHIFT(level, wshift). 79 * 80 * Special case: an objset's osphys is represented as level -1 of object 0. 81 * It is always either the very first or very last block we visit in an objset. 82 * Therefore, if either bookmark's level is -1, level alone determines order. 83 */ 84 static int 85 compare_bookmark(zbookmark_t *szb, zbookmark_t *ezb, dnode_phys_t *dnp, 86 int advance) 87 { 88 int bias = (advance & ADVANCE_PRE) ? 0 : -1; 89 uint64_t sblkoff, eblkoff; 90 int slevel, elevel, wshift; 91 92 if (szb->zb_objset + bias < ezb->zb_objset + bias) 93 return (-1); 94 95 if (szb->zb_objset + bias > ezb->zb_objset + bias) 96 return (1); 97 98 slevel = szb->zb_level; 99 elevel = ezb->zb_level; 100 101 if ((slevel | elevel) < 0) 102 return ((slevel ^ bias) - (elevel ^ bias)); 103 104 if (szb->zb_object + bias < ezb->zb_object + bias) 105 return (-1); 106 107 if (szb->zb_object + bias > ezb->zb_object + bias) 108 return (1); 109 110 if (dnp == NULL) 111 return (0); 112 113 wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT; 114 115 sblkoff = (szb->zb_blkid - bias) << BP_SPAN_SHIFT(slevel, wshift); 116 eblkoff = (ezb->zb_blkid - bias) << BP_SPAN_SHIFT(elevel, wshift); 117 118 if (sblkoff < eblkoff) 119 return (-1); 120 121 if (sblkoff > eblkoff) 122 return (1); 123 124 return ((elevel ^ bias) - (slevel ^ bias)); 125 } 126 127 #define SET_BOOKMARK(zb, objset, object, level, blkid) \ 128 { \ 129 (zb)->zb_objset = objset; \ 130 (zb)->zb_object = object; \ 131 (zb)->zb_level = level; \ 132 (zb)->zb_blkid = blkid; \ 133 } 134 135 #define SET_BOOKMARK_LB(zb, level, blkid) \ 136 { \ 137 (zb)->zb_level = level; \ 138 (zb)->zb_blkid = blkid; \ 139 } 140 141 static int 142 advance_objset(zseg_t *zseg, uint64_t objset, int advance) 143 { 144 zbookmark_t *zb = &zseg->seg_start; 145 146 if (advance & ADVANCE_PRE) { 147 if (objset >= ZB_MAXOBJSET) 148 return (ERANGE); 149 SET_BOOKMARK(zb, objset, 0, -1, 0); 150 } else { 151 if (objset >= ZB_MAXOBJSET) 152 objset = 0; 153 SET_BOOKMARK(zb, objset, 1, 0, 0); 154 } 155 156 if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0) 157 return (ERANGE); 158 159 return (EAGAIN); 160 } 161 162 static int 163 advance_object(zseg_t *zseg, uint64_t object, int advance) 164 { 165 zbookmark_t *zb = &zseg->seg_start; 166 167 if (advance & ADVANCE_PRE) { 168 if (object >= ZB_MAXOBJECT) { 169 SET_BOOKMARK(zb, zb->zb_objset + 1, 0, -1, 0); 170 } else { 171 SET_BOOKMARK(zb, zb->zb_objset, object, ZB_MAXLEVEL, 0); 172 } 173 } else { 174 if (zb->zb_object == 0) { 175 SET_BOOKMARK(zb, zb->zb_objset, 0, -1, 0); 176 } else { 177 if (object >= ZB_MAXOBJECT) 178 object = 0; 179 SET_BOOKMARK(zb, zb->zb_objset, object, 0, 0); 180 } 181 } 182 183 if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0) 184 return (ERANGE); 185 186 return (EAGAIN); 187 } 188 189 static int 190 advance_from_osphys(zseg_t *zseg, int advance) 191 { 192 zbookmark_t *zb = &zseg->seg_start; 193 194 ASSERT(zb->zb_object == 0); 195 ASSERT(zb->zb_level == -1); 196 ASSERT(zb->zb_blkid == 0); 197 198 if (advance & ADVANCE_PRE) { 199 SET_BOOKMARK_LB(zb, ZB_MAXLEVEL, 0); 200 } else { 201 if (zb->zb_objset == 0) 202 return (ERANGE); 203 SET_BOOKMARK(zb, zb->zb_objset + 1, 1, 0, 0); 204 } 205 206 if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0) 207 return (ERANGE); 208 209 return (EAGAIN); 210 } 211 212 static int 213 advance_block(zseg_t *zseg, dnode_phys_t *dnp, int rc, int advance) 214 { 215 zbookmark_t *zb = &zseg->seg_start; 216 int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT; 217 int maxlevel = dnp->dn_nlevels - 1; 218 int level = zb->zb_level; 219 uint64_t blkid = zb->zb_blkid; 220 221 if (advance & ADVANCE_PRE) { 222 if (level > 0 && rc == 0) { 223 level--; 224 blkid <<= wshift; 225 } else { 226 blkid++; 227 228 if ((blkid << BP_SPAN_SHIFT(level, wshift)) > 229 dnp->dn_maxblkid) 230 return (ERANGE); 231 232 while (level < maxlevel) { 233 if (P2PHASE(blkid, 1ULL << wshift)) 234 break; 235 blkid >>= wshift; 236 level++; 237 } 238 } 239 } else { 240 if (level >= maxlevel || P2PHASE(blkid + 1, 1ULL << wshift)) { 241 blkid = (blkid + 1) << BP_SPAN_SHIFT(level, wshift); 242 level = 0; 243 } else { 244 blkid >>= wshift; 245 level++; 246 } 247 248 while ((blkid << BP_SPAN_SHIFT(level, wshift)) > 249 dnp->dn_maxblkid) { 250 if (level == maxlevel) 251 return (ERANGE); 252 blkid >>= wshift; 253 level++; 254 } 255 } 256 SET_BOOKMARK_LB(zb, level, blkid); 257 258 if (compare_bookmark(zb, &zseg->seg_end, dnp, advance) > 0) 259 return (ERANGE); 260 261 return (EAGAIN); 262 } 263 264 static int 265 traverse_callback(traverse_handle_t *th, zseg_t *zseg, traverse_blk_cache_t *bc) 266 { 267 /* 268 * Before we issue the callback, prune against maxtxg. 269 * 270 * We prune against mintxg before we get here because it's a big win. 271 * If a given block was born in txg 37, then we know that the entire 272 * subtree below that block must have been born in txg 37 or earlier. 273 * We can therefore lop off huge branches of the tree as we go. 274 * 275 * There's no corresponding optimization for maxtxg because knowing 276 * that bp->blk_birth >= maxtxg doesn't imply anything about the bp's 277 * children. In fact, the copy-on-write design of ZFS ensures that 278 * top-level blocks will pretty much always be new. 279 * 280 * Therefore, in the name of simplicity we don't prune against 281 * maxtxg until the last possible moment -- that being right now. 282 */ 283 if (bc->bc_errno == 0 && bc->bc_blkptr.blk_birth >= zseg->seg_maxtxg) 284 return (0); 285 286 /* 287 * Debugging: verify that the order we visit things agrees with the 288 * order defined by compare_bookmark(). We don't check this for 289 * log blocks because there's no defined ordering for them; they're 290 * always visited (or not) as part of visiting the objset_phys_t. 291 */ 292 if (bc->bc_errno == 0 && bc != &th->th_zil_cache) { 293 zbookmark_t *zb = &bc->bc_bookmark; 294 zbookmark_t *szb = &zseg->seg_start; 295 zbookmark_t *ezb = &zseg->seg_end; 296 zbookmark_t *lzb = &th->th_lastcb; 297 dnode_phys_t *dnp = bc->bc_dnode; 298 299 ASSERT(compare_bookmark(zb, ezb, dnp, th->th_advance) <= 0); 300 ASSERT(compare_bookmark(zb, szb, dnp, th->th_advance) == 0); 301 ASSERT(compare_bookmark(lzb, zb, dnp, th->th_advance) < 0 || 302 lzb->zb_level == ZB_NO_LEVEL); 303 *lzb = *zb; 304 } 305 306 th->th_callbacks++; 307 return (th->th_func(bc, th->th_spa, th->th_arg)); 308 } 309 310 static int 311 traverse_read(traverse_handle_t *th, traverse_blk_cache_t *bc, blkptr_t *bp, 312 dnode_phys_t *dnp) 313 { 314 zbookmark_t *zb = &bc->bc_bookmark; 315 int error; 316 317 th->th_hits++; 318 319 bc->bc_dnode = dnp; 320 bc->bc_errno = 0; 321 322 if (BP_EQUAL(&bc->bc_blkptr, bp)) 323 return (0); 324 325 bc->bc_blkptr = *bp; 326 327 if (bc->bc_data == NULL) 328 return (0); 329 330 if (BP_IS_HOLE(bp)) { 331 ASSERT(th->th_advance & ADVANCE_HOLES); 332 return (0); 333 } 334 335 if (compare_bookmark(zb, &th->th_noread, dnp, 0) == 0) { 336 error = EIO; 337 } else if (arc_tryread(th->th_spa, bp, bc->bc_data) == 0) { 338 error = 0; 339 th->th_arc_hits++; 340 } else { 341 error = zio_wait(zio_read(NULL, th->th_spa, bp, bc->bc_data, 342 BP_GET_LSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_READ, 343 th->th_zio_flags | ZIO_FLAG_DONT_CACHE, zb)); 344 345 if (BP_SHOULD_BYTESWAP(bp) && error == 0) 346 (zb->zb_level > 0 ? byteswap_uint64_array : 347 dmu_ot[BP_GET_TYPE(bp)].ot_byteswap)(bc->bc_data, 348 BP_GET_LSIZE(bp)); 349 th->th_reads++; 350 } 351 352 if (error) { 353 bc->bc_errno = error; 354 error = traverse_callback(th, NULL, bc); 355 ASSERT(error == EAGAIN || error == EINTR || error == ERESTART); 356 bc->bc_blkptr.blk_birth = -1ULL; 357 } 358 359 dprintf("cache %02x error %d <%llu, %llu, %d, %llx>\n", 360 bc - &th->th_cache[0][0], error, 361 zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid); 362 363 return (error); 364 } 365 366 static int 367 find_block(traverse_handle_t *th, zseg_t *zseg, dnode_phys_t *dnp, int depth) 368 { 369 zbookmark_t *zb = &zseg->seg_start; 370 traverse_blk_cache_t *bc; 371 blkptr_t *bp = dnp->dn_blkptr; 372 int i, first, level; 373 int nbp = dnp->dn_nblkptr; 374 int minlevel = zb->zb_level; 375 int maxlevel = dnp->dn_nlevels - 1; 376 int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT; 377 int bp_shift = BP_SPAN_SHIFT(maxlevel - minlevel, wshift); 378 uint64_t blkid = zb->zb_blkid >> bp_shift; 379 int do_holes = (th->th_advance & ADVANCE_HOLES) && depth == ZB_DN_CACHE; 380 int rc; 381 382 if (minlevel > maxlevel || blkid >= nbp) 383 return (ERANGE); 384 385 for (level = maxlevel; level >= minlevel; level--) { 386 first = P2PHASE(blkid, 1ULL << wshift); 387 388 for (i = first; i < nbp; i++) 389 if (bp[i].blk_birth > zseg->seg_mintxg || 390 BP_IS_HOLE(&bp[i]) && do_holes) 391 break; 392 393 if (i != first) { 394 i--; 395 SET_BOOKMARK_LB(zb, level, blkid + (i - first)); 396 return (ENOTBLK); 397 } 398 399 bc = &th->th_cache[depth][level]; 400 401 SET_BOOKMARK(&bc->bc_bookmark, zb->zb_objset, zb->zb_object, 402 level, blkid); 403 404 if (rc = traverse_read(th, bc, bp + i, dnp)) { 405 if (rc != EAGAIN) { 406 SET_BOOKMARK_LB(zb, level, blkid); 407 } 408 return (rc); 409 } 410 411 if (BP_IS_HOLE(&bp[i])) { 412 SET_BOOKMARK_LB(zb, level, blkid); 413 th->th_lastcb.zb_level = ZB_NO_LEVEL; 414 return (0); 415 } 416 417 nbp = 1 << wshift; 418 bp = bc->bc_data; 419 bp_shift -= wshift; 420 blkid = zb->zb_blkid >> bp_shift; 421 } 422 423 return (0); 424 } 425 426 static int 427 get_dnode(traverse_handle_t *th, uint64_t objset, dnode_phys_t *mdn, 428 uint64_t *objectp, dnode_phys_t **dnpp, uint64_t txg, int type, int depth) 429 { 430 zseg_t zseg; 431 zbookmark_t *zb = &zseg.seg_start; 432 uint64_t object = *objectp; 433 int i, rc; 434 435 SET_BOOKMARK(zb, objset, 0, 0, object / DNODES_PER_BLOCK); 436 SET_BOOKMARK(&zseg.seg_end, objset, 0, 0, ZB_MAXBLKID); 437 438 zseg.seg_mintxg = txg; 439 zseg.seg_maxtxg = -1ULL; 440 441 for (;;) { 442 rc = find_block(th, &zseg, mdn, depth); 443 444 if (rc == EAGAIN || rc == EINTR || rc == ERANGE) 445 break; 446 447 if (rc == 0 && zb->zb_level == 0) { 448 dnode_phys_t *dnp = th->th_cache[depth][0].bc_data; 449 for (i = 0; i < DNODES_PER_BLOCK; i++) { 450 object = (zb->zb_blkid * DNODES_PER_BLOCK) + i; 451 if (object >= *objectp && 452 dnp[i].dn_type != DMU_OT_NONE && 453 (type == -1 || dnp[i].dn_type == type)) { 454 *objectp = object; 455 *dnpp = &dnp[i]; 456 return (0); 457 } 458 } 459 } 460 461 rc = advance_block(&zseg, mdn, rc, ADVANCE_PRE); 462 463 if (rc == ERANGE) 464 break; 465 } 466 467 if (rc == ERANGE) 468 *objectp = ZB_MAXOBJECT; 469 470 return (rc); 471 } 472 473 /* ARGSUSED */ 474 static void 475 traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) 476 { 477 traverse_handle_t *th = arg; 478 traverse_blk_cache_t *bc = &th->th_zil_cache; 479 zbookmark_t *zb = &bc->bc_bookmark; 480 zseg_t *zseg = list_head(&th->th_seglist); 481 482 if (bp->blk_birth <= zseg->seg_mintxg) 483 return; 484 485 if (claim_txg != 0 || bp->blk_birth < spa_first_txg(th->th_spa)) { 486 zb->zb_object = 0; 487 zb->zb_blkid = bp->blk_cksum.zc_word[3]; 488 bc->bc_blkptr = *bp; 489 (void) traverse_callback(th, zseg, bc); 490 } 491 } 492 493 /* ARGSUSED */ 494 static void 495 traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) 496 { 497 traverse_handle_t *th = arg; 498 traverse_blk_cache_t *bc = &th->th_zil_cache; 499 zbookmark_t *zb = &bc->bc_bookmark; 500 zseg_t *zseg = list_head(&th->th_seglist); 501 502 if (lrc->lrc_txtype == TX_WRITE) { 503 lr_write_t *lr = (lr_write_t *)lrc; 504 blkptr_t *bp = &lr->lr_blkptr; 505 506 if (bp->blk_birth <= zseg->seg_mintxg) 507 return; 508 509 if (claim_txg != 0 && bp->blk_birth >= claim_txg) { 510 zb->zb_object = lr->lr_foid; 511 zb->zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp); 512 bc->bc_blkptr = *bp; 513 (void) traverse_callback(th, zseg, bc); 514 } 515 } 516 } 517 518 static void 519 traverse_zil(traverse_handle_t *th, traverse_blk_cache_t *bc) 520 { 521 spa_t *spa = th->th_spa; 522 dsl_pool_t *dp = spa_get_dsl(spa); 523 objset_phys_t *osphys = bc->bc_data; 524 zil_header_t *zh = &osphys->os_zil_header; 525 uint64_t claim_txg = zh->zh_claim_txg; 526 zilog_t *zilog; 527 528 ASSERT(bc == &th->th_cache[ZB_MDN_CACHE][ZB_MAXLEVEL - 1]); 529 ASSERT(bc->bc_bookmark.zb_level == -1); 530 531 /* 532 * We only want to visit blocks that have been claimed but not yet 533 * replayed (or, in read-only mode, blocks that *would* be claimed). 534 */ 535 if (claim_txg == 0 && (spa_mode & FWRITE)) 536 return; 537 538 th->th_zil_cache.bc_bookmark = bc->bc_bookmark; 539 540 zilog = zil_alloc(dp->dp_meta_objset, zh); 541 542 zil_parse(zilog, traverse_zil_block, traverse_zil_record, th, 543 claim_txg); 544 545 zil_free(zilog); 546 } 547 548 static int 549 traverse_segment(traverse_handle_t *th, zseg_t *zseg, blkptr_t *mosbp) 550 { 551 zbookmark_t *zb = &zseg->seg_start; 552 traverse_blk_cache_t *bc; 553 dnode_phys_t *dn, *dn_tmp; 554 int worklimit = 100; 555 int rc; 556 557 dprintf("<%llu, %llu, %d, %llx>\n", 558 zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid); 559 560 bc = &th->th_cache[ZB_MOS_CACHE][ZB_MAXLEVEL - 1]; 561 dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode; 562 563 SET_BOOKMARK(&bc->bc_bookmark, 0, 0, -1, 0); 564 565 rc = traverse_read(th, bc, mosbp, dn); 566 567 if (rc) /* If we get ERESTART, we've got nowhere left to go */ 568 return (rc == ERESTART ? EINTR : rc); 569 570 ASSERT(dn->dn_nlevels < ZB_MAXLEVEL); 571 572 if (zb->zb_objset != 0) { 573 uint64_t objset = zb->zb_objset; 574 dsl_dataset_phys_t *dsp; 575 576 rc = get_dnode(th, 0, dn, &objset, &dn_tmp, 0, 577 DMU_OT_DSL_DATASET, ZB_MOS_CACHE); 578 579 if (objset != zb->zb_objset) 580 rc = advance_objset(zseg, objset, th->th_advance); 581 582 if (rc != 0) 583 return (rc); 584 585 dsp = DN_BONUS(dn_tmp); 586 587 bc = &th->th_cache[ZB_MDN_CACHE][ZB_MAXLEVEL - 1]; 588 dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode; 589 590 SET_BOOKMARK(&bc->bc_bookmark, objset, 0, -1, 0); 591 592 /* 593 * If we're traversing an open snapshot, we know that it 594 * can't be deleted (because it's open) and it can't change 595 * (because it's a snapshot). Therefore, once we've gotten 596 * from the uberblock down to the snapshot's objset_phys_t, 597 * we no longer need to synchronize with spa_sync(); we're 598 * traversing a completely static block tree from here on. 599 */ 600 if (th->th_advance & ADVANCE_NOLOCK) { 601 ASSERT(th->th_locked); 602 rw_exit(spa_traverse_rwlock(th->th_spa)); 603 th->th_locked = 0; 604 } 605 606 rc = traverse_read(th, bc, &dsp->ds_bp, dn); 607 608 if (rc != 0) { 609 if (rc == ERESTART) 610 rc = advance_objset(zseg, zb->zb_objset + 1, 611 th->th_advance); 612 return (rc); 613 } 614 615 if (th->th_advance & ADVANCE_PRUNE) 616 zseg->seg_mintxg = 617 MAX(zseg->seg_mintxg, dsp->ds_prev_snap_txg); 618 } 619 620 if (zb->zb_level == -1) { 621 ASSERT(zb->zb_object == 0); 622 ASSERT(zb->zb_blkid == 0); 623 ASSERT(BP_GET_TYPE(&bc->bc_blkptr) == DMU_OT_OBJSET); 624 625 if (bc->bc_blkptr.blk_birth > zseg->seg_mintxg) { 626 rc = traverse_callback(th, zseg, bc); 627 if (rc) { 628 ASSERT(rc == EINTR); 629 return (rc); 630 } 631 if ((th->th_advance & ADVANCE_ZIL) && 632 zb->zb_objset != 0) 633 traverse_zil(th, bc); 634 } 635 636 return (advance_from_osphys(zseg, th->th_advance)); 637 } 638 639 if (zb->zb_object != 0) { 640 uint64_t object = zb->zb_object; 641 642 rc = get_dnode(th, zb->zb_objset, dn, &object, &dn_tmp, 643 zseg->seg_mintxg, -1, ZB_MDN_CACHE); 644 645 if (object != zb->zb_object) 646 rc = advance_object(zseg, object, th->th_advance); 647 648 if (rc != 0) 649 return (rc); 650 651 dn = dn_tmp; 652 } 653 654 if (zb->zb_level == ZB_MAXLEVEL) 655 zb->zb_level = dn->dn_nlevels - 1; 656 657 for (;;) { 658 rc = find_block(th, zseg, dn, ZB_DN_CACHE); 659 660 if (rc == EAGAIN || rc == EINTR || rc == ERANGE) 661 break; 662 663 if (rc == 0) { 664 bc = &th->th_cache[ZB_DN_CACHE][zb->zb_level]; 665 ASSERT(bc->bc_dnode == dn); 666 ASSERT(bc->bc_blkptr.blk_birth <= mosbp->blk_birth); 667 rc = traverse_callback(th, zseg, bc); 668 if (rc) { 669 ASSERT(rc == EINTR); 670 return (rc); 671 } 672 if (BP_IS_HOLE(&bc->bc_blkptr)) { 673 ASSERT(th->th_advance & ADVANCE_HOLES); 674 rc = ENOTBLK; 675 } 676 } 677 678 rc = advance_block(zseg, dn, rc, th->th_advance); 679 680 if (rc == ERANGE) 681 break; 682 683 /* 684 * Give spa_sync() a chance to run. 685 */ 686 if (th->th_locked && spa_traverse_wanted(th->th_spa)) { 687 th->th_syncs++; 688 return (EAGAIN); 689 } 690 691 if (--worklimit == 0) 692 return (EAGAIN); 693 } 694 695 if (rc == ERANGE) 696 rc = advance_object(zseg, zb->zb_object + 1, th->th_advance); 697 698 return (rc); 699 } 700 701 /* 702 * It is the caller's responsibility to ensure that the dsl_dataset_t 703 * doesn't go away during traversal. 704 */ 705 int 706 traverse_dsl_dataset(dsl_dataset_t *ds, uint64_t txg_start, int advance, 707 blkptr_cb_t func, void *arg) 708 { 709 spa_t *spa = ds->ds_dir->dd_pool->dp_spa; 710 traverse_handle_t *th; 711 int err; 712 713 th = traverse_init(spa, func, arg, advance, ZIO_FLAG_MUSTSUCCEED); 714 715 traverse_add_objset(th, txg_start, -1ULL, ds->ds_object); 716 717 while ((err = traverse_more(th)) == EAGAIN) 718 continue; 719 720 traverse_fini(th); 721 return (err); 722 } 723 724 int 725 traverse_more(traverse_handle_t *th) 726 { 727 zseg_t *zseg = list_head(&th->th_seglist); 728 uint64_t save_txg; /* XXX won't be necessary with real itinerary */ 729 krwlock_t *rw = spa_traverse_rwlock(th->th_spa); 730 blkptr_t *mosbp = spa_get_rootblkptr(th->th_spa); 731 int rc; 732 733 if (zseg == NULL) 734 return (0); 735 736 th->th_restarts++; 737 738 save_txg = zseg->seg_mintxg; 739 740 rw_enter(rw, RW_READER); 741 th->th_locked = 1; 742 743 rc = traverse_segment(th, zseg, mosbp); 744 ASSERT(rc == ERANGE || rc == EAGAIN || rc == EINTR); 745 746 if (th->th_locked) 747 rw_exit(rw); 748 th->th_locked = 0; 749 750 zseg->seg_mintxg = save_txg; 751 752 if (rc == ERANGE) { 753 list_remove(&th->th_seglist, zseg); 754 kmem_free(zseg, sizeof (*zseg)); 755 return (EAGAIN); 756 } 757 758 return (rc); 759 } 760 761 /* 762 * Note: (mintxg, maxtxg) is an open interval; mintxg and maxtxg themselves 763 * are not included. The blocks covered by this segment will all have 764 * mintxg < birth < maxtxg. 765 */ 766 static void 767 traverse_add_segment(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg, 768 uint64_t sobjset, uint64_t sobject, int slevel, uint64_t sblkid, 769 uint64_t eobjset, uint64_t eobject, int elevel, uint64_t eblkid) 770 { 771 zseg_t *zseg; 772 773 zseg = kmem_alloc(sizeof (zseg_t), KM_SLEEP); 774 775 zseg->seg_mintxg = mintxg; 776 zseg->seg_maxtxg = maxtxg; 777 778 zseg->seg_start.zb_objset = sobjset; 779 zseg->seg_start.zb_object = sobject; 780 zseg->seg_start.zb_level = slevel; 781 zseg->seg_start.zb_blkid = sblkid; 782 783 zseg->seg_end.zb_objset = eobjset; 784 zseg->seg_end.zb_object = eobject; 785 zseg->seg_end.zb_level = elevel; 786 zseg->seg_end.zb_blkid = eblkid; 787 788 list_insert_tail(&th->th_seglist, zseg); 789 } 790 791 void 792 traverse_add_dnode(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg, 793 uint64_t objset, uint64_t object) 794 { 795 if (th->th_advance & ADVANCE_PRE) 796 traverse_add_segment(th, mintxg, maxtxg, 797 objset, object, ZB_MAXLEVEL, 0, 798 objset, object, 0, ZB_MAXBLKID); 799 else 800 traverse_add_segment(th, mintxg, maxtxg, 801 objset, object, 0, 0, 802 objset, object, 0, ZB_MAXBLKID); 803 } 804 805 void 806 traverse_add_objset(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg, 807 uint64_t objset) 808 { 809 if (th->th_advance & ADVANCE_PRE) 810 traverse_add_segment(th, mintxg, maxtxg, 811 objset, 0, -1, 0, 812 objset, ZB_MAXOBJECT, 0, ZB_MAXBLKID); 813 else 814 traverse_add_segment(th, mintxg, maxtxg, 815 objset, 1, 0, 0, 816 objset, 0, -1, 0); 817 } 818 819 void 820 traverse_add_pool(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg) 821 { 822 if (th->th_advance & ADVANCE_PRE) 823 traverse_add_segment(th, mintxg, maxtxg, 824 0, 0, -1, 0, 825 ZB_MAXOBJSET, ZB_MAXOBJECT, 0, ZB_MAXBLKID); 826 else 827 traverse_add_segment(th, mintxg, maxtxg, 828 1, 1, 0, 0, 829 0, 0, -1, 0); 830 } 831 832 traverse_handle_t * 833 traverse_init(spa_t *spa, blkptr_cb_t func, void *arg, int advance, 834 int zio_flags) 835 { 836 traverse_handle_t *th; 837 int d, l; 838 839 th = kmem_zalloc(sizeof (*th), KM_SLEEP); 840 841 th->th_spa = spa; 842 th->th_func = func; 843 th->th_arg = arg; 844 th->th_advance = advance; 845 th->th_lastcb.zb_level = ZB_NO_LEVEL; 846 th->th_noread.zb_level = ZB_NO_LEVEL; 847 th->th_zio_flags = zio_flags; 848 849 list_create(&th->th_seglist, sizeof (zseg_t), 850 offsetof(zseg_t, seg_node)); 851 852 for (d = 0; d < ZB_DEPTH; d++) { 853 for (l = 0; l < ZB_MAXLEVEL; l++) { 854 if ((advance & ADVANCE_DATA) || 855 l != 0 || d != ZB_DN_CACHE) 856 th->th_cache[d][l].bc_data = 857 zio_buf_alloc(SPA_MAXBLOCKSIZE); 858 } 859 } 860 861 return (th); 862 } 863 864 void 865 traverse_fini(traverse_handle_t *th) 866 { 867 int d, l; 868 zseg_t *zseg; 869 870 for (d = 0; d < ZB_DEPTH; d++) 871 for (l = 0; l < ZB_MAXLEVEL; l++) 872 if (th->th_cache[d][l].bc_data != NULL) 873 zio_buf_free(th->th_cache[d][l].bc_data, 874 SPA_MAXBLOCKSIZE); 875 876 while ((zseg = list_head(&th->th_seglist)) != NULL) { 877 list_remove(&th->th_seglist, zseg); 878 kmem_free(zseg, sizeof (*zseg)); 879 } 880 881 list_destroy(&th->th_seglist); 882 883 dprintf("%llu hit, %llu ARC, %llu IO, %llu cb, %llu sync, %llu again\n", 884 th->th_hits, th->th_arc_hits, th->th_reads, th->th_callbacks, 885 th->th_syncs, th->th_restarts); 886 887 kmem_free(th, sizeof (*th)); 888 } 889