1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/zfs_context.h> 29 #include <sys/dmu_objset.h> 30 #include <sys/dmu_traverse.h> 31 #include <sys/dsl_dataset.h> 32 #include <sys/dsl_dir.h> 33 #include <sys/dsl_pool.h> 34 #include <sys/dnode.h> 35 #include <sys/spa.h> 36 #include <sys/zio.h> 37 #include <sys/dmu_impl.h> 38 #include <sys/zvol.h> 39 40 #define BP_SPAN_SHIFT(level, width) ((level) * (width)) 41 42 #define BP_EQUAL(b1, b2) \ 43 (DVA_EQUAL(BP_IDENTITY(b1), BP_IDENTITY(b2)) && \ 44 (b1)->blk_birth == (b2)->blk_birth) 45 46 /* 47 * Compare two bookmarks. 48 * 49 * For ADVANCE_PRE, the visitation order is: 50 * 51 * objset 0, 1, 2, ..., ZB_MAXOBJSET. 52 * object 0, 1, 2, ..., ZB_MAXOBJECT. 53 * blkoff 0, 1, 2, ... 54 * level ZB_MAXLEVEL, ..., 2, 1, 0. 55 * 56 * where blkoff = blkid << BP_SPAN_SHIFT(level, width), and thus a valid 57 * ordering vector is: 58 * 59 * < objset, object, blkoff, -level > 60 * 61 * For ADVANCE_POST, the starting offsets aren't sequential but ending 62 * offsets [blkoff = (blkid + 1) << BP_SPAN_SHIFT(level, width)] are. 63 * The visitation order is: 64 * 65 * objset 1, 2, ..., ZB_MAXOBJSET, 0. 66 * object 1, 2, ..., ZB_MAXOBJECT, 0. 67 * blkoff 1, 2, ... 68 * level 0, 1, 2, ..., ZB_MAXLEVEL. 69 * 70 * and thus a valid ordering vector is: 71 * 72 * < objset - 1, object - 1, blkoff, level > 73 * 74 * Both orderings can be expressed as: 75 * 76 * < objset + bias, object + bias, blkoff, level ^ bias > 77 * 78 * where 'bias' is either 0 or -1 (for ADVANCE_PRE or ADVANCE_POST) 79 * and 'blkoff' is (blkid - bias) << BP_SPAN_SHIFT(level, wshift). 80 * 81 * Special case: an objset's osphys is represented as level -1 of object 0. 82 * It is always either the very first or very last block we visit in an objset. 83 * Therefore, if either bookmark's level is -1, level alone determines order. 84 */ 85 static int 86 compare_bookmark(zbookmark_t *szb, zbookmark_t *ezb, dnode_phys_t *dnp, 87 int advance) 88 { 89 int bias = (advance & ADVANCE_PRE) ? 0 : -1; 90 uint64_t sblkoff, eblkoff; 91 int slevel, elevel, wshift; 92 93 if (szb->zb_objset + bias < ezb->zb_objset + bias) 94 return (-1); 95 96 if (szb->zb_objset + bias > ezb->zb_objset + bias) 97 return (1); 98 99 slevel = szb->zb_level; 100 elevel = ezb->zb_level; 101 102 if ((slevel | elevel) < 0) 103 return ((slevel ^ bias) - (elevel ^ bias)); 104 105 if (szb->zb_object + bias < ezb->zb_object + bias) 106 return (-1); 107 108 if (szb->zb_object + bias > ezb->zb_object + bias) 109 return (1); 110 111 if (dnp == NULL) 112 return (0); 113 114 wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT; 115 116 sblkoff = (szb->zb_blkid - bias) << BP_SPAN_SHIFT(slevel, wshift); 117 eblkoff = (ezb->zb_blkid - bias) << BP_SPAN_SHIFT(elevel, wshift); 118 119 if (sblkoff < eblkoff) 120 return (-1); 121 122 if (sblkoff > eblkoff) 123 return (1); 124 125 return ((elevel ^ bias) - (slevel ^ bias)); 126 } 127 128 #define SET_BOOKMARK(zb, objset, object, level, blkid) \ 129 { \ 130 (zb)->zb_objset = objset; \ 131 (zb)->zb_object = object; \ 132 (zb)->zb_level = level; \ 133 (zb)->zb_blkid = blkid; \ 134 } 135 136 #define SET_BOOKMARK_LB(zb, level, blkid) \ 137 { \ 138 (zb)->zb_level = level; \ 139 (zb)->zb_blkid = blkid; \ 140 } 141 142 static int 143 advance_objset(zseg_t *zseg, uint64_t objset, int advance) 144 { 145 zbookmark_t *zb = &zseg->seg_start; 146 147 if (advance & ADVANCE_PRE) { 148 if (objset >= ZB_MAXOBJSET) 149 return (ERANGE); 150 SET_BOOKMARK(zb, objset, 0, -1, 0); 151 } else { 152 if (objset >= ZB_MAXOBJSET) 153 objset = 0; 154 SET_BOOKMARK(zb, objset, 1, 0, 0); 155 } 156 157 if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0) 158 return (ERANGE); 159 160 return (EAGAIN); 161 } 162 163 static int 164 advance_object(zseg_t *zseg, uint64_t object, int advance) 165 { 166 zbookmark_t *zb = &zseg->seg_start; 167 168 if (advance & ADVANCE_PRE) { 169 if (object >= ZB_MAXOBJECT) { 170 SET_BOOKMARK(zb, zb->zb_objset + 1, 0, -1, 0); 171 } else { 172 SET_BOOKMARK(zb, zb->zb_objset, object, ZB_MAXLEVEL, 0); 173 } 174 } else { 175 if (zb->zb_object == 0) { 176 SET_BOOKMARK(zb, zb->zb_objset, 0, -1, 0); 177 } else { 178 if (object >= ZB_MAXOBJECT) 179 object = 0; 180 SET_BOOKMARK(zb, zb->zb_objset, object, 0, 0); 181 } 182 } 183 184 if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0) 185 return (ERANGE); 186 187 return (EAGAIN); 188 } 189 190 static int 191 advance_from_osphys(zseg_t *zseg, int advance) 192 { 193 zbookmark_t *zb = &zseg->seg_start; 194 195 ASSERT(zb->zb_object == 0); 196 ASSERT(zb->zb_level == -1); 197 ASSERT(zb->zb_blkid == 0); 198 199 if (advance & ADVANCE_PRE) { 200 SET_BOOKMARK_LB(zb, ZB_MAXLEVEL, 0); 201 } else { 202 if (zb->zb_objset == 0) 203 return (ERANGE); 204 SET_BOOKMARK(zb, zb->zb_objset + 1, 1, 0, 0); 205 } 206 207 if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0) 208 return (ERANGE); 209 210 return (EAGAIN); 211 } 212 213 static int 214 advance_block(zseg_t *zseg, dnode_phys_t *dnp, int rc, int advance) 215 { 216 zbookmark_t *zb = &zseg->seg_start; 217 int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT; 218 int maxlevel = dnp->dn_nlevels - 1; 219 int level = zb->zb_level; 220 uint64_t blkid = zb->zb_blkid; 221 222 if (advance & ADVANCE_PRE) { 223 if (level > 0 && rc == 0) { 224 level--; 225 blkid <<= wshift; 226 } else { 227 blkid++; 228 229 if ((blkid << BP_SPAN_SHIFT(level, wshift)) > 230 dnp->dn_maxblkid) 231 return (ERANGE); 232 233 while (level < maxlevel) { 234 if (P2PHASE(blkid, 1ULL << wshift)) 235 break; 236 blkid >>= wshift; 237 level++; 238 } 239 } 240 } else { 241 if (level >= maxlevel || P2PHASE(blkid + 1, 1ULL << wshift)) { 242 blkid = (blkid + 1) << BP_SPAN_SHIFT(level, wshift); 243 level = 0; 244 } else { 245 blkid >>= wshift; 246 level++; 247 } 248 249 while ((blkid << BP_SPAN_SHIFT(level, wshift)) > 250 dnp->dn_maxblkid) { 251 if (level == maxlevel) 252 return (ERANGE); 253 blkid >>= wshift; 254 level++; 255 } 256 } 257 SET_BOOKMARK_LB(zb, level, blkid); 258 259 if (compare_bookmark(zb, &zseg->seg_end, dnp, advance) > 0) 260 return (ERANGE); 261 262 return (EAGAIN); 263 } 264 265 /* 266 * The traverse_callback function will call the function specified in th_func. 267 * In the event of an error the callee, specified by th_func, must return 268 * one of the following errors: 269 * 270 * EINTR - Indicates that the callee wants the traversal to 271 * abort immediately. 272 * ERESTART - The callee has acknowledged the error and would 273 * like to continue. 274 */ 275 static int 276 traverse_callback(traverse_handle_t *th, zseg_t *zseg, traverse_blk_cache_t *bc) 277 { 278 /* 279 * Before we issue the callback, prune against maxtxg. 280 * 281 * We prune against mintxg before we get here because it's a big win. 282 * If a given block was born in txg 37, then we know that the entire 283 * subtree below that block must have been born in txg 37 or earlier. 284 * We can therefore lop off huge branches of the tree as we go. 285 * 286 * There's no corresponding optimization for maxtxg because knowing 287 * that bp->blk_birth >= maxtxg doesn't imply anything about the bp's 288 * children. In fact, the copy-on-write design of ZFS ensures that 289 * top-level blocks will pretty much always be new. 290 * 291 * Therefore, in the name of simplicity we don't prune against 292 * maxtxg until the last possible moment -- that being right now. 293 */ 294 if (bc->bc_errno == 0 && bc->bc_blkptr.blk_birth >= zseg->seg_maxtxg) 295 return (0); 296 297 /* 298 * Debugging: verify that the order we visit things agrees with the 299 * order defined by compare_bookmark(). We don't check this for 300 * log blocks because there's no defined ordering for them; they're 301 * always visited (or not) as part of visiting the objset_phys_t. 302 */ 303 if (bc->bc_errno == 0 && bc != &th->th_zil_cache) { 304 zbookmark_t *zb = &bc->bc_bookmark; 305 zbookmark_t *szb = &zseg->seg_start; 306 zbookmark_t *ezb = &zseg->seg_end; 307 zbookmark_t *lzb = &th->th_lastcb; 308 dnode_phys_t *dnp = bc->bc_dnode; 309 310 ASSERT(compare_bookmark(zb, ezb, dnp, th->th_advance) <= 0); 311 ASSERT(compare_bookmark(zb, szb, dnp, th->th_advance) == 0); 312 ASSERT(compare_bookmark(lzb, zb, dnp, th->th_advance) < 0 || 313 lzb->zb_level == ZB_NO_LEVEL); 314 *lzb = *zb; 315 } 316 317 th->th_callbacks++; 318 return (th->th_func(bc, th->th_spa, th->th_arg)); 319 } 320 321 static int 322 traverse_read(traverse_handle_t *th, traverse_blk_cache_t *bc, blkptr_t *bp, 323 dnode_phys_t *dnp) 324 { 325 zbookmark_t *zb = &bc->bc_bookmark; 326 int error; 327 328 th->th_hits++; 329 330 bc->bc_dnode = dnp; 331 bc->bc_errno = 0; 332 333 if (BP_EQUAL(&bc->bc_blkptr, bp)) 334 return (0); 335 336 bc->bc_blkptr = *bp; 337 338 if (bc->bc_data == NULL) 339 return (0); 340 341 if (BP_IS_HOLE(bp)) { 342 ASSERT(th->th_advance & ADVANCE_HOLES); 343 return (0); 344 } 345 346 if (compare_bookmark(zb, &th->th_noread, dnp, 0) == 0) { 347 error = EIO; 348 } else if (arc_tryread(th->th_spa, bp, bc->bc_data) == 0) { 349 error = 0; 350 th->th_arc_hits++; 351 } else { 352 error = zio_wait(zio_read(NULL, th->th_spa, bp, bc->bc_data, 353 BP_GET_LSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_READ, 354 th->th_zio_flags | ZIO_FLAG_DONT_CACHE, zb)); 355 356 if (BP_SHOULD_BYTESWAP(bp) && error == 0) 357 (zb->zb_level > 0 ? byteswap_uint64_array : 358 dmu_ot[BP_GET_TYPE(bp)].ot_byteswap)(bc->bc_data, 359 BP_GET_LSIZE(bp)); 360 th->th_reads++; 361 } 362 363 if (error) { 364 bc->bc_errno = error; 365 error = traverse_callback(th, NULL, bc); 366 ASSERT(error == EAGAIN || error == EINTR || error == ERESTART); 367 bc->bc_blkptr.blk_birth = -1ULL; 368 } 369 370 dprintf("cache %02x error %d <%llu, %llu, %d, %llx>\n", 371 bc - &th->th_cache[0][0], error, 372 zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid); 373 374 return (error); 375 } 376 377 static int 378 find_block(traverse_handle_t *th, zseg_t *zseg, dnode_phys_t *dnp, int depth) 379 { 380 zbookmark_t *zb = &zseg->seg_start; 381 traverse_blk_cache_t *bc; 382 blkptr_t *bp = dnp->dn_blkptr; 383 int i, first, level; 384 int nbp = dnp->dn_nblkptr; 385 int minlevel = zb->zb_level; 386 int maxlevel = dnp->dn_nlevels - 1; 387 int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT; 388 int bp_shift = BP_SPAN_SHIFT(maxlevel - minlevel, wshift); 389 uint64_t blkid = zb->zb_blkid >> bp_shift; 390 int do_holes = (th->th_advance & ADVANCE_HOLES) && depth == ZB_DN_CACHE; 391 int rc; 392 393 if (minlevel > maxlevel || blkid >= nbp) 394 return (ERANGE); 395 396 for (level = maxlevel; level >= minlevel; level--) { 397 first = P2PHASE(blkid, 1ULL << wshift); 398 399 for (i = first; i < nbp; i++) 400 if (bp[i].blk_birth > zseg->seg_mintxg || 401 BP_IS_HOLE(&bp[i]) && do_holes) 402 break; 403 404 if (i != first) { 405 i--; 406 SET_BOOKMARK_LB(zb, level, blkid + (i - first)); 407 return (ENOTBLK); 408 } 409 410 bc = &th->th_cache[depth][level]; 411 412 SET_BOOKMARK(&bc->bc_bookmark, zb->zb_objset, zb->zb_object, 413 level, blkid); 414 415 if (rc = traverse_read(th, bc, bp + i, dnp)) { 416 if (rc != EAGAIN) { 417 SET_BOOKMARK_LB(zb, level, blkid); 418 } 419 return (rc); 420 } 421 422 if (BP_IS_HOLE(&bp[i])) { 423 SET_BOOKMARK_LB(zb, level, blkid); 424 th->th_lastcb.zb_level = ZB_NO_LEVEL; 425 return (0); 426 } 427 428 nbp = 1 << wshift; 429 bp = bc->bc_data; 430 bp_shift -= wshift; 431 blkid = zb->zb_blkid >> bp_shift; 432 } 433 434 return (0); 435 } 436 437 static int 438 get_dnode(traverse_handle_t *th, uint64_t objset, dnode_phys_t *mdn, 439 uint64_t *objectp, dnode_phys_t **dnpp, uint64_t txg, int type, int depth) 440 { 441 zseg_t zseg; 442 zbookmark_t *zb = &zseg.seg_start; 443 uint64_t object = *objectp; 444 int i, rc; 445 446 SET_BOOKMARK(zb, objset, 0, 0, object / DNODES_PER_BLOCK); 447 SET_BOOKMARK(&zseg.seg_end, objset, 0, 0, ZB_MAXBLKID); 448 449 zseg.seg_mintxg = txg; 450 zseg.seg_maxtxg = -1ULL; 451 452 for (;;) { 453 rc = find_block(th, &zseg, mdn, depth); 454 455 if (rc == EAGAIN || rc == EINTR || rc == ERANGE) 456 break; 457 458 if (rc == 0 && zb->zb_level == 0) { 459 dnode_phys_t *dnp = th->th_cache[depth][0].bc_data; 460 for (i = 0; i < DNODES_PER_BLOCK; i++) { 461 object = (zb->zb_blkid * DNODES_PER_BLOCK) + i; 462 if (object >= *objectp && 463 dnp[i].dn_type != DMU_OT_NONE && 464 (type == -1 || dnp[i].dn_type == type)) { 465 *objectp = object; 466 *dnpp = &dnp[i]; 467 return (0); 468 } 469 } 470 } 471 472 rc = advance_block(&zseg, mdn, rc, ADVANCE_PRE); 473 474 if (rc == ERANGE) 475 break; 476 } 477 478 if (rc == ERANGE) 479 *objectp = ZB_MAXOBJECT; 480 481 return (rc); 482 } 483 484 /* ARGSUSED */ 485 static void 486 traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) 487 { 488 traverse_handle_t *th = arg; 489 traverse_blk_cache_t *bc = &th->th_zil_cache; 490 zbookmark_t *zb = &bc->bc_bookmark; 491 zseg_t *zseg = list_head(&th->th_seglist); 492 493 if (bp->blk_birth <= zseg->seg_mintxg) 494 return; 495 496 if (claim_txg != 0 || bp->blk_birth < spa_first_txg(th->th_spa)) { 497 zb->zb_object = 0; 498 zb->zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ]; 499 bc->bc_blkptr = *bp; 500 (void) traverse_callback(th, zseg, bc); 501 } 502 } 503 504 /* ARGSUSED */ 505 static void 506 traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) 507 { 508 traverse_handle_t *th = arg; 509 traverse_blk_cache_t *bc = &th->th_zil_cache; 510 zbookmark_t *zb = &bc->bc_bookmark; 511 zseg_t *zseg = list_head(&th->th_seglist); 512 513 if (lrc->lrc_txtype == TX_WRITE) { 514 lr_write_t *lr = (lr_write_t *)lrc; 515 blkptr_t *bp = &lr->lr_blkptr; 516 517 if (bp->blk_birth <= zseg->seg_mintxg) 518 return; 519 520 if (claim_txg != 0 && bp->blk_birth >= claim_txg) { 521 zb->zb_object = lr->lr_foid; 522 zb->zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp); 523 bc->bc_blkptr = *bp; 524 (void) traverse_callback(th, zseg, bc); 525 } 526 } 527 } 528 529 static void 530 traverse_zil(traverse_handle_t *th, traverse_blk_cache_t *bc) 531 { 532 spa_t *spa = th->th_spa; 533 dsl_pool_t *dp = spa_get_dsl(spa); 534 objset_phys_t *osphys = bc->bc_data; 535 zil_header_t *zh = &osphys->os_zil_header; 536 uint64_t claim_txg = zh->zh_claim_txg; 537 zilog_t *zilog; 538 539 ASSERT(bc == &th->th_cache[ZB_MDN_CACHE][ZB_MAXLEVEL - 1]); 540 ASSERT(bc->bc_bookmark.zb_level == -1); 541 542 /* 543 * We only want to visit blocks that have been claimed but not yet 544 * replayed (or, in read-only mode, blocks that *would* be claimed). 545 */ 546 if (claim_txg == 0 && (spa_mode & FWRITE)) 547 return; 548 549 th->th_zil_cache.bc_bookmark = bc->bc_bookmark; 550 551 zilog = zil_alloc(dp->dp_meta_objset, zh); 552 553 (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, th, 554 claim_txg); 555 556 zil_free(zilog); 557 } 558 559 static int 560 traverse_segment(traverse_handle_t *th, zseg_t *zseg, blkptr_t *mosbp) 561 { 562 zbookmark_t *zb = &zseg->seg_start; 563 traverse_blk_cache_t *bc; 564 dnode_phys_t *dn, *dn_tmp; 565 int worklimit = 100; 566 int rc; 567 568 dprintf("<%llu, %llu, %d, %llx>\n", 569 zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid); 570 571 bc = &th->th_cache[ZB_MOS_CACHE][ZB_MAXLEVEL - 1]; 572 dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode; 573 574 SET_BOOKMARK(&bc->bc_bookmark, 0, 0, -1, 0); 575 576 rc = traverse_read(th, bc, mosbp, dn); 577 578 if (rc) /* If we get ERESTART, we've got nowhere left to go */ 579 return (rc == ERESTART ? EINTR : rc); 580 581 ASSERT(dn->dn_nlevels < ZB_MAXLEVEL); 582 583 if (zb->zb_objset != 0) { 584 uint64_t objset = zb->zb_objset; 585 dsl_dataset_phys_t *dsp; 586 587 rc = get_dnode(th, 0, dn, &objset, &dn_tmp, 0, 588 DMU_OT_DSL_DATASET, ZB_MOS_CACHE); 589 590 if (objset != zb->zb_objset) 591 rc = advance_objset(zseg, objset, th->th_advance); 592 593 if (rc != 0) 594 return (rc); 595 596 dsp = DN_BONUS(dn_tmp); 597 598 bc = &th->th_cache[ZB_MDN_CACHE][ZB_MAXLEVEL - 1]; 599 dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode; 600 601 SET_BOOKMARK(&bc->bc_bookmark, objset, 0, -1, 0); 602 603 /* 604 * If we're traversing an open snapshot, we know that it 605 * can't be deleted (because it's open) and it can't change 606 * (because it's a snapshot). Therefore, once we've gotten 607 * from the uberblock down to the snapshot's objset_phys_t, 608 * we no longer need to synchronize with spa_sync(); we're 609 * traversing a completely static block tree from here on. 610 */ 611 if (th->th_advance & ADVANCE_NOLOCK) { 612 ASSERT(th->th_locked); 613 rw_exit(spa_traverse_rwlock(th->th_spa)); 614 th->th_locked = 0; 615 } 616 617 rc = traverse_read(th, bc, &dsp->ds_bp, dn); 618 619 if (rc != 0) { 620 if (rc == ERESTART) 621 rc = advance_objset(zseg, zb->zb_objset + 1, 622 th->th_advance); 623 return (rc); 624 } 625 626 if (th->th_advance & ADVANCE_PRUNE) 627 zseg->seg_mintxg = 628 MAX(zseg->seg_mintxg, dsp->ds_prev_snap_txg); 629 } 630 631 if (zb->zb_level == -1) { 632 ASSERT(zb->zb_object == 0); 633 ASSERT(zb->zb_blkid == 0); 634 ASSERT(BP_GET_TYPE(&bc->bc_blkptr) == DMU_OT_OBJSET); 635 636 if (bc->bc_blkptr.blk_birth > zseg->seg_mintxg) { 637 rc = traverse_callback(th, zseg, bc); 638 if (rc) { 639 ASSERT(rc == EINTR); 640 return (rc); 641 } 642 if ((th->th_advance & ADVANCE_ZIL) && 643 zb->zb_objset != 0) 644 traverse_zil(th, bc); 645 } 646 647 return (advance_from_osphys(zseg, th->th_advance)); 648 } 649 650 if (zb->zb_object != 0) { 651 uint64_t object = zb->zb_object; 652 653 rc = get_dnode(th, zb->zb_objset, dn, &object, &dn_tmp, 654 zseg->seg_mintxg, -1, ZB_MDN_CACHE); 655 656 if (object != zb->zb_object) 657 rc = advance_object(zseg, object, th->th_advance); 658 659 if (rc != 0) 660 return (rc); 661 662 dn = dn_tmp; 663 } 664 665 if (zb->zb_level == ZB_MAXLEVEL) 666 zb->zb_level = dn->dn_nlevels - 1; 667 668 for (;;) { 669 rc = find_block(th, zseg, dn, ZB_DN_CACHE); 670 671 if (rc == EAGAIN || rc == EINTR || rc == ERANGE) 672 break; 673 674 if (rc == 0) { 675 bc = &th->th_cache[ZB_DN_CACHE][zb->zb_level]; 676 ASSERT(bc->bc_dnode == dn); 677 ASSERT(bc->bc_blkptr.blk_birth <= mosbp->blk_birth); 678 rc = traverse_callback(th, zseg, bc); 679 if (rc) { 680 ASSERT(rc == EINTR); 681 return (rc); 682 } 683 if (BP_IS_HOLE(&bc->bc_blkptr)) { 684 ASSERT(th->th_advance & ADVANCE_HOLES); 685 rc = ENOTBLK; 686 } 687 } 688 689 rc = advance_block(zseg, dn, rc, th->th_advance); 690 691 if (rc == ERANGE) 692 break; 693 694 /* 695 * Give spa_sync() a chance to run. 696 */ 697 if (th->th_locked && spa_traverse_wanted(th->th_spa)) { 698 th->th_syncs++; 699 return (EAGAIN); 700 } 701 702 if (--worklimit == 0) 703 return (EAGAIN); 704 } 705 706 if (rc == ERANGE) 707 rc = advance_object(zseg, zb->zb_object + 1, th->th_advance); 708 709 return (rc); 710 } 711 712 /* 713 * It is the caller's responsibility to ensure that the dsl_dataset_t 714 * doesn't go away during traversal. 715 */ 716 int 717 traverse_dsl_dataset(dsl_dataset_t *ds, uint64_t txg_start, int advance, 718 blkptr_cb_t func, void *arg) 719 { 720 spa_t *spa = ds->ds_dir->dd_pool->dp_spa; 721 traverse_handle_t *th; 722 int err; 723 724 th = traverse_init(spa, func, arg, advance, ZIO_FLAG_MUSTSUCCEED); 725 726 traverse_add_objset(th, txg_start, -1ULL, ds->ds_object); 727 728 while ((err = traverse_more(th)) == EAGAIN) 729 continue; 730 731 traverse_fini(th); 732 return (err); 733 } 734 735 int 736 traverse_zvol(objset_t *os, int advance, blkptr_cb_t func, void *arg) 737 { 738 spa_t *spa = dmu_objset_spa(os); 739 traverse_handle_t *th; 740 int err; 741 742 th = traverse_init(spa, func, arg, advance, ZIO_FLAG_CANFAIL); 743 744 traverse_add_dnode(th, 0, -1ULL, dmu_objset_id(os), ZVOL_OBJ); 745 746 while ((err = traverse_more(th)) == EAGAIN) 747 continue; 748 749 traverse_fini(th); 750 return (err); 751 } 752 753 int 754 traverse_more(traverse_handle_t *th) 755 { 756 zseg_t *zseg = list_head(&th->th_seglist); 757 uint64_t save_txg; /* XXX won't be necessary with real itinerary */ 758 krwlock_t *rw = spa_traverse_rwlock(th->th_spa); 759 blkptr_t *mosbp = spa_get_rootblkptr(th->th_spa); 760 int rc; 761 762 if (zseg == NULL) 763 return (0); 764 765 th->th_restarts++; 766 767 save_txg = zseg->seg_mintxg; 768 769 rw_enter(rw, RW_READER); 770 th->th_locked = 1; 771 772 rc = traverse_segment(th, zseg, mosbp); 773 ASSERT(rc == ERANGE || rc == EAGAIN || rc == EINTR); 774 775 if (th->th_locked) 776 rw_exit(rw); 777 th->th_locked = 0; 778 779 zseg->seg_mintxg = save_txg; 780 781 if (rc == ERANGE) { 782 list_remove(&th->th_seglist, zseg); 783 kmem_free(zseg, sizeof (*zseg)); 784 return (EAGAIN); 785 } 786 787 return (rc); 788 } 789 790 /* 791 * Note: (mintxg, maxtxg) is an open interval; mintxg and maxtxg themselves 792 * are not included. The blocks covered by this segment will all have 793 * mintxg < birth < maxtxg. 794 */ 795 static void 796 traverse_add_segment(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg, 797 uint64_t sobjset, uint64_t sobject, int slevel, uint64_t sblkid, 798 uint64_t eobjset, uint64_t eobject, int elevel, uint64_t eblkid) 799 { 800 zseg_t *zseg; 801 802 zseg = kmem_alloc(sizeof (zseg_t), KM_SLEEP); 803 804 zseg->seg_mintxg = mintxg; 805 zseg->seg_maxtxg = maxtxg; 806 807 zseg->seg_start.zb_objset = sobjset; 808 zseg->seg_start.zb_object = sobject; 809 zseg->seg_start.zb_level = slevel; 810 zseg->seg_start.zb_blkid = sblkid; 811 812 zseg->seg_end.zb_objset = eobjset; 813 zseg->seg_end.zb_object = eobject; 814 zseg->seg_end.zb_level = elevel; 815 zseg->seg_end.zb_blkid = eblkid; 816 817 list_insert_tail(&th->th_seglist, zseg); 818 } 819 820 void 821 traverse_add_dnode(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg, 822 uint64_t objset, uint64_t object) 823 { 824 if (th->th_advance & ADVANCE_PRE) 825 traverse_add_segment(th, mintxg, maxtxg, 826 objset, object, ZB_MAXLEVEL, 0, 827 objset, object, 0, ZB_MAXBLKID); 828 else 829 traverse_add_segment(th, mintxg, maxtxg, 830 objset, object, 0, 0, 831 objset, object, 0, ZB_MAXBLKID); 832 } 833 834 void 835 traverse_add_objset(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg, 836 uint64_t objset) 837 { 838 if (th->th_advance & ADVANCE_PRE) 839 traverse_add_segment(th, mintxg, maxtxg, 840 objset, 0, -1, 0, 841 objset, ZB_MAXOBJECT, 0, ZB_MAXBLKID); 842 else 843 traverse_add_segment(th, mintxg, maxtxg, 844 objset, 1, 0, 0, 845 objset, 0, -1, 0); 846 } 847 848 void 849 traverse_add_pool(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg) 850 { 851 if (th->th_advance & ADVANCE_PRE) 852 traverse_add_segment(th, mintxg, maxtxg, 853 0, 0, -1, 0, 854 ZB_MAXOBJSET, ZB_MAXOBJECT, 0, ZB_MAXBLKID); 855 else 856 traverse_add_segment(th, mintxg, maxtxg, 857 1, 1, 0, 0, 858 0, 0, -1, 0); 859 } 860 861 traverse_handle_t * 862 traverse_init(spa_t *spa, blkptr_cb_t func, void *arg, int advance, 863 int zio_flags) 864 { 865 traverse_handle_t *th; 866 int d, l; 867 868 th = kmem_zalloc(sizeof (*th), KM_SLEEP); 869 870 th->th_spa = spa; 871 th->th_func = func; 872 th->th_arg = arg; 873 th->th_advance = advance; 874 th->th_lastcb.zb_level = ZB_NO_LEVEL; 875 th->th_noread.zb_level = ZB_NO_LEVEL; 876 th->th_zio_flags = zio_flags; 877 878 list_create(&th->th_seglist, sizeof (zseg_t), 879 offsetof(zseg_t, seg_node)); 880 881 for (d = 0; d < ZB_DEPTH; d++) { 882 for (l = 0; l < ZB_MAXLEVEL; l++) { 883 if ((advance & ADVANCE_DATA) || 884 l != 0 || d != ZB_DN_CACHE) 885 th->th_cache[d][l].bc_data = 886 zio_buf_alloc(SPA_MAXBLOCKSIZE); 887 } 888 } 889 890 return (th); 891 } 892 893 void 894 traverse_fini(traverse_handle_t *th) 895 { 896 int d, l; 897 zseg_t *zseg; 898 899 for (d = 0; d < ZB_DEPTH; d++) 900 for (l = 0; l < ZB_MAXLEVEL; l++) 901 if (th->th_cache[d][l].bc_data != NULL) 902 zio_buf_free(th->th_cache[d][l].bc_data, 903 SPA_MAXBLOCKSIZE); 904 905 while ((zseg = list_head(&th->th_seglist)) != NULL) { 906 list_remove(&th->th_seglist, zseg); 907 kmem_free(zseg, sizeof (*zseg)); 908 } 909 910 list_destroy(&th->th_seglist); 911 912 dprintf("%llu hit, %llu ARC, %llu IO, %llu cb, %llu sync, %llu again\n", 913 th->th_hits, th->th_arc_hits, th->th_reads, th->th_callbacks, 914 th->th_syncs, th->th_restarts); 915 916 kmem_free(th, sizeof (*th)); 917 } 918