/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License, Version 1.0 only * (the "License"). You may not use this file except in compliance * with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #pragma ident "%Z%%M% %I% %E% SMI" #include #include #include #include #include #include #include #include #include #include #define BP_SPAN_SHIFT(level, width) ((level) * (width)) #define BP_EQUAL(b1, b2) \ (DVA_EQUAL(BP_IDENTITY(b1), BP_IDENTITY(b2)) && \ (b1)->blk_birth == (b2)->blk_birth) /* * Compare two bookmarks. * * For ADVANCE_PRE, the visitation order is: * * objset 0, 1, 2, ..., ZB_MAXOBJSET. * object 0, 1, 2, ..., ZB_MAXOBJECT. * blkoff 0, 1, 2, ... * level ZB_MAXLEVEL, ..., 2, 1, 0. * * where blkoff = blkid << BP_SPAN_SHIFT(level, width), and thus a valid * ordering vector is: * * < objset, object, blkoff, -level > * * For ADVANCE_POST, the starting offsets aren't sequential but ending * offsets [blkoff = (blkid + 1) << BP_SPAN_SHIFT(level, width)] are. * The visitation order is: * * objset 1, 2, ..., ZB_MAXOBJSET, 0. * object 1, 2, ..., ZB_MAXOBJECT, 0. * blkoff 1, 2, ... * level 0, 1, 2, ..., ZB_MAXLEVEL. * * and thus a valid ordering vector is: * * < objset - 1, object - 1, blkoff, level > * * Both orderings can be expressed as: * * < objset + bias, object + bias, blkoff, level ^ bias > * * where 'bias' is either 0 or -1 (for ADVANCE_PRE or ADVANCE_POST) * and 'blkoff' is (blkid - bias) << BP_SPAN_SHIFT(level, wshift). * * Special case: an objset's osphys is represented as level -1 of object 0. * It is always either the very first or very last block we visit in an objset. * Therefore, if either bookmark's level is -1, level alone determines order. */ static int compare_bookmark(zbookmark_t *szb, zbookmark_t *ezb, dnode_phys_t *dnp, int advance) { int bias = (advance & ADVANCE_PRE) ? 0 : -1; uint64_t sblkoff, eblkoff; int slevel, elevel, wshift; if (szb->zb_objset + bias < ezb->zb_objset + bias) return (-1); if (szb->zb_objset + bias > ezb->zb_objset + bias) return (1); slevel = szb->zb_level; elevel = ezb->zb_level; if ((slevel | elevel) < 0) return ((slevel ^ bias) - (elevel ^ bias)); if (szb->zb_object + bias < ezb->zb_object + bias) return (-1); if (szb->zb_object + bias > ezb->zb_object + bias) return (1); if (dnp == NULL) return (0); wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT; sblkoff = (szb->zb_blkid - bias) << BP_SPAN_SHIFT(slevel, wshift); eblkoff = (ezb->zb_blkid - bias) << BP_SPAN_SHIFT(elevel, wshift); if (sblkoff < eblkoff) return (-1); if (sblkoff > eblkoff) return (1); return ((elevel ^ bias) - (slevel ^ bias)); } #define SET_BOOKMARK(zb, objset, object, level, blkid) \ { \ (zb)->zb_objset = objset; \ (zb)->zb_object = object; \ (zb)->zb_level = level; \ (zb)->zb_blkid = blkid; \ } #define SET_BOOKMARK_LB(zb, level, blkid) \ { \ (zb)->zb_level = level; \ (zb)->zb_blkid = blkid; \ } static int advance_objset(zseg_t *zseg, uint64_t objset, int advance) { zbookmark_t *zb = &zseg->seg_start; if (advance & ADVANCE_PRE) { if (objset >= ZB_MAXOBJSET) return (ERANGE); SET_BOOKMARK(zb, objset, 0, -1, 0); } else { if (objset >= ZB_MAXOBJSET) objset = 0; SET_BOOKMARK(zb, objset, 1, 0, 0); } if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0) return (ERANGE); return (EAGAIN); } static int advance_object(zseg_t *zseg, uint64_t object, int advance) { zbookmark_t *zb = &zseg->seg_start; if (advance & ADVANCE_PRE) { if (object >= ZB_MAXOBJECT) { SET_BOOKMARK(zb, zb->zb_objset + 1, 0, -1, 0); } else { SET_BOOKMARK(zb, zb->zb_objset, object, ZB_MAXLEVEL, 0); } } else { if (zb->zb_object == 0) { SET_BOOKMARK(zb, zb->zb_objset, 0, -1, 0); } else { if (object >= ZB_MAXOBJECT) object = 0; SET_BOOKMARK(zb, zb->zb_objset, object, 0, 0); } } if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0) return (ERANGE); return (EAGAIN); } static int advance_from_osphys(zseg_t *zseg, int advance) { zbookmark_t *zb = &zseg->seg_start; ASSERT(zb->zb_object == 0); ASSERT(zb->zb_level == -1); ASSERT(zb->zb_blkid == 0); if (advance & ADVANCE_PRE) { SET_BOOKMARK_LB(zb, ZB_MAXLEVEL, 0); } else { if (zb->zb_objset == 0) return (ERANGE); SET_BOOKMARK(zb, zb->zb_objset + 1, 1, 0, 0); } if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0) return (ERANGE); return (EAGAIN); } static int advance_block(zseg_t *zseg, dnode_phys_t *dnp, int rc, int advance) { zbookmark_t *zb = &zseg->seg_start; int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT; int maxlevel = dnp->dn_nlevels - 1; int level = zb->zb_level; uint64_t blkid = zb->zb_blkid; if (advance & ADVANCE_PRE) { if (level > 0 && rc == 0) { level--; blkid <<= wshift; } else { blkid++; if ((blkid << BP_SPAN_SHIFT(level, wshift)) > dnp->dn_maxblkid) return (ERANGE); while (level < maxlevel) { if (P2PHASE(blkid, 1ULL << wshift)) break; blkid >>= wshift; level++; } } } else { if (level >= maxlevel || P2PHASE(blkid + 1, 1ULL << wshift)) { blkid = (blkid + 1) << BP_SPAN_SHIFT(level, wshift); level = 0; } else { blkid >>= wshift; level++; } while ((blkid << BP_SPAN_SHIFT(level, wshift)) > dnp->dn_maxblkid) { if (level == maxlevel) return (ERANGE); blkid >>= wshift; level++; } } SET_BOOKMARK_LB(zb, level, blkid); if (compare_bookmark(zb, &zseg->seg_end, dnp, advance) > 0) return (ERANGE); return (EAGAIN); } static int traverse_callback(traverse_handle_t *th, zseg_t *zseg, traverse_blk_cache_t *bc) { /* * Before we issue the callback, prune against maxtxg. * * We prune against mintxg before we get here because it's a big win. * If a given block was born in txg 37, then we know that the entire * subtree below that block must have been born in txg 37 or earlier. * We can therefore lop off huge branches of the tree as we go. * * There's no corresponding optimization for maxtxg because knowing * that bp->blk_birth >= maxtxg doesn't imply anything about the bp's * children. In fact, the copy-on-write design of ZFS ensures that * top-level blocks will pretty much always be new. * * Therefore, in the name of simplicity we don't prune against * maxtxg until the last possible moment -- that being right now. */ if (bc->bc_errno == 0 && bc->bc_blkptr.blk_birth >= zseg->seg_maxtxg) return (0); if (bc->bc_errno == 0) { zbookmark_t *zb = &bc->bc_bookmark; zbookmark_t *szb = &zseg->seg_start; zbookmark_t *ezb = &zseg->seg_end; zbookmark_t *lzb = &th->th_lastcb; dnode_phys_t *dnp = bc->bc_dnode; /* * Debugging: verify that the order we visit things * agrees with the order defined by compare_bookmark(). */ ASSERT(compare_bookmark(zb, ezb, dnp, th->th_advance) <= 0); ASSERT(compare_bookmark(zb, szb, dnp, th->th_advance) == 0); ASSERT(compare_bookmark(lzb, zb, dnp, th->th_advance) < 0 || lzb->zb_level == ZB_NO_LEVEL); *lzb = *zb; } th->th_callbacks++; return (th->th_func(bc, th->th_spa, th->th_arg)); } static int traverse_read(traverse_handle_t *th, traverse_blk_cache_t *bc, blkptr_t *bp, dnode_phys_t *dnp) { zbookmark_t *zb = &bc->bc_bookmark; int error; th->th_hits++; bc->bc_dnode = dnp; bc->bc_errno = 0; if (BP_EQUAL(&bc->bc_blkptr, bp)) return (0); bc->bc_blkptr = *bp; if (bc->bc_data == NULL) return (0); if (BP_IS_HOLE(bp)) { ASSERT(th->th_advance & ADVANCE_HOLES); return (0); } if (compare_bookmark(zb, &th->th_noread, dnp, 0) == 0) { error = EIO; } else if (arc_tryread(th->th_spa, bp, bc->bc_data) == 0) { error = 0; th->th_arc_hits++; } else { error = zio_wait(zio_read(NULL, th->th_spa, bp, bc->bc_data, BP_GET_LSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_READ, th->th_zio_flags | ZIO_FLAG_DONT_CACHE)); if (BP_SHOULD_BYTESWAP(bp) && error == 0) (zb->zb_level > 0 ? byteswap_uint64_array : dmu_ot[BP_GET_TYPE(bp)].ot_byteswap)(bc->bc_data, BP_GET_LSIZE(bp)); th->th_reads++; } if (error) { bc->bc_errno = error; error = traverse_callback(th, NULL, bc); ASSERT(error == EAGAIN || error == EINTR || error == ERESTART); bc->bc_blkptr.blk_birth = -1ULL; } dprintf("cache %02x error %d <%llu, %llu, %d, %llx>\n", bc - &th->th_cache[0][0], error, zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid); return (error); } static int find_block(traverse_handle_t *th, zseg_t *zseg, dnode_phys_t *dnp, int depth) { zbookmark_t *zb = &zseg->seg_start; traverse_blk_cache_t *bc; blkptr_t *bp = dnp->dn_blkptr; int i, first, level; int nbp = dnp->dn_nblkptr; int minlevel = zb->zb_level; int maxlevel = dnp->dn_nlevels - 1; int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT; int bp_shift = BP_SPAN_SHIFT(maxlevel - minlevel, wshift); uint64_t blkid = zb->zb_blkid >> bp_shift; int do_holes = (th->th_advance & ADVANCE_HOLES) && depth == ZB_DN_CACHE; int rc; if (minlevel > maxlevel || blkid >= nbp) return (ERANGE); for (level = maxlevel; level >= minlevel; level--) { first = P2PHASE(blkid, 1ULL << wshift); for (i = first; i < nbp; i++) if (bp[i].blk_birth > zseg->seg_mintxg || BP_IS_HOLE(&bp[i]) && do_holes) break; if (i != first) { i--; SET_BOOKMARK_LB(zb, level, blkid + (i - first)); return (ENOTBLK); } bc = &th->th_cache[depth][level]; SET_BOOKMARK(&bc->bc_bookmark, zb->zb_objset, zb->zb_object, level, blkid); if (rc = traverse_read(th, bc, bp + i, dnp)) { if (rc != EAGAIN) { SET_BOOKMARK_LB(zb, level, blkid); } return (rc); } if (BP_IS_HOLE(&bp[i])) { SET_BOOKMARK_LB(zb, level, blkid); th->th_lastcb.zb_level = ZB_NO_LEVEL; return (0); } nbp = 1 << wshift; bp = bc->bc_data; bp_shift -= wshift; blkid = zb->zb_blkid >> bp_shift; } return (0); } static int get_dnode(traverse_handle_t *th, uint64_t objset, dnode_phys_t *mdn, uint64_t *objectp, dnode_phys_t **dnpp, uint64_t txg, int type, int depth) { zseg_t zseg; zbookmark_t *zb = &zseg.seg_start; uint64_t object = *objectp; int i, rc; SET_BOOKMARK(zb, objset, 0, 0, object / DNODES_PER_BLOCK); SET_BOOKMARK(&zseg.seg_end, objset, 0, 0, ZB_MAXBLKID); zseg.seg_mintxg = txg; zseg.seg_maxtxg = -1ULL; for (;;) { rc = find_block(th, &zseg, mdn, depth); if (rc == EAGAIN || rc == EINTR || rc == ERANGE) break; if (rc == 0 && zb->zb_level == 0) { dnode_phys_t *dnp = th->th_cache[depth][0].bc_data; for (i = 0; i < DNODES_PER_BLOCK; i++) { object = (zb->zb_blkid * DNODES_PER_BLOCK) + i; if (object >= *objectp && dnp[i].dn_type != DMU_OT_NONE && (type == -1 || dnp[i].dn_type == type)) { *objectp = object; *dnpp = &dnp[i]; return (0); } } } rc = advance_block(&zseg, mdn, rc, ADVANCE_PRE); if (rc == ERANGE) break; } if (rc == ERANGE) *objectp = ZB_MAXOBJECT; return (rc); } static int traverse_segment(traverse_handle_t *th, zseg_t *zseg, blkptr_t *mosbp) { zbookmark_t *zb = &zseg->seg_start; traverse_blk_cache_t *bc; dnode_phys_t *dn, *dn_tmp; int worklimit = 1000; int rc; dprintf("<%llu, %llu, %d, %llx>\n", zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid); bc = &th->th_cache[ZB_MOS_CACHE][ZB_MAXLEVEL - 1]; dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode; SET_BOOKMARK(&bc->bc_bookmark, 0, 0, -1, 0); rc = traverse_read(th, bc, mosbp, dn); if (rc) /* If we get ERESTART, we've got nowhere left to go */ return (rc == ERESTART ? EINTR : rc); ASSERT(dn->dn_nlevels < ZB_MAXLEVEL); if (zb->zb_objset != 0) { uint64_t objset = zb->zb_objset; dsl_dataset_phys_t *dsp; rc = get_dnode(th, 0, dn, &objset, &dn_tmp, 0, DMU_OT_DSL_OBJSET, ZB_MOS_CACHE); if (objset != zb->zb_objset) rc = advance_objset(zseg, objset, th->th_advance); if (rc != 0) return (rc); dsp = DN_BONUS(dn_tmp); bc = &th->th_cache[ZB_MDN_CACHE][ZB_MAXLEVEL - 1]; dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode; SET_BOOKMARK(&bc->bc_bookmark, objset, 0, -1, 0); rc = traverse_read(th, bc, &dsp->ds_bp, dn); if (rc != 0) { if (rc == ERESTART) rc = advance_objset(zseg, zb->zb_objset + 1, th->th_advance); return (rc); } if (th->th_advance & ADVANCE_PRUNE) zseg->seg_mintxg = MAX(zseg->seg_mintxg, dsp->ds_prev_snap_txg); } if (zb->zb_level == -1) { ASSERT(zb->zb_object == 0); if (bc->bc_blkptr.blk_birth > zseg->seg_mintxg) { rc = traverse_callback(th, zseg, bc); if (rc) { ASSERT(rc == EINTR); return (rc); } } return (advance_from_osphys(zseg, th->th_advance)); } if (zb->zb_object != 0) { uint64_t object = zb->zb_object; rc = get_dnode(th, zb->zb_objset, dn, &object, &dn_tmp, zseg->seg_mintxg, -1, ZB_MDN_CACHE); if (object != zb->zb_object) rc = advance_object(zseg, object, th->th_advance); if (rc != 0) return (rc); dn = dn_tmp; } if (zb->zb_level == ZB_MAXLEVEL) zb->zb_level = dn->dn_nlevels - 1; for (;;) { rc = find_block(th, zseg, dn, ZB_DN_CACHE); if (rc == EAGAIN || rc == EINTR || rc == ERANGE) break; if (rc == 0) { bc = &th->th_cache[ZB_DN_CACHE][zb->zb_level]; ASSERT(bc->bc_dnode == dn); ASSERT(bc->bc_blkptr.blk_birth <= mosbp->blk_birth); rc = traverse_callback(th, zseg, bc); if (rc) { ASSERT(rc == EINTR); return (rc); } if (BP_IS_HOLE(&bc->bc_blkptr)) { ASSERT(th->th_advance & ADVANCE_HOLES); rc = ENOTBLK; } } rc = advance_block(zseg, dn, rc, th->th_advance); if (rc == ERANGE) break; /* * Give spa_sync() a chance to run. */ if (spa_traverse_wanted(th->th_spa)) { th->th_syncs++; return (EAGAIN); } if (--worklimit == 0) return (EAGAIN); } if (rc == ERANGE) rc = advance_object(zseg, zb->zb_object + 1, th->th_advance); return (rc); } /* * It is the caller's responsibility to ensure that the dsl_dataset_t * doesn't go away during traversal. */ int traverse_dsl_dataset(dsl_dataset_t *ds, uint64_t txg_start, int advance, blkptr_cb_t func, void *arg) { spa_t *spa = ds->ds_dir->dd_pool->dp_spa; traverse_handle_t *th; int err; th = traverse_init(spa, func, arg, advance, ZIO_FLAG_MUSTSUCCEED); traverse_add_objset(th, txg_start, -1ULL, ds->ds_object); while ((err = traverse_more(th)) == EAGAIN) continue; traverse_fini(th); return (err); } int traverse_more(traverse_handle_t *th) { zseg_t *zseg = list_head(&th->th_seglist); uint64_t save_txg; /* XXX won't be necessary with real itinerary */ krwlock_t *rw = spa_traverse_rwlock(th->th_spa); blkptr_t *mosbp = spa_get_rootblkptr(th->th_spa); int rc; if (zseg == NULL) return (0); th->th_restarts++; save_txg = zseg->seg_mintxg; if (!(th->th_advance & ADVANCE_NOLOCK)) rw_enter(rw, RW_READER); rc = traverse_segment(th, zseg, mosbp); ASSERT(rc == ERANGE || rc == EAGAIN || rc == EINTR); if (!(th->th_advance & ADVANCE_NOLOCK)) rw_exit(rw); zseg->seg_mintxg = save_txg; if (rc == ERANGE) { list_remove(&th->th_seglist, zseg); kmem_free(zseg, sizeof (*zseg)); return (EAGAIN); } return (rc); } /* * Note: (mintxg, maxtxg) is an open interval; mintxg and maxtxg themselves * are not included. The blocks covered by this segment will all have * mintxg < birth < maxtxg. */ static void traverse_add_segment(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg, uint64_t sobjset, uint64_t sobject, int slevel, uint64_t sblkid, uint64_t eobjset, uint64_t eobject, int elevel, uint64_t eblkid) { zseg_t *zseg; zseg = kmem_alloc(sizeof (zseg_t), KM_SLEEP); zseg->seg_mintxg = mintxg; zseg->seg_maxtxg = maxtxg; zseg->seg_start.zb_objset = sobjset; zseg->seg_start.zb_object = sobject; zseg->seg_start.zb_level = slevel; zseg->seg_start.zb_blkid = sblkid; zseg->seg_end.zb_objset = eobjset; zseg->seg_end.zb_object = eobject; zseg->seg_end.zb_level = elevel; zseg->seg_end.zb_blkid = eblkid; list_insert_tail(&th->th_seglist, zseg); } void traverse_add_dnode(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg, uint64_t objset, uint64_t object) { if (th->th_advance & ADVANCE_PRE) traverse_add_segment(th, mintxg, maxtxg, objset, object, ZB_MAXLEVEL, 0, objset, object, 0, ZB_MAXBLKID); else traverse_add_segment(th, mintxg, maxtxg, objset, object, 0, 0, objset, object, 0, ZB_MAXBLKID); } void traverse_add_objset(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg, uint64_t objset) { if (th->th_advance & ADVANCE_PRE) traverse_add_segment(th, mintxg, maxtxg, objset, 0, -1, 0, objset, ZB_MAXOBJECT, 0, ZB_MAXBLKID); else traverse_add_segment(th, mintxg, maxtxg, objset, 1, 0, 0, objset, 0, -1, 0); } void traverse_add_pool(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg) { if (th->th_advance & ADVANCE_PRE) traverse_add_segment(th, mintxg, maxtxg, 0, 0, -1, 0, ZB_MAXOBJSET, ZB_MAXOBJECT, 0, ZB_MAXBLKID); else traverse_add_segment(th, mintxg, maxtxg, 1, 1, 0, 0, 0, 0, -1, 0); } traverse_handle_t * traverse_init(spa_t *spa, blkptr_cb_t func, void *arg, int advance, int zio_flags) { traverse_handle_t *th; int d, l; th = kmem_zalloc(sizeof (*th), KM_SLEEP); th->th_spa = spa; th->th_func = func; th->th_arg = arg; th->th_advance = advance; th->th_lastcb.zb_level = ZB_NO_LEVEL; th->th_noread.zb_level = ZB_NO_LEVEL; th->th_zio_flags = zio_flags; list_create(&th->th_seglist, sizeof (zseg_t), offsetof(zseg_t, seg_node)); for (d = 0; d < ZB_DEPTH; d++) { for (l = 0; l < ZB_MAXLEVEL; l++) { if ((advance & ADVANCE_DATA) || l != 0 || d != ZB_DN_CACHE) th->th_cache[d][l].bc_data = zio_buf_alloc(SPA_MAXBLOCKSIZE); } } return (th); } void traverse_fini(traverse_handle_t *th) { int d, l; zseg_t *zseg; for (d = 0; d < ZB_DEPTH; d++) for (l = 0; l < ZB_MAXLEVEL; l++) if (th->th_cache[d][l].bc_data != NULL) zio_buf_free(th->th_cache[d][l].bc_data, SPA_MAXBLOCKSIZE); while ((zseg = list_head(&th->th_seglist)) != NULL) { list_remove(&th->th_seglist, zseg); kmem_free(zseg, sizeof (*zseg)); } list_destroy(&th->th_seglist); dprintf("%llu hit, %llu ARC, %llu IO, %llu cb, %llu sync, %llu again\n", th->th_hits, th->th_arc_hits, th->th_reads, th->th_callbacks, th->th_syncs, th->th_restarts); kmem_free(th, sizeof (*th)); }