xref: /illumos-gate/usr/src/uts/common/fs/zfs/dmu_traverse.c (revision 240e56fe6d839850e4681b02d57a1bb22e08eb86)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/zfs_context.h>
29 #include <sys/dmu_objset.h>
30 #include <sys/dmu_traverse.h>
31 #include <sys/dsl_dataset.h>
32 #include <sys/dsl_dir.h>
33 #include <sys/dsl_pool.h>
34 #include <sys/dnode.h>
35 #include <sys/spa.h>
36 #include <sys/zio.h>
37 #include <sys/dmu_impl.h>
38 
39 #define	BP_SPAN_SHIFT(level, width)	((level) * (width))
40 
41 #define	BP_EQUAL(b1, b2)				\
42 	(DVA_EQUAL(BP_IDENTITY(b1), BP_IDENTITY(b2)) &&	\
43 	(b1)->blk_birth == (b2)->blk_birth)
44 
45 /*
46  * Compare two bookmarks.
47  *
48  * For ADVANCE_PRE, the visitation order is:
49  *
50  *	objset 0, 1, 2, ..., ZB_MAXOBJSET.
51  *	object 0, 1, 2, ..., ZB_MAXOBJECT.
52  *	blkoff 0, 1, 2, ...
53  *	level ZB_MAXLEVEL, ..., 2, 1, 0.
54  *
55  * where blkoff = blkid << BP_SPAN_SHIFT(level, width), and thus a valid
56  * ordering vector is:
57  *
58  *	< objset, object, blkoff, -level >
59  *
60  * For ADVANCE_POST, the starting offsets aren't sequential but ending
61  * offsets [blkoff = (blkid + 1) << BP_SPAN_SHIFT(level, width)] are.
62  * The visitation order is:
63  *
64  *	objset 1, 2, ..., ZB_MAXOBJSET, 0.
65  *	object 1, 2, ..., ZB_MAXOBJECT, 0.
66  *	blkoff 1, 2, ...
67  *	level 0, 1, 2, ..., ZB_MAXLEVEL.
68  *
69  * and thus a valid ordering vector is:
70  *
71  *	< objset - 1, object - 1, blkoff, level >
72  *
73  * Both orderings can be expressed as:
74  *
75  *	< objset + bias, object + bias, blkoff, level ^ bias >
76  *
77  * where 'bias' is either 0 or -1 (for ADVANCE_PRE or ADVANCE_POST)
78  * and 'blkoff' is (blkid - bias) << BP_SPAN_SHIFT(level, wshift).
79  *
80  * Special case: an objset's osphys is represented as level -1 of object 0.
81  * It is always either the very first or very last block we visit in an objset.
82  * Therefore, if either bookmark's level is -1, level alone determines order.
83  */
84 static int
85 compare_bookmark(zbookmark_t *szb, zbookmark_t *ezb, dnode_phys_t *dnp,
86     int advance)
87 {
88 	int bias = (advance & ADVANCE_PRE) ? 0 : -1;
89 	uint64_t sblkoff, eblkoff;
90 	int slevel, elevel, wshift;
91 
92 	if (szb->zb_objset + bias < ezb->zb_objset + bias)
93 		return (-1);
94 
95 	if (szb->zb_objset + bias > ezb->zb_objset + bias)
96 		return (1);
97 
98 	slevel = szb->zb_level;
99 	elevel = ezb->zb_level;
100 
101 	if ((slevel | elevel) < 0)
102 		return ((slevel ^ bias) - (elevel ^ bias));
103 
104 	if (szb->zb_object + bias < ezb->zb_object + bias)
105 		return (-1);
106 
107 	if (szb->zb_object + bias > ezb->zb_object + bias)
108 		return (1);
109 
110 	if (dnp == NULL)
111 		return (0);
112 
113 	wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT;
114 
115 	sblkoff = (szb->zb_blkid - bias) << BP_SPAN_SHIFT(slevel, wshift);
116 	eblkoff = (ezb->zb_blkid - bias) << BP_SPAN_SHIFT(elevel, wshift);
117 
118 	if (sblkoff < eblkoff)
119 		return (-1);
120 
121 	if (sblkoff > eblkoff)
122 		return (1);
123 
124 	return ((elevel ^ bias) - (slevel ^ bias));
125 }
126 
127 #define	SET_BOOKMARK(zb, objset, object, level, blkid)	\
128 {							\
129 	(zb)->zb_objset = objset;			\
130 	(zb)->zb_object = object;			\
131 	(zb)->zb_level = level;				\
132 	(zb)->zb_blkid = blkid;				\
133 }
134 
135 #define	SET_BOOKMARK_LB(zb, level, blkid)		\
136 {							\
137 	(zb)->zb_level = level;				\
138 	(zb)->zb_blkid = blkid;				\
139 }
140 
141 static int
142 advance_objset(zseg_t *zseg, uint64_t objset, int advance)
143 {
144 	zbookmark_t *zb = &zseg->seg_start;
145 
146 	if (advance & ADVANCE_PRE) {
147 		if (objset >= ZB_MAXOBJSET)
148 			return (ERANGE);
149 		SET_BOOKMARK(zb, objset, 0, -1, 0);
150 	} else {
151 		if (objset >= ZB_MAXOBJSET)
152 			objset = 0;
153 		SET_BOOKMARK(zb, objset, 1, 0, 0);
154 	}
155 
156 	if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0)
157 		return (ERANGE);
158 
159 	return (EAGAIN);
160 }
161 
162 static int
163 advance_object(zseg_t *zseg, uint64_t object, int advance)
164 {
165 	zbookmark_t *zb = &zseg->seg_start;
166 
167 	if (advance & ADVANCE_PRE) {
168 		if (object >= ZB_MAXOBJECT) {
169 			SET_BOOKMARK(zb, zb->zb_objset + 1, 0, -1, 0);
170 		} else {
171 			SET_BOOKMARK(zb, zb->zb_objset, object, ZB_MAXLEVEL, 0);
172 		}
173 	} else {
174 		if (zb->zb_object == 0) {
175 			SET_BOOKMARK(zb, zb->zb_objset, 0, -1, 0);
176 		} else {
177 			if (object >= ZB_MAXOBJECT)
178 				object = 0;
179 			SET_BOOKMARK(zb, zb->zb_objset, object, 0, 0);
180 		}
181 	}
182 
183 	if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0)
184 		return (ERANGE);
185 
186 	return (EAGAIN);
187 }
188 
189 static int
190 advance_from_osphys(zseg_t *zseg, int advance)
191 {
192 	zbookmark_t *zb = &zseg->seg_start;
193 
194 	ASSERT(zb->zb_object == 0);
195 	ASSERT(zb->zb_level == -1);
196 	ASSERT(zb->zb_blkid == 0);
197 
198 	if (advance & ADVANCE_PRE) {
199 		SET_BOOKMARK_LB(zb, ZB_MAXLEVEL, 0);
200 	} else {
201 		if (zb->zb_objset == 0)
202 			return (ERANGE);
203 		SET_BOOKMARK(zb, zb->zb_objset + 1, 1, 0, 0);
204 	}
205 
206 	if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0)
207 		return (ERANGE);
208 
209 	return (EAGAIN);
210 }
211 
212 static int
213 advance_block(zseg_t *zseg, dnode_phys_t *dnp, int rc, int advance)
214 {
215 	zbookmark_t *zb = &zseg->seg_start;
216 	int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT;
217 	int maxlevel = dnp->dn_nlevels - 1;
218 	int level = zb->zb_level;
219 	uint64_t blkid = zb->zb_blkid;
220 
221 	if (advance & ADVANCE_PRE) {
222 		if (level > 0 && rc == 0) {
223 			level--;
224 			blkid <<= wshift;
225 		} else {
226 			blkid++;
227 
228 			if ((blkid << BP_SPAN_SHIFT(level, wshift)) >
229 			    dnp->dn_maxblkid)
230 				return (ERANGE);
231 
232 			while (level < maxlevel) {
233 				if (P2PHASE(blkid, 1ULL << wshift))
234 					break;
235 				blkid >>= wshift;
236 				level++;
237 			}
238 		}
239 	} else {
240 		if (level >= maxlevel || P2PHASE(blkid + 1, 1ULL << wshift)) {
241 			blkid = (blkid + 1) << BP_SPAN_SHIFT(level, wshift);
242 			level = 0;
243 		} else {
244 			blkid >>= wshift;
245 			level++;
246 		}
247 
248 		while ((blkid << BP_SPAN_SHIFT(level, wshift)) >
249 		    dnp->dn_maxblkid) {
250 			if (level == maxlevel)
251 				return (ERANGE);
252 			blkid >>= wshift;
253 			level++;
254 		}
255 	}
256 	SET_BOOKMARK_LB(zb, level, blkid);
257 
258 	if (compare_bookmark(zb, &zseg->seg_end, dnp, advance) > 0)
259 		return (ERANGE);
260 
261 	return (EAGAIN);
262 }
263 
264 static int
265 traverse_callback(traverse_handle_t *th, zseg_t *zseg, traverse_blk_cache_t *bc)
266 {
267 	/*
268 	 * Before we issue the callback, prune against maxtxg.
269 	 *
270 	 * We prune against mintxg before we get here because it's a big win.
271 	 * If a given block was born in txg 37, then we know that the entire
272 	 * subtree below that block must have been born in txg 37 or earlier.
273 	 * We can therefore lop off huge branches of the tree as we go.
274 	 *
275 	 * There's no corresponding optimization for maxtxg because knowing
276 	 * that bp->blk_birth >= maxtxg doesn't imply anything about the bp's
277 	 * children.  In fact, the copy-on-write design of ZFS ensures that
278 	 * top-level blocks will pretty much always be new.
279 	 *
280 	 * Therefore, in the name of simplicity we don't prune against
281 	 * maxtxg until the last possible moment -- that being right now.
282 	 */
283 	if (bc->bc_errno == 0 && bc->bc_blkptr.blk_birth >= zseg->seg_maxtxg)
284 		return (0);
285 
286 	/*
287 	 * Debugging: verify that the order we visit things agrees with the
288 	 * order defined by compare_bookmark().  We don't check this for
289 	 * log blocks because there's no defined ordering for them; they're
290 	 * always visited (or not) as part of visiting the objset_phys_t.
291 	 */
292 	if (bc->bc_errno == 0 && bc != &th->th_zil_cache) {
293 		zbookmark_t *zb = &bc->bc_bookmark;
294 		zbookmark_t *szb = &zseg->seg_start;
295 		zbookmark_t *ezb = &zseg->seg_end;
296 		zbookmark_t *lzb = &th->th_lastcb;
297 		dnode_phys_t *dnp = bc->bc_dnode;
298 
299 		ASSERT(compare_bookmark(zb, ezb, dnp, th->th_advance) <= 0);
300 		ASSERT(compare_bookmark(zb, szb, dnp, th->th_advance) == 0);
301 		ASSERT(compare_bookmark(lzb, zb, dnp, th->th_advance) < 0 ||
302 		    lzb->zb_level == ZB_NO_LEVEL);
303 		*lzb = *zb;
304 	}
305 
306 	th->th_callbacks++;
307 	return (th->th_func(bc, th->th_spa, th->th_arg));
308 }
309 
310 static int
311 traverse_read(traverse_handle_t *th, traverse_blk_cache_t *bc, blkptr_t *bp,
312 	dnode_phys_t *dnp)
313 {
314 	zbookmark_t *zb = &bc->bc_bookmark;
315 	int error;
316 
317 	th->th_hits++;
318 
319 	bc->bc_dnode = dnp;
320 	bc->bc_errno = 0;
321 
322 	if (BP_EQUAL(&bc->bc_blkptr, bp))
323 		return (0);
324 
325 	bc->bc_blkptr = *bp;
326 
327 	if (bc->bc_data == NULL)
328 		return (0);
329 
330 	if (BP_IS_HOLE(bp)) {
331 		ASSERT(th->th_advance & ADVANCE_HOLES);
332 		return (0);
333 	}
334 
335 	if (compare_bookmark(zb, &th->th_noread, dnp, 0) == 0) {
336 		error = EIO;
337 	} else if (arc_tryread(th->th_spa, bp, bc->bc_data) == 0) {
338 		error = 0;
339 		th->th_arc_hits++;
340 	} else {
341 		error = zio_wait(zio_read(NULL, th->th_spa, bp, bc->bc_data,
342 		    BP_GET_LSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_READ,
343 		    th->th_zio_flags | ZIO_FLAG_DONT_CACHE, zb));
344 
345 		if (BP_SHOULD_BYTESWAP(bp) && error == 0)
346 			(zb->zb_level > 0 ? byteswap_uint64_array :
347 			    dmu_ot[BP_GET_TYPE(bp)].ot_byteswap)(bc->bc_data,
348 			    BP_GET_LSIZE(bp));
349 		th->th_reads++;
350 	}
351 
352 	if (error) {
353 		bc->bc_errno = error;
354 		error = traverse_callback(th, NULL, bc);
355 		ASSERT(error == EAGAIN || error == EINTR || error == ERESTART);
356 		bc->bc_blkptr.blk_birth = -1ULL;
357 	}
358 
359 	dprintf("cache %02x error %d <%llu, %llu, %d, %llx>\n",
360 	    bc - &th->th_cache[0][0], error,
361 	    zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid);
362 
363 	return (error);
364 }
365 
366 static int
367 find_block(traverse_handle_t *th, zseg_t *zseg, dnode_phys_t *dnp, int depth)
368 {
369 	zbookmark_t *zb = &zseg->seg_start;
370 	traverse_blk_cache_t *bc;
371 	blkptr_t *bp = dnp->dn_blkptr;
372 	int i, first, level;
373 	int nbp = dnp->dn_nblkptr;
374 	int minlevel = zb->zb_level;
375 	int maxlevel = dnp->dn_nlevels - 1;
376 	int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT;
377 	int bp_shift = BP_SPAN_SHIFT(maxlevel - minlevel, wshift);
378 	uint64_t blkid = zb->zb_blkid >> bp_shift;
379 	int do_holes = (th->th_advance & ADVANCE_HOLES) && depth == ZB_DN_CACHE;
380 	int rc;
381 
382 	if (minlevel > maxlevel || blkid >= nbp)
383 		return (ERANGE);
384 
385 	for (level = maxlevel; level >= minlevel; level--) {
386 		first = P2PHASE(blkid, 1ULL << wshift);
387 
388 		for (i = first; i < nbp; i++)
389 			if (bp[i].blk_birth > zseg->seg_mintxg ||
390 			    BP_IS_HOLE(&bp[i]) && do_holes)
391 				break;
392 
393 		if (i != first) {
394 			i--;
395 			SET_BOOKMARK_LB(zb, level, blkid + (i - first));
396 			return (ENOTBLK);
397 		}
398 
399 		bc = &th->th_cache[depth][level];
400 
401 		SET_BOOKMARK(&bc->bc_bookmark, zb->zb_objset, zb->zb_object,
402 		    level, blkid);
403 
404 		if (rc = traverse_read(th, bc, bp + i, dnp)) {
405 			if (rc != EAGAIN) {
406 				SET_BOOKMARK_LB(zb, level, blkid);
407 			}
408 			return (rc);
409 		}
410 
411 		if (BP_IS_HOLE(&bp[i])) {
412 			SET_BOOKMARK_LB(zb, level, blkid);
413 			th->th_lastcb.zb_level = ZB_NO_LEVEL;
414 			return (0);
415 		}
416 
417 		nbp = 1 << wshift;
418 		bp = bc->bc_data;
419 		bp_shift -= wshift;
420 		blkid = zb->zb_blkid >> bp_shift;
421 	}
422 
423 	return (0);
424 }
425 
426 static int
427 get_dnode(traverse_handle_t *th, uint64_t objset, dnode_phys_t *mdn,
428     uint64_t *objectp, dnode_phys_t **dnpp, uint64_t txg, int type, int depth)
429 {
430 	zseg_t zseg;
431 	zbookmark_t *zb = &zseg.seg_start;
432 	uint64_t object = *objectp;
433 	int i, rc;
434 
435 	SET_BOOKMARK(zb, objset, 0, 0, object / DNODES_PER_BLOCK);
436 	SET_BOOKMARK(&zseg.seg_end, objset, 0, 0, ZB_MAXBLKID);
437 
438 	zseg.seg_mintxg = txg;
439 	zseg.seg_maxtxg = -1ULL;
440 
441 	for (;;) {
442 		rc = find_block(th, &zseg, mdn, depth);
443 
444 		if (rc == EAGAIN || rc == EINTR || rc == ERANGE)
445 			break;
446 
447 		if (rc == 0 && zb->zb_level == 0) {
448 			dnode_phys_t *dnp = th->th_cache[depth][0].bc_data;
449 			for (i = 0; i < DNODES_PER_BLOCK; i++) {
450 				object = (zb->zb_blkid * DNODES_PER_BLOCK) + i;
451 				if (object >= *objectp &&
452 				    dnp[i].dn_type != DMU_OT_NONE &&
453 				    (type == -1 || dnp[i].dn_type == type)) {
454 					*objectp = object;
455 					*dnpp = &dnp[i];
456 					return (0);
457 				}
458 			}
459 		}
460 
461 		rc = advance_block(&zseg, mdn, rc, ADVANCE_PRE);
462 
463 		if (rc == ERANGE)
464 			break;
465 	}
466 
467 	if (rc == ERANGE)
468 		*objectp = ZB_MAXOBJECT;
469 
470 	return (rc);
471 }
472 
473 /* ARGSUSED */
474 static void
475 traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
476 {
477 	traverse_handle_t *th = arg;
478 	traverse_blk_cache_t *bc = &th->th_zil_cache;
479 	zbookmark_t *zb = &bc->bc_bookmark;
480 	zseg_t *zseg = list_head(&th->th_seglist);
481 
482 	if (bp->blk_birth <= zseg->seg_mintxg)
483 		return;
484 
485 	if (claim_txg != 0 || bp->blk_birth < spa_first_txg(th->th_spa)) {
486 		zb->zb_object = 0;
487 		zb->zb_blkid = bp->blk_cksum.zc_word[3];
488 		bc->bc_blkptr = *bp;
489 		(void) traverse_callback(th, zseg, bc);
490 	}
491 }
492 
493 /* ARGSUSED */
494 static void
495 traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
496 {
497 	traverse_handle_t *th = arg;
498 	traverse_blk_cache_t *bc = &th->th_zil_cache;
499 	zbookmark_t *zb = &bc->bc_bookmark;
500 	zseg_t *zseg = list_head(&th->th_seglist);
501 
502 	if (lrc->lrc_txtype == TX_WRITE) {
503 		lr_write_t *lr = (lr_write_t *)lrc;
504 		blkptr_t *bp = &lr->lr_blkptr;
505 
506 		if (bp->blk_birth <= zseg->seg_mintxg)
507 			return;
508 
509 		if (claim_txg != 0 && bp->blk_birth >= claim_txg) {
510 			zb->zb_object = lr->lr_foid;
511 			zb->zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp);
512 			bc->bc_blkptr = *bp;
513 			(void) traverse_callback(th, zseg, bc);
514 		}
515 	}
516 }
517 
518 static void
519 traverse_zil(traverse_handle_t *th, traverse_blk_cache_t *bc)
520 {
521 	spa_t *spa = th->th_spa;
522 	dsl_pool_t *dp = spa_get_dsl(spa);
523 	objset_phys_t *osphys = bc->bc_data;
524 	zil_header_t *zh = &osphys->os_zil_header;
525 	uint64_t claim_txg = zh->zh_claim_txg;
526 	zilog_t *zilog;
527 
528 	ASSERT(bc == &th->th_cache[ZB_MDN_CACHE][ZB_MAXLEVEL - 1]);
529 	ASSERT(bc->bc_bookmark.zb_level == -1);
530 
531 	/*
532 	 * We only want to visit blocks that have been claimed but not yet
533 	 * replayed (or, in read-only mode, blocks that *would* be claimed).
534 	 */
535 	if (claim_txg == 0 && (spa_mode & FWRITE))
536 		return;
537 
538 	th->th_zil_cache.bc_bookmark = bc->bc_bookmark;
539 
540 	zilog = zil_alloc(dp->dp_meta_objset, zh);
541 
542 	zil_parse(zilog, traverse_zil_block, traverse_zil_record, th,
543 	    claim_txg);
544 
545 	zil_free(zilog);
546 }
547 
548 static int
549 traverse_segment(traverse_handle_t *th, zseg_t *zseg, blkptr_t *mosbp)
550 {
551 	zbookmark_t *zb = &zseg->seg_start;
552 	traverse_blk_cache_t *bc;
553 	dnode_phys_t *dn, *dn_tmp;
554 	int worklimit = 100;
555 	int rc;
556 
557 	dprintf("<%llu, %llu, %d, %llx>\n",
558 	    zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid);
559 
560 	bc = &th->th_cache[ZB_MOS_CACHE][ZB_MAXLEVEL - 1];
561 	dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode;
562 
563 	SET_BOOKMARK(&bc->bc_bookmark, 0, 0, -1, 0);
564 
565 	rc = traverse_read(th, bc, mosbp, dn);
566 
567 	if (rc)		/* If we get ERESTART, we've got nowhere left to go */
568 		return (rc == ERESTART ? EINTR : rc);
569 
570 	ASSERT(dn->dn_nlevels < ZB_MAXLEVEL);
571 
572 	if (zb->zb_objset != 0) {
573 		uint64_t objset = zb->zb_objset;
574 		dsl_dataset_phys_t *dsp;
575 
576 		rc = get_dnode(th, 0, dn, &objset, &dn_tmp, 0,
577 		    DMU_OT_DSL_DATASET, ZB_MOS_CACHE);
578 
579 		if (objset != zb->zb_objset)
580 			rc = advance_objset(zseg, objset, th->th_advance);
581 
582 		if (rc != 0)
583 			return (rc);
584 
585 		dsp = DN_BONUS(dn_tmp);
586 
587 		bc = &th->th_cache[ZB_MDN_CACHE][ZB_MAXLEVEL - 1];
588 		dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode;
589 
590 		SET_BOOKMARK(&bc->bc_bookmark, objset, 0, -1, 0);
591 
592 		/*
593 		 * If we're traversing an open snapshot, we know that it
594 		 * can't be deleted (because it's open) and it can't change
595 		 * (because it's a snapshot).  Therefore, once we've gotten
596 		 * from the uberblock down to the snapshot's objset_phys_t,
597 		 * we no longer need to synchronize with spa_sync(); we're
598 		 * traversing a completely static block tree from here on.
599 		 */
600 		if (th->th_advance & ADVANCE_NOLOCK) {
601 			ASSERT(th->th_locked);
602 			rw_exit(spa_traverse_rwlock(th->th_spa));
603 			th->th_locked = 0;
604 		}
605 
606 		rc = traverse_read(th, bc, &dsp->ds_bp, dn);
607 
608 		if (rc != 0) {
609 			if (rc == ERESTART)
610 				rc = advance_objset(zseg, zb->zb_objset + 1,
611 				    th->th_advance);
612 			return (rc);
613 		}
614 
615 		if (th->th_advance & ADVANCE_PRUNE)
616 			zseg->seg_mintxg =
617 			    MAX(zseg->seg_mintxg, dsp->ds_prev_snap_txg);
618 	}
619 
620 	if (zb->zb_level == -1) {
621 		ASSERT(zb->zb_object == 0);
622 		ASSERT(zb->zb_blkid == 0);
623 		ASSERT(BP_GET_TYPE(&bc->bc_blkptr) == DMU_OT_OBJSET);
624 
625 		if (bc->bc_blkptr.blk_birth > zseg->seg_mintxg) {
626 			rc = traverse_callback(th, zseg, bc);
627 			if (rc) {
628 				ASSERT(rc == EINTR);
629 				return (rc);
630 			}
631 			if ((th->th_advance & ADVANCE_ZIL) &&
632 			    zb->zb_objset != 0)
633 				traverse_zil(th, bc);
634 		}
635 
636 		return (advance_from_osphys(zseg, th->th_advance));
637 	}
638 
639 	if (zb->zb_object != 0) {
640 		uint64_t object = zb->zb_object;
641 
642 		rc = get_dnode(th, zb->zb_objset, dn, &object, &dn_tmp,
643 		    zseg->seg_mintxg, -1, ZB_MDN_CACHE);
644 
645 		if (object != zb->zb_object)
646 			rc = advance_object(zseg, object, th->th_advance);
647 
648 		if (rc != 0)
649 			return (rc);
650 
651 		dn = dn_tmp;
652 	}
653 
654 	if (zb->zb_level == ZB_MAXLEVEL)
655 		zb->zb_level = dn->dn_nlevels - 1;
656 
657 	for (;;) {
658 		rc = find_block(th, zseg, dn, ZB_DN_CACHE);
659 
660 		if (rc == EAGAIN || rc == EINTR || rc == ERANGE)
661 			break;
662 
663 		if (rc == 0) {
664 			bc = &th->th_cache[ZB_DN_CACHE][zb->zb_level];
665 			ASSERT(bc->bc_dnode == dn);
666 			ASSERT(bc->bc_blkptr.blk_birth <= mosbp->blk_birth);
667 			rc = traverse_callback(th, zseg, bc);
668 			if (rc) {
669 				ASSERT(rc == EINTR);
670 				return (rc);
671 			}
672 			if (BP_IS_HOLE(&bc->bc_blkptr)) {
673 				ASSERT(th->th_advance & ADVANCE_HOLES);
674 				rc = ENOTBLK;
675 			}
676 		}
677 
678 		rc = advance_block(zseg, dn, rc, th->th_advance);
679 
680 		if (rc == ERANGE)
681 			break;
682 
683 		/*
684 		 * Give spa_sync() a chance to run.
685 		 */
686 		if (th->th_locked && spa_traverse_wanted(th->th_spa)) {
687 			th->th_syncs++;
688 			return (EAGAIN);
689 		}
690 
691 		if (--worklimit == 0)
692 			return (EAGAIN);
693 	}
694 
695 	if (rc == ERANGE)
696 		rc = advance_object(zseg, zb->zb_object + 1, th->th_advance);
697 
698 	return (rc);
699 }
700 
701 /*
702  * It is the caller's responsibility to ensure that the dsl_dataset_t
703  * doesn't go away during traversal.
704  */
705 int
706 traverse_dsl_dataset(dsl_dataset_t *ds, uint64_t txg_start, int advance,
707     blkptr_cb_t func, void *arg)
708 {
709 	spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
710 	traverse_handle_t *th;
711 	int err;
712 
713 	th = traverse_init(spa, func, arg, advance, ZIO_FLAG_MUSTSUCCEED);
714 
715 	traverse_add_objset(th, txg_start, -1ULL, ds->ds_object);
716 
717 	while ((err = traverse_more(th)) == EAGAIN)
718 		continue;
719 
720 	traverse_fini(th);
721 	return (err);
722 }
723 
724 int
725 traverse_more(traverse_handle_t *th)
726 {
727 	zseg_t *zseg = list_head(&th->th_seglist);
728 	uint64_t save_txg;	/* XXX won't be necessary with real itinerary */
729 	krwlock_t *rw = spa_traverse_rwlock(th->th_spa);
730 	blkptr_t *mosbp = spa_get_rootblkptr(th->th_spa);
731 	int rc;
732 
733 	if (zseg == NULL)
734 		return (0);
735 
736 	th->th_restarts++;
737 
738 	save_txg = zseg->seg_mintxg;
739 
740 	rw_enter(rw, RW_READER);
741 	th->th_locked = 1;
742 
743 	rc = traverse_segment(th, zseg, mosbp);
744 	ASSERT(rc == ERANGE || rc == EAGAIN || rc == EINTR);
745 
746 	if (th->th_locked)
747 		rw_exit(rw);
748 	th->th_locked = 0;
749 
750 	zseg->seg_mintxg = save_txg;
751 
752 	if (rc == ERANGE) {
753 		list_remove(&th->th_seglist, zseg);
754 		kmem_free(zseg, sizeof (*zseg));
755 		return (EAGAIN);
756 	}
757 
758 	return (rc);
759 }
760 
761 /*
762  * Note: (mintxg, maxtxg) is an open interval; mintxg and maxtxg themselves
763  * are not included.  The blocks covered by this segment will all have
764  * mintxg < birth < maxtxg.
765  */
766 static void
767 traverse_add_segment(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg,
768     uint64_t sobjset, uint64_t sobject, int slevel, uint64_t sblkid,
769     uint64_t eobjset, uint64_t eobject, int elevel, uint64_t eblkid)
770 {
771 	zseg_t *zseg;
772 
773 	zseg = kmem_alloc(sizeof (zseg_t), KM_SLEEP);
774 
775 	zseg->seg_mintxg = mintxg;
776 	zseg->seg_maxtxg = maxtxg;
777 
778 	zseg->seg_start.zb_objset = sobjset;
779 	zseg->seg_start.zb_object = sobject;
780 	zseg->seg_start.zb_level = slevel;
781 	zseg->seg_start.zb_blkid = sblkid;
782 
783 	zseg->seg_end.zb_objset = eobjset;
784 	zseg->seg_end.zb_object = eobject;
785 	zseg->seg_end.zb_level = elevel;
786 	zseg->seg_end.zb_blkid = eblkid;
787 
788 	list_insert_tail(&th->th_seglist, zseg);
789 }
790 
791 void
792 traverse_add_dnode(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg,
793     uint64_t objset, uint64_t object)
794 {
795 	if (th->th_advance & ADVANCE_PRE)
796 		traverse_add_segment(th, mintxg, maxtxg,
797 		    objset, object, ZB_MAXLEVEL, 0,
798 		    objset, object, 0, ZB_MAXBLKID);
799 	else
800 		traverse_add_segment(th, mintxg, maxtxg,
801 		    objset, object, 0, 0,
802 		    objset, object, 0, ZB_MAXBLKID);
803 }
804 
805 void
806 traverse_add_objset(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg,
807     uint64_t objset)
808 {
809 	if (th->th_advance & ADVANCE_PRE)
810 		traverse_add_segment(th, mintxg, maxtxg,
811 		    objset, 0, -1, 0,
812 		    objset, ZB_MAXOBJECT, 0, ZB_MAXBLKID);
813 	else
814 		traverse_add_segment(th, mintxg, maxtxg,
815 		    objset, 1, 0, 0,
816 		    objset, 0, -1, 0);
817 }
818 
819 void
820 traverse_add_pool(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg)
821 {
822 	if (th->th_advance & ADVANCE_PRE)
823 		traverse_add_segment(th, mintxg, maxtxg,
824 		    0, 0, -1, 0,
825 		    ZB_MAXOBJSET, ZB_MAXOBJECT, 0, ZB_MAXBLKID);
826 	else
827 		traverse_add_segment(th, mintxg, maxtxg,
828 		    1, 1, 0, 0,
829 		    0, 0, -1, 0);
830 }
831 
832 traverse_handle_t *
833 traverse_init(spa_t *spa, blkptr_cb_t func, void *arg, int advance,
834     int zio_flags)
835 {
836 	traverse_handle_t *th;
837 	int d, l;
838 
839 	th = kmem_zalloc(sizeof (*th), KM_SLEEP);
840 
841 	th->th_spa = spa;
842 	th->th_func = func;
843 	th->th_arg = arg;
844 	th->th_advance = advance;
845 	th->th_lastcb.zb_level = ZB_NO_LEVEL;
846 	th->th_noread.zb_level = ZB_NO_LEVEL;
847 	th->th_zio_flags = zio_flags;
848 
849 	list_create(&th->th_seglist, sizeof (zseg_t),
850 	    offsetof(zseg_t, seg_node));
851 
852 	for (d = 0; d < ZB_DEPTH; d++) {
853 		for (l = 0; l < ZB_MAXLEVEL; l++) {
854 			if ((advance & ADVANCE_DATA) ||
855 			    l != 0 || d != ZB_DN_CACHE)
856 				th->th_cache[d][l].bc_data =
857 				    zio_buf_alloc(SPA_MAXBLOCKSIZE);
858 		}
859 	}
860 
861 	return (th);
862 }
863 
864 void
865 traverse_fini(traverse_handle_t *th)
866 {
867 	int d, l;
868 	zseg_t *zseg;
869 
870 	for (d = 0; d < ZB_DEPTH; d++)
871 		for (l = 0; l < ZB_MAXLEVEL; l++)
872 			if (th->th_cache[d][l].bc_data != NULL)
873 				zio_buf_free(th->th_cache[d][l].bc_data,
874 				    SPA_MAXBLOCKSIZE);
875 
876 	while ((zseg = list_head(&th->th_seglist)) != NULL) {
877 		list_remove(&th->th_seglist, zseg);
878 		kmem_free(zseg, sizeof (*zseg));
879 	}
880 
881 	list_destroy(&th->th_seglist);
882 
883 	dprintf("%llu hit, %llu ARC, %llu IO, %llu cb, %llu sync, %llu again\n",
884 	    th->th_hits, th->th_arc_hits, th->th_reads, th->th_callbacks,
885 	    th->th_syncs, th->th_restarts);
886 
887 	kmem_free(th, sizeof (*th));
888 }
889