xref: /illumos-gate/usr/src/uts/common/fs/zfs/dmu_traverse.c (revision 66e150d7d3c0cb2de3c45c74612784ffd3e73de6)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/zfs_context.h>
29 #include <sys/dmu_objset.h>
30 #include <sys/dmu_traverse.h>
31 #include <sys/dsl_dataset.h>
32 #include <sys/dsl_dir.h>
33 #include <sys/dsl_pool.h>
34 #include <sys/dnode.h>
35 #include <sys/spa.h>
36 #include <sys/zio.h>
37 #include <sys/dmu_impl.h>
38 #include <sys/zvol.h>
39 
40 #define	BP_SPAN_SHIFT(level, width)	((level) * (width))
41 
42 #define	BP_EQUAL(b1, b2)				\
43 	(DVA_EQUAL(BP_IDENTITY(b1), BP_IDENTITY(b2)) &&	\
44 	(b1)->blk_birth == (b2)->blk_birth)
45 
46 /*
47  * Compare two bookmarks.
48  *
49  * For ADVANCE_PRE, the visitation order is:
50  *
51  *	objset 0, 1, 2, ..., ZB_MAXOBJSET.
52  *	object 0, 1, 2, ..., ZB_MAXOBJECT.
53  *	blkoff 0, 1, 2, ...
54  *	level ZB_MAXLEVEL, ..., 2, 1, 0.
55  *
56  * where blkoff = blkid << BP_SPAN_SHIFT(level, width), and thus a valid
57  * ordering vector is:
58  *
59  *	< objset, object, blkoff, -level >
60  *
61  * For ADVANCE_POST, the starting offsets aren't sequential but ending
62  * offsets [blkoff = (blkid + 1) << BP_SPAN_SHIFT(level, width)] are.
63  * The visitation order is:
64  *
65  *	objset 1, 2, ..., ZB_MAXOBJSET, 0.
66  *	object 1, 2, ..., ZB_MAXOBJECT, 0.
67  *	blkoff 1, 2, ...
68  *	level 0, 1, 2, ..., ZB_MAXLEVEL.
69  *
70  * and thus a valid ordering vector is:
71  *
72  *	< objset - 1, object - 1, blkoff, level >
73  *
74  * Both orderings can be expressed as:
75  *
76  *	< objset + bias, object + bias, blkoff, level ^ bias >
77  *
78  * where 'bias' is either 0 or -1 (for ADVANCE_PRE or ADVANCE_POST)
79  * and 'blkoff' is (blkid - bias) << BP_SPAN_SHIFT(level, wshift).
80  *
81  * Special case: an objset's osphys is represented as level -1 of object 0.
82  * It is always either the very first or very last block we visit in an objset.
83  * Therefore, if either bookmark's level is -1, level alone determines order.
84  */
85 static int
86 compare_bookmark(zbookmark_t *szb, zbookmark_t *ezb, dnode_phys_t *dnp,
87     int advance)
88 {
89 	int bias = (advance & ADVANCE_PRE) ? 0 : -1;
90 	uint64_t sblkoff, eblkoff;
91 	int slevel, elevel, wshift;
92 
93 	if (szb->zb_objset + bias < ezb->zb_objset + bias)
94 		return (-1);
95 
96 	if (szb->zb_objset + bias > ezb->zb_objset + bias)
97 		return (1);
98 
99 	slevel = szb->zb_level;
100 	elevel = ezb->zb_level;
101 
102 	if ((slevel | elevel) < 0)
103 		return ((slevel ^ bias) - (elevel ^ bias));
104 
105 	if (szb->zb_object + bias < ezb->zb_object + bias)
106 		return (-1);
107 
108 	if (szb->zb_object + bias > ezb->zb_object + bias)
109 		return (1);
110 
111 	if (dnp == NULL)
112 		return (0);
113 
114 	wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT;
115 
116 	sblkoff = (szb->zb_blkid - bias) << BP_SPAN_SHIFT(slevel, wshift);
117 	eblkoff = (ezb->zb_blkid - bias) << BP_SPAN_SHIFT(elevel, wshift);
118 
119 	if (sblkoff < eblkoff)
120 		return (-1);
121 
122 	if (sblkoff > eblkoff)
123 		return (1);
124 
125 	return ((elevel ^ bias) - (slevel ^ bias));
126 }
127 
128 #define	SET_BOOKMARK(zb, objset, object, level, blkid)	\
129 {							\
130 	(zb)->zb_objset = objset;			\
131 	(zb)->zb_object = object;			\
132 	(zb)->zb_level = level;				\
133 	(zb)->zb_blkid = blkid;				\
134 }
135 
136 #define	SET_BOOKMARK_LB(zb, level, blkid)		\
137 {							\
138 	(zb)->zb_level = level;				\
139 	(zb)->zb_blkid = blkid;				\
140 }
141 
142 static int
143 advance_objset(zseg_t *zseg, uint64_t objset, int advance)
144 {
145 	zbookmark_t *zb = &zseg->seg_start;
146 
147 	if (advance & ADVANCE_PRE) {
148 		if (objset >= ZB_MAXOBJSET)
149 			return (ERANGE);
150 		SET_BOOKMARK(zb, objset, 0, -1, 0);
151 	} else {
152 		if (objset >= ZB_MAXOBJSET)
153 			objset = 0;
154 		SET_BOOKMARK(zb, objset, 1, 0, 0);
155 	}
156 
157 	if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0)
158 		return (ERANGE);
159 
160 	return (EAGAIN);
161 }
162 
163 static int
164 advance_object(zseg_t *zseg, uint64_t object, int advance)
165 {
166 	zbookmark_t *zb = &zseg->seg_start;
167 
168 	if (advance & ADVANCE_PRE) {
169 		if (object >= ZB_MAXOBJECT) {
170 			SET_BOOKMARK(zb, zb->zb_objset + 1, 0, -1, 0);
171 		} else {
172 			SET_BOOKMARK(zb, zb->zb_objset, object, ZB_MAXLEVEL, 0);
173 		}
174 	} else {
175 		if (zb->zb_object == 0) {
176 			SET_BOOKMARK(zb, zb->zb_objset, 0, -1, 0);
177 		} else {
178 			if (object >= ZB_MAXOBJECT)
179 				object = 0;
180 			SET_BOOKMARK(zb, zb->zb_objset, object, 0, 0);
181 		}
182 	}
183 
184 	if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0)
185 		return (ERANGE);
186 
187 	return (EAGAIN);
188 }
189 
190 static int
191 advance_from_osphys(zseg_t *zseg, int advance)
192 {
193 	zbookmark_t *zb = &zseg->seg_start;
194 
195 	ASSERT(zb->zb_object == 0);
196 	ASSERT(zb->zb_level == -1);
197 	ASSERT(zb->zb_blkid == 0);
198 
199 	if (advance & ADVANCE_PRE) {
200 		SET_BOOKMARK_LB(zb, ZB_MAXLEVEL, 0);
201 	} else {
202 		if (zb->zb_objset == 0)
203 			return (ERANGE);
204 		SET_BOOKMARK(zb, zb->zb_objset + 1, 1, 0, 0);
205 	}
206 
207 	if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0)
208 		return (ERANGE);
209 
210 	return (EAGAIN);
211 }
212 
213 static int
214 advance_block(zseg_t *zseg, dnode_phys_t *dnp, int rc, int advance)
215 {
216 	zbookmark_t *zb = &zseg->seg_start;
217 	int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT;
218 	int maxlevel = dnp->dn_nlevels - 1;
219 	int level = zb->zb_level;
220 	uint64_t blkid = zb->zb_blkid;
221 
222 	if (advance & ADVANCE_PRE) {
223 		if (level > 0 && rc == 0) {
224 			level--;
225 			blkid <<= wshift;
226 		} else {
227 			blkid++;
228 
229 			if ((blkid << BP_SPAN_SHIFT(level, wshift)) >
230 			    dnp->dn_maxblkid)
231 				return (ERANGE);
232 
233 			while (level < maxlevel) {
234 				if (P2PHASE(blkid, 1ULL << wshift))
235 					break;
236 				blkid >>= wshift;
237 				level++;
238 			}
239 		}
240 	} else {
241 		if (level >= maxlevel || P2PHASE(blkid + 1, 1ULL << wshift)) {
242 			blkid = (blkid + 1) << BP_SPAN_SHIFT(level, wshift);
243 			level = 0;
244 		} else {
245 			blkid >>= wshift;
246 			level++;
247 		}
248 
249 		while ((blkid << BP_SPAN_SHIFT(level, wshift)) >
250 		    dnp->dn_maxblkid) {
251 			if (level == maxlevel)
252 				return (ERANGE);
253 			blkid >>= wshift;
254 			level++;
255 		}
256 	}
257 	SET_BOOKMARK_LB(zb, level, blkid);
258 
259 	if (compare_bookmark(zb, &zseg->seg_end, dnp, advance) > 0)
260 		return (ERANGE);
261 
262 	return (EAGAIN);
263 }
264 
265 /*
266  * The traverse_callback function will call the function specified in th_func.
267  * In the event of an error the callee, specified by th_func, must return
268  * one of the following errors:
269  *
270  *	EINTR		- Indicates that the callee wants the traversal to
271  *			  abort immediately.
272  * 	ERESTART	- The callee has acknowledged the error and would
273  *			  like to continue.
274  */
275 static int
276 traverse_callback(traverse_handle_t *th, zseg_t *zseg, traverse_blk_cache_t *bc)
277 {
278 	/*
279 	 * Before we issue the callback, prune against maxtxg.
280 	 *
281 	 * We prune against mintxg before we get here because it's a big win.
282 	 * If a given block was born in txg 37, then we know that the entire
283 	 * subtree below that block must have been born in txg 37 or earlier.
284 	 * We can therefore lop off huge branches of the tree as we go.
285 	 *
286 	 * There's no corresponding optimization for maxtxg because knowing
287 	 * that bp->blk_birth >= maxtxg doesn't imply anything about the bp's
288 	 * children.  In fact, the copy-on-write design of ZFS ensures that
289 	 * top-level blocks will pretty much always be new.
290 	 *
291 	 * Therefore, in the name of simplicity we don't prune against
292 	 * maxtxg until the last possible moment -- that being right now.
293 	 */
294 	if (bc->bc_errno == 0 && bc->bc_blkptr.blk_birth >= zseg->seg_maxtxg)
295 		return (0);
296 
297 	/*
298 	 * Debugging: verify that the order we visit things agrees with the
299 	 * order defined by compare_bookmark().  We don't check this for
300 	 * log blocks because there's no defined ordering for them; they're
301 	 * always visited (or not) as part of visiting the objset_phys_t.
302 	 */
303 	if (bc->bc_errno == 0 && bc != &th->th_zil_cache) {
304 		zbookmark_t *zb = &bc->bc_bookmark;
305 		zbookmark_t *szb = &zseg->seg_start;
306 		zbookmark_t *ezb = &zseg->seg_end;
307 		zbookmark_t *lzb = &th->th_lastcb;
308 		dnode_phys_t *dnp = bc->bc_dnode;
309 
310 		ASSERT(compare_bookmark(zb, ezb, dnp, th->th_advance) <= 0);
311 		ASSERT(compare_bookmark(zb, szb, dnp, th->th_advance) == 0);
312 		ASSERT(compare_bookmark(lzb, zb, dnp, th->th_advance) < 0 ||
313 		    lzb->zb_level == ZB_NO_LEVEL);
314 		*lzb = *zb;
315 	}
316 
317 	th->th_callbacks++;
318 	return (th->th_func(bc, th->th_spa, th->th_arg));
319 }
320 
321 static int
322 traverse_read(traverse_handle_t *th, traverse_blk_cache_t *bc, blkptr_t *bp,
323 	dnode_phys_t *dnp)
324 {
325 	zbookmark_t *zb = &bc->bc_bookmark;
326 	int error;
327 
328 	th->th_hits++;
329 
330 	bc->bc_dnode = dnp;
331 	bc->bc_errno = 0;
332 
333 	if (BP_EQUAL(&bc->bc_blkptr, bp))
334 		return (0);
335 
336 	bc->bc_blkptr = *bp;
337 
338 	if (bc->bc_data == NULL)
339 		return (0);
340 
341 	if (BP_IS_HOLE(bp)) {
342 		ASSERT(th->th_advance & ADVANCE_HOLES);
343 		return (0);
344 	}
345 
346 	if (compare_bookmark(zb, &th->th_noread, dnp, 0) == 0) {
347 		error = EIO;
348 	} else if (arc_tryread(th->th_spa, bp, bc->bc_data) == 0) {
349 		error = 0;
350 		th->th_arc_hits++;
351 	} else {
352 		error = zio_wait(zio_read(NULL, th->th_spa, bp, bc->bc_data,
353 		    BP_GET_LSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_READ,
354 		    th->th_zio_flags | ZIO_FLAG_DONT_CACHE, zb));
355 
356 		if (BP_SHOULD_BYTESWAP(bp) && error == 0)
357 			(zb->zb_level > 0 ? byteswap_uint64_array :
358 			    dmu_ot[BP_GET_TYPE(bp)].ot_byteswap)(bc->bc_data,
359 			    BP_GET_LSIZE(bp));
360 		th->th_reads++;
361 	}
362 
363 	if (error) {
364 		bc->bc_errno = error;
365 		error = traverse_callback(th, NULL, bc);
366 		ASSERT(error == EAGAIN || error == EINTR || error == ERESTART);
367 		bc->bc_blkptr.blk_birth = -1ULL;
368 	}
369 
370 	dprintf("cache %02x error %d <%llu, %llu, %d, %llx>\n",
371 	    bc - &th->th_cache[0][0], error,
372 	    zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid);
373 
374 	return (error);
375 }
376 
377 static int
378 find_block(traverse_handle_t *th, zseg_t *zseg, dnode_phys_t *dnp, int depth)
379 {
380 	zbookmark_t *zb = &zseg->seg_start;
381 	traverse_blk_cache_t *bc;
382 	blkptr_t *bp = dnp->dn_blkptr;
383 	int i, first, level;
384 	int nbp = dnp->dn_nblkptr;
385 	int minlevel = zb->zb_level;
386 	int maxlevel = dnp->dn_nlevels - 1;
387 	int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT;
388 	int bp_shift = BP_SPAN_SHIFT(maxlevel - minlevel, wshift);
389 	uint64_t blkid = zb->zb_blkid >> bp_shift;
390 	int do_holes = (th->th_advance & ADVANCE_HOLES) && depth == ZB_DN_CACHE;
391 	int rc;
392 
393 	if (minlevel > maxlevel || blkid >= nbp)
394 		return (ERANGE);
395 
396 	for (level = maxlevel; level >= minlevel; level--) {
397 		first = P2PHASE(blkid, 1ULL << wshift);
398 
399 		for (i = first; i < nbp; i++)
400 			if (bp[i].blk_birth > zseg->seg_mintxg ||
401 			    BP_IS_HOLE(&bp[i]) && do_holes)
402 				break;
403 
404 		if (i != first) {
405 			i--;
406 			SET_BOOKMARK_LB(zb, level, blkid + (i - first));
407 			return (ENOTBLK);
408 		}
409 
410 		bc = &th->th_cache[depth][level];
411 
412 		SET_BOOKMARK(&bc->bc_bookmark, zb->zb_objset, zb->zb_object,
413 		    level, blkid);
414 
415 		if (rc = traverse_read(th, bc, bp + i, dnp)) {
416 			if (rc != EAGAIN) {
417 				SET_BOOKMARK_LB(zb, level, blkid);
418 			}
419 			return (rc);
420 		}
421 
422 		if (BP_IS_HOLE(&bp[i])) {
423 			SET_BOOKMARK_LB(zb, level, blkid);
424 			th->th_lastcb.zb_level = ZB_NO_LEVEL;
425 			return (0);
426 		}
427 
428 		nbp = 1 << wshift;
429 		bp = bc->bc_data;
430 		bp_shift -= wshift;
431 		blkid = zb->zb_blkid >> bp_shift;
432 	}
433 
434 	return (0);
435 }
436 
437 static int
438 get_dnode(traverse_handle_t *th, uint64_t objset, dnode_phys_t *mdn,
439     uint64_t *objectp, dnode_phys_t **dnpp, uint64_t txg, int type, int depth)
440 {
441 	zseg_t zseg;
442 	zbookmark_t *zb = &zseg.seg_start;
443 	uint64_t object = *objectp;
444 	int i, rc;
445 
446 	SET_BOOKMARK(zb, objset, 0, 0, object / DNODES_PER_BLOCK);
447 	SET_BOOKMARK(&zseg.seg_end, objset, 0, 0, ZB_MAXBLKID);
448 
449 	zseg.seg_mintxg = txg;
450 	zseg.seg_maxtxg = -1ULL;
451 
452 	for (;;) {
453 		rc = find_block(th, &zseg, mdn, depth);
454 
455 		if (rc == EAGAIN || rc == EINTR || rc == ERANGE)
456 			break;
457 
458 		if (rc == 0 && zb->zb_level == 0) {
459 			dnode_phys_t *dnp = th->th_cache[depth][0].bc_data;
460 			for (i = 0; i < DNODES_PER_BLOCK; i++) {
461 				object = (zb->zb_blkid * DNODES_PER_BLOCK) + i;
462 				if (object >= *objectp &&
463 				    dnp[i].dn_type != DMU_OT_NONE &&
464 				    (type == -1 || dnp[i].dn_type == type)) {
465 					*objectp = object;
466 					*dnpp = &dnp[i];
467 					return (0);
468 				}
469 			}
470 		}
471 
472 		rc = advance_block(&zseg, mdn, rc, ADVANCE_PRE);
473 
474 		if (rc == ERANGE)
475 			break;
476 	}
477 
478 	if (rc == ERANGE)
479 		*objectp = ZB_MAXOBJECT;
480 
481 	return (rc);
482 }
483 
484 /* ARGSUSED */
485 static void
486 traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
487 {
488 	traverse_handle_t *th = arg;
489 	traverse_blk_cache_t *bc = &th->th_zil_cache;
490 	zbookmark_t *zb = &bc->bc_bookmark;
491 	zseg_t *zseg = list_head(&th->th_seglist);
492 
493 	if (bp->blk_birth <= zseg->seg_mintxg)
494 		return;
495 
496 	if (claim_txg != 0 || bp->blk_birth < spa_first_txg(th->th_spa)) {
497 		zb->zb_object = 0;
498 		zb->zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
499 		bc->bc_blkptr = *bp;
500 		(void) traverse_callback(th, zseg, bc);
501 	}
502 }
503 
504 /* ARGSUSED */
505 static void
506 traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
507 {
508 	traverse_handle_t *th = arg;
509 	traverse_blk_cache_t *bc = &th->th_zil_cache;
510 	zbookmark_t *zb = &bc->bc_bookmark;
511 	zseg_t *zseg = list_head(&th->th_seglist);
512 
513 	if (lrc->lrc_txtype == TX_WRITE) {
514 		lr_write_t *lr = (lr_write_t *)lrc;
515 		blkptr_t *bp = &lr->lr_blkptr;
516 
517 		if (bp->blk_birth <= zseg->seg_mintxg)
518 			return;
519 
520 		if (claim_txg != 0 && bp->blk_birth >= claim_txg) {
521 			zb->zb_object = lr->lr_foid;
522 			zb->zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp);
523 			bc->bc_blkptr = *bp;
524 			(void) traverse_callback(th, zseg, bc);
525 		}
526 	}
527 }
528 
529 static void
530 traverse_zil(traverse_handle_t *th, traverse_blk_cache_t *bc)
531 {
532 	spa_t *spa = th->th_spa;
533 	dsl_pool_t *dp = spa_get_dsl(spa);
534 	objset_phys_t *osphys = bc->bc_data;
535 	zil_header_t *zh = &osphys->os_zil_header;
536 	uint64_t claim_txg = zh->zh_claim_txg;
537 	zilog_t *zilog;
538 
539 	ASSERT(bc == &th->th_cache[ZB_MDN_CACHE][ZB_MAXLEVEL - 1]);
540 	ASSERT(bc->bc_bookmark.zb_level == -1);
541 
542 	/*
543 	 * We only want to visit blocks that have been claimed but not yet
544 	 * replayed (or, in read-only mode, blocks that *would* be claimed).
545 	 */
546 	if (claim_txg == 0 && (spa_mode & FWRITE))
547 		return;
548 
549 	th->th_zil_cache.bc_bookmark = bc->bc_bookmark;
550 
551 	zilog = zil_alloc(dp->dp_meta_objset, zh);
552 
553 	(void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, th,
554 	    claim_txg);
555 
556 	zil_free(zilog);
557 }
558 
559 static int
560 traverse_segment(traverse_handle_t *th, zseg_t *zseg, blkptr_t *mosbp)
561 {
562 	zbookmark_t *zb = &zseg->seg_start;
563 	traverse_blk_cache_t *bc;
564 	dnode_phys_t *dn, *dn_tmp;
565 	int worklimit = 100;
566 	int rc;
567 
568 	dprintf("<%llu, %llu, %d, %llx>\n",
569 	    zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid);
570 
571 	bc = &th->th_cache[ZB_MOS_CACHE][ZB_MAXLEVEL - 1];
572 	dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode;
573 
574 	SET_BOOKMARK(&bc->bc_bookmark, 0, 0, -1, 0);
575 
576 	rc = traverse_read(th, bc, mosbp, dn);
577 
578 	if (rc)		/* If we get ERESTART, we've got nowhere left to go */
579 		return (rc == ERESTART ? EINTR : rc);
580 
581 	ASSERT(dn->dn_nlevels < ZB_MAXLEVEL);
582 
583 	if (zb->zb_objset != 0) {
584 		uint64_t objset = zb->zb_objset;
585 		dsl_dataset_phys_t *dsp;
586 
587 		rc = get_dnode(th, 0, dn, &objset, &dn_tmp, 0,
588 		    DMU_OT_DSL_DATASET, ZB_MOS_CACHE);
589 
590 		if (objset != zb->zb_objset)
591 			rc = advance_objset(zseg, objset, th->th_advance);
592 
593 		if (rc != 0)
594 			return (rc);
595 
596 		dsp = DN_BONUS(dn_tmp);
597 
598 		bc = &th->th_cache[ZB_MDN_CACHE][ZB_MAXLEVEL - 1];
599 		dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode;
600 
601 		SET_BOOKMARK(&bc->bc_bookmark, objset, 0, -1, 0);
602 
603 		/*
604 		 * If we're traversing an open snapshot, we know that it
605 		 * can't be deleted (because it's open) and it can't change
606 		 * (because it's a snapshot).  Therefore, once we've gotten
607 		 * from the uberblock down to the snapshot's objset_phys_t,
608 		 * we no longer need to synchronize with spa_sync(); we're
609 		 * traversing a completely static block tree from here on.
610 		 */
611 		if (th->th_advance & ADVANCE_NOLOCK) {
612 			ASSERT(th->th_locked);
613 			rw_exit(spa_traverse_rwlock(th->th_spa));
614 			th->th_locked = 0;
615 		}
616 
617 		if (BP_IS_HOLE(&dsp->ds_bp))
618 			rc = ERESTART;
619 		else
620 			rc = traverse_read(th, bc, &dsp->ds_bp, dn);
621 
622 		if (rc != 0) {
623 			if (rc == ERESTART)
624 				rc = advance_objset(zseg, zb->zb_objset + 1,
625 				    th->th_advance);
626 			return (rc);
627 		}
628 
629 		if (th->th_advance & ADVANCE_PRUNE)
630 			zseg->seg_mintxg =
631 			    MAX(zseg->seg_mintxg, dsp->ds_prev_snap_txg);
632 	}
633 
634 	if (zb->zb_level == -1) {
635 		ASSERT(zb->zb_object == 0);
636 		ASSERT(zb->zb_blkid == 0);
637 		ASSERT(BP_GET_TYPE(&bc->bc_blkptr) == DMU_OT_OBJSET);
638 
639 		if (bc->bc_blkptr.blk_birth > zseg->seg_mintxg) {
640 			rc = traverse_callback(th, zseg, bc);
641 			if (rc) {
642 				ASSERT(rc == EINTR);
643 				return (rc);
644 			}
645 			if ((th->th_advance & ADVANCE_ZIL) &&
646 			    zb->zb_objset != 0)
647 				traverse_zil(th, bc);
648 		}
649 
650 		return (advance_from_osphys(zseg, th->th_advance));
651 	}
652 
653 	if (zb->zb_object != 0) {
654 		uint64_t object = zb->zb_object;
655 
656 		rc = get_dnode(th, zb->zb_objset, dn, &object, &dn_tmp,
657 		    zseg->seg_mintxg, -1, ZB_MDN_CACHE);
658 
659 		if (object != zb->zb_object)
660 			rc = advance_object(zseg, object, th->th_advance);
661 
662 		if (rc != 0)
663 			return (rc);
664 
665 		dn = dn_tmp;
666 	}
667 
668 	if (zb->zb_level == ZB_MAXLEVEL)
669 		zb->zb_level = dn->dn_nlevels - 1;
670 
671 	for (;;) {
672 		rc = find_block(th, zseg, dn, ZB_DN_CACHE);
673 
674 		if (rc == EAGAIN || rc == EINTR || rc == ERANGE)
675 			break;
676 
677 		if (rc == 0) {
678 			bc = &th->th_cache[ZB_DN_CACHE][zb->zb_level];
679 			ASSERT(bc->bc_dnode == dn);
680 			ASSERT(bc->bc_blkptr.blk_birth <= mosbp->blk_birth);
681 			rc = traverse_callback(th, zseg, bc);
682 			if (rc) {
683 				ASSERT(rc == EINTR);
684 				return (rc);
685 			}
686 			if (BP_IS_HOLE(&bc->bc_blkptr)) {
687 				ASSERT(th->th_advance & ADVANCE_HOLES);
688 				rc = ENOTBLK;
689 			}
690 		}
691 
692 		rc = advance_block(zseg, dn, rc, th->th_advance);
693 
694 		if (rc == ERANGE)
695 			break;
696 
697 		/*
698 		 * Give spa_sync() a chance to run.
699 		 */
700 		if (th->th_locked && spa_traverse_wanted(th->th_spa)) {
701 			th->th_syncs++;
702 			return (EAGAIN);
703 		}
704 
705 		if (--worklimit == 0)
706 			return (EAGAIN);
707 	}
708 
709 	if (rc == ERANGE)
710 		rc = advance_object(zseg, zb->zb_object + 1, th->th_advance);
711 
712 	return (rc);
713 }
714 
715 /*
716  * It is the caller's responsibility to ensure that the dsl_dataset_t
717  * doesn't go away during traversal.
718  */
719 int
720 traverse_dsl_dataset(dsl_dataset_t *ds, uint64_t txg_start, int advance,
721     blkptr_cb_t func, void *arg)
722 {
723 	spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
724 	traverse_handle_t *th;
725 	int err;
726 
727 	th = traverse_init(spa, func, arg, advance, ZIO_FLAG_MUSTSUCCEED);
728 
729 	traverse_add_objset(th, txg_start, -1ULL, ds->ds_object);
730 
731 	while ((err = traverse_more(th)) == EAGAIN)
732 		continue;
733 
734 	traverse_fini(th);
735 	return (err);
736 }
737 
738 int
739 traverse_zvol(objset_t *os, int advance,  blkptr_cb_t func, void *arg)
740 {
741 	spa_t *spa = dmu_objset_spa(os);
742 	traverse_handle_t *th;
743 	int err;
744 
745 	th = traverse_init(spa, func, arg, advance, ZIO_FLAG_CANFAIL);
746 
747 	traverse_add_dnode(th, 0, -1ULL, dmu_objset_id(os), ZVOL_OBJ);
748 
749 	while ((err = traverse_more(th)) == EAGAIN)
750 		continue;
751 
752 	traverse_fini(th);
753 	return (err);
754 }
755 
756 int
757 traverse_more(traverse_handle_t *th)
758 {
759 	zseg_t *zseg = list_head(&th->th_seglist);
760 	uint64_t save_txg;	/* XXX won't be necessary with real itinerary */
761 	krwlock_t *rw = spa_traverse_rwlock(th->th_spa);
762 	blkptr_t *mosbp = spa_get_rootblkptr(th->th_spa);
763 	int rc;
764 
765 	if (zseg == NULL)
766 		return (0);
767 
768 	th->th_restarts++;
769 
770 	save_txg = zseg->seg_mintxg;
771 
772 	rw_enter(rw, RW_READER);
773 	th->th_locked = 1;
774 
775 	rc = traverse_segment(th, zseg, mosbp);
776 	ASSERT(rc == ERANGE || rc == EAGAIN || rc == EINTR);
777 
778 	if (th->th_locked)
779 		rw_exit(rw);
780 	th->th_locked = 0;
781 
782 	zseg->seg_mintxg = save_txg;
783 
784 	if (rc == ERANGE) {
785 		list_remove(&th->th_seglist, zseg);
786 		kmem_free(zseg, sizeof (*zseg));
787 		return (EAGAIN);
788 	}
789 
790 	return (rc);
791 }
792 
793 /*
794  * Note: (mintxg, maxtxg) is an open interval; mintxg and maxtxg themselves
795  * are not included.  The blocks covered by this segment will all have
796  * mintxg < birth < maxtxg.
797  */
798 static void
799 traverse_add_segment(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg,
800     uint64_t sobjset, uint64_t sobject, int slevel, uint64_t sblkid,
801     uint64_t eobjset, uint64_t eobject, int elevel, uint64_t eblkid)
802 {
803 	zseg_t *zseg;
804 
805 	zseg = kmem_alloc(sizeof (zseg_t), KM_SLEEP);
806 
807 	zseg->seg_mintxg = mintxg;
808 	zseg->seg_maxtxg = maxtxg;
809 
810 	zseg->seg_start.zb_objset = sobjset;
811 	zseg->seg_start.zb_object = sobject;
812 	zseg->seg_start.zb_level = slevel;
813 	zseg->seg_start.zb_blkid = sblkid;
814 
815 	zseg->seg_end.zb_objset = eobjset;
816 	zseg->seg_end.zb_object = eobject;
817 	zseg->seg_end.zb_level = elevel;
818 	zseg->seg_end.zb_blkid = eblkid;
819 
820 	list_insert_tail(&th->th_seglist, zseg);
821 }
822 
823 void
824 traverse_add_dnode(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg,
825     uint64_t objset, uint64_t object)
826 {
827 	if (th->th_advance & ADVANCE_PRE)
828 		traverse_add_segment(th, mintxg, maxtxg,
829 		    objset, object, ZB_MAXLEVEL, 0,
830 		    objset, object, 0, ZB_MAXBLKID);
831 	else
832 		traverse_add_segment(th, mintxg, maxtxg,
833 		    objset, object, 0, 0,
834 		    objset, object, 0, ZB_MAXBLKID);
835 }
836 
837 void
838 traverse_add_objset(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg,
839     uint64_t objset)
840 {
841 	if (th->th_advance & ADVANCE_PRE)
842 		traverse_add_segment(th, mintxg, maxtxg,
843 		    objset, 0, -1, 0,
844 		    objset, ZB_MAXOBJECT, 0, ZB_MAXBLKID);
845 	else
846 		traverse_add_segment(th, mintxg, maxtxg,
847 		    objset, 1, 0, 0,
848 		    objset, 0, -1, 0);
849 }
850 
851 void
852 traverse_add_pool(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg)
853 {
854 	if (th->th_advance & ADVANCE_PRE)
855 		traverse_add_segment(th, mintxg, maxtxg,
856 		    0, 0, -1, 0,
857 		    ZB_MAXOBJSET, ZB_MAXOBJECT, 0, ZB_MAXBLKID);
858 	else
859 		traverse_add_segment(th, mintxg, maxtxg,
860 		    1, 1, 0, 0,
861 		    0, 0, -1, 0);
862 }
863 
864 traverse_handle_t *
865 traverse_init(spa_t *spa, blkptr_cb_t func, void *arg, int advance,
866     int zio_flags)
867 {
868 	traverse_handle_t *th;
869 	int d, l;
870 
871 	th = kmem_zalloc(sizeof (*th), KM_SLEEP);
872 
873 	th->th_spa = spa;
874 	th->th_func = func;
875 	th->th_arg = arg;
876 	th->th_advance = advance;
877 	th->th_lastcb.zb_level = ZB_NO_LEVEL;
878 	th->th_noread.zb_level = ZB_NO_LEVEL;
879 	th->th_zio_flags = zio_flags;
880 
881 	list_create(&th->th_seglist, sizeof (zseg_t),
882 	    offsetof(zseg_t, seg_node));
883 
884 	for (d = 0; d < ZB_DEPTH; d++) {
885 		for (l = 0; l < ZB_MAXLEVEL; l++) {
886 			if ((advance & ADVANCE_DATA) ||
887 			    l != 0 || d != ZB_DN_CACHE)
888 				th->th_cache[d][l].bc_data =
889 				    zio_buf_alloc(SPA_MAXBLOCKSIZE);
890 		}
891 	}
892 
893 	return (th);
894 }
895 
896 void
897 traverse_fini(traverse_handle_t *th)
898 {
899 	int d, l;
900 	zseg_t *zseg;
901 
902 	for (d = 0; d < ZB_DEPTH; d++)
903 		for (l = 0; l < ZB_MAXLEVEL; l++)
904 			if (th->th_cache[d][l].bc_data != NULL)
905 				zio_buf_free(th->th_cache[d][l].bc_data,
906 				    SPA_MAXBLOCKSIZE);
907 
908 	while ((zseg = list_head(&th->th_seglist)) != NULL) {
909 		list_remove(&th->th_seglist, zseg);
910 		kmem_free(zseg, sizeof (*zseg));
911 	}
912 
913 	list_destroy(&th->th_seglist);
914 
915 	dprintf("%llu hit, %llu ARC, %llu IO, %llu cb, %llu sync, %llu again\n",
916 	    th->th_hits, th->th_arc_hits, th->th_reads, th->th_callbacks,
917 	    th->th_syncs, th->th_restarts);
918 
919 	kmem_free(th, sizeof (*th));
920 }
921