xref: /titanic_41/usr/src/uts/common/fs/zfs/dmu_objset.c (revision afd1ac7b1c9a8cdf273c865aa5e9a14620341443)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/zfs_context.h>
30 #include <sys/dmu_objset.h>
31 #include <sys/dsl_dir.h>
32 #include <sys/dsl_dataset.h>
33 #include <sys/dsl_prop.h>
34 #include <sys/dsl_pool.h>
35 #include <sys/dnode.h>
36 #include <sys/dbuf.h>
37 #include <sys/dmu_tx.h>
38 #include <sys/zio_checksum.h>
39 #include <sys/zap.h>
40 #include <sys/zil.h>
41 #include <sys/dmu_impl.h>
42 
43 
44 spa_t *
45 dmu_objset_spa(objset_t *os)
46 {
47 	return (os->os->os_spa);
48 }
49 
50 zilog_t *
51 dmu_objset_zil(objset_t *os)
52 {
53 	return (os->os->os_zil);
54 }
55 
56 dsl_pool_t *
57 dmu_objset_pool(objset_t *os)
58 {
59 	dsl_dataset_t *ds;
60 
61 	if ((ds = os->os->os_dsl_dataset) != NULL && ds->ds_dir)
62 		return (ds->ds_dir->dd_pool);
63 	else
64 		return (spa_get_dsl(os->os->os_spa));
65 }
66 
67 dsl_dataset_t *
68 dmu_objset_ds(objset_t *os)
69 {
70 	return (os->os->os_dsl_dataset);
71 }
72 
73 dmu_objset_type_t
74 dmu_objset_type(objset_t *os)
75 {
76 	return (os->os->os_phys->os_type);
77 }
78 
79 void
80 dmu_objset_name(objset_t *os, char *buf)
81 {
82 	dsl_dataset_name(os->os->os_dsl_dataset, buf);
83 }
84 
85 uint64_t
86 dmu_objset_id(objset_t *os)
87 {
88 	dsl_dataset_t *ds = os->os->os_dsl_dataset;
89 
90 	return (ds ? ds->ds_object : 0);
91 }
92 
93 static void
94 checksum_changed_cb(void *arg, uint64_t newval)
95 {
96 	objset_impl_t *osi = arg;
97 
98 	/*
99 	 * Inheritance should have been done by now.
100 	 */
101 	ASSERT(newval != ZIO_CHECKSUM_INHERIT);
102 
103 	osi->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE);
104 }
105 
106 static void
107 compression_changed_cb(void *arg, uint64_t newval)
108 {
109 	objset_impl_t *osi = arg;
110 
111 	/*
112 	 * Inheritance and range checking should have been done by now.
113 	 */
114 	ASSERT(newval != ZIO_COMPRESS_INHERIT);
115 
116 	osi->os_compress = zio_compress_select(newval, ZIO_COMPRESS_ON_VALUE);
117 }
118 
119 void
120 dmu_objset_byteswap(void *buf, size_t size)
121 {
122 	objset_phys_t *osp = buf;
123 
124 	ASSERT(size == sizeof (objset_phys_t));
125 	dnode_byteswap(&osp->os_meta_dnode);
126 	byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t));
127 	osp->os_type = BSWAP_64(osp->os_type);
128 }
129 
130 objset_impl_t *
131 dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp)
132 {
133 	objset_impl_t *winner, *osi;
134 	int i, err, checksum;
135 
136 	osi = kmem_zalloc(sizeof (objset_impl_t), KM_SLEEP);
137 	osi->os.os = osi;
138 	osi->os_dsl_dataset = ds;
139 	osi->os_spa = spa;
140 	if (bp)
141 		osi->os_rootbp = *bp;
142 	osi->os_phys = zio_buf_alloc(sizeof (objset_phys_t));
143 	if (!BP_IS_HOLE(&osi->os_rootbp)) {
144 		dprintf_bp(&osi->os_rootbp, "reading %s", "");
145 		(void) arc_read(NULL, spa, &osi->os_rootbp,
146 		    dmu_ot[DMU_OT_OBJSET].ot_byteswap,
147 		    arc_bcopy_func, osi->os_phys,
148 		    ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_MUSTSUCCEED, ARC_WAIT);
149 	} else {
150 		bzero(osi->os_phys, sizeof (objset_phys_t));
151 	}
152 	osi->os_zil = zil_alloc(&osi->os, &osi->os_phys->os_zil_header);
153 
154 	/*
155 	 * Note: the changed_cb will be called once before the register
156 	 * func returns, thus changing the checksum/compression from the
157 	 * default (fletcher2/off).
158 	 */
159 	if (ds) {
160 		err = dsl_prop_register(ds, "checksum",
161 		    checksum_changed_cb, osi);
162 		ASSERT(err == 0);
163 
164 		err = dsl_prop_register(ds, "compression",
165 		    compression_changed_cb, osi);
166 		ASSERT(err == 0);
167 	} else {
168 		/* It's the meta-objset. */
169 		/* XXX - turn off metadata compression temporarily */
170 		osi->os_checksum = ZIO_CHECKSUM_FLETCHER_4;
171 		osi->os_compress = ZIO_COMPRESS_OFF;
172 	}
173 
174 	/*
175 	 * Metadata always gets compressed and checksummed.
176 	 * If the data checksum is multi-bit correctable, and it's not
177 	 * a ZBT-style checksum, then it's suitable for metadata as well.
178 	 * Otherwise, the metadata checksum defaults to fletcher4.
179 	 */
180 	checksum = osi->os_checksum;
181 
182 	if (zio_checksum_table[checksum].ci_correctable &&
183 	    !zio_checksum_table[checksum].ci_zbt)
184 		osi->os_md_checksum = checksum;
185 	else
186 		osi->os_md_checksum = ZIO_CHECKSUM_FLETCHER_4;
187 
188 	/* XXX - turn off metadata compression temporarily */
189 	osi->os_md_compress = ZIO_COMPRESS_OFF;
190 
191 	for (i = 0; i < TXG_SIZE; i++) {
192 		list_create(&osi->os_dirty_dnodes[i], sizeof (dnode_t),
193 		    offsetof(dnode_t, dn_dirty_link[i]));
194 		list_create(&osi->os_free_dnodes[i], sizeof (dnode_t),
195 		    offsetof(dnode_t, dn_dirty_link[i]));
196 	}
197 	list_create(&osi->os_dnodes, sizeof (dnode_t),
198 	    offsetof(dnode_t, dn_link));
199 	list_create(&osi->os_downgraded_dbufs, sizeof (dmu_buf_impl_t),
200 	    offsetof(dmu_buf_impl_t, db_link));
201 
202 	osi->os_meta_dnode = dnode_special_open(osi,
203 	    &osi->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT);
204 
205 	if (ds != NULL) {
206 		winner = dsl_dataset_set_user_ptr(ds, osi, dmu_objset_evict);
207 		if (winner) {
208 			dmu_objset_evict(ds, osi);
209 			osi = winner;
210 		}
211 	}
212 
213 	return (osi);
214 }
215 
216 /* called from zpl */
217 int
218 dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
219     objset_t **osp)
220 {
221 	dsl_dataset_t *ds;
222 	int err;
223 	objset_t *os;
224 	objset_impl_t *osi;
225 
226 	os = kmem_alloc(sizeof (objset_t), KM_SLEEP);
227 	err = dsl_dataset_open(name, mode, os, &ds);
228 	if (err) {
229 		kmem_free(os, sizeof (objset_t));
230 		return (err);
231 	}
232 
233 	osi = dsl_dataset_get_user_ptr(ds);
234 	if (osi == NULL) {
235 		blkptr_t bp;
236 
237 		dsl_dataset_get_blkptr(ds, &bp);
238 		osi = dmu_objset_open_impl(dsl_dataset_get_spa(ds), ds, &bp);
239 	}
240 
241 	os->os = osi;
242 	os->os_mode = mode;
243 
244 	if (type != DMU_OST_ANY && type != os->os->os_phys->os_type) {
245 		dmu_objset_close(os);
246 		return (EINVAL);
247 	}
248 	*osp = os;
249 	return (0);
250 }
251 
252 void
253 dmu_objset_close(objset_t *os)
254 {
255 	dsl_dataset_close(os->os->os_dsl_dataset, os->os_mode, os);
256 	kmem_free(os, sizeof (objset_t));
257 }
258 
259 void
260 dmu_objset_evict(dsl_dataset_t *ds, void *arg)
261 {
262 	objset_impl_t *osi = arg;
263 	int err, i;
264 
265 	for (i = 0; i < TXG_SIZE; i++) {
266 		ASSERT(list_head(&osi->os_dirty_dnodes[i]) == NULL);
267 		ASSERT(list_head(&osi->os_free_dnodes[i]) == NULL);
268 	}
269 
270 	if (ds) {
271 		err = dsl_prop_unregister(ds, "checksum",
272 		    checksum_changed_cb, osi);
273 		ASSERT(err == 0);
274 
275 		err = dsl_prop_unregister(ds, "compression",
276 		    compression_changed_cb, osi);
277 		ASSERT(err == 0);
278 	}
279 
280 	ASSERT3P(list_head(&osi->os_dnodes), ==, osi->os_meta_dnode);
281 	ASSERT3P(list_tail(&osi->os_dnodes), ==, osi->os_meta_dnode);
282 	ASSERT3P(list_head(&osi->os_meta_dnode->dn_dbufs), ==, NULL);
283 
284 	dnode_special_close(osi->os_meta_dnode);
285 	zil_free(osi->os_zil);
286 
287 	zio_buf_free(osi->os_phys, sizeof (objset_phys_t));
288 	kmem_free(osi, sizeof (objset_impl_t));
289 }
290 
291 /* called from dsl for meta-objset */
292 objset_impl_t *
293 dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, dmu_objset_type_t type,
294     dmu_tx_t *tx)
295 {
296 	objset_impl_t *osi;
297 	dnode_t *mdn;
298 
299 	ASSERT(dmu_tx_is_syncing(tx));
300 	osi = dmu_objset_open_impl(spa, ds, NULL);
301 	mdn = osi->os_meta_dnode;
302 
303 	dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT,
304 	    DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx);
305 
306 	/*
307 	 * We don't want to have to increase the meta-dnode's nlevels
308 	 * later, because then we could do it in quescing context while
309 	 * we are also accessing it in open context.
310 	 *
311 	 * This precaution is not necessary for the MOS (ds == NULL),
312 	 * because the MOS is only updated in syncing context.
313 	 * This is most fortunate: the MOS is the only objset that
314 	 * needs to be synced multiple times as spa_sync() iterates
315 	 * to convergence, so minimizing its dn_nlevels matters.
316 	 */
317 	if (ds != NULL)
318 		mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] =
319 		    mdn->dn_nlevels = DN_META_DNODE_LEVELS;
320 
321 	ASSERT(type != DMU_OST_NONE);
322 	ASSERT(type != DMU_OST_ANY);
323 	ASSERT(type < DMU_OST_NUMTYPES);
324 	osi->os_phys->os_type = type;
325 
326 	dsl_dataset_dirty(ds, tx);
327 
328 	return (osi);
329 }
330 
331 struct oscarg {
332 	void (*userfunc)(objset_t *os, void *arg, dmu_tx_t *tx);
333 	void *userarg;
334 	dsl_dataset_t *clone_parent;
335 	const char *fullname;
336 	const char *lastname;
337 	dmu_objset_type_t type;
338 };
339 
340 static int
341 dmu_objset_create_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
342 {
343 	struct oscarg *oa = arg;
344 	dsl_dataset_t *ds;
345 	int err;
346 	blkptr_t bp;
347 
348 	ASSERT(dmu_tx_is_syncing(tx));
349 
350 	err = dsl_dataset_create_sync(dd, oa->fullname, oa->lastname,
351 	    oa->clone_parent, tx);
352 	dprintf_dd(dd, "fn=%s ln=%s err=%d\n",
353 	    oa->fullname, oa->lastname, err);
354 	if (err)
355 		return (err);
356 
357 	err = dsl_dataset_open_spa(dd->dd_pool->dp_spa, oa->fullname,
358 	    DS_MODE_STANDARD | DS_MODE_READONLY, FTAG, &ds);
359 	ASSERT3U(err, ==, 0);
360 	dsl_dataset_get_blkptr(ds, &bp);
361 	if (BP_IS_HOLE(&bp)) {
362 		objset_impl_t *osi;
363 
364 		/* This is an empty dmu_objset; not a clone. */
365 		osi = dmu_objset_create_impl(dsl_dataset_get_spa(ds),
366 		    ds, oa->type, tx);
367 
368 		if (oa->userfunc)
369 			oa->userfunc(&osi->os, oa->userarg, tx);
370 	}
371 	dsl_dataset_close(ds, DS_MODE_STANDARD | DS_MODE_READONLY, FTAG);
372 
373 	return (0);
374 }
375 
376 int
377 dmu_objset_create(const char *name, dmu_objset_type_t type,
378     objset_t *clone_parent,
379     void (*func)(objset_t *os, void *arg, dmu_tx_t *tx), void *arg)
380 {
381 	dsl_dir_t *pds;
382 	const char *tail;
383 	int err = 0;
384 
385 	pds = dsl_dir_open(name, FTAG, &tail);
386 	if (pds == NULL)
387 		return (ENOENT);
388 	if (tail == NULL) {
389 		dsl_dir_close(pds, FTAG);
390 		return (EEXIST);
391 	}
392 
393 	dprintf("name=%s\n", name);
394 
395 	if (tail[0] == '@') {
396 		/*
397 		 * If we're creating a snapshot, make sure everything
398 		 * they might want is on disk.  XXX Sketchy to know
399 		 * about snapshots here, better to put in DSL.
400 		 */
401 		objset_t *os;
402 		size_t plen = strchr(name, '@') - name + 1;
403 		char *pbuf = kmem_alloc(plen, KM_SLEEP);
404 		bcopy(name, pbuf, plen - 1);
405 		pbuf[plen - 1] = '\0';
406 
407 		err = dmu_objset_open(pbuf, DMU_OST_ANY, DS_MODE_STANDARD, &os);
408 		if (err == 0) {
409 			err = zil_suspend(dmu_objset_zil(os));
410 			if (err == 0) {
411 				err = dsl_dir_sync_task(pds,
412 				    dsl_dataset_snapshot_sync,
413 				    (void*)(tail+1), 16*1024);
414 				zil_resume(dmu_objset_zil(os));
415 			}
416 			dmu_objset_close(os);
417 		}
418 		kmem_free(pbuf, plen);
419 	} else {
420 		struct oscarg oa = { 0 };
421 		oa.userfunc = func;
422 		oa.userarg = arg;
423 		oa.fullname = name;
424 		oa.lastname = tail;
425 		oa.type = type;
426 		if (clone_parent != NULL) {
427 			/*
428 			 * You can't clone to a different type.
429 			 */
430 			if (clone_parent->os->os_phys->os_type != type) {
431 				dsl_dir_close(pds, FTAG);
432 				return (EINVAL);
433 			}
434 			oa.clone_parent = clone_parent->os->os_dsl_dataset;
435 		}
436 		err = dsl_dir_sync_task(pds, dmu_objset_create_sync, &oa,
437 		    256*1024);
438 	}
439 	dsl_dir_close(pds, FTAG);
440 	return (err);
441 }
442 
443 int
444 dmu_objset_destroy(const char *name)
445 {
446 	objset_t *os;
447 	int error;
448 
449 	/*
450 	 * If it looks like we'll be able to destroy it, and there's
451 	 * an unplayed replay log sitting around, destroy the log.
452 	 * It would be nicer to do this in dsl_dataset_destroy_sync(),
453 	 * but the replay log objset is modified in open context.
454 	 */
455 	error = dmu_objset_open(name, DMU_OST_ANY, DS_MODE_EXCLUSIVE, &os);
456 	if (error == 0) {
457 		zil_destroy(dmu_objset_zil(os));
458 		dmu_objset_close(os);
459 	}
460 
461 	/* XXX uncache everything? */
462 	return (dsl_dataset_destroy(name));
463 }
464 
465 int
466 dmu_objset_rollback(const char *name)
467 {
468 	int err;
469 	objset_t *os;
470 
471 	err = dmu_objset_open(name, DMU_OST_ANY, DS_MODE_EXCLUSIVE, &os);
472 	if (err == 0) {
473 		err = zil_suspend(dmu_objset_zil(os));
474 		if (err == 0)
475 			zil_resume(dmu_objset_zil(os));
476 		dmu_objset_close(os);
477 		if (err == 0) {
478 			/* XXX uncache everything? */
479 			err = dsl_dataset_rollback(name);
480 		}
481 	}
482 	return (err);
483 }
484 
485 static void
486 dmu_objset_sync_dnodes(objset_impl_t *os, list_t *list, dmu_tx_t *tx)
487 {
488 	dnode_t *dn = list_head(list);
489 	int level, err;
490 
491 	for (level = 0; dn = list_head(list); level++) {
492 		zio_t *zio;
493 		zio = zio_root(os->os_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
494 
495 		ASSERT3U(level, <=, DN_MAX_LEVELS);
496 
497 		while (dn) {
498 			dnode_t *next = list_next(list, dn);
499 
500 			list_remove(list, dn);
501 			if (dnode_sync(dn, level, zio, tx) == 0) {
502 				/*
503 				 * This dnode requires syncing at higher
504 				 * levels; put it back onto the list.
505 				 */
506 				if (next)
507 					list_insert_before(list, next, dn);
508 				else
509 					list_insert_tail(list, dn);
510 			}
511 			dn = next;
512 		}
513 		err = zio_wait(zio);
514 		ASSERT(err == 0);
515 	}
516 }
517 
518 /* ARGSUSED */
519 static void
520 killer(zio_t *zio, arc_buf_t *abuf, void *arg)
521 {
522 	objset_impl_t *os = arg;
523 	objset_phys_t *osphys = zio->io_data;
524 	dnode_phys_t *dnp = &osphys->os_meta_dnode;
525 	int i;
526 
527 	ASSERT3U(zio->io_error, ==, 0);
528 
529 	/*
530 	 * Update rootbp fill count.
531 	 */
532 	os->os_rootbp.blk_fill = 1;	/* count the meta-dnode */
533 	for (i = 0; i < dnp->dn_nblkptr; i++)
534 		os->os_rootbp.blk_fill += dnp->dn_blkptr[i].blk_fill;
535 
536 	BP_SET_TYPE(zio->io_bp, DMU_OT_OBJSET);
537 	BP_SET_LEVEL(zio->io_bp, 0);
538 
539 	if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp),
540 	    BP_IDENTITY(&zio->io_bp_orig))) {
541 		dsl_dataset_block_kill(os->os_dsl_dataset, &zio->io_bp_orig,
542 		    os->os_synctx);
543 		dsl_dataset_block_born(os->os_dsl_dataset, zio->io_bp,
544 		    os->os_synctx);
545 	}
546 }
547 
548 
549 /* called from dsl */
550 void
551 dmu_objset_sync(objset_impl_t *os, dmu_tx_t *tx)
552 {
553 	extern taskq_t *dbuf_tq;
554 	int txgoff;
555 	list_t *dirty_list;
556 	int err;
557 	arc_buf_t *abuf =
558 	    arc_buf_alloc(os->os_spa, sizeof (objset_phys_t), FTAG);
559 
560 	ASSERT(dmu_tx_is_syncing(tx));
561 	ASSERT(os->os_synctx == NULL);
562 	/* XXX the write_done callback should really give us the tx... */
563 	os->os_synctx = tx;
564 
565 	dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg);
566 
567 	txgoff = tx->tx_txg & TXG_MASK;
568 
569 	dmu_objset_sync_dnodes(os, &os->os_free_dnodes[txgoff], tx);
570 	dmu_objset_sync_dnodes(os, &os->os_dirty_dnodes[txgoff], tx);
571 
572 	/*
573 	 * Free intent log blocks up to this tx.
574 	 */
575 	zil_sync(os->os_zil, tx);
576 
577 	/*
578 	 * Sync meta-dnode
579 	 */
580 	dirty_list = &os->os_dirty_dnodes[txgoff];
581 	ASSERT(list_head(dirty_list) == NULL);
582 	list_insert_tail(dirty_list, os->os_meta_dnode);
583 	dmu_objset_sync_dnodes(os, dirty_list, tx);
584 
585 	/*
586 	 * Sync the root block.
587 	 */
588 	bcopy(os->os_phys, abuf->b_data, sizeof (objset_phys_t));
589 	err = arc_write(NULL, os->os_spa, os->os_md_checksum,
590 	    os->os_md_compress, tx->tx_txg, &os->os_rootbp, abuf, killer, os,
591 	    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_WAIT);
592 	ASSERT(err == 0);
593 	arc_buf_free(abuf, FTAG);
594 
595 	dsl_dataset_set_blkptr(os->os_dsl_dataset, &os->os_rootbp, tx);
596 
597 	ASSERT3P(os->os_synctx, ==, tx);
598 	taskq_wait(dbuf_tq);
599 	os->os_synctx = NULL;
600 }
601 
602 void
603 dmu_objset_stats(objset_t *os, dmu_objset_stats_t *dds)
604 {
605 	if (os->os->os_dsl_dataset != NULL) {
606 		dsl_dataset_stats(os->os->os_dsl_dataset, dds);
607 	} else {
608 		ASSERT(os->os->os_phys->os_type == DMU_OST_META);
609 		bzero(dds, sizeof (*dds));
610 	}
611 	dds->dds_type = os->os->os_phys->os_type;
612 }
613 
614 int
615 dmu_objset_is_snapshot(objset_t *os)
616 {
617 	if (os->os->os_dsl_dataset != NULL)
618 		return (dsl_dataset_is_snapshot(os->os->os_dsl_dataset));
619 	else
620 		return (B_FALSE);
621 }
622 
623 int
624 dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
625     uint64_t *idp, uint64_t *offp)
626 {
627 	dsl_dataset_t *ds = os->os->os_dsl_dataset;
628 	zap_cursor_t cursor;
629 	zap_attribute_t attr;
630 
631 	if (ds->ds_phys->ds_snapnames_zapobj == 0)
632 		return (ENOENT);
633 
634 	zap_cursor_init_serialized(&cursor,
635 	    ds->ds_dir->dd_pool->dp_meta_objset,
636 	    ds->ds_phys->ds_snapnames_zapobj, *offp);
637 
638 	if (zap_cursor_retrieve(&cursor, &attr) != 0) {
639 		zap_cursor_fini(&cursor);
640 		return (ENOENT);
641 	}
642 
643 	if (strlen(attr.za_name) + 1 > namelen) {
644 		zap_cursor_fini(&cursor);
645 		return (ENAMETOOLONG);
646 	}
647 
648 	(void) strcpy(name, attr.za_name);
649 	if (idp)
650 		*idp = attr.za_first_integer;
651 	zap_cursor_advance(&cursor);
652 	*offp = zap_cursor_serialize(&cursor);
653 	zap_cursor_fini(&cursor);
654 
655 	return (0);
656 }
657 
658 int
659 dmu_dir_list_next(objset_t *os, int namelen, char *name,
660     uint64_t *idp, uint64_t *offp)
661 {
662 	dsl_dir_t *dd = os->os->os_dsl_dataset->ds_dir;
663 	zap_cursor_t cursor;
664 	zap_attribute_t attr;
665 
666 	if (dd->dd_phys->dd_child_dir_zapobj == 0)
667 		return (ENOENT);
668 
669 	/* there is no next dir on a snapshot! */
670 	if (os->os->os_dsl_dataset->ds_object !=
671 	    dd->dd_phys->dd_head_dataset_obj)
672 		return (ENOENT);
673 
674 	zap_cursor_init_serialized(&cursor,
675 	    dd->dd_pool->dp_meta_objset,
676 	    dd->dd_phys->dd_child_dir_zapobj, *offp);
677 
678 	if (zap_cursor_retrieve(&cursor, &attr) != 0) {
679 		zap_cursor_fini(&cursor);
680 		return (ENOENT);
681 	}
682 
683 	if (strlen(attr.za_name) + 1 > namelen) {
684 		zap_cursor_fini(&cursor);
685 		return (ENAMETOOLONG);
686 	}
687 
688 	(void) strcpy(name, attr.za_name);
689 	if (idp)
690 		*idp = attr.za_first_integer;
691 	zap_cursor_advance(&cursor);
692 	*offp = zap_cursor_serialize(&cursor);
693 	zap_cursor_fini(&cursor);
694 
695 	return (0);
696 }
697 
698 /*
699  * Find all objsets under name, and for each, call 'func(child_name, arg)'.
700  */
701 void
702 dmu_objset_find(char *name, void func(char *, void *), void *arg, int flags)
703 {
704 	dsl_dir_t *dd;
705 	objset_t *os;
706 	uint64_t snapobj;
707 	zap_cursor_t zc;
708 	zap_attribute_t attr;
709 	char *child;
710 	int do_self;
711 
712 	dd = dsl_dir_open(name, FTAG, NULL);
713 	if (dd == NULL)
714 		return;
715 
716 	do_self = (dd->dd_phys->dd_head_dataset_obj != 0);
717 
718 	/*
719 	 * Iterate over all children.
720 	 */
721 	if (dd->dd_phys->dd_child_dir_zapobj != 0) {
722 		for (zap_cursor_init(&zc, dd->dd_pool->dp_meta_objset,
723 		    dd->dd_phys->dd_child_dir_zapobj);
724 		    zap_cursor_retrieve(&zc, &attr) == 0;
725 		    (void) zap_cursor_advance(&zc)) {
726 			ASSERT(attr.za_integer_length == sizeof (uint64_t));
727 			ASSERT(attr.za_num_integers == 1);
728 
729 			/*
730 			 * No separating '/' because parent's name ends in /.
731 			 */
732 			child = kmem_alloc(MAXPATHLEN, KM_SLEEP);
733 			/* XXX could probably just use name here */
734 			dsl_dir_name(dd, child);
735 			(void) strcat(child, "/");
736 			(void) strcat(child, attr.za_name);
737 			dmu_objset_find(child, func, arg, flags);
738 			kmem_free(child, MAXPATHLEN);
739 		}
740 		zap_cursor_fini(&zc);
741 	}
742 
743 	/*
744 	 * Iterate over all snapshots.
745 	 */
746 	if ((flags & DS_FIND_SNAPSHOTS) &&
747 	    dmu_objset_open(name, DMU_OST_ANY,
748 	    DS_MODE_STANDARD | DS_MODE_READONLY, &os) == 0) {
749 
750 		snapobj = os->os->os_dsl_dataset->ds_phys->ds_snapnames_zapobj;
751 		dmu_objset_close(os);
752 
753 		for (zap_cursor_init(&zc, dd->dd_pool->dp_meta_objset, snapobj);
754 		    zap_cursor_retrieve(&zc, &attr) == 0;
755 		    (void) zap_cursor_advance(&zc)) {
756 			ASSERT(attr.za_integer_length == sizeof (uint64_t));
757 			ASSERT(attr.za_num_integers == 1);
758 
759 			child = kmem_alloc(MAXPATHLEN, KM_SLEEP);
760 			/* XXX could probably just use name here */
761 			dsl_dir_name(dd, child);
762 			(void) strcat(child, "@");
763 			(void) strcat(child, attr.za_name);
764 			func(child, arg);
765 			kmem_free(child, MAXPATHLEN);
766 		}
767 		zap_cursor_fini(&zc);
768 	}
769 
770 	dsl_dir_close(dd, FTAG);
771 
772 	/*
773 	 * Apply to self if appropriate.
774 	 */
775 	if (do_self)
776 		func(name, arg);
777 }
778